diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
new file mode 100644
index 00000000..84108c51
--- /dev/null
+++ b/.github/workflows/main.yaml
@@ -0,0 +1,91 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - main
+      - dev
+  pull_request:
+
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Install Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.x'
+
+    - name: Install Python dependencies
+      run: |
+        pip install numpy
+        pip install torch
+
+    - name: Install xmake
+      uses: xmake-io/github-action-setup-xmake@v1
+      with:
+        xmake-version: latest
+    
+    - name: configure xmake
+      run: xmake f --cpu=true -cv
+
+    - name: Set INFINI_ROOT
+      run: |
+        export INFINI_ROOT=$GITHUB_WORKSPACE/.infini
+        mkdir -p $INFINI_ROOT
+        echo "INFINI_ROOT=$INFINI_ROOT" >> $GITHUB_ENV
+
+    - name: Build with XMake
+      run: xmake build && xmake install
+
+    - name: Run Python Tests
+      run: |
+        GREEN='\033[0;32m'
+        RED='\033[0;31m'
+        NC='\033[0m' # No Color
+
+        PASSED_TESTS=()
+        FAILED_TESTS=()
+        for script in operatorspy/tests/*.py; do
+          if [ "$(basename $script)" != "__init__.py" ] && [ "$(basename $script)" != "test_utils.py" ]; then
+            echo "Running $script"
+            START_TIME=$(date +%s)
+            if ! python3 $script --cpu; then
+              echo "$script failed"
+              FAILED_TESTS+=($script)
+            else
+              echo "$script passed"
+              PASSED_TESTS+=($script)
+            fi
+            END_TIME=$(date +%s)
+            DURATION=$(( END_TIME - START_TIME ))
+            MINUTES=$(( DURATION / 60 ))
+            SECONDS=$(( DURATION % 60 ))
+            echo "Execution time for $script: ${MINUTES}m ${SECONDS}s"
+          fi
+        done
+
+        if [ ${#FAILED_TESTS[@]} -ne 0 ]; then
+          echo "The following tests passed:"
+          for test in "${PASSED_TESTS[@]}"; do
+            echo -e "${GREEN}$test${NC}"
+          done
+          echo "The following tests failed:"
+          for test in "${FAILED_TESTS[@]}"; do
+            echo -e "${RED}$test${NC}"
+          done
+          exit 1
+        else
+          echo "The following tests passed:"
+          for test in "${PASSED_TESTS[@]}"; do
+          echo -e "${GREEN}$test${NC}"
+          done
+          echo "${GREEN}All tests passed${NC}"
+        fi
+      env:
+        INFINI_ROOT: ${{ env.INFINI_ROOT }}
diff --git a/.gitignore b/.gitignore
index 45efbbb4..024cd682 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,13 @@ __pycache__/
 
 # Lib
 lib/
+out/
+
+# Log
+*.log
+
+# Cache
+cache/
+
+# Json
+*.json
diff --git a/README.md b/README.md
index c2778312..674a874f 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,77 @@
-﻿# 算子库
+﻿# InfiniOperators 算子库
 
-跨平台高性能通用算子库。形式为 C 接口动态库。
+跨平台高性能统一算子库。形式为 C 接口动态库。
 
-采用二段式算子设计，每个算子都实现并对外暴露以下的 C 接口:
+## 简介
 
-- 第一阶段：构造算子 Descriptor。用户提供的算子名称、硬件、以及算子配置（如计算的数据类型、计算排布等），相应模组会被 load 到硬件上。
+### 算子接口设计
+
+采用3+1段式算子设计，每个算子都实现并对外暴露以下的 C 接口:
+
+- 第一阶段：构造硬件控柄（Handle）。用户提供控柄地址、硬件类型以及硬件序号。控柄所在的内存空间由用户管理。
 
   ```C
-  void* createOpDescriptor(Device, void *config);
+  infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, int device, int device_id);
   ```
 
-- 第二阶段：计算。根据一阶段的 Descriptor，执行相应计算，用户需要提供输入输出张量，以及硬件计算流（CPU 为 NULL）。
+- 第二阶段：构造算子描述（Descriptor）。用户提供描述符地址、硬件控柄、以及算子涉及的张量描述（含张量数据类型、形状和步长）。这一步会完成算子所需的与张量数据无关的预计算。
 
   ```C
-  void op(void *descriptor, Tensor output, Tensor input, void *stream);
+  infiniopStatus_t infiniopCreateOpDescriptor(infiniopHandle_t handle, infiniopOpDescriptor_t *desc_ptr, infiniopTensorDescriptor_t t, ...);
   ```
 
-- 销毁 Descriptor。
+- 第三阶段（可选）：计算额外工作空间。根据算子描述，计算算子所需的额外工作空间大小，并存储于用户提供的位置。具体空间分配由用户负责。
 
   ```C
-  void destroyOpDescriptor(void *descriptor);
+  infiniopStatus_t infiniopGetOpWorkspaceSize(infiniopOpDescriptor_t desc, uint64_t *size);
   ```
 
+- 第四阶段：计算。根据算子描述符，在指定的硬件上执行相应计算，用户需要提供输入输出的数据，以及硬件计算流（CPU 为 NULL）。
+
+  ```C
+  infiniopStatus_t infiniopGetOp(infiniopOpDescriptor_t desc, [void *workspace, uint64_t workspace_size,] void *output_data, void *input_data, ..., void *stream);
+  ```
+
+- 销毁描述和硬件控柄。
+
+  ```C
+  infiniopStatus_t infiniopDestroyOpDescriptor(infiniopOpDescriptor_t desc);
+  infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle);
+  ```
+
+### 张量（Tensor）描述设计
+
+张量描述由以下几个部分组成：
+
+1.数据类型，由打包大小（即一个元素代表几个数据）、符号位、元素大小、尾数位数、指数位数共4字节表示。定义如下：
+
+```C
+typedef struct DataLayout {
+    unsigned short
+        packed : 8,
+        sign : 1,
+        size : 7,
+        mantissa : 8,
+        exponent : 8;
+} DataLayout;
+```
+
+2.维度信息。张量有多少个维度。类型为uint64_t。
+
+3.张量形状。张量每个维度的大小。类型为uint64_t*。
+
+4.张量步长。张量每个维度的步长。类型为uint64_t*。
+
+创建和销毁张量描述符的接口：
+
+```C
+infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, DataLayout layout, uint64_t ndim, uint64_t *shape, uint64_t *strides);
+infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc);
+```
+
 ## 一、使用说明
 
-### 配置
+### 1. 配置
 
 #### 查看当前配置
 
@@ -52,23 +99,27 @@ xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv
 xmake f --cambricon-mlu=true -cv
 ```
 
-### 编译
+#### 配置 NPU
+
+````xmake
+xmake f --ascend-npu=true -cv
+````
+
+### 2. 编译安装
 
 ```xmake
-xmake
+xmake build && xmake install
 ```
 
-### 将编译好的算子库添加至环境变量 `INFINI_ROOT`
+### 3. 设置环境变量
 
-```bash
-export INFINI_ROOT=[PATH_TO_LIBRARY]
-```
+按输出提示设置 `INFINI_ROOT` 和 `LD_LIBRARY_PATH` 环境变量。
 
-### 运行算子测试
+### 4. 运行算子测试
 
 ```bash
 cd operatorspy/tests
-python operator_name.py
+python operator_name.py [--cpu | --cuda | --cambricon | --ascend]
 ```
 
 ## 二、开发说明
@@ -82,6 +133,8 @@ python operator_name.py
 │   │   ├── [operator_name].h  # 对外暴露的算子 C 接口定义，descriptor 定义
 │   ├── tensor
 │   │   ├── tensor_descriptor.h  # 对外暴露的张量 descriptor 定义
+│   ├── handle
+│   │   ├── handle_export.h  # 对外暴露的硬件 handle 定义
 │   ├── *.h  # 对外暴露的核心结构体定义
 ├── src
 │   ├── devices
@@ -105,7 +158,7 @@ python operator_name.py
 
 - 在 `src/device.h` 和 `operatorspy/devices.py` 中增加新的硬件类型，注意两者需要一一对应；
 - 在 `xmake.lua` 中增加新硬件的编译选项以及编译方式；
-- 在 `src/ops/devices/[device_name]` 下编写特定硬件的通用代码；
+- 在 `src/ops/devices/[device_name]` 下编写特定硬件的handle实现和通用代码；
 - 实现该硬件的算子；
 
 ### 增加新的算子
diff --git a/include/data_type.h b/include/data_type.h
index 7767693f..e2f24c4f 100644
--- a/include/data_type.h
+++ b/include/data_type.h
@@ -8,8 +8,28 @@ typedef struct DataLayout {
         size : 7,
         mantissa : 8,
         exponent : 8;
+
+#ifdef __cplusplus
+    bool operator==(const DataLayout &other) const {
+        union TypePun {
+            DataLayout layout;
+            unsigned int i;
+        } pun;
+        pun.layout = *this;
+        auto a_ = pun.i;
+        pun.layout = other;
+        auto b_ = pun.i;
+        return a_ == b_;
+    }
+
+    bool operator!=(const DataLayout &other) const {
+        return !(*this == other);
+    }
+#endif
 } DataLayout;
 
+typedef struct DataLayout DT;
+
 // clang-format off
 const static struct DataLayout
     I8   = {1, 1, 1,  7,  0},
diff --git a/include/device.h b/include/device.h
index d7f714e0..bdeb1dc9 100644
--- a/include/device.h
+++ b/include/device.h
@@ -2,9 +2,14 @@
 #define __DEVICE_H__
 
 enum DeviceEnum {
-    DevCpu,
-    DevNvGpu,
-    DevCambriconMlu,
+    DevCpu = 0,
+    DevNvGpu = 1,
+    DevCambriconMlu = 2,
+    DevAscendNpu = 3,
+    DevMetaxGpu = 4,
+    DevMthreadsGpu = 5,
 };
 
+typedef enum DeviceEnum Device;
+
 #endif// __DEVICE_H__
diff --git a/include/handle.h b/include/handle.h
new file mode 100644
index 00000000..d4eeee28
--- /dev/null
+++ b/include/handle.h
@@ -0,0 +1,12 @@
+#ifndef INFINIOP_HANDLE_H
+#define INFINIOP_HANDLE_H
+
+#include "device.h"
+
+typedef struct HandleStruct {
+    Device device;
+} HandleStruct;
+
+typedef HandleStruct *infiniopHandle_t;
+
+#endif
diff --git a/include/handle/handle_export.h b/include/handle/handle_export.h
new file mode 100644
index 00000000..e6f38cf9
--- /dev/null
+++ b/include/handle/handle_export.h
@@ -0,0 +1,12 @@
+#ifndef INFINIOP_HANDLE_EXPORT_H
+#define INFINIOP_HANDLE_EXPORT_H
+#include "../status.h"
+#include "../handle.h"
+#include "../export.h"
+#include "../device.h"
+
+__C __export infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id);
+
+__C __export infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle);
+
+#endif // INFINIOP_HANDLE_EXPORT_H
diff --git a/include/infini_operators.h b/include/infini_operators.h
index 1167c037..9a5a2555 100644
--- a/include/infini_operators.h
+++ b/include/infini_operators.h
@@ -1,6 +1,18 @@
+#include "handle/handle_export.h"
+#include "ops/add/add.h"
+#include "ops/attention/attention.h"
+#include "ops/avg_pool/avg_pool.h"
 #include "ops/causal_softmax/causal_softmax.h"
+#include "ops/global_avg_pool/global_avg_pool.h"
+#include "ops/expand/expand.h"
+#include "ops/gemm/gemm.h"
+#include "ops/conv/conv.h"
 #include "ops/matmul/matmul.h"
-#include "ops/reform/reform.h"
+#include "ops/max_pool/max_pool.h"
+#include "ops/mlp/mlp.h"
+#include "ops/random_sample/random_sample.h"
+#include "ops/rearrange/rearrange.h"
+#include "ops/relu/relu.h"
 #include "ops/rms_norm/rms_norm.h"
 #include "ops/rotary_embedding/rotary_embedding.h"
 #include "ops/swiglu/swiglu.h"
diff --git a/include/operators.h b/include/operators.h
index 1a57a88c..989a1602 100644
--- a/include/operators.h
+++ b/include/operators.h
@@ -1,11 +1,9 @@
 ﻿#ifndef __OPERATORS_H__
 #define __OPERATORS_H__
 
-#include "data_type.h"
 #include "device.h"
 #include "tensor.h"
-
-typedef enum DeviceEnum Device;
-typedef struct DataLayout DT;
+#include "handle.h"
+#include "status.h"
 
 #endif// __OPERATORS_H__
diff --git a/include/ops/add/add.h b/include/ops/add/add.h
new file mode 100644
index 00000000..70da8cd2
--- /dev/null
+++ b/include/ops/add/add.h
@@ -0,0 +1,27 @@
+#ifndef ADD_H
+#define ADD_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct AddDescriptor {
+    Device device;
+} AddDescriptor;
+
+typedef AddDescriptor *infiniopAddDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
+                                                          infiniopAddDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b);
+
+__C __export infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc);
+
+#endif
diff --git a/include/ops/attention/attention.h b/include/ops/attention/attention.h
new file mode 100644
index 00000000..913ca792
--- /dev/null
+++ b/include/ops/attention/attention.h
@@ -0,0 +1,39 @@
+#ifndef ATTENTION_H
+#define ATTENTION_H
+
+#include "../../export.h"
+#include "../../operators.h"
+#include "../matmul/matmul.h"
+#include "../swiglu/swiglu.h"
+
+typedef struct AttentionDescriptor {
+    Device device;
+} AttentionDescriptor;
+
+typedef AttentionDescriptor *infiniopAttentionDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t handle,
+                                                                infiniopAttentionDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t out_desc,
+                                                                infiniopTensorDescriptor_t q_desc,
+                                                                infiniopTensorDescriptor_t k_desc,
+                                                                infiniopTensorDescriptor_t v_desc,
+                                                                infiniopTensorDescriptor_t k_cache_desc,
+                                                                infiniopTensorDescriptor_t v_cache_desc,
+                                                                uint64_t pos);
+
+__C __export infiniopStatus_t infiniopGetAttentionWorkspaceSize(infiniopAttentionDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc,
+                                                void *workspace,
+                                                uint64_t workspace_size,
+                                                void *out,
+                                                void const *q,
+                                                void const *k,
+                                                void const *v,
+                                                void *k_cache,
+                                                void *v_cache,
+                                                void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyAttentionDescriptor(infiniopAttentionDescriptor_t desc);
+#endif
diff --git a/include/ops/avg_pool/avg_pool.h b/include/ops/avg_pool/avg_pool.h
new file mode 100644
index 00000000..39a4ce3c
--- /dev/null
+++ b/include/ops/avg_pool/avg_pool.h
@@ -0,0 +1,28 @@
+#ifndef AVG_POOL_H
+#define AVG_POOL_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct AvgPoolDescriptor {
+    Device device;
+} AvgPoolDescriptor;
+typedef AvgPoolDescriptor *infiniopAvgPoolDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle,
+                                                              infiniopAvgPoolDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t y,
+                                                              infiniopTensorDescriptor_t x,
+                                                              uint64_t const *kernel_shape,
+                                                              uint64_t const *pads,
+                                                              int64_t const *strides,
+                                                              uint64_t n);
+
+__C __export infiniopStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc,
+                                              void *workspace, uint64_t workspace_size,
+                                              void *y, void const *x, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc);
+#endif
diff --git a/include/ops/causal_softmax/causal_softmax.h b/include/ops/causal_softmax/causal_softmax.h
index 9607374b..86c700f0 100644
--- a/include/ops/causal_softmax/causal_softmax.h
+++ b/include/ops/causal_softmax/causal_softmax.h
@@ -4,11 +4,25 @@
 #include "../../export.h"
 #include "../../operators.h"
 
-typedef struct CausalSoftmaxDescriptor CausalSoftmaxDescriptor;
+typedef struct CausalSoftmaxDescriptor {
+    Device device;
+} CausalSoftmaxDescriptor;
 
-__C __export CausalSoftmaxDescriptor *createCausalSoftmaxDescriptor(Device, void *config);
-__C __export void destroyCausalSoftmaxDescriptor(CausalSoftmaxDescriptor *descriptor);
-__C __export void causalSoftmax(CausalSoftmaxDescriptor *descriptor, Tensor y, void *stream);
+typedef CausalSoftmaxDescriptor *infiniopCausalSoftmaxDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(infiniopHandle_t handle,
+                                                                    infiniopCausalSoftmaxDescriptor_t *desc_ptr,
+                                                                    infiniopTensorDescriptor_t y_desc);
+
+__C __export infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc,
+                                                    void *workspace,
+                                                    uint64_t workspace_size,
+                                                    void *data,
+                                                    void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc);
 
 
 #endif
diff --git a/include/ops/conv/conv.h b/include/ops/conv/conv.h
new file mode 100644
index 00000000..12e1b289
--- /dev/null
+++ b/include/ops/conv/conv.h
@@ -0,0 +1,30 @@
+#ifndef CONV_H
+#define CONV_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ConvDescriptor {
+    Device device;
+} ConvDescriptor;
+
+typedef ConvDescriptor *infiniopConvDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle,
+                                                           infiniopConvDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t y,
+                                                           infiniopTensorDescriptor_t x,
+                                                           infiniopTensorDescriptor_t w,
+                                                           void *pads,
+                                                           void *strides,
+                                                           void *dilations,
+                                                           uint64_t n);
+
+__C __export infiniopStatus_t infiniopGetConvWorkspaceSize(infiniopConvDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void const *w, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc);
+
+
+#endif
diff --git a/include/ops/expand/expand.h b/include/ops/expand/expand.h
new file mode 100644
index 00000000..ee28b70c
--- /dev/null
+++ b/include/ops/expand/expand.h
@@ -0,0 +1,25 @@
+#ifndef EXPAND_H
+#define EXPAND_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ExpandDescriptor {
+    Device device;
+} ExpandDescriptor;
+
+typedef ExpandDescriptor *infiniopExpandDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateExpandDescriptor(infiniopHandle_t handle,
+                                                             infiniopExpandDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t y,
+                                                             infiniopTensorDescriptor_t x);
+
+__C __export infiniopStatus_t infiniopExpand(infiniopExpandDescriptor_t desc,
+                                             void *y,
+                                             void const *x,
+                                             void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyExpandDescriptor(infiniopExpandDescriptor_t desc);
+
+#endif
diff --git a/include/ops/gemm/gemm.h b/include/ops/gemm/gemm.h
new file mode 100644
index 00000000..a6eac566
--- /dev/null
+++ b/include/ops/gemm/gemm.h
@@ -0,0 +1,36 @@
+#ifndef GEMM_H
+#define GEMM_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct GEMMDescriptor {
+    Device device;
+} GEMMDescriptor;
+
+typedef GEMMDescriptor *infiniopGEMMDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateGEMMDescriptor(infiniopHandle_t handle,
+                                                           infiniopGEMMDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t y_desc,
+                                                           infiniopTensorDescriptor_t a_desc,
+                                                           infiniopTensorDescriptor_t b_desc,
+                                                           infiniopTensorDescriptor_t c_desc,
+                                                           float alpha,
+                                                           float beta,
+                                                           char transA,
+                                                           char transB);
+
+__C __export infiniopStatus_t infiniopGetGEMMWorkspaceSize(infiniopGEMMDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopGEMM(infiniopGEMMDescriptor_t desc,
+                                           void *workspace,
+                                           uint64_t workspace_size,
+                                           void *y,
+                                           void const *a,
+                                           void const *b,
+                                           void const *c,
+                                           void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyGEMMDescriptor(infiniopGEMMDescriptor_t desc);
+#endif
diff --git a/include/ops/global_avg_pool/global_avg_pool.h b/include/ops/global_avg_pool/global_avg_pool.h
new file mode 100644
index 00000000..ba839ecc
--- /dev/null
+++ b/include/ops/global_avg_pool/global_avg_pool.h
@@ -0,0 +1,26 @@
+#ifndef GLOBAL_AVG_POOL_H
+#define GLOBAL_AVG_POOL_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct GlobalAvgPoolDescriptor {
+    Device device;
+} GlobalAvgPoolDescriptor;
+
+typedef GlobalAvgPoolDescriptor *infiniopGlobalAvgPoolDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateGlobalAvgPoolDescriptor(infiniopHandle_t handle,
+                                                                    infiniopGlobalAvgPoolDescriptor_t *desc_ptr,
+                                                                    infiniopTensorDescriptor_t y,
+                                                                    infiniopTensorDescriptor_t x);
+
+__C __export infiniopStatus_t infiniopGetGlobalAvgPoolWorkspaceSize(infiniopGlobalAvgPoolDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopGlobalAvgPool(infiniopGlobalAvgPoolDescriptor_t desc,
+                                                    void *workspace, uint64_t workspace_size,
+                                                    void *y, void const *x, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyGlobalAvgPoolDescriptor(infiniopGlobalAvgPoolDescriptor_t desc);
+
+#endif
diff --git a/include/ops/matmul/matmul.h b/include/ops/matmul/matmul.h
index 6c80d761..67285683 100644
--- a/include/ops/matmul/matmul.h
+++ b/include/ops/matmul/matmul.h
@@ -4,12 +4,30 @@
 #include "../../export.h"
 #include "../../operators.h"
 
-typedef struct MatmulDescriptor MatmulDescriptor;
+typedef struct MatmulDescriptor {
+    Device device;
+} MatmulDescriptor;
 
-__C __export MatmulDescriptor *createMatmulDescriptor(Device, void *config);
+typedef MatmulDescriptor *infiniopMatmulDescriptor_t;
 
-__C __export void destroyMatmulDescriptor(MatmulDescriptor *descriptor);
+__C __export infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle,
+                                                             infiniopMatmulDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t c_desc,
+                                                             float alpha,
+                                                             infiniopTensorDescriptor_t a_desc,
+                                                             infiniopTensorDescriptor_t b_desc,
+                                                             float beta);
 
-__C __export void matmul(MatmulDescriptor *descriptor, Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream);
+__C __export infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc,
+                                             void *workspace,
+                                             uint64_t workspace_size,
+                                             void *c,
+                                             void const *a,
+                                             void const *b,
+                                             void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc);
 
 #endif
diff --git a/include/ops/max_pool/max_pool.h b/include/ops/max_pool/max_pool.h
new file mode 100644
index 00000000..8828c2c5
--- /dev/null
+++ b/include/ops/max_pool/max_pool.h
@@ -0,0 +1,28 @@
+#ifndef MAX_POOL_H
+#define MAX_POOL_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct MaxPoolDescriptor {
+    Device device;
+} MaxPoolDescriptor;
+typedef MaxPoolDescriptor *infiniopMaxPoolDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle,
+                                                              infiniopMaxPoolDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t y,
+                                                              infiniopTensorDescriptor_t x,
+                                                              uint64_t const *kernel_shape,
+                                                              uint64_t const *pads,
+                                                              int64_t const *strides,
+                                                              uint64_t n);
+
+__C __export infiniopStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc,
+                                              void *workspace, uint64_t workspace_size,
+                                              void *y, void const *x, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc);
+#endif
diff --git a/include/ops/mlp/mlp.h b/include/ops/mlp/mlp.h
new file mode 100644
index 00000000..9c4c7dd2
--- /dev/null
+++ b/include/ops/mlp/mlp.h
@@ -0,0 +1,36 @@
+#ifndef MLP_H
+#define MLP_H
+
+#include "../../export.h"
+#include "../../operators.h"
+#include "../matmul/matmul.h"
+#include "../swiglu/swiglu.h"
+
+typedef struct MLPDescriptor {
+    Device device;
+} MLPDescriptor;
+
+typedef MLPDescriptor *infiniopMLPDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateMLPDescriptor(infiniopHandle_t handle,
+                                                          infiniopMLPDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t y_desc,
+                                                          infiniopTensorDescriptor_t x_desc,
+                                                          infiniopTensorDescriptor_t w12_desc,
+                                                          infiniopTensorDescriptor_t w3_desc,
+                                                          float alpha,
+                                                          char residual);
+
+__C __export infiniopStatus_t infiniopGetMLPWorkspaceSize(infiniopMLPDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopMLP(infiniopMLPDescriptor_t desc,
+                                          void *workspace,
+                                          uint64_t workspace_size,
+                                          void *y,
+                                          void const *x,
+                                          void const *w12,
+                                          void const *w3,
+                                          void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyMLPDescriptor(infiniopMLPDescriptor_t desc);
+#endif
diff --git a/include/ops/random_sample/random_sample.h b/include/ops/random_sample/random_sample.h
new file mode 100644
index 00000000..e48cb7cc
--- /dev/null
+++ b/include/ops/random_sample/random_sample.h
@@ -0,0 +1,31 @@
+#ifndef RANDOM_SAMPLE_H
+#define RANDOM_SAMPLE_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct RandomSampleDescriptor {
+    Device device;
+} RandomSampleDescriptor;
+
+typedef RandomSampleDescriptor *infiniopRandomSampleDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs);
+
+__C __export infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
+                                                   void *workspace,
+                                                   uint64_t workspace_size,
+                                                   void *result,
+                                                   void const *probs,
+                                                   float random_val,
+                                                   float topp,
+                                                   int topk,
+                                                   float temperature,
+                                                   void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc);
+
+
+#endif
diff --git a/include/ops/rearrange/rearrange.h b/include/ops/rearrange/rearrange.h
new file mode 100644
index 00000000..742c4696
--- /dev/null
+++ b/include/ops/rearrange/rearrange.h
@@ -0,0 +1,20 @@
+#ifndef REARRANGE_H
+#define REARRANGE_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct RearrangeDescriptor {
+    Device device;
+} RearrangeDescriptor;
+typedef RearrangeDescriptor *infiniopRearrangeDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateRearrangeDescriptor(infiniopHandle_t handle,
+                                                                infiniopRearrangeDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t dst,
+                                                                infiniopTensorDescriptor_t src);
+
+__C __export infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void *dst, void const *src, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescriptor_t desc);
+#endif
diff --git a/include/ops/reform/reform.h b/include/ops/reform/reform.h
deleted file mode 100644
index 1a2af372..00000000
--- a/include/ops/reform/reform.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef REFORM_H
-#define REFORM_H
-
-#include "../../export.h"
-#include "../../operators.h"
-typedef struct ReformDescriptor ReformDescriptor;
-
-__C __export ReformDescriptor *createReformDescriptor(Device, void *config);
-__C __export void destroyReformDescriptor(ReformDescriptor *descriptor);
-__C __export void reform(ReformDescriptor *descriptor, Tensor y, Tensor x, void *stream);
-
-#endif
diff --git a/include/ops/relu/relu.h b/include/ops/relu/relu.h
new file mode 100644
index 00000000..9f639b9b
--- /dev/null
+++ b/include/ops/relu/relu.h
@@ -0,0 +1,25 @@
+#ifndef RELU_H
+#define RELU_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReluDescriptor {
+    Device device;
+} ReluDescriptor;
+
+typedef ReluDescriptor *infiniopReluDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReluDescriptor(infiniopHandle_t handle,
+                                                           infiniopReluDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t y,
+                                                           infiniopTensorDescriptor_t x);
+
+__C __export infiniopStatus_t infiniopRelu(infiniopReluDescriptor_t desc,
+                                           void *y,
+                                           void const *x,
+                                           void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc);
+
+#endif
diff --git a/include/ops/rms_norm/rms_norm.h b/include/ops/rms_norm/rms_norm.h
index 71aeffbc..19dc8ad5 100644
--- a/include/ops/rms_norm/rms_norm.h
+++ b/include/ops/rms_norm/rms_norm.h
@@ -4,10 +4,25 @@
 #include "../../export.h"
 #include "../../operators.h"
 
-typedef struct RMSNormDescriptor RMSNormDescriptor;
+typedef struct RMSNormDescriptor {
+    Device device;
+} RMSNormDescriptor;
 
-__C __export void *createRMSNormDescriptor(Device, void *config);
-__C __export void destroyRMSNormDescriptor(RMSNormDescriptor *descriptor);
-__C __export void rmsNorm(RMSNormDescriptor *descriptor, Tensor y, Tensor x, Tensor w, float epsilon, void *stream);
+typedef RMSNormDescriptor *infiniopRMSNormDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateRMSNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopRMSNormDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    float epsilon);
+
+__C __export infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, uint64_t workspace_size,
+                                              void *y, void const *x, void const *w, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc);
 
 #endif
diff --git a/include/ops/rotary_embedding/rotary_embedding.h b/include/ops/rotary_embedding/rotary_embedding.h
index 103b3101..48b85bdd 100644
--- a/include/ops/rotary_embedding/rotary_embedding.h
+++ b/include/ops/rotary_embedding/rotary_embedding.h
@@ -4,10 +4,29 @@
 #include "../../export.h"
 #include "../../operators.h"
 
-typedef struct RotaryEmbeddingDescriptor RotaryEmbeddingDescriptor;
+typedef struct RoPEDescriptor RoPEDescriptor;
+typedef RoPEDescriptor *infiniopRoPEDescriptor_t;
 
-__C __export void *createRotaryEmbeddingDescriptor(Device, void *config);
-__C __export void destroyRotaryEmbeddingDescriptor(RotaryEmbeddingDescriptor *descriptor);
-__C __export void rotaryEmbedding(RotaryEmbeddingDescriptor *descriptor, Tensor t, Tensor pos, float theta, void *stream);
+__C __export infiniopStatus_t infiniopCreateRoPEDescriptor(
+    infiniopHandle_t handle,
+    infiniopRoPEDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t t,
+    infiniopTensorDescriptor_t pos_ids,
+    infiniopTensorDescriptor_t sin_table,
+    infiniopTensorDescriptor_t cos_table);
+
+__C __export infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopRoPE(
+    infiniopRoPEDescriptor_t desc,
+    void *workspace,
+    uint64_t workspace_size,
+    void *t,
+    void const *pos_ids,
+    void const *sin_table,
+    void const *cos_table,
+    void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc);
 
 #endif
diff --git a/include/ops/swiglu/swiglu.h b/include/ops/swiglu/swiglu.h
index b181ef87..58ae73b6 100644
--- a/include/ops/swiglu/swiglu.h
+++ b/include/ops/swiglu/swiglu.h
@@ -4,10 +4,24 @@
 #include "../../export.h"
 #include "../../operators.h"
 
-typedef struct SwigluDescriptor SwigluDescriptor;
+typedef struct SwiGLUDescriptor {
+    Device device;
+} SwiGLUDescriptor;
 
-__C __export void *createSwigluDescriptor(Device, void *config);
-__C __export void destroySwigluDescriptor(SwigluDescriptor *descriptor);
-__C __export void swiglu(SwigluDescriptor *descriptor, Tensor gate, Tensor up, void *stream);
+typedef SwiGLUDescriptor *infiniopSwiGLUDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle,
+                                                             infiniopSwiGLUDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t c_desc,
+                                                             infiniopTensorDescriptor_t a_desc,
+                                                             infiniopTensorDescriptor_t b_desc);
+
+__C __export infiniopStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc,
+                                             void *c,
+                                             void const *a,
+                                             void const *b,
+                                             void *stream);
+
+__C __export infiniopStatus_t infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc);
 
 #endif
diff --git a/include/status.h b/include/status.h
new file mode 100644
index 00000000..54acb02a
--- /dev/null
+++ b/include/status.h
@@ -0,0 +1,16 @@
+#ifndef INFINIOP_STATUS_H
+#define INFINIOP_STATUS_H
+
+typedef enum {
+    STATUS_SUCCESS = 0,
+    STATUS_EXECUTION_FAILED = 1,
+    STATUS_BAD_PARAM = 2,
+    STATUS_BAD_TENSOR_DTYPE = 3,
+    STATUS_BAD_TENSOR_SHAPE = 4,
+    STATUS_BAD_TENSOR_STRIDES = 5,
+    STATUS_MEMORY_NOT_ALLOCATED = 6,
+    STATUS_INSUFFICIENT_WORKSPACE = 7,
+    STATUS_BAD_DEVICE = 8,
+} infiniopStatus_t;
+
+#endif
diff --git a/include/tensor.h b/include/tensor.h
index abe51434..3cc28922 100644
--- a/include/tensor.h
+++ b/include/tensor.h
@@ -4,20 +4,17 @@
 #include "data_type.h"
 #include <stdint.h>
 
-struct TensorLayout {
-    struct DataLayout dt;
+struct TensorDescriptor {
+    // Datatype
+    DT dt;
+    // Number of dimensions
     uint64_t ndim;
+    // Shape of the tensor, ndim elements
     uint64_t *shape;
+    // Stride of each dimension in elements, ndim elements
     int64_t *strides;
 };
 
-typedef struct TensorLayout *TensorDescriptor;
-
-struct TensorTuple {
-    TensorDescriptor const layout;
-    void *data;
-};
-
-typedef struct TensorTuple Tensor;
+typedef struct TensorDescriptor *infiniopTensorDescriptor_t;
 
 #endif// __TENSOR_H__
diff --git a/include/tensor/tensor_descriptor.h b/include/tensor/tensor_descriptor.h
index 87b4dd94..2fb9fc1d 100644
--- a/include/tensor/tensor_descriptor.h
+++ b/include/tensor/tensor_descriptor.h
@@ -3,9 +3,10 @@
 
 #include "../export.h"
 #include "../tensor.h"
+#include "../status.h"
 
-__C __export void createTensorDescriptor(TensorDescriptor* desc_ptr, uint64_t ndim, uint64_t *shape_, int64_t *strides_, DataLayout datatype);
+__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, uint64_t ndim, uint64_t const *shape_, int64_t const *strides_, DataLayout datatype);
 
-__C __export void destroyTensorDescriptor(TensorDescriptor desc);
+__C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc);
 
 #endif// TENSOR_DESCRIPTOR_H
diff --git a/operatorspy/__init__.py b/operatorspy/__init__.py
index f4935b7f..abb67be9 100644
--- a/operatorspy/__init__.py
+++ b/operatorspy/__init__.py
@@ -1,5 +1,7 @@
 import os
 import sys
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '.')))
-from .liboperators import open_lib, to_tensor, CTensor
+from .liboperators import open_lib, CTensor, infiniopHandle_t, infiniopTensorDescriptor_t
 from .devices import DeviceEnum
+from .utils import *
+from .data_layout import *
diff --git a/operatorspy/devices.py b/operatorspy/devices.py
index 446bc37f..23bd2a5c 100644
--- a/operatorspy/devices.py
+++ b/operatorspy/devices.py
@@ -2,3 +2,6 @@ class DeviceEnum:
     DEVICE_CPU = 0
     DEVICE_CUDA = 1
     DEVICE_BANG = 2
+    DEVICE_ASCEND = 3
+    DEVICE_MACA = 4
+    DEVICE_MUSA = 5
diff --git a/operatorspy/liboperators.py b/operatorspy/liboperators.py
index 80bb640f..0909c0cf 100644
--- a/operatorspy/liboperators.py
+++ b/operatorspy/liboperators.py
@@ -1,35 +1,50 @@
 import os
 import platform
 import ctypes
-from ctypes import c_void_p, c_int, c_int64, c_uint64, Structure, POINTER
+from ctypes import c_int, c_int64, c_uint64, Structure, POINTER
 from .data_layout import *
+from .devices import *
 
 Device = c_int
 Optype = c_int
 
-LIB_OPERATORS_DIR = "INFINI_ROOT"
+LIB_OPERATORS_DIR = os.path.join(os.environ.get("INFINI_ROOT"), "lib")
 
 
-class TensorLayout(Structure):
+class TensorDescriptor(Structure):
     _fields_ = [
         ("dt", DataLayout),
         ("ndim", c_uint64),
         ("shape", POINTER(c_uint64)),
-        ("pattern", POINTER(c_int64)),
+        ("strides", POINTER(c_int64)),
     ]
 
+    def invalidate(self):
+        for i in range(self.ndim):
+            self.shape[i] = 0
+            self.strides[i] = 0
 
-TensorDescriptor = ctypes.POINTER(TensorLayout)
 
+infiniopTensorDescriptor_t = ctypes.POINTER(TensorDescriptor)
 
-class CTensor(Structure):
-    _fields_ = [("layout", TensorDescriptor), ("data", c_void_p)]
+
+class CTensor:
+    def __init__(self, desc, data):
+        self.descriptor = desc
+        self.data = data
+
+
+class Handle(Structure):
+    _fields_ = [("device", c_int)]
+
+
+infiniopHandle_t = POINTER(Handle)
 
 
 # Open operators library
 def open_lib():
     def find_library_in_ld_path(library_name):
-        ld_library_path = os.environ.get(LIB_OPERATORS_DIR, "")
+        ld_library_path = LIB_OPERATORS_DIR
         paths = ld_library_path.split(os.pathsep)
         for path in paths:
             full_path = os.path.join(path, library_name)
@@ -39,64 +54,25 @@ def find_library_in_ld_path(library_name):
 
     system_name = platform.system()
     # Load the library
-    if system_name == 'Windows':
-        library_path = find_library_in_ld_path("operators.dll")
-    elif system_name == 'Linux':
-        library_path = find_library_in_ld_path("liboperators.so")
+    if system_name == "Windows":
+        library_path = find_library_in_ld_path("infiniop.dll")
+    elif system_name == "Linux":
+        library_path = find_library_in_ld_path("libinfiniop.so")
 
     assert (
         library_path is not None
-    ), f"Cannot find operators.dll or liboperators.so. Check if {LIB_OPERATORS_DIR} is set correctly."
+    ), f"Cannot find infiniop.dll or libinfiniop.so. Check if INFINI_ROOT is set correctly."
     lib = ctypes.CDLL(library_path)
-    lib.createTensorDescriptor.argtypes = [
-        POINTER(POINTER(TensorLayout)),
+    lib.infiniopCreateTensorDescriptor.argtypes = [
+        POINTER(infiniopTensorDescriptor_t),
         c_uint64,
         POINTER(c_uint64),
         POINTER(c_int64),
         DataLayout,
     ]
-    return lib
+    lib.infiniopCreateHandle.argtypes = [POINTER(infiniopHandle_t), c_int, c_int]
+    lib.infiniopCreateHandle.restype = c_int
+    lib.infiniopDestroyHandle.argtypes = [infiniopHandle_t]
+    lib.infiniopDestroyHandle.restype = c_int
 
-
-# Convert PyTorch tensor to library Tensor
-def to_tensor(tensor, lib, shape = None, strides = None):
-    import torch
-
-    ndim = tensor.ndimension()
-    if shape is None:
-        shape = (ctypes.c_uint64 * ndim)(*tensor.shape)
-    else:
-        shape = (ctypes.c_uint64 * ndim)(*shape)
-    # Get strides in bytes
-    if strides is None:
-        strides = (ctypes.c_int64 * ndim)(
-            *(s * tensor.element_size() for s in tensor.stride())
-        )
-    else:
-        strides = (ctypes.c_int64 * ndim)(*strides)
-    data_ptr = tensor.data_ptr()
-    # fmt: off
-    dt = (
-        I8 if tensor.dtype == torch.int8 else
-        I16 if tensor.dtype == torch.int16 else
-        I32 if tensor.dtype == torch.int32 else
-        I64 if tensor.dtype == torch.int64 else
-        U8 if tensor.dtype == torch.uint8 else
-        F16 if tensor.dtype == torch.float16 else
-        BF16 if tensor.dtype == torch.bfloat16 else
-        F32 if tensor.dtype == torch.float32 else
-        F64 if tensor.dtype == torch.float64 else
-        # TODO: These following types may not be supported by older 
-        # versions of PyTorch.
-        U16 if tensor.dtype == torch.uint16 else
-        U32 if tensor.dtype == torch.uint32 else
-        U64 if tensor.dtype == torch.uint64 else
-        None
-    )
-    # fmt: on
-    assert dt is not None
-    # Create TensorDecriptor
-    tensor_desc = TensorDescriptor()
-    lib.createTensorDescriptor(ctypes.byref(tensor_desc), ndim, shape, strides, dt)
-    # Create Tensor
-    return CTensor(tensor_desc, ctypes.c_void_p(data_ptr))
+    return lib
diff --git a/operatorspy/tests/add.py b/operatorspy/tests/add.py
new file mode 100644
index 00000000..da9c58c9
--- /dev/null
+++ b/operatorspy/tests/add.py
@@ -0,0 +1,180 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+class AddDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopAddDescriptor_t = POINTER(AddDescriptor)
+
+
+def add(x, y):
+    return torch.add(x, y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    c_shape, 
+    a_shape, 
+    b_shape,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Add on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+    if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
+        print("Unsupported test: broadcasting does not support in-place")
+        return
+
+    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
+    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
+    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+
+    ans = add(a, b)
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    descriptor = infiniopAddDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateAddDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    c_tensor.descriptor.contents.invalidate()
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
+    )
+    assert torch.allclose(c, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyAddDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+def test_musa(lib, test_cases):
+    import torch_musa
+
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for c_shape, a_shape, b_shape, inplace in test_cases:
+        test(lib, handle, "musa", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "musa", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # c_shape, a_shape, b_shape, inplace
+        # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
+        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
+        ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
+        ((), (), (), Inplace.OUT_OF_PLACE),
+        ((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
+        ((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
+        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A),
+        ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B),
+        ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
+        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateAddDescriptor.restype = c_int32
+    lib.infiniopCreateAddDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopAddDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopAdd.restype = c_int32
+    lib.infiniopAdd.argtypes = [
+        infiniopAddDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAddDescriptor.restype = c_int32
+    lib.infiniopDestroyAddDescriptor.argtypes = [
+        infiniopAddDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/attention.py b/operatorspy/tests/attention.py
new file mode 100644
index 00000000..f5449aaa
--- /dev/null
+++ b/operatorspy/tests/attention.py
@@ -0,0 +1,417 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    CTensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+import torch.nn.functional as F
+
+
+class AttentionDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopAttentionDescriptor_t = POINTER(AttentionDescriptor)
+
+
+def causal_softmax(x):
+    type = x.dtype
+    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
+    y = x.clone()
+    masked = torch.where(mask == 1, -torch.inf, y.to(torch.float32))
+    return torch.nn.functional.softmax(masked, dim=-1).to(type)
+
+
+def attention(q, k, v, k_cache, v_cache, pos):
+    type = q.dtype
+
+    n_q_head = q.shape[0]
+    n_kv_head = k.shape[0]
+
+    # Concatenate key and value caches
+    k_cache = k_cache[:, :pos, :]  # (n_kv_head, pos, head_dim)
+    v_cache = v_cache[:, :pos, :]  # (n_kv_head, pos, head_dim)
+    k = torch.cat([k_cache, k], dim=1)  # (n_kv_head, total_seq_len, head_dim)
+    v = torch.cat([v_cache, v], dim=1)  # (n_kv_head, total_seq_len, head_dim)
+
+    total_seq_len = k.shape[1]
+
+    head_dim = v.shape[-1]
+
+    if n_q_head != n_kv_head:
+        q = q.reshape(
+            n_kv_head, -1, head_dim
+        )  # (n_kv_head, n_group * seq_len, head_dim)
+
+    # Scaled dot-product attention
+    attn_scores = (
+        torch.einsum("hqd,hkd->hqk", q.to(torch.float32), k.to(torch.float32))
+        .to(type)
+        .reshape(n_q_head, -1, total_seq_len)
+    )  # (n_q_head, seq_len, total_seq_len)
+    attn_scores = attn_scores / (head_dim**0.5)
+
+    attn_weights = causal_softmax(attn_scores).reshape(
+        n_kv_head, -1, total_seq_len
+    )  # (n_kv_head, seq_len, total_seq_len)
+
+    # Weighted sum of values
+    attn_output = (
+        torch.einsum(
+            "hqk,hkd->hqd", attn_weights.to(torch.float32), v.to(torch.float32)
+        )
+        .to(type)
+        .reshape(n_q_head, -1, head_dim)
+        .permute(1, 0, 2)
+    )  # ([seq_len, n_q_head, head_dim])
+
+    return attn_output
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    n_q_head,
+    n_kv_head,
+    seq_len,
+    head_dim,
+    pos,
+    k_cache_buf_len,
+    v_cache_buf_len,
+    dtype=torch.float16,
+    q_stride=None,
+    k_stride=None,
+    v_stride=None,
+    k_cache_stride=None,
+    v_cache_stride=None,
+):
+    print(
+        f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
+        f"dtype:{dtype} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
+    )
+
+    out = torch.zeros([seq_len, n_q_head, head_dim], dtype=dtype, device=torch_device)
+    q = torch.rand([n_q_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    k = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    v = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
+    k_cache = (
+        torch.rand([n_kv_head, k_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
+        * 0.1
+    )
+    v_cache = (
+        torch.rand([n_kv_head, v_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
+        * 0.1
+    )
+
+    ans = attention(q, k, v, k_cache, v_cache, pos)
+
+    if q_stride is not None:
+        q = rearrange_tensor(q, q_stride)
+    if k_stride is not None:
+        k = rearrange_tensor(k, k_stride)
+    if v_stride is not None:
+        v = rearrange_tensor(v, v_stride)
+    if k_cache_stride is not None:
+        k_cache = rearrange_tensor(k_cache, k_cache_stride)
+    if v_cache_stride is not None:
+        v_cache = rearrange_tensor(v_cache, v_cache_stride)
+
+    out_tensor = to_tensor(out, lib)
+    q_tensor = to_tensor(q, lib)
+    k_tensor = to_tensor(k, lib)
+    v_tensor = to_tensor(v, lib)
+    k_cache_tensor = to_tensor(k_cache, lib)
+    v_cache_tensor = to_tensor(v_cache, lib)
+
+    descriptor = infiniopAttentionDescriptor_t()
+    check_error(
+        lib.infiniopCreateAttentionDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            out_tensor.descriptor,
+            q_tensor.descriptor,
+            k_tensor.descriptor,
+            v_tensor.descriptor,
+            k_cache_tensor.descriptor,
+            v_cache_tensor.descriptor,
+            pos,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    out_tensor.descriptor.contents.invalidate()
+    q_tensor.descriptor.contents.invalidate()
+    k_tensor.descriptor.contents.invalidate()
+    v_tensor.descriptor.contents.invalidate()
+    k_cache_tensor.descriptor.contents.invalidate()
+    v_cache_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetAttentionWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, out.device)
+
+    check_error(
+        lib.infiniopAttention(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            out_tensor.data,
+            q_tensor.data,
+            k_tensor.data,
+            v_tensor.data,
+            k_cache_tensor.data,
+            v_cache_tensor.data,
+            None,
+        )
+    )
+
+    assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2)
+
+    check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+
+    for (
+        n_q_head,
+        n_kv_head,
+        seq_len,
+        head_dim,
+        pos,
+        k_cache_buf_len,
+        v_cache_buf_len,
+        dtype,
+        q_stride,
+        k_stride,
+        v_stride,
+        k_cache_stride,
+        v_cache_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cpu",
+            n_q_head,
+            n_kv_head,
+            seq_len,
+            head_dim,
+            pos,
+            k_cache_buf_len,
+            v_cache_buf_len,
+            dtype,
+            q_stride,
+            k_stride,
+            v_stride,
+            k_cache_stride,
+            v_cache_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+
+    for (
+        n_q_head,
+        n_kv_head,
+        seq_len,
+        head_dim,
+        pos,
+        k_cache_buf_len,
+        v_cache_buf_len,
+        dtype,
+        q_stride,
+        k_stride,
+        v_stride,
+        k_cache_stride,
+        v_cache_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cuda",
+            n_q_head,
+            n_kv_head,
+            seq_len,
+            head_dim,
+            pos,
+            k_cache_buf_len,
+            v_cache_buf_len,
+            dtype,
+            q_stride,
+            k_stride,
+            v_stride,
+            k_cache_stride,
+            v_cache_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+
+    for (
+        n_q_head,
+        n_kv_head,
+        seq_len,
+        head_dim,
+        pos,
+        k_cache_buf_len,
+        v_cache_buf_len,
+        dtype,
+        q_stride,
+        k_stride,
+        v_stride,
+        k_cache_stride,
+        v_cache_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "mlu",
+            n_q_head,
+            n_kv_head,
+            seq_len,
+            head_dim,
+            pos,
+            k_cache_buf_len,
+            v_cache_buf_len,
+            dtype,
+            q_stride,
+            k_stride,
+            v_stride,
+            k_cache_stride,
+            v_cache_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # prefill
+        (
+            32,  # n_q_head
+            4,  # n_kv_head
+            5,  # seq_len
+            64,  # head_dim
+            0,  # pos
+            2048,  # k_cache_buf_len
+            2048,  # v_cache_buf_len
+            torch.float16,  # dtype
+            [64, 2560, 1],  # q_stride
+            [64, 2560, 1],  # k_stride
+            [64, 2560, 1],  # v_stride
+            [64, 11264, 1],  # k_cache_stride
+            [64, 11264, 1],  # v_cache_stride
+        ),
+        # decode
+        (
+            32,  # n_q_head
+            4,  # n_kv_head
+            1,  # seq_len
+            64,  # head_dim
+            3,  # pos
+            2048,  # k_cache_buf_len
+            2048,  # v_cache_buf_len
+            torch.float16,  # dtype
+            [64, 2560, 1],  # q_stride
+            [64, 2560, 1],  # k_stride
+            [64, 2560, 1],  # v_stride
+            [64, 11264, 1],  # k_cache_stride
+            [64, 11264, 1],  # v_cache_stride
+        ),
+        # for test
+        (
+            8,  # n_q_head
+            4,  # n_kv_head
+            2,  # seq_len
+            16,  # head_dim
+            1,  # pos
+            8,  # k_cache_buf_len
+            8,  # v_cache_buf_len
+            torch.float16,  # dtype
+            None,  # q_stride
+            None,  # k_stride
+            None,  # v_stride
+            None,  # k_cache_stride
+            None,  # v_cache_stride
+        ),
+    ]
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateAttentionDescriptor.restype = c_int32
+    lib.infiniopCreateAttentionDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopAttentionDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_uint64,
+    ]
+
+    lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
+    lib.infiniopGetAttentionWorkspaceSize.argtypes = [
+        infiniopAttentionDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopAttention.restype = c_int32
+    lib.infiniopAttention.argtypes = [
+        infiniopAttentionDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAttentionDescriptor.restype = c_int32
+    lib.infiniopDestroyAttentionDescriptor.argtypes = [
+        infiniopAttentionDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/avg_pool.py b/operatorspy/tests/avg_pool.py
new file mode 100644
index 00000000..9c240789
--- /dev/null
+++ b/operatorspy/tests/avg_pool.py
@@ -0,0 +1,239 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+from typing import Tuple
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class AvgPoolDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
+
+
+def pool(x, k, padding, stride, dilation = 1):
+    pooling_layers = {
+        1: torch.nn.AvgPool1d,
+        2: torch.nn.AvgPool2d,
+        3: torch.nn.AvgPool3d,
+    }
+
+    ndim = len(x.shape) - 2
+    if ndim not in pooling_layers:
+        print("Error: Pytorch -> Unsupported tensor dimension")
+        return None
+
+    if ndim == 3 and x.dtype == torch.float16:
+        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x.to(torch.float32)).to(torch.float16)
+    else:
+        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return ans
+
+
+def inferShape(x_shape, kernel_shape, padding, strides):
+    assert (
+        len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
+    ), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
+    input_shape = x_shape[2:]
+    output_shape = []
+
+    for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
+        output_dim = (dim + 2 * p - k) // s + 1
+        output_shape.append(output_dim)
+
+    return x_shape[:2] + tuple(output_shape)
+
+# convert a python tuple to a ctype void pointer
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape, 
+    k_shape, 
+    padding,
+    strides,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = pool(x, k_shape, padding, strides)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = pool(x, k_shape, padding, strides)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopAvgPoolDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateAvgPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            tuple_to_void_p(k_shape),
+            tuple_to_void_p(padding),
+            tuple_to_void_p(strides),
+            len(k_shape),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopAvgPool(
+                descriptor,
+                workspace_ptr,
+                workspaceSize,
+                y_tensor.data,
+                x_tensor.data,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopAvgPool(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape, kernel_shape, padding, strides
+        ((1, 1, 10), (3,), (1,), (1,)),
+        ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
+        ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
+    lib.infiniopCreateAvgPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopAvgPoolDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_uint64,
+    ]
+    lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
+        infiniopAvgPoolDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopAvgPool.restype = c_int32
+    lib.infiniopAvgPool.argtypes = [
+        infiniopAvgPoolDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
+        infiniopAvgPoolDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/causal_softmax.py b/operatorspy/tests/causal_softmax.py
index 09c15fec..b7cabc4a 100644
--- a/operatorspy/tests/causal_softmax.py
+++ b/operatorspy/tests/causal_softmax.py
@@ -1,20 +1,34 @@
-from ctypes import c_void_p
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
 import ctypes
 import sys
 import os
 
+
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
     open_lib,
     to_tensor,
-    CTensor,
     DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
 )
 
 from operatorspy.tests.test_utils import get_args
 import torch
 
 
+class CausalSoftmaxDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopCausalSoftmaxDescriptor_t = POINTER(CausalSoftmaxDescriptor)
+
+
 def causal_softmax(x):
     type = x.dtype
     mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
@@ -23,49 +37,142 @@ def causal_softmax(x):
     return torch.nn.functional.softmax(masked, dim=-1).to(type)
 
 
-def test(lib, descriptor, torch_device):
-    x = torch.rand((32, 20, 512), dtype=torch.float16).to(torch_device)
+def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float16):
+    print(
+        f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{x_dtype}"
+    )
+    x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
+    if x_stride is not None:
+        x = rearrange_tensor(x, x_stride)
     ans = causal_softmax(x)
-    lib.causalSoftmax(descriptor, to_tensor(x, lib), None)
-    assert torch.allclose(x, ans, atol=0, rtol=1e-3)
-    print("Test passed!")
+    x_tensor = to_tensor(x, lib)
+    descriptor = infiniopCausalSoftmaxDescriptor_t()
+    check_error(
+        lib.infiniopCreateCausalSoftmaxDescriptor(
+            handle, ctypes.byref(descriptor), x_tensor.descriptor
+        )
+    )
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetCausalSoftmaxWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
 
+    workspace = create_workspace(workspace_size.value, x.device)
+    check_error(
+        lib.infiniopCausalSoftmax(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            x_tensor.data,
+            None,
+        )
+    )
+    assert torch.allclose(x, ans, atol=0, rtol=1e-2)
+    check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
 
-def test_cpu(lib):
+
+def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
-    config = None
-    descriptor = lib.createCausalSoftmaxDescriptor(device, config)
-    test(lib, descriptor, "cpu")
-    lib.destroyCausalSoftmaxDescriptor(descriptor)
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "cpu", x_shape, x_stride)
+    destroy_handle(lib, handle)
 
 
-def test_cuda(lib):
+def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
-    config = None
-    descriptor = lib.createCausalSoftmaxDescriptor(device, config)
-    test(lib, descriptor, "cuda")
-    lib.destroyCausalSoftmaxDescriptor(descriptor)
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "cuda", x_shape, x_stride)
+    destroy_handle(lib, handle)
+
 
-def test_bang(lib):
+def test_bang(lib, test_cases):
     import torch_mlu
+
     device = DeviceEnum.DEVICE_BANG
-    descriptor = lib.createCausalSoftmaxDescriptor(device, None)
-    test(lib, descriptor, "mlu")
-    lib.destroyCausalSoftmaxDescriptor(descriptor)
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "mlu", x_shape, x_stride)
+    destroy_handle(lib, handle)
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "npu", x_shape, x_stride)
+
+    destroy_handle(lib, handle)
+
+def test_maca(lib, test_cases):
+    device = DeviceEnum.DEVICE_MACA
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "cuda", x_shape, x_stride)
+
+    destroy_handle(lib, handle)
+
+def test_musa(lib, test_cases):
+    import torch_musa
+    device = DeviceEnum.DEVICE_MUSA
+    
+    handle = create_handle(lib, device)
+    for x_shape, x_stride in test_cases:
+        test(lib, handle, "musa", x_shape, x_stride)
+        
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
+    test_cases = [
+        # x_shape, x_stride
+        ((32, 20, 512), None),
+        ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续
+    ]
     args = get_args()
     lib = open_lib()
-    lib.createCausalSoftmaxDescriptor.restype = c_void_p
-    lib.destroyCausalSoftmaxDescriptor.argtypes = [c_void_p]
-    lib.causalSoftmax.argtypes = [
+    lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
+    lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopCausalSoftmaxDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
+        infiniopCausalSoftmaxDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopCausalSoftmax.restype = c_int32
+    lib.infiniopCausalSoftmax.argtypes = [
+        infiniopCausalSoftmaxDescriptor_t,
+        c_void_p,
+        c_uint64,
         c_void_p,
-        CTensor,
         c_void_p,
     ]
+    lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
+    lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
+        infiniopCausalSoftmaxDescriptor_t,
+    ]
+
     if args.cpu:
-        test_cpu(lib)
+        test_cpu(lib, test_cases)
     if args.cuda:
-        test_cuda(lib)
+        test_cuda(lib, test_cases)
     if args.bang:
-        test_bang(lib)
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if args.maca:
+        test_maca(lib, test_cases)
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend or args.maca or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/conv.py b/operatorspy/tests/conv.py
new file mode 100644
index 00000000..7e7ea953
--- /dev/null
+++ b/operatorspy/tests/conv.py
@@ -0,0 +1,297 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+import math
+import ctypes
+from torch.nn import functional as F
+from typing import List, Tuple
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class ConvDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopConvDescriptor_t = POINTER(ConvDescriptor)
+
+
+def conv(x, w, stride, padding, dilation):
+    match len(x.shape) - 2:
+        case 1:
+            return F.conv1d(
+                x, w, stride=stride, padding=padding, dilation=dilation
+            )
+        case 2:
+            return F.conv2d(
+                x, w, stride=stride, padding=padding, dilation=dilation
+            )
+        case 3:
+            return F.conv3d(
+                x, w, stride=stride, padding=padding, dilation=dilation
+            )
+        case _:
+            print("Error: Pytorch -> Unsupported tensor dimension")
+            return None
+
+
+# infer the shape of the output given the inputs for a N-ary convolution
+def inferShape(
+    x_shape: List[int],
+    w_shape: List[int],
+    pads: List[int],
+    strides: List[int],
+    dilations: List[int],
+) -> Tuple[int, ...]:
+    assert (
+        len(x_shape) == len(w_shape) == len(pads) + 2 == len(dilations) + 2 == len(strides) + 2
+    ), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
+    output_dims = [
+        math.floor(
+            (x_shape[i+2] + 2 * pads[i] - dilations[i] * (w_shape[i+2] - 1) - 1)
+            / strides[i]
+            + 1
+        )
+        for i in range(len(pads))
+    ]
+    return (x_shape[0], w_shape[0]) + tuple(output_dims)
+
+
+# convert a python tuple to a ctype void pointer
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    w_shape,
+    pads,
+    strides,
+    dilations,
+    tensor_stride=None,
+    tensor_dtype=torch.float16,
+):
+    assert len(pads) == len(strides) == len(dilations)
+    print(
+        f"Testing Conv on {torch_device} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {tensor_stride} dtype:{tensor_dtype}"
+    )
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    w = torch.rand(w_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.zeros(
+        inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype
+    ).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = conv(x, w, strides, pads, dilations)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = conv(x, w, strides, pads, dilations)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    w_tensor = to_tensor(w, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopConvDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateConvDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            w_tensor.descriptor,
+            tuple_to_void_p(pads),
+            tuple_to_void_p(strides),
+            tuple_to_void_p(dilations),
+            len(pads),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    w_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopConv(
+                descriptor,
+                workspace_ptr,
+                workspaceSize,
+                y_tensor.data,
+                x_tensor.data,
+                w_tensor.data,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopConv(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    w_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    if (tensor_dtype == torch.float16):
+        assert torch.allclose(y, ans, atol=0, rtol=1e-2)
+    else:
+        assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyConvDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape, w_shape, pads, strides, dilations, x_strides
+        (
+            (32, 3, 4),
+            (32, 3, 5),
+            (1,),
+            (1,),
+            (1,),
+            None,
+        ),
+        (
+            (1, 3, 4, 4),
+            (2, 3, 3, 3),
+            (1, 1),
+            (1, 2),
+            (2, 1),
+            None,
+        ),
+        (
+            (32, 3, 128, 128),
+            (64, 3, 5, 5),
+            (2, 2),
+            (2, 2),
+            (1, 1),
+            None,
+        ),
+        (
+            (1, 1, 4, 4, 4),
+            (1, 1, 5, 5, 5),
+            (1, 1, 1),
+            (1, 1, 1),
+            (1, 1, 1),
+            None,
+        ),
+        (
+            (32, 3, 32, 32, 32),
+            (64, 3, 5, 5, 5),
+            (3, 2, 2),
+            (4, 3, 3),
+            (2, 2, 1),
+            None,
+        ),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateConvDescriptor.restype = c_int32
+    lib.infiniopCreateConvDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopConvDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_uint64,
+    ]
+    lib.infiniopConv.restype = c_int32
+    lib.infiniopConv.argtypes = [
+        infiniopConvDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyConvDescriptor.restype = c_int32
+    lib.infiniopDestroyConvDescriptor.argtypes = [
+        infiniopConvDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/expand.py b/operatorspy/tests/expand.py
new file mode 100644
index 00000000..87365c05
--- /dev/null
+++ b/operatorspy/tests/expand.py
@@ -0,0 +1,191 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class ExpandDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopExpandDescriptor_t = POINTER(ExpandDescriptor)
+
+
+def expand(x, y):
+    if PROFILE:
+        ans = x.expand_as(y).clone()
+        torch.cuda.synchronize()
+        return ans
+    return x.expand_as(y)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    y_shape, 
+    x_shape,
+    y_stride=None, 
+    x_stride=None, 
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
+
+    if x_stride is not None:
+        x = rearrange_tensor(x, x_stride)
+    if y_stride is not None:
+        y = rearrange_tensor(y, y_stride)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = expand(x, y)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = expand(x, y)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopExpandDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateExpandDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+def test_musa(lib, test_cases):
+    import torch_musa
+
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        test(lib, handle, "musa", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
+        test(lib, handle, "musa", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # y_shape, x_shape, y_stride, x_stride
+        ((), (), None, None),
+        ((3, 3), (1,), None, None),
+        ((5, 4, 3), (4, 3,), None, (6, 1)),
+        ((99, 111), (111,), None, None),
+        ((2, 4, 3), (1, 3), None, None),
+        ((2, 20, 3), (2, 1, 3), None, None),
+        ((2, 3, 4, 5), (5,), None, None),
+        ((3, 2, 4, 5), (3, 2, 1, 1), None, None),
+        ((32, 256, 112, 112), (32, 256, 112, 1), None, None),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateExpandDescriptor.restype = c_int32
+    lib.infiniopCreateExpandDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopExpandDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopExpand.restype = c_int32
+    lib.infiniopExpand.argtypes = [
+        infiniopExpandDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyExpandDescriptor.restype = c_int32
+    lib.infiniopDestroyExpandDescriptor.argtypes = [
+        infiniopExpandDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/gemm.py b/operatorspy/tests/gemm.py
new file mode 100644
index 00000000..5da99eac
--- /dev/null
+++ b/operatorspy/tests/gemm.py
@@ -0,0 +1,374 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class GEMMDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor)
+
+
+def gemm(A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32):
+    A = A.T if transA else A
+    B = B.T if transB else B
+    result = alpha * torch.matmul(A if dtype != torch.float16 else A.to(torch.float32), B if dtype != torch.float16 else B.to(torch.float32)).to(dtype)
+    if C is not None:
+        result += beta * C if dtype != torch.float16 else C.to(torch.float32)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return result
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    alpha,
+    beta,
+    transA,
+    transB,
+    a_shape,
+    b_shape,
+    c_shape,
+    y_shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    y_stride=None,
+    dtype=torch.float16,
+):
+    print(
+        f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} " 
+        f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} "
+        f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}"
+    )
+
+    a = torch.rand(a_shape, dtype=dtype).to(torch_device)
+    b = torch.rand(b_shape, dtype=dtype).to(torch_device)
+    c = torch.rand(c_shape, dtype=dtype).to(torch_device) if c_shape else None
+    y = torch.rand(y_shape, dtype=dtype).to(torch_device)
+
+    if a_stride is not None:
+        a = rearrange_tensor(a, a_stride)
+    if b_stride is not None:
+        b = rearrange_tensor(b, b_stride)
+    if c_stride is not None and c is not None:
+        c = rearrange_tensor(c, c_stride)
+    if y_stride is not None:
+        y = rearrange_tensor(y, y_stride)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = gemm(a, b, c, transA, transB, alpha, beta, dtype)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = gemm(a, b, c, transA, transB, alpha, beta, dtype)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    c_tensor = to_tensor(c, lib) if c is not None else None
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopGEMMDescriptor_t()
+    check_error(
+        lib.infiniopCreateGEMMDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+            c_tensor.descriptor if c_tensor else None,
+            alpha,
+            beta,
+            transA,
+            transB,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+    if c_tensor is not None:
+        c_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetGEMMWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to(
+        torch_device
+    )
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopGEMM(
+                descriptor,
+                workspace_ptr,
+                workspace_size,
+                y_tensor.data,
+                a_tensor.data,
+                b_tensor.data,
+                c_tensor.data if c_tensor else None,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopGEMM(
+                    descriptor,
+                    workspace_ptr,
+                    workspace_size,
+                    y_tensor.data,
+                    a_tensor.data,
+                    b_tensor.data,
+                    c_tensor.data if c_tensor else None,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-2)
+    check_error(lib.infiniopDestroyGEMMDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for (
+        alpha,
+        beta,
+        transA,
+        transB,
+        a_shape,
+        b_shape,
+        c_shape,
+        y_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        y_stride,
+    ) in test_cases:
+        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
+        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for (
+        alpha,
+        beta,
+        transA,
+        transB,
+        a_shape,
+        b_shape,
+        c_shape,
+        y_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        y_stride,
+    ) in test_cases:
+        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
+        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        transA,
+        transB,
+        a_shape,
+        b_shape,
+        c_shape,
+        y_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        y_stride,
+    ) in test_cases:
+        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
+        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride
+        (
+            1.0,
+            1.0,
+            False,
+            False,
+            (1, 2048),
+            (2048, 2048),
+            (1, 2048),
+            (1, 2048),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            True,
+            True,
+            (2048, 4),
+            (2048, 2048),
+            (4, 2048),
+            (4, 2048),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            False,
+            True,
+            (1, 2048),
+            (1000, 2048),
+            (1000),
+            (1, 1000),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            True,
+            False,
+            (2048, 4),
+            (2048, 2048),
+            (2048),
+            (4, 2048),
+            (4096, 1),
+            (4096, 1),
+            (2,),
+            (4096, 1),
+        ),
+        (
+            1.0,
+            1.0,
+            False,
+            False,
+            (3, 1, 2048),
+            (3, 2048, 2048),
+            (1,),
+            (3, 1, 2048),
+            None,
+            None,
+            None,
+            None,
+        ),
+        (
+            1.0,
+            1.0,
+            True,
+            False,
+            (2048, 4),
+            (2048, 2048),
+            None,
+            (4, 2048),
+            (4096, 1),
+            (4096, 1),
+            (2,),
+            (4096, 1),
+        ),
+    ]
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateGEMMDescriptor.restype = c_int32
+    lib.infiniopCreateGEMMDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopGEMMDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+        c_float,
+        c_bool,
+        c_bool,
+    ]
+
+    lib.infiniopGetGEMMWorkspaceSize.restype = c_int32
+    lib.infiniopGetGEMMWorkspaceSize.argtypes = [
+        infiniopGEMMDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopGEMM.restype = c_int32
+    lib.infiniopGEMM.argtypes = [
+        infiniopGEMMDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyGEMMDescriptor.restype = c_int32
+    lib.infiniopDestroyGEMMDescriptor.argtypes = [
+        infiniopGEMMDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/global_avg_pool.py b/operatorspy/tests/global_avg_pool.py
new file mode 100644
index 00000000..33f7b64d
--- /dev/null
+++ b/operatorspy/tests/global_avg_pool.py
@@ -0,0 +1,208 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch, time
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class GlobalAvgPoolDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor)
+
+
+def inferShape(x):
+    return x.shape[:2] + (1,) * (x.dim() - 2)
+
+
+def globalAvgPool(x):
+    y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return y.view(*inferShape(x))
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = globalAvgPool(x)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = globalAvgPool(x)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopGlobalAvgPoolDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateGlobalAvgPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetGlobalAvgPoolWorkspaceSize(
+            descriptor, ctypes.byref(workspaceSize)
+        )
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
+        torch_device
+    )
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopGlobalAvgPool(
+                descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopGlobalAvgPool(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape in test_cases:
+        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape
+        ((1, 3, 3)),
+        ((1, 3, 1, 1, 3)),
+        ((1, 3, 1, 1, 257)),
+        ((1, 2, 1, 1, 514)),
+        ((1, 3, 1, 1, 1025)),
+        ((32, 256, 1, 112, 112)),
+        ((2, 3, 2048000)),
+        ((2, 1, 10243)),
+        ((2, 20, 100)),
+        ((3, 33, 333)),
+        ((32, 20, 512)),
+        ((3, 3, 11, 11, 11, 3, 2)),
+        ((32, 256, 1, 112, 112)),
+        ((32, 256, 112, 112)),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
+    lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopGlobalAvgPoolDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopGlobalAvgPool.restype = c_int32
+    lib.infiniopGlobalAvgPool.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [
+        infiniopGlobalAvgPoolDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
index 9dce5f31..31076fb5 100644
--- a/operatorspy/tests/matmul.py
+++ b/operatorspy/tests/matmul.py
@@ -1,6 +1,8 @@
-from ctypes import c_float, c_void_p
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
 import sys
 import os
+import time
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
@@ -8,81 +10,416 @@
     to_tensor,
     CTensor,
     DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, synchronize_device
 import torch
 
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 
-def matmul(c, beta, a, b, alpha):
+class MatmulDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopMatmulDescriptor_t = POINTER(MatmulDescriptor)
+
+def matmul(_c, beta, _a, _b, alpha):
+    a = _a.clone()
+    b = _b.clone()
+    c = _c.clone()
     input_dtype = c.dtype
-    return (
+    ans = (
         alpha * torch.matmul(a.to(torch.float32), b.to(torch.float32)).to(input_dtype)
         + beta * c
     )
+    return ans
 
 
-def test(lib, descriptor, torch_device):
-    c = torch.zeros((1, 2048), dtype=torch.float16).to(torch_device)
-    a = torch.rand((1, 2048), dtype=torch.float16).to(torch_device)
-    b = torch.rand((2048, 2048), dtype=torch.float16).to(torch_device)
+def test(
+    lib,
+    handle,
+    torch_device,
+    alpha,
+    beta,
+    a_shape,
+    b_shape,
+    c_shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    dtype=torch.float16,
+):
+    print(
+        f"Testing Matmul on {torch_device} with a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape}"
+        f" a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}"
+    )
 
-    beta = 0.0
-    alpha = 1.0
+    a = torch.rand(a_shape, dtype=dtype).to(torch_device)
+    b = torch.rand(b_shape, dtype=dtype).to(torch_device)
+    c = torch.ones(c_shape, dtype=dtype).to(torch_device)
 
     ans = matmul(c, beta, a, b, alpha)
-    lib.matmul(
-        descriptor,
-        to_tensor(c, lib),
-        beta,
-        to_tensor(a, lib),
-        to_tensor(b, lib),
-        alpha,
-        None,
+
+    if a_stride is not None:
+        a = rearrange_tensor(a, a_stride)
+    if b_stride is not None:
+        b = rearrange_tensor(b, b_stride)
+    if c_stride is not None:
+        c = rearrange_tensor(c, c_stride)
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    c_tensor = to_tensor(c, lib)
+    descriptor = infiniopMatmulDescriptor_t()
+    check_error(
+        lib.infiniopCreateMatmulDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            alpha,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+            beta
+        )
     )
 
-    assert torch.allclose(c, ans, atol=0, rtol=1e-3)
-    print("Test passed!")
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+    c_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, a.device)
+
+    check_error(
+        lib.infiniopMatmul(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            c_tensor.data,
+            a_tensor.data,
+            b_tensor.data,
+            None,
+        )
+    )
+
+    assert torch.allclose(c, ans, atol=0, rtol=1e-2)
+
+    if PROFILE:
+        for i in range(NUM_PRERUN):
+            _ = matmul(c, beta, a, b, alpha)
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = matmul(c, beta, a, b, alpha)
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f" pytorch time: {elapsed * 1000 :6f} ms")
+        for i in range(NUM_PRERUN):
+            check_error(
+                lib.infiniopMatmul(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    c_tensor.data,
+                    a_tensor.data,
+                    b_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopMatmul(
+                    descriptor,
+                    workspace.data_ptr() if workspace is not None else None,
+                    workspace_size.value,
+                    c_tensor.data,
+                    a_tensor.data,
+                    b_tensor.data,
+                    None,
+                )
+            )
+        synchronize_device(torch_device)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"     lib time: {elapsed * 1000 :6f} ms")
+
+    check_error(lib.infiniopDestroyMatmulDescriptor(descriptor))
 
 
-def test_cpu(lib):
+def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
-    descriptor = lib.createMatmulDescriptor(device, None)
-    test(lib, descriptor, "cpu")
-    lib.destroyMatmulDescriptor(descriptor)
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cpu",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
 
 
-def test_cuda(lib):
+def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cuda",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
 
-    descriptor = lib.createMatmulDescriptor(device, None)
-    test(lib, descriptor, "cuda")
-    lib.destroyMatmulDescriptor(descriptor)
+    destroy_handle(lib, handle)
 
-def test_bang(lib):
+
+def test_bang(lib, test_cases):
     import torch_mlu
     device = DeviceEnum.DEVICE_BANG
-    descriptor = lib.createMatmulDescriptor(device, None)
-    test(lib, descriptor, "mlu")
-    lib.destroyMatmulDescriptor(descriptor)
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "mlu",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "npu",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
+
+def test_maca(lib, test_cases):
+    device = DeviceEnum.DEVICE_MACA
+    handle = create_handle(lib, device)
+
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cuda",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
+
+    destroy_handle(lib, handle)
+
+def test_musa(lib, test_cases):
+    import torch_musa
+
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for (
+        alpha,
+        beta,
+        a_shape,
+        b_shape,
+        c_shape,
+        a_stride,
+        b_stride,
+        c_stride,
+        dtype,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "musa",
+            alpha,
+            beta,
+            a_shape,
+            b_shape,
+            c_shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+        )
 
 if __name__ == "__main__":
+    test_cases = [
+        # alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride, dtype
+        (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None, torch.float16),
+        (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None, torch.float32),
+        (1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None, torch.float16),
+        (1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None, torch.float32),
+        (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1), torch.float16),
+        (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1), torch.float32),
+        (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1), torch.float16),
+        (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1), torch.float32),
+        (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None, torch.float16),
+        (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None, torch.float32),
+    ]
     args = get_args()
     lib = open_lib()
-    lib.createMatmulDescriptor.restype = c_void_p
-    lib.destroyMatmulDescriptor.argtypes = [c_void_p]
-    lib.matmul.argtypes = [
-        c_void_p,
-        CTensor,
-        c_float,
-        CTensor,
-        CTensor,
+
+    lib.infiniopCreateMatmulDescriptor.restype = c_int32
+    lib.infiniopCreateMatmulDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMatmulDescriptor_t),
+        infiniopTensorDescriptor_t,
         c_float,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float
+    ]
+
+    lib.infiniopGetMatmulWorkspaceSize.restype = c_int32
+    lib.infiniopGetMatmulWorkspaceSize.argtypes = [
+        infiniopMatmulDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopMatmul.restype = c_int32
+    lib.infiniopMatmul.argtypes = [
+        infiniopMatmulDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
         c_void_p,
     ]
+
+    lib.infiniopDestroyMatmulDescriptor.restype = c_int32
+    lib.infiniopDestroyMatmulDescriptor.argtypes = [
+        infiniopMatmulDescriptor_t,
+    ]
+
+    if args.profile:
+        PROFILE = True
     if args.cpu:
-        test_cpu(lib)
+        test_cpu(lib, test_cases)
     if args.cuda:
-        test_cuda(lib)
+        test_cuda(lib, test_cases)
     if args.bang:
-        test_bang(lib)
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if args.maca:
+        test_maca(lib, test_cases)
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend or args.maca or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/max_pool.py b/operatorspy/tests/max_pool.py
new file mode 100644
index 00000000..ffc0bb19
--- /dev/null
+++ b/operatorspy/tests/max_pool.py
@@ -0,0 +1,236 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+from typing import Tuple
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class MaxPoolDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopMaxPoolDescriptor_t = POINTER(MaxPoolDescriptor)
+
+
+def pool(x, k, padding, stride, dilation = 1):
+    pooling_layers = {
+        1: torch.nn.MaxPool1d,
+        2: torch.nn.MaxPool2d,
+        3: torch.nn.MaxPool3d,
+    }
+
+    ndim = len(x.shape) - 2
+    if ndim not in pooling_layers:
+        print("Error: Pytorch -> Unsupported tensor dimension")
+        return None
+
+    ans = pooling_layers[ndim](k, stride=stride, padding=padding, dilation=dilation)(x)
+    if PROFILE:
+        torch.cuda.synchronize()
+    return ans
+
+
+def inferShape(x_shape, kernel_shape, padding, strides):
+    assert (
+        len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
+    ), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
+    input_shape = x_shape[2:]
+    output_shape = []
+
+    for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
+        output_dim = (dim + 2 * p - k) // s + 1
+        output_shape.append(output_dim)
+
+    return x_shape[:2] + tuple(output_shape)
+
+# convert a python tuple to a ctype void pointer
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape, 
+    k_shape, 
+    padding,
+    strides,
+    tensor_dtype=torch.float16,
+):
+    print(
+        f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
+    )
+
+    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = pool(x, k_shape, padding, strides)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = pool(x, k_shape, padding, strides)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopMaxPoolDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateMaxPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            tuple_to_void_p(k_shape),
+            tuple_to_void_p(padding),
+            tuple_to_void_p(strides),
+            len(k_shape),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetMaxPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(
+            lib.infiniopMaxPool(
+                descriptor,
+                workspace_ptr,
+                workspaceSize,
+                y_tensor.data,
+                x_tensor.data,
+                None,
+            )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopMaxPool(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    None,
+                )
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyMaxPoolDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape, kernel_shape, padding, strides in test_cases:
+        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape, kernel_shape, padding, strides
+        ((1, 1, 10), (3,), (1,), (1,)),
+        ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
+        ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateMaxPoolDescriptor.restype = c_int32
+    lib.infiniopCreateMaxPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMaxPoolDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_uint64,
+    ]
+    lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [
+        infiniopMaxPoolDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopMaxPool.restype = c_int32
+    lib.infiniopMaxPool.argtypes = [
+        infiniopMaxPoolDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyMaxPoolDescriptor.argtypes = [
+        infiniopMaxPoolDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/mlp.py b/operatorspy/tests/mlp.py
new file mode 100644
index 00000000..668d7861
--- /dev/null
+++ b/operatorspy/tests/mlp.py
@@ -0,0 +1,316 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    CTensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+import torch.nn as nn
+
+
+class MLPDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopMLPDescriptor_t = POINTER(MLPDescriptor)
+
+
+def swiglu(a, b):
+    return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
+
+
+def mlp(y, x, w12, w3, alpha, residual):
+    input_dtype = x.dtype
+
+    intermediate_size = w3.shape[0]
+
+    a = torch.matmul(
+        x.to(torch.float32), w12[:, intermediate_size:].to(torch.float32)
+    ).to(input_dtype)
+    b = torch.matmul(
+        x.to(torch.float32), w12[:, 0:intermediate_size].to(torch.float32)
+    ).to(input_dtype)
+    c = swiglu(a, b)
+    d = torch.matmul(c.to(torch.float32), alpha * w3.to(torch.float32)).to(input_dtype)
+    out = d + y if residual else d
+    return out
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    num_tokens,
+    hidden_size,
+    intermediate_size,
+    alpha,
+    residual,
+    dtype=torch.float16,
+    x_stride=None,
+    y_stride=None,
+    w12_stride=None,
+    w3_stride=None,
+):
+    print(
+        f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}"
+        f" alpha:{alpha} residual:{residual} dtype:{dtype} x_stride:{x_stride} y_stride:{y_stride} w12_stride:{w12_stride} w3_stride:{w3_stride}"
+    )
+
+    y = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01
+    x = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01
+    w12 = (
+        torch.rand([hidden_size, 2 * intermediate_size], dtype=dtype).to(torch_device)
+        * 0.01
+    )
+    w3 = (
+        torch.rand([intermediate_size, hidden_size], dtype=dtype).to(torch_device)
+        * 0.01
+    )
+
+    ans = mlp(y, x, w12, w3, alpha, residual)
+
+    if x_stride is not None:
+        x = rearrange_tensor(x, x_stride)
+    if y_stride is not None:
+        y = rearrange_tensor(y, y_stride)
+    if w12_stride is not None:
+        w12 = rearrange_tensor(w12, w12_stride)
+    if w3_stride is not None:
+        w3 = rearrange_tensor(w3, w3_stride)
+
+    y_tensor = to_tensor(y, lib)
+    x_tensor = to_tensor(x, lib)
+    w12_tensor = to_tensor(w12, lib)
+    w3_tensor = to_tensor(w3, lib)
+    descriptor = infiniopMLPDescriptor_t()
+    check_error(
+        lib.infiniopCreateMLPDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            w12_tensor.descriptor,
+            w3_tensor.descriptor,
+            alpha,
+            residual,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    y_tensor.descriptor.contents.invalidate()
+    x_tensor.descriptor.contents.invalidate()
+    w12_tensor.descriptor.contents.invalidate()
+    w3_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetMLPWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, x.device)
+
+    check_error(
+        lib.infiniopMLP(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            y_tensor.data,
+            x_tensor.data,
+            w12_tensor.data,
+            w3_tensor.data,
+            None,
+        )
+    )
+    assert torch.allclose(y, ans, atol=0, rtol=2e-2)
+
+    check_error(lib.infiniopDestroyMLPDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+
+    for (
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        alpha,
+        residual,
+        dtype,
+        x_stride,
+        y_stride,
+        w12_stride,
+        w3_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cpu",
+            num_tokens,
+            hidden_size,
+            intermediate_size,
+            alpha,
+            residual,
+            dtype,
+            x_stride,
+            y_stride,
+            w12_stride,
+            w3_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+
+    for (
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        alpha,
+        residual,
+        dtype,
+        x_stride,
+        y_stride,
+        w12_stride,
+        w3_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "cuda",
+            num_tokens,
+            hidden_size,
+            intermediate_size,
+            alpha,
+            residual,
+            dtype,
+            x_stride,
+            y_stride,
+            w12_stride,
+            w3_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+
+    for (
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        alpha,
+        residual,
+        dtype,
+        x_stride,
+        y_stride,
+        w12_stride,
+        w3_stride,
+    ) in test_cases:
+        test(
+            lib,
+            handle,
+            "mlu",
+            num_tokens,
+            hidden_size,
+            intermediate_size,
+            alpha,
+            residual,
+            dtype,
+            x_stride,
+            y_stride,
+            w12_stride,
+            w3_stride,
+        )
+
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # num_tokens, hidden_size, intermediate_size, alpha, residual, dtype, x_stride, y_stride, w12_stride, w3_stride
+        (4, 4096, 11008, 1.0, True, torch.float16, None, None, None, None),
+        (4, 4096, 11008, 1.0, True, torch.float16, [8192, 1], [8192, 1], None, None),
+        (
+            4,
+            4096,
+            11008,
+            1.0,
+            True,
+            torch.float16,
+            None,
+            None,
+            [1, 4096],
+            [1, 11008],
+        ),
+        (4, 4096, 11008, 1.0, False, torch.float16, None, None, None, None),
+        (4, 4096, 11008, 1.0, False, torch.float16, [8192, 1], [8192, 1], None, None),
+    ]
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateMLPDescriptor.restype = c_int32
+    lib.infiniopCreateMLPDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMLPDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+        c_bool,
+    ]
+
+    lib.infiniopGetMLPWorkspaceSize.restype = c_int32
+    lib.infiniopGetMLPWorkspaceSize.argtypes = [
+        infiniopMLPDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopMLP.restype = c_int32
+    lib.infiniopMLP.argtypes = [
+        infiniopMLPDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMLPDescriptor.restype = c_int32
+    lib.infiniopDestroyMLPDescriptor.argtypes = [
+        infiniopMLPDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py
new file mode 100644
index 00000000..85a3c681
--- /dev/null
+++ b/operatorspy/tests/random_sample.py
@@ -0,0 +1,250 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+    U64,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+
+class RandomSampleDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopRandomSampleDescriptor_t = POINTER(RandomSampleDescriptor)
+
+
+def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
+    indices = torch.zeros([topk], dtype = torch.int64)
+    dataNp = data.clone().detach()
+    sorted_indices = torch.arange(voc)
+    
+    for i in range(topk):
+        for j in range(i + 1, voc):
+            if(dataNp[i] < dataNp[j]):
+                tmp = dataNp[i].clone().detach()
+                dataNp[i] = dataNp[j].clone().detach()
+                dataNp[j] = tmp
+
+                tmpInd = sorted_indices[i].clone().detach()
+                sorted_indices[i] = sorted_indices[j].clone().detach()
+                sorted_indices[j] = tmpInd
+                
+    #sorted_indices = torch.argsort(dataNp, descending=True)
+    indices = sorted_indices[:topk] 
+    
+    dataNp = dataNp[sorted_indices]
+    
+    globalM = dataNp[0]
+    dataNp = (dataNp - globalM) / temperature
+    dataNp = torch.softmax(dataNp.float(), dim = 0)
+    sum_s = 0
+    for end in range(topk):
+        sum_s += dataNp[end]
+        if(sum_s >= topp):
+            break
+    if(end < topk - 1):
+        end += 1
+    else:
+        end = topk
+    
+    sum_s = 0
+    for i in range(end):
+        sum_s += dataNp[i]
+    random_val *= sum_s
+    
+    sum_s = 0
+    for i in range(end):
+        sum_s += dataNp[i]
+        if(random_val < sum_s):
+            return indices[i]
+
+def random_sample_0(data):
+    return torch.argmax(data)
+
+def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_dtype=torch.float16):
+    print(
+        f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}"
+    )
+    data = torch.arange(voc).float() * 0.0001
+    _perm = torch.randperm(voc)
+    if (torch_device == 'maca'):
+        data = data[_perm].to(x_dtype).to('cuda')
+    else:
+        data = data[_perm].to(x_dtype).to(torch_device)
+    if(topp > 0 and topk > 1):
+        ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
+    else:
+        ans = random_sample_0(data)
+    if(torch_device == 'maca'):
+        indices = torch.zeros([1], dtype = torch.int64).to('cuda')
+    else:
+        indices = torch.zeros([1], dtype = torch.int64).to(torch_device)
+    x_tensor = to_tensor(data, lib)
+    indices_tensor = to_tensor(indices, lib)
+    indices_tensor.descriptor.contents.dt = U64  # treat int64 as uint64
+
+    descriptor = infiniopRandomSampleDescriptor_t()
+    check_error(
+        lib.infiniopCreateRandomSampleDescriptor(
+            handle, ctypes.byref(descriptor), indices_tensor.descriptor, x_tensor.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    indices_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetRandomSampleWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = create_workspace(workspace_size.value, torch_device) 
+    check_error(
+        lib.infiniopRandomSample(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            indices_tensor.data,
+            x_tensor.data,
+            random_val,
+            topp,
+            topk,
+            temperature,
+            None,
+        )
+    )
+    if torch_device == "npu":
+        torch.npu.synchronize()
+
+    assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
+    check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for (voc, random_val, topp, topk, temperature) in test_cases:
+        test(lib, handle, "cpu", voc, random_val, topp, topk, temperature)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for (voc, random_val, topp, topk, temperature) in test_cases:
+        test(lib, handle, "cuda", voc, random_val, topp, topk, temperature)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for (voc, random_val, topp, topk, temperature) in test_cases:
+        test(lib, handle, "mlu", voc, random_val, topp, topk, temperature)
+    destroy_handle(lib, handle)
+
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+    for (voc, random_val, topp, topk, temperature) in test_cases:
+        test(lib, handle, "npu", voc, random_val, topp, topk, temperature)
+    destroy_handle(lib, handle) 
+    
+def test_maca(lib, test_cases):
+    device = DeviceEnum.DEVICE_MACA
+    handle = create_handle(lib, device)
+    for (voc, random_val, topp, topk, temperature) in test_cases:
+        test(lib, handle, "maca", voc, random_val, topp, topk, temperature)
+    destroy_handle(lib, handle) 
+    
+
+def test_musa(lib, test_cases):
+    import torch_musa
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for (voc, random_val, topp, topk, temperature) in test_cases:
+        test(lib, handle, "musa", voc, random_val, topp, topk, temperature)
+    destroy_handle(lib, handle) 
+
+if __name__ == "__main__":
+    test_cases = [
+        # voc, random_val, topp, topk, temperature
+        (512, 0.8, 0.8, 3, 0.5),
+        (4096, 0.05, 0.9, 5, 1.0),
+        (16384, 0.15, 0.85, 10, 2.0),
+        (512, 0.08, 0, 3, 0.5),
+        (4096, 0.5, 0.9, 1, 1.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (16384, 0.15, 0, 1, 2.0),
+        (32000, 0.08, 0.8, 50, 1.0),
+        (32000, 0.08, 1.0, 25, 1.0),
+        # (119696, 0.01, 1.0, 100, 1.0),
+    ]
+    
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
+    lib.infiniopCreateRandomSampleDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopRandomSampleDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32
+    lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [
+        infiniopRandomSampleDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopRandomSample.restype = c_int32
+    lib.infiniopRandomSample.argtypes = [
+        infiniopRandomSampleDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_uint64,
+        c_void_p,
+        c_float,
+        c_float,
+        c_int32,
+        c_float,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32
+    lib.infiniopDestroyRandomSampleDescriptor.argtypes = [
+        infiniopRandomSampleDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if args.maca:
+        test_maca(lib, test_cases)
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend or args.maca or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/rearrange.py b/operatorspy/tests/rearrange.py
new file mode 100644
index 00000000..9709e6b3
--- /dev/null
+++ b/operatorspy/tests/rearrange.py
@@ -0,0 +1,181 @@
+import ctypes
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    CTensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+
+
+class RerrangeDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopRearrangeDescriptor_t = POINTER(RerrangeDescriptor)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    x_shape,
+    x_stride,
+    y_shape,
+    y_stride,
+    x_dtype=torch.float16,
+):
+    print(
+        f"Testing Rerrange on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} y_shape:{y_shape} y_stride:{y_stride} x_dtype:{x_dtype}"
+    )
+    x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
+    y = torch.zeros(y_shape, dtype=x_dtype).to(torch_device)
+    if x_stride is not None:
+        x = rearrange_tensor(x, x_stride)
+    if y_stride is not None:
+        y = rearrange_tensor(y, y_stride)
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+
+    descriptor = infiniopRearrangeDescriptor_t()
+    check_error(
+        lib.infiniopCreateRearrangeDescriptor(
+            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None)
+    )
+    assert torch.allclose(x, y, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "cpu", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle)
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "mlu", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle)
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle) 
+
+def test_maca(lib, test_cases):
+    device = DeviceEnum.DEVICE_MACA
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle) 
+
+def test_musa(lib, test_cases):
+    import torch_musa
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "musa", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle)
+
+def test_musa(lib, test_cases):
+    import torch_musa
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for test_case in test_cases:
+        x_shape, x_stride = test_case[0]
+        y_shape, y_stride = test_case[1]
+        test(lib, handle, "musa", x_shape, x_stride, y_shape, y_stride)
+    destroy_handle(lib, handle)
+
+if __name__ == "__main__":
+    args = get_args()
+    test_cases = [
+        # ((src_shape, src_stride), (dst_shape, dst_stride))
+        (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))),
+        (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)),
+        (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))),
+        (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))),
+        (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))),
+        (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
+        (((64,), (1,)), ((64,), (1,))),
+        ]
+    lib = open_lib()
+    lib.infiniopCreateRearrangeDescriptor.restype = c_int32
+    lib.infiniopCreateRearrangeDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopRearrangeDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopRearrange.restype = c_int32
+    lib.infiniopRearrange.argtypes = [
+        infiniopRearrangeDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
+    lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t]
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if args.maca:
+        test_maca(lib, test_cases)
+    if args.musa:
+        test_musa(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/reform.py b/operatorspy/tests/reform.py
deleted file mode 100644
index d671c003..00000000
--- a/operatorspy/tests/reform.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import ctypes
-from ctypes import c_float, POINTER, c_void_p
-import sys
-import os
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    CTensor,
-    DeviceEnum,
-)
-
-from operatorspy.tests.test_utils import get_args
-import torch
-import time
-
-
-def test(lib, descriptor, torch_device, x = None):
-    if x is None:
-        x = torch.rand((10, 10), dtype=torch.float16).to(torch_device)
-    else:
-        x = x.to(torch_device)
-    y = torch.zeros((5, 5), dtype=torch.float16).to(torch_device)
-
-    lib.reform(descriptor, to_tensor(y, lib), to_tensor(x, lib, [5, 5], [20, 2]), None)
-    
-    return x, y
-
-def test_cpu(lib):
-    device = DeviceEnum.DEVICE_CPU
-    config = None
-    descriptor = lib.createReformDescriptor(device, config)
-    test(lib, descriptor, "cpu")
-    lib.destroyReformDescriptor(descriptor)
-    print("Test passed!")
-
-def run_cpu(lib):
-    device = DeviceEnum.DEVICE_CPU
-    config = None
-    descriptor = lib.createReformDescriptor(device, config)
-    x, ans = test(lib, descriptor, "cpu")
-    lib.destroyReformDescriptor(descriptor)
-    return x, ans
-
-def test_cuda(lib):
-    device = DeviceEnum.DEVICE_CUDA
-    config = None
-    descriptor = lib.createReformDescriptor(device, config)
-    
-    # compare with cpu results
-    x, cpu_ans = run_cpu(lib)
-    _, cuda_ans = test(lib, descriptor, "cuda", x)
-    
-    assert torch.allclose(cuda_ans.cpu(), cpu_ans, atol=1e-3, rtol=1e-3)
-    print("Test passed!")
-
-    lib.destroyReformDescriptor(descriptor)
-
-def test_bang(lib):
-    import torch_mlu
-    device = DeviceEnum.DEVICE_BANG
-    descriptor = lib.createReformDescriptor(device, None)
-    
-    # compare with cpu results
-    x, cpu_ans = run_cpu(lib)
-    _, bang_ans = test(lib, descriptor, "mlu", x)
-    
-    assert torch.allclose(bang_ans.cpu(), cpu_ans, atol=1e-3, rtol=1e-3)
-    print("Test passed!")
-    
-    lib.destroyReformDescriptor(descriptor)
-    
-
-if __name__ == "__main__":
-    args = get_args()
-    lib = open_lib()
-    lib.createReformDescriptor.restype = c_void_p
-    lib.destroyReformDescriptor.argtypes = [c_void_p]
-    lib.reform.argtypes = [
-        c_void_p,
-        CTensor,
-        CTensor,
-        c_void_p,
-    ]
-    if args.cpu:
-        test_cpu(lib)
-    if args.cuda:
-        test_cuda(lib)
-    if args.bang:
-        test_bang(lib)
diff --git a/operatorspy/tests/relu.py b/operatorspy/tests/relu.py
new file mode 100644
index 00000000..b99706ff
--- /dev/null
+++ b/operatorspy/tests/relu.py
@@ -0,0 +1,189 @@
+from ctypes import POINTER, Structure, c_int32, c_void_p
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+# constant for control whether profile the pytorch and lib functions
+# NOTE: need to manually add synchronization function to the lib function,
+#       e.g., cudaDeviceSynchronize() for CUDA
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+class ReluDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopReluDescriptor_t = POINTER(ReluDescriptor)
+
+
+def relu(x):
+    if PROFILE:
+        ans = torch.nn.functional.relu(x).to(x.dtype)
+        torch.cuda.synchronize()
+        return ans
+    return torch.nn.functional.relu(x).to(x.dtype)
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    tensor_shape, 
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+
+    x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+    y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else x
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = relu(x)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = relu(x)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
+    descriptor = infiniopReluDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateReluDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None))
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    check_error(lib.infiniopDestroyReluDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+def test_musa(lib, test_cases):
+    import torch_musa
+
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "musa", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "musa", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # tensor_shape, inplace
+        ((), Inplace.OUT_OF_PLACE),
+        ((), Inplace.INPLACE_X),
+        ((1, 3), Inplace.OUT_OF_PLACE),
+        ((3, 3), Inplace.OUT_OF_PLACE),
+        ((3, 3, 13, 9, 17), Inplace.INPLACE_X),
+        ((32, 20, 512), Inplace.INPLACE_X),
+        ((33, 333, 333), Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReluDescriptor.restype = c_int32
+    lib.infiniopCreateReluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReluDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopRelu.restype = c_int32
+    lib.infiniopRelu.argtypes = [
+        infiniopReluDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReluDescriptor.restype = c_int32
+    lib.infiniopDestroyReluDescriptor.argtypes = [
+        infiniopReluDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/rms_norm.py b/operatorspy/tests/rms_norm.py
index 2442376d..46b1d0f3 100644
--- a/operatorspy/tests/rms_norm.py
+++ b/operatorspy/tests/rms_norm.py
@@ -1,4 +1,5 @@
-from ctypes import c_float, c_void_p
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
 import sys
 import os
 
@@ -6,13 +7,24 @@
 from operatorspy import (
     open_lib,
     to_tensor,
-    CTensor,
     DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
 )
 
 from operatorspy.tests.test_utils import get_args
 import torch
 
+class RMSNormDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopRMSNormDescriptor_t = POINTER(RMSNormDescriptor)
 
 def rms_norm(x, w, eps):
     input_dtype = x.dtype
@@ -22,61 +34,156 @@ def rms_norm(x, w, eps):
     return w * hidden_states.to(input_dtype)
 
 
-def test(lib, descriptor, torch_device):
-    y = torch.zeros((16, 13312), dtype=torch.float16).to(torch_device)
-    x = torch.rand((16, 2048), dtype=torch.float16).to(torch_device)
-    w = torch.ones((2048,), dtype=torch.float16).to(torch_device)
+def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float16, w_dtype=torch.float16):
+    print(f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
+        f" dtype:{dtype} w_dtype:{w_dtype}")
+
+    y = torch.zeros(y_shape, dtype=dtype).to(torch_device)
+    x = torch.rand(x_shape, dtype=dtype).to(torch_device)
+    w = torch.ones(w_shape, dtype=w_dtype).to(torch_device)
 
     eps = 1e-5
     ans = rms_norm(x, w, eps)
-    lib.rmsNorm(
-        descriptor, to_tensor(y, lib, [16, 2048], [26624, 2]), to_tensor(x, lib), to_tensor(w, lib), eps, None
+
+    y_tensor = to_tensor(y, lib)
+    x_tensor = to_tensor(x, lib)
+    w_tensor = to_tensor(w, lib)
+
+    descriptor = infiniopRMSNormDescriptor_t()
+    w_dataType = 0 if w_dtype==torch.float16 else 1
+
+    check_error(
+        lib.infiniopCreateRMSNormDescriptor(
+            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor,
+            w_tensor.descriptor, eps
+        )
     )
 
-    # print(ans)
-    # print("=======================================================")
-    # print(y[:, :2048])
-    assert torch.allclose(y[:, :2048], ans, atol=1e-3, rtol=1e-3)
-    print("Test passed!")
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+    w_tensor.descriptor.contents.invalidate()
 
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetRMSNormWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = create_workspace(workspace_size.value, y.device)
+    check_error(
+        lib.infiniopRMSNorm(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            y_tensor.data,
+            x_tensor.data,
+            w_tensor.data,
+            None,
+        )
+    )
 
-def test_cpu(lib):
-    device = DeviceEnum.DEVICE_CPU
-    descriptor = lib.createRMSNormDescriptor(device, None)
-    test(lib, descriptor, "cpu")
-    lib.destroyRMSNormDescriptor(descriptor)
+    assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3)
+    check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
 
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "cpu", y_shape, x_shape, w_shape, dtype, w_dtype)
+    destroy_handle(lib, handle)
 
-def test_cuda(lib):
+def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
-    descriptor = lib.createRMSNormDescriptor(device, None)
-    test(lib, descriptor, "cuda")
-    lib.destroyRMSNormDescriptor(descriptor)
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "cuda", y_shape, x_shape, w_shape, dtype, w_dtype)
+    destroy_handle(lib, handle)
 
-def test_bang(lib):
+def test_bang(lib, test_cases):
     import torch_mlu
     device = DeviceEnum.DEVICE_BANG
-    descriptor = lib.createRMSNormDescriptor(device, None)
-    test(lib, descriptor, "mlu")
-    lib.destroyRMSNormDescriptor(descriptor)
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype)
+    destroy_handle(lib, handle)
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "npu", y_shape, x_shape, w_shape, dtype, w_dtype)
+
+    destroy_handle(lib, handle)
+
+def test_maca(lib, test_cases):
+    device = DeviceEnum.DEVICE_MACA
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "cuda", y_shape, x_shape, w_shape, dtype, w_dtype)
 
+    destroy_handle(lib, handle)
+
+def test_musa(lib, test_cases):
+    import torch_musa
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+        test(lib, handle, "musa", y_shape, x_shape, w_shape, dtype, w_dtype)
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
+    test_cases = [
+        # y_shape, x_shape, w_shape, dtype, w_dtype
+        ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float16),
+        ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float32),
+    ]
     args = get_args()
     lib = open_lib()
-    lib.createRMSNormDescriptor.restype = c_void_p
-    lib.destroyRMSNormDescriptor.argtypes = [c_void_p]
-    lib.rmsNorm.argtypes = [
-        c_void_p,
-        CTensor,
-        CTensor,
-        CTensor,
+    lib.infiniopCreateRMSNormDescriptor.restype = c_int32
+    lib.infiniopCreateRMSNormDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopRMSNormDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
         c_float,
+    ]
+
+    lib.infiniopGetRMSNormWorkspaceSize.restype = c_int32
+    lib.infiniopGetRMSNormWorkspaceSize.argtypes = [
+        infiniopRMSNormDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopRMSNorm.restypes = c_int32
+    lib.infiniopRMSNorm.argtypes = [
+        infiniopRMSNormDescriptor_t,
         c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRMSNormDescriptor.restype = c_int32
+    lib.infiniopDestroyRMSNormDescriptor.argtypes = [
+        infiniopRMSNormDescriptor_t,
     ]
+
     if args.cpu:
-        test_cpu(lib)
+        test_cpu(lib, test_cases)
     if args.cuda:
-        test_cuda(lib)
+        test_cuda(lib, test_cases)
     if args.bang:
-        test_bang(lib)
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if args.maca:
+        test_maca(lib, test_cases)
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend or args.maca or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/rotary_embedding.py b/operatorspy/tests/rotary_embedding.py
index bfa4d8db..1c1122a6 100644
--- a/operatorspy/tests/rotary_embedding.py
+++ b/operatorspy/tests/rotary_embedding.py
@@ -1,20 +1,35 @@
 import ctypes
-from ctypes import c_float, POINTER, c_void_p
+from ctypes import c_float, POINTER, c_void_p, c_int32, c_uint64, Structure, byref
 import sys
 import os
 
+
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 from operatorspy import (
     open_lib,
     to_tensor,
-    CTensor,
     DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
+    create_workspace,
+    U64,
 )
 
 from operatorspy.tests.test_utils import get_args
 import torch
 
 
+class RoPEDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopRoPEDescriptor_t = POINTER(RoPEDescriptor)
+
+
 def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
     ndim = x.ndim
     assert 0 <= 1 < ndim
@@ -30,79 +45,204 @@ def rotary_embedding(t, pos, theta, torch_device):
     )
     freqs = torch.outer(pos, freqs)
     freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
-
     t_ = torch.view_as_complex(t.reshape(*t.shape[:-1], -1, 2))
     freqs_cis = reshape_for_broadcast(freqs_cis, t_)
     t_out = torch.view_as_real(t_ * freqs_cis).flatten(2).to(t.dtype)
     return t_out
 
+def sin_cos_table(max_seq_len, dim, torch_device, theta):
+    pos = torch.arange(
+        0, max_seq_len, dtype=torch.float32, device=torch.device(torch_device)
+    )
+    freqs = (1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))).to(
+        torch_device
+    )
+    # (a0, a1, a2) -> (a0, a0, a1, a1, a2, a2)
+    freqs = torch.repeat_interleave(freqs, repeats=2)
+    angles = torch.outer(pos, freqs)
+    return torch.sin(angles), torch.cos(angles)
 
-def test(lib, descriptor, torch_device):
-    t = torch.rand((1, 32, 128), dtype=torch.float16).to(torch_device)
-    pos = torch.ones((1,), dtype=torch.int32).to(torch_device)
-    theta = 1e4
 
-    ans = rotary_embedding(t, pos, theta, torch_device)
-    lib.rotaryEmbedding(
-        descriptor, to_tensor(t, lib), to_tensor(pos, lib), c_float(theta, lib), None
+def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
+    print(
+        f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}"
     )
 
-    assert torch.allclose(t, ans, atol=1, rtol=1e-3)
-    print("Test passed!")
+    t = torch.rand(shape, dtype=dtype)
+    if strides is not None:
+        t = rearrange_tensor(t, strides)
+    posTmp = torch.arange(0, t.shape[0])
+    pos = torch.zeros(2 * posTmp.shape[0], dtype = torch.int32)
+    for i in range(posTmp.shape[0]):
+        pos[2 * i] = posTmp[i]
+        pos[2 * i + 1] = 0
+    theta = 1e4
+    if torch_device == 'mlu' or torch_device == 'npu' or torch_device == 'musa':
+        ans = rotary_embedding(t, posTmp, theta, "cpu").to(torch_device)
+        pos = pos.to(torch_device)
+        t = t.to(torch_device)
+    elif torch_device == 'maca':
+        ans = rotary_embedding(t, posTmp, theta, "cpu").to('cuda')
+        pos = pos.to('cuda')
+        t = t.to('cuda')
+    else:
+        t = t.to(torch_device)
+        pos = pos.to(torch_device)
+        ans = rotary_embedding(t, posTmp.to(torch_device), theta, torch_device)
+
+    descriptor = infiniopRoPEDescriptor_t()
+    # 2x table length for test
+    sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta)
+    t_tensor = to_tensor(t, lib)
+    pos_tensor = to_tensor(pos[: t.shape[0]], lib)
+    pos_tensor.descriptor.contents.dt = U64
+    sin_table_tensor = to_tensor(sin_table, lib)
+    cos_table_tensor = to_tensor(cos_table, lib)
+
+    if torch_device == "npu":
+        torch.npu.synchronize() 
+
+    check_error(
+        lib.infiniopCreateRoPEDescriptor(
+            handle,
+            byref(descriptor),
+            t_tensor.descriptor,
+            pos_tensor.descriptor,
+            sin_table_tensor.descriptor,
+            cos_table_tensor.descriptor,
+        )
+    )
 
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    t_tensor.descriptor.contents.invalidate()
+    pos_tensor.descriptor.contents.invalidate()
+    sin_table_tensor.descriptor.contents.invalidate()
+    cos_table_tensor.descriptor.contents.invalidate()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetRoPEWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, t.device)
+    check_error(
+        lib.infiniopRoPE(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            t_tensor.data,
+            pos_tensor.data,
+            sin_table_tensor.data,
+            cos_table_tensor.data,
+            None,
+        )
+    )
+    assert torch.allclose(t, ans, atol=1e-4, rtol=1e-2)
+    check_error(lib.infiniopDestroyRoPEDescriptor(descriptor))
 
-def test_cpu(lib):
+
+def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
-    config = None
-    descriptor = lib.createRotaryEmbeddingDescriptor(device, config)
-    test(lib, descriptor, "cpu")
-    lib.destroyRotaryEmbeddingDescriptor(descriptor)
+    handle = create_handle(lib, device)
+    for shape, strides, dtype in test_cases:
+        test(lib, handle, "cpu", shape, strides, dtype)
+    destroy_handle(lib, handle)
 
 
-def test_cuda(lib):
+def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
-    config = None
-    descriptor = lib.createRotaryEmbeddingDescriptor(device, config)
-    test(lib, descriptor, "cuda")
-    lib.destroyRotaryEmbeddingDescriptor(descriptor)
+    handle = create_handle(lib, device)
+    for shape, strides, dtype in test_cases:
+        test(lib, handle, "cuda", shape, strides, dtype)
+    destroy_handle(lib, handle)
+
 
-def test_bang(lib):
+def test_bang(lib, test_cases):
     import torch_mlu
     device = DeviceEnum.DEVICE_BANG
-    config = None
-    descriptor = lib.createRotaryEmbeddingDescriptor(device, config)
-    
-    # Note: BANG does not support complex calculation, compare with cpu results 
-    t = torch.rand((1, 32, 128), dtype=torch.float16)
-    pos = torch.ones((1,), dtype=torch.int32)
-    theta = 1e4
-    ans = rotary_embedding(t, pos, theta, "cpu")
-
-    t = t.to("mlu")
-    pos = pos.to("mlu")
-    lib.rotaryEmbedding(
-        descriptor, to_tensor(t, lib), to_tensor(pos, lib), c_float(theta), None
-    )
-    assert torch.allclose(t.cpu(), ans, atol=1e-3, rtol=1e-3)
-    print("Test passed!")
-
-    lib.destroyRotaryEmbeddingDescriptor(descriptor)
+    handle = create_handle(lib, device)
+    for shape, strides, dtype in test_cases:
+        test(lib, handle, "mlu", shape, strides, dtype)
+    destroy_handle(lib, handle)
+
+
+def test_ascend(lib, test_cases) :
+    import torch_npu
+
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+    for shape, strides, dtype in test_cases:
+        test(lib, handle, "npu", shape, strides, dtype)
+    destroy_handle(lib, handle)
+
+def test_maca(lib, test_cases) :
+    device = DeviceEnum.DEVICE_MACA
+    handle = create_handle(lib, device)
+    for shape, strides, dtype in test_cases:
+        test(lib, handle, "maca", shape, strides, dtype)
+    destroy_handle(lib, handle)
+
+def test_musa(lib, test_cases) :
+    import torch_musa
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+    for shape, strides, dtype in test_cases:
+        test(lib, handle, "musa", shape, strides, dtype)
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
+    test_cases = [
+        ((1, 32, 128), None, torch.float16),
+        ((1, 32, 64), None, torch.float16),
+        # 昇腾暂不满足这个用例，最后一维度 <=32 会有问题，可能与其核心
+        # 接口 GatherMask 的内部实现相关，目前 48 64 128 都可以支持
+        ((4, 1, 32), None, torch.float16),
+        ((1, 32, 128), None, torch.float16),
+        
+        ((3, 32, 128), (8000, 200, 1), torch.float16),
+    ]
     args = get_args()
     lib = open_lib()
-    lib.createRotaryEmbeddingDescriptor.restype = c_void_p
-    lib.destroyRotaryEmbeddingDescriptor.argtypes = [c_void_p]
-    lib.rotaryEmbedding.argtypes = [
+    lib.infiniopCreateRoPEDescriptor.restype = c_int32
+    lib.infiniopCreateRoPEDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopRoPEDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetRoPEWorkspaceSize.restype = c_int32
+    lib.infiniopGetRoPEWorkspaceSize.argtypes = [
+        infiniopRoPEDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopRoPE.restype = c_int32
+    lib.infiniopRoPE.argtypes = [
+        infiniopRoPEDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
         c_void_p,
-        CTensor,
-        CTensor,
-        c_float,
         c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRoPEDescriptor.restype = c_int32
+    lib.infiniopDestroyRoPEDescriptor.argtypes = [
+        infiniopRoPEDescriptor_t,
     ]
     if args.cpu:
-        test_cpu(lib)
+        test_cpu(lib, test_cases)
     if args.cuda:
-        test_cuda(lib)
+        test_cuda(lib, test_cases)
     if args.bang:
-        test_bang(lib)
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if args.maca:
+        test_maca(lib, test_cases)
+    if args.musa:
+        test_musa(lib, test_cases)
+    if not (args.cpu or args.cuda or args.bang or args.ascend or args.maca or args.musa):
+        test_cpu(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/swiglu.py b/operatorspy/tests/swiglu.py
index 1be3c437..9ca07c14 100644
--- a/operatorspy/tests/swiglu.py
+++ b/operatorspy/tests/swiglu.py
@@ -1,4 +1,5 @@
-from ctypes import c_float, c_void_p
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
+import ctypes
 import sys
 import os
 
@@ -8,61 +9,318 @@
     to_tensor,
     CTensor,
     DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+    rearrange_tensor,
 )
 
 from operatorspy.tests.test_utils import get_args
 import torch
 
-def swiglu(gate, up):
-    return up * torch.nn.functional.silu(gate).to(gate.dtype)
 
+class SwiGLUDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
 
-def test(lib, descriptor, torch_device):
-    gate = torch.rand((13, 4), dtype=torch.float16).to(torch_device)
-    up = torch.rand((13, 4), dtype=torch.float16).to(torch_device)
-    ans = swiglu(gate, up)
-    lib.swiglu(descriptor, to_tensor(gate, lib), to_tensor(up, lib), None)
-    assert torch.allclose(gate, ans, atol=1e-3, rtol=1e-3)
-    print("Test passed!")
 
+infiniopSwiGLUDescriptor_t = POINTER(SwiGLUDescriptor)
 
-def test_cpu(lib):
+
+def swiglu(a, b):
+    
+    return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
+
+def test_out_of_place(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    print(
+        f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}"
+    )
+    a = torch.rand(shape, dtype=dtype).to(torch_device)
+    b = torch.rand(shape, dtype=dtype).to(torch_device)
+    c = torch.rand(shape, dtype=dtype).to(torch_device)
+
+    if a_stride is not None:
+        a = rearrange_tensor(a, a_stride)
+    if b_stride is not None:
+        b = rearrange_tensor(b, b_stride)
+    if c_stride is not None:
+        c = rearrange_tensor(c, c_stride)
+    ans = swiglu(a, b)
+
+    if sync is not None:
+        sync()
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    c_tensor = to_tensor(c, lib)
+    descriptor = infiniopSwiGLUDescriptor_t()
+    check_error(
+        lib.infiniopCreateSwiGLUDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+    c_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopSwiGLU(
+            descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    )
+
+    assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2)
+    print("out-of-place Test passed!")
+
+    check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
+
+
+def test_in_place1(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = torch.rand(shape, dtype=dtype).to(torch_device)
+    b = torch.rand(shape, dtype=dtype).to(torch_device)
+
+    if a_stride is not None:
+        a = rearrange_tensor(a, a_stride)
+    if b_stride is not None:
+        b = rearrange_tensor(b, b_stride)
+    ans = swiglu(a, b)
+
+    if sync is not None:
+        sync()
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    descriptor = infiniopSwiGLUDescriptor_t()
+    check_error(
+        lib.infiniopCreateSwiGLUDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            a_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopSwiGLU(
+            descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    )
+
+    assert torch.allclose(a, ans, atol=1e-4, rtol=1e-2)
+    print("in-place1 Test passed!")
+
+    check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
+
+
+def test_in_place2(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = torch.rand(shape, dtype=dtype).to(torch_device)
+    b = torch.rand(shape, dtype=dtype).to(torch_device)
+
+    if a_stride is not None:
+        a = rearrange_tensor(a, a_stride)
+    if b_stride is not None:
+        b = rearrange_tensor(b, b_stride)
+    ans = swiglu(a, b)
+
+    if sync is not None:
+        sync()
+
+    a_tensor = to_tensor(a, lib)
+    b_tensor = to_tensor(b, lib)
+    descriptor = infiniopSwiGLUDescriptor_t()
+    check_error(
+        lib.infiniopCreateSwiGLUDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            b_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    a_tensor.descriptor.contents.invalidate()
+    b_tensor.descriptor.contents.invalidate()
+
+    check_error(
+        lib.infiniopSwiGLU(
+            descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None
+        )
+    )
+
+    assert torch.allclose(b, ans, atol=1e-4, rtol=1e-2)
+
+    check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
-    descriptor = lib.createSwigluDescriptor(device, None)
-    test(lib, descriptor, "cpu")
-    lib.destroySwigluDescriptor(descriptor)
+    handle = create_handle(lib, device)
+
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "cpu", shape, a_stride, b_stride, c_stride, dtype
+        )
+        test_in_place1(lib, handle, "cpu", shape, a_stride, b_stride, dtype)
+        test_in_place2(lib, handle, "cpu", shape, a_stride, b_stride, dtype)
+
+    destroy_handle(lib, handle)
 
 
-def test_cuda(lib):
+def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
 
-    descriptor = lib.createSwigluDescriptor(device, None)
-    test(lib, descriptor, "cuda")
-    lib.destroySwigluDescriptor(descriptor)
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "cuda", shape, a_stride, b_stride, c_stride, dtype
+        )
+        test_in_place1(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
+        test_in_place2(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
 
+    destroy_handle(lib, handle)
 
-def test_bang(lib):
+
+def test_bang(lib, test_cases):
     import torch_mlu
     device = DeviceEnum.DEVICE_BANG
-    descriptor = lib.createSwigluDescriptor(device, None)
-    test(lib, descriptor, "mlu")
-    lib.destroySwigluDescriptor(descriptor)
+    handle = create_handle(lib, device)
+
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "mlu", shape, a_stride, b_stride, c_stride, dtype
+        )
+        test_in_place1(lib, handle, "mlu", shape, a_stride, b_stride, dtype)
+        test_in_place2(lib, handle, "mlu", shape, a_stride, b_stride, dtype)
+
+    destroy_handle(lib, handle)
+
+
+def test_ascend(lib, test_cases):
+    import torch_npu
+    device = DeviceEnum.DEVICE_ASCEND
+    handle = create_handle(lib, device)
+
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "npu", shape, a_stride, b_stride, c_stride, dtype, torch.npu.synchronize
+        )
+        test_in_place1(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
+        test_in_place2(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
+
+    destroy_handle(lib, handle) 
+
+def test_maca(lib, test_cases):
+    device = DeviceEnum.DEVICE_MACA
+    handle = create_handle(lib, device)
+
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "cuda", shape, a_stride, b_stride, c_stride, dtype)
+        test_in_place1(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
+        test_in_place2(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
+
+    destroy_handle(lib, handle) 
+
+def test_musa(lib, test_cases):
+    import torch_musa
+    device = DeviceEnum.DEVICE_MUSA
+    handle = create_handle(lib, device)
+
+    for shape, a_stride, b_stride, c_stride, dtype in test_cases:
+        test_out_of_place(
+            lib, handle, "musa", shape, a_stride, b_stride, c_stride, dtype
+        )
+        test_in_place1(lib, handle, "musa", shape, a_stride, b_stride, dtype)
+        test_in_place2(lib, handle, "musa", shape, a_stride, b_stride, dtype)
+
+    destroy_handle(lib, handle) 
 
 
 if __name__ == "__main__":
+    test_cases = [
+        # shape, a_stride, b_stride, c_stride, dtype
+        ((13, 4), None, None, None, torch.float16),
+        ((13, 4), (10, 1), (10, 1), (10, 1), torch.float16),
+        ((16, 5632), None, None, None, torch.float16),
+        ((16, 5632), (13312, 1), (13312, 1), (13312, 1), torch.float16),
+    ]
     args = get_args()
     lib = open_lib()
-    lib.createSwigluDescriptor.restype = c_void_p
-    lib.destroySwigluDescriptor.argtypes = [c_void_p]
-    lib.swiglu.argtypes = [
+
+    lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
+    lib.infiniopCreateSwiGLUDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopSwiGLUDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopSwiGLU.restype = c_int32
+    lib.infiniopSwiGLU.argtypes = [
+        infiniopSwiGLUDescriptor_t,
+        c_void_p,
+        c_void_p,
         c_void_p,
-        CTensor,
-        CTensor,
         c_void_p,
     ]
+
+    lib.infiniopDestroySwiGLUDescriptor.restype = c_int32
+    lib.infiniopDestroySwiGLUDescriptor.argtypes = [
+        infiniopSwiGLUDescriptor_t,
+    ]
+
     if args.cpu:
-        test_cpu(lib)
+        test_cpu(lib, test_cases)
     if args.cuda:
-        test_cuda(lib)
+        test_cuda(lib, test_cases)
     if args.bang:
-        test_bang(lib)
+        test_bang(lib, test_cases)
+    if args.ascend:
+        test_ascend(lib, test_cases)
+    if args.maca:
+        test_maca(lib, test_cases)
+    if args.musa:
+        test_musa(lib, test_cases)
+    print("\033[92mTest passed!\033[0m")
diff --git a/operatorspy/tests/test_utils.py b/operatorspy/tests/test_utils.py
index 9a75d15b..6e4960d5 100644
--- a/operatorspy/tests/test_utils.py
+++ b/operatorspy/tests/test_utils.py
@@ -2,6 +2,11 @@ def get_args():
     import argparse
 
     parser = argparse.ArgumentParser(description="Test Operator")
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Whether profile tests",
+    )
     parser.add_argument(
         "--cpu",
         action="store_true",
@@ -17,5 +22,30 @@ def get_args():
         action="store_true",
         help="Run BANG test",
     )
+    parser.add_argument(
+        "--ascend",
+        action="store_true",
+        help="Run ASCEND NPU test",
+    )
+    parser.add_argument(
+        "--maca",
+        action="store_true",
+        help="Run ASCEND NPU test",
+    )
+    parser.add_argument(
+        "--musa",
+        action="store_true",
+        help="Run MUSA test",
+    )
 
     return parser.parse_args()
+
+
+def synchronize_device(torch_device):
+    import torch
+    if torch_device == "cuda":
+        torch.cuda.synchronize()
+    elif torch_device == "npu":
+        torch.npu.synchronize()
+    elif torch_device == "mlu":
+        torch.mlu.synchronize()
diff --git a/operatorspy/utils.py b/operatorspy/utils.py
new file mode 100644
index 00000000..bb095658
--- /dev/null
+++ b/operatorspy/utils.py
@@ -0,0 +1,110 @@
+import ctypes
+from .data_layout import *
+from .liboperators import infiniopTensorDescriptor_t, CTensor, infiniopHandle_t
+
+
+def check_error(status):
+    if status != 0:
+        raise Exception("Error code " + str(status))
+
+
+def to_tensor(tensor, lib):
+    """
+    Convert a PyTorch tensor to a library Tensor(descriptor, data).
+    """
+    import torch
+
+    ndim = tensor.ndimension()
+    shape = (ctypes.c_uint64 * ndim)(*tensor.shape)
+    strides = (ctypes.c_int64 * ndim)(*(tensor.stride()))
+    data_ptr = tensor.data_ptr()
+    # fmt: off
+    dt = (
+        I8 if tensor.dtype == torch.int8 else
+        I16 if tensor.dtype == torch.int16 else
+        I32 if tensor.dtype == torch.int32 else
+        I64 if tensor.dtype == torch.int64 else
+        U8 if tensor.dtype == torch.uint8 else
+        F16 if tensor.dtype == torch.float16 else
+        BF16 if tensor.dtype == torch.bfloat16 else
+        F32 if tensor.dtype == torch.float32 else
+        F64 if tensor.dtype == torch.float64 else
+        # TODO: These following types may not be supported by older 
+        # versions of PyTorch.
+        U16 if tensor.dtype == torch.uint16 else
+        U32 if tensor.dtype == torch.uint32 else
+        U64 if tensor.dtype == torch.uint64 else
+        None
+    )
+    # fmt: on
+    assert dt is not None
+    # Create TensorDecriptor
+    tensor_desc = infiniopTensorDescriptor_t()
+    lib.infiniopCreateTensorDescriptor(
+        ctypes.byref(tensor_desc), ndim, shape, strides, dt
+    )
+    # Create Tensor
+    return CTensor(tensor_desc, data_ptr)
+
+def create_workspace(size, torch_device):
+    if size == 0:
+        return None
+    import torch
+    if (torch_device == 'maca'):
+        return torch.zeros(size=(size,), dtype=torch.uint8, device='cuda')
+    return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device)
+
+def create_handle(lib, device, id=0):
+    handle = infiniopHandle_t()
+    check_error(lib.infiniopCreateHandle(ctypes.byref(handle), device, id))
+    return handle
+
+
+def destroy_handle(lib, handle):
+    check_error(lib.infiniopDestroyHandle(handle))
+
+
+def rearrange_tensor(tensor, new_strides):
+    """
+    Given a PyTorch tensor and a list of new strides, return a new PyTorch tensor with the given strides.
+    """
+    import torch
+
+    shape = tensor.shape
+
+    new_size = [0] * len(shape)
+    left = 0
+    right = 0
+    for i in range(len(shape)):
+        if new_strides[i] > 0:
+            new_size[i] = (shape[i] - 1) * new_strides[i] + 1
+            right += new_strides[i] * (shape[i] - 1)
+        else:  # TODO: Support negative strides in the future
+            # new_size[i] = (shape[i] - 1) * (-new_strides[i]) + 1
+            # left += new_strides[i] * (shape[i] - 1)
+            raise ValueError("Negative strides are not supported yet")
+
+    # Create a new tensor with zeros
+    new_tensor = torch.zeros(
+        (right - left + 1,), dtype=tensor.dtype, device=tensor.device
+    )
+
+    # Generate indices for original tensor based on original strides
+    indices = [torch.arange(s) for s in shape]
+    mesh = torch.meshgrid(*indices, indexing="ij")
+
+    # Flatten indices for linear indexing
+    linear_indices = [m.flatten() for m in mesh]
+
+    # Calculate new positions based on new strides
+    new_positions = sum(
+        linear_indices[i] * new_strides[i] for i in range(len(shape))
+    ).to(tensor.device)
+    offset = -left
+    new_positions += offset
+
+    # Copy the original data to the new tensor
+    new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1))
+    new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides))
+
+    return new_tensor
diff --git a/src/devices/ascend/CMakeLists.txt b/src/devices/ascend/CMakeLists.txt
new file mode 100644
index 00000000..8cc7f7f8
--- /dev/null
+++ b/src/devices/ascend/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.16.0)
+
+# project information
+project(Ascend_C)
+set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "ASCEND CANN package installation directory")
+set(RUN_MODE "npu" CACHE STRING "run mode: npu")
+set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.")
+endif()
+
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascend_kernels STATIC
+    ../../ops/swiglu/ascend/swiglu_kernel.cpp
+    ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
+    ../../ops/random_sample/ascend/random_sample_kernel.cpp
+)
+
diff --git a/src/devices/ascend/Makefile b/src/devices/ascend/Makefile
new file mode 100644
index 00000000..7af26076
--- /dev/null
+++ b/src/devices/ascend/Makefile
@@ -0,0 +1,10 @@
+.PHONY: build clean
+
+MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+MKFILE_DIR := $(dir $(MKFILE_PATH))
+
+build:
+	mkdir -p build && cd build && cmake .. && make -j8
+
+clean:
+	rm -rf build
diff --git a/src/devices/ascend/ascend_handle.cc b/src/devices/ascend/ascend_handle.cc
new file mode 100644
index 00000000..84b31fd5
--- /dev/null
+++ b/src/devices/ascend/ascend_handle.cc
@@ -0,0 +1,23 @@
+#include "ascend_handle.h"
+
+infiniopStatus_t createAscendHandle(AscendHandle_t *handle_ptr, int device_id) {
+    uint32_t device_count;
+    aclrtGetDeviceCount(&device_count);
+    if (device_id >= static_cast<int>(device_count)) {
+        return STATUS_BAD_DEVICE;
+    }
+
+    auto ret = aclrtSetDevice(device_id);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
+
+    *handle_ptr = new AscendContext{DevAscendNpu, device_id};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t deleteAscendHandle(AscendHandle_t handle_ptr) {
+    delete handle_ptr;
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/devices/ascend/ascend_handle.h b/src/devices/ascend/ascend_handle.h
new file mode 100644
index 00000000..fbbeb824
--- /dev/null
+++ b/src/devices/ascend/ascend_handle.h
@@ -0,0 +1,23 @@
+#ifndef ASCEND_HANDLE_H
+#define ASCEND_HANDLE_H
+
+#include "common_ascend.h"
+#include "device.h"
+#include "status.h"
+#include <acl/acl.h>
+#include <acl/acl_base.h>
+#include <acl/acl_rt.h>
+#include <aclnn/acl_meta.h>
+#include <memory>
+
+struct AscendContext {
+    Device device;
+    int device_id;
+};
+typedef struct AscendContext *AscendHandle_t;
+
+infiniopStatus_t createAscendHandle(AscendHandle_t *handle_ptr, int device_id);
+
+infiniopStatus_t deleteAscendHandle(AscendHandle_t handle_ptr);
+
+#endif
diff --git a/src/devices/ascend/common_ascend.cc b/src/devices/ascend/common_ascend.cc
new file mode 100644
index 00000000..fe988e5d
--- /dev/null
+++ b/src/devices/ascend/common_ascend.cc
@@ -0,0 +1,145 @@
+#include "common_ascend.h"
+
+int64_t numElements(const int64_t *shape, int64_t num) {
+    int64_t numEle = 1;
+    for (int i = 0; i < num; i++) {
+        numEle *= shape[i];
+    }
+    return numEle;
+}
+
+infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize) {
+    *workspaceAddr = nullptr;
+    if (workspaceSize > 0) {
+        auto ret = aclrtMalloc(workspaceAddr, workspaceSize,
+                               ACL_MEM_MALLOC_HUGE_FIRST);
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret);
+                  return STATUS_EXECUTION_FAILED);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t freeWorkspace(void *workspaceAddr) {
+    if (workspaceAddr != nullptr) {
+        auto ret = aclrtFree(workspaceAddr);
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclrtFree failed, ERROR: %d\n", ret);
+                  return STATUS_EXECUTION_FAILED);
+    }
+    return STATUS_SUCCESS;
+}
+
+aclDataType toAclDataType(DT dt) {
+    if (dt == I8)
+        return aclDataType::ACL_INT8;
+    else if (dt == I16)
+        return aclDataType::ACL_INT16;
+    else if (dt == I32)
+        return aclDataType::ACL_INT32;
+    else if (dt == I64)
+        return aclDataType::ACL_INT64;
+    else if (dt == U8)
+        return aclDataType::ACL_UINT8;
+    else if (dt == U16)
+        return aclDataType::ACL_UINT16;
+    else if (dt == U32)
+        return aclDataType::ACL_UINT32;
+    else if (dt == U64)
+        return aclDataType::ACL_UINT64;
+    else if (dt == F16)
+        return aclDataType::ACL_FLOAT16;
+    else if (dt == BF16)
+        return aclDataType::ACL_BF16;
+    else if (dt == F32)
+        return aclDataType::ACL_FLOAT;
+    else if (dt == F64)
+        return aclDataType::ACL_DOUBLE;
+    else
+        return aclDataType::ACL_DT_UNDEFINED;
+}
+
+
+const char *dataTypeToString(aclDataType dtype) {
+    switch (dtype) {
+        case ACL_DT_UNDEFINED:
+            return "ACL_DT_UNDEFINED";
+        case ACL_FLOAT:
+            return "ACL_FLOAT";
+        case ACL_FLOAT16:
+            return "ACL_FLOAT16";
+        case ACL_INT8:
+            return "ACL_INT8";
+        case ACL_INT32:
+            return "ACL_INT32";
+        case ACL_UINT8:
+            return "ACL_UINT8";
+        case ACL_INT16:
+            return "ACL_INT16";
+        case ACL_UINT16:
+            return "ACL_UINT16";
+        case ACL_UINT32:
+            return "ACL_UINT32";
+        case ACL_INT64:
+            return "ACL_INT64";
+        case ACL_UINT64:
+            return "ACL_UINT64";
+        case ACL_DOUBLE:
+            return "ACL_DOUBLE";
+        case ACL_BOOL:
+            return "ACL_BOOL";
+        case ACL_STRING:
+            return "ACL_STRING";
+        case ACL_COMPLEX64:
+            return "ACL_COMPLEX64";
+        case ACL_COMPLEX128:
+            return "ACL_COMPLEX128";
+        case ACL_BF16:
+            return "ACL_BF16";
+        case ACL_INT4:
+            return "ACL_INT4";
+        case ACL_UINT1:
+            return "ACL_UINT1";
+        case ACL_COMPLEX32:
+            return "ACL_COMPLEX32";
+        default:
+            return "UNKNOWN";
+    }
+}
+
+const char *formatToString(aclFormat format) {
+    switch (format) {
+        case ACL_FORMAT_UNDEFINED:
+            return "ACL_FORMAT_UNDEFINED";
+        case ACL_FORMAT_NCHW:
+            return "ACL_FORMAT_NCHW";
+        case ACL_FORMAT_NHWC:
+            return "ACL_FORMAT_NHWC";
+        case ACL_FORMAT_ND:
+            return "ACL_FORMAT_ND";
+        case ACL_FORMAT_NC1HWC0:
+            return "ACL_FORMAT_NC1HWC0";
+        case ACL_FORMAT_FRACTAL_Z:
+            return "ACL_FORMAT_FRACTAL_Z";
+        case ACL_FORMAT_NC1HWC0_C04:
+            return "ACL_FORMAT_NC1HWC0_C04";
+        case ACL_FORMAT_HWCN:
+            return "ACL_FORMAT_HWCN";
+        case ACL_FORMAT_NDHWC:
+            return "ACL_FORMAT_NDHWC";
+        case ACL_FORMAT_FRACTAL_NZ:
+            return "ACL_FORMAT_FRACTAL_NZ";
+        case ACL_FORMAT_NCDHW:
+            return "ACL_FORMAT_NCDHW";
+        case ACL_FORMAT_NDC1HWC0:
+            return "ACL_FORMAT_NDC1HWC0";
+        case ACL_FRACTAL_Z_3D:
+            return "ACL_FRACTAL_Z_3D";
+        case ACL_FORMAT_NC:
+            return "ACL_FORMAT_NC";
+        case ACL_FORMAT_NCL:
+            return "ACL_FORMAT_NCL";
+        default:
+            return "UNKNOWN";
+    }
+}
diff --git a/src/devices/ascend/common_ascend.h b/src/devices/ascend/common_ascend.h
new file mode 100644
index 00000000..9b23fd35
--- /dev/null
+++ b/src/devices/ascend/common_ascend.h
@@ -0,0 +1,41 @@
+#ifndef __COMMON_ASCEND_H__
+#define __COMMON_ASCEND_H__
+
+#include "operators.h"
+#include <acl/acl.h>
+#include <acl/acl_base.h>
+#include <acl/acl_rt.h>
+#include <cstdio>
+#include <functional>
+#include <inttypes.h>
+#include <numeric>
+#include <vector>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CHECK_RET(cond, return_expr) \
+    do {                             \
+        if (!(cond)) {               \
+            return_expr;             \
+        }                            \
+    } while (0)
+
+#define LOG_PRINT(message, ...)         \
+    do {                                \
+        printf(message, ##__VA_ARGS__); \
+    } while (0)
+
+#ifdef __cplusplus
+};
+#endif
+
+int64_t numElements(const int64_t *shape, int64_t num);
+const char *dataTypeToString(aclDataType dtype);
+const char *formatToString(aclFormat format);
+infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize);
+infiniopStatus_t freeWorkspace(void *workspaceAddr);
+aclDataType toAclDataType(DT dt);
+
+#endif
diff --git a/src/devices/ascend/tensor_aclnn.cc b/src/devices/ascend/tensor_aclnn.cc
new file mode 100644
index 00000000..0a0fad74
--- /dev/null
+++ b/src/devices/ascend/tensor_aclnn.cc
@@ -0,0 +1,137 @@
+#include "tensor_aclnn.h"
+#include "../../ops/utils.h"
+#include <algorithm>
+
+infiniopStatus_t aclnnTensorDescriptor::setDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides) {
+    if (shape.size() != strides.size()) {
+        return STATUS_BAD_PARAM;
+    }
+    this->ndim = shape.size();
+    this->shape = std::vector<int64_t>(shape);
+    this->strides = std::vector<int64_t>(strides);
+    this->dataType = dtype;
+
+    // Set format
+    // TODO: Support other format
+    aclFormat format = aclFormat::ACL_FORMAT_ND;
+    this->format = format;
+
+    CHECK_STATUS(this->inferStorageShape(), STATUS_SUCCESS);
+
+    return STATUS_SUCCESS;
+}
+
+
+/// @brief Infer storage shape. For now this ruturns a 1D shape of the total tensor storage size.
+/// We don't see why higher dimensional storage shape is ever needed. To change if necesary.
+infiniopStatus_t aclnnTensorDescriptor::inferStorageShape() {
+    auto index = std::max_element(this->strides.begin(), this->strides.end());
+    uint64_t max_stride_index = std::distance(this->strides.begin(), index);
+    this->storageNdim = 1;
+    this->storageShape = std::vector<int64_t>({this->shape[max_stride_index] * this->strides[max_stride_index]});
+
+    return STATUS_SUCCESS;
+}
+
+/// @brief Set aclnnTensorDescriptor from infiniopTensorDescriptor
+/// @param y infiniopTensorDescriptor
+/// @return infiniopStatus_t
+infiniopStatus_t aclnnTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y) {
+    uint64_t ndim = y->ndim;
+    // Cast shape type
+    auto shape = std::vector<int64_t>(ndim);
+    auto strides = std::vector<int64_t>(ndim);
+    for (uint64_t i = 0; i < ndim; ++i) {
+        shape[i] = static_cast<int64_t>(y->shape[i]);
+        strides[i] = y->strides[i];
+    }
+    return setDescriptor(toAclDataType(y->dt), shape, strides);
+}
+
+/// @brief Wrapper of aclCreateTensor. Create aclTensor.
+/// See https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha001/apiref/appdevgapi/aclcppdevg_03_0168.html
+/// @param desc Alias of aclnnTensorDescriptor*.
+/// @param data Data ptr on device global mem.
+/// @param tensor Pointer of pointer of aclTensor.
+/// @return
+infiniopStatus_t aclnnTensorDescriptor::createTensor(void *data) {
+    if (this->t) {
+        return STATUS_SUCCESS;
+    }
+    this->t = aclCreateTensor(this->shape.data(),
+                              this->ndim,
+                              this->dataType,
+                              this->strides.data(),
+                              this->offset,
+                              this->format,
+                              this->storageShape.data(),
+                              this->storageNdim,
+                              data);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnTensorDescriptor::destroyTensor() {
+    auto ret = aclDestroyTensor(this->t);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclDesctroyTensor failed, ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+    t = nullptr;
+
+    return STATUS_SUCCESS;
+}
+
+aclnnTensorDescriptor::~aclnnTensorDescriptor() {
+    if (this->t) {
+        destroyTensor();
+    }
+}
+
+/// @brief TensorDescriptor's string info
+/// @param desc Alias of aclnnTensorDescriptor*.
+/// @return String of aclnnTensorDescriptor.
+char *aclnnTensorDescriptor::toString() {
+
+    // Assume bufferSize
+    size_t bufferSize = 1024 + this->ndim * 40 + this->storageNdim * 40;
+    char *buffer = (char *) malloc(bufferSize);
+    if (!buffer) return NULL;
+
+    // Write info into buffer
+    char *ptr = buffer;
+    ptr += sprintf(ptr, "ndim: %" PRId64 "\n", this->ndim);
+
+    ptr += sprintf(ptr, "shape: [");
+    for (uint64_t i = 0; i < this->ndim; ++i) {
+        ptr += sprintf(ptr, "%" PRId64, this->shape[i]);
+        if (i < this->ndim - 1) {
+            ptr += sprintf(ptr, ", ");
+        }
+    }
+    ptr += sprintf(ptr, "]\n");
+
+    ptr += sprintf(ptr, "stride: [");
+    for (uint64_t i = 0; i < this->ndim; ++i) {
+        ptr += sprintf(ptr, "%" PRId64, this->strides[i]);
+        if (i < this->ndim - 1) {
+            ptr += sprintf(ptr, ", ");
+        }
+    }
+    ptr += sprintf(ptr, "]\n");
+
+    ptr += sprintf(ptr, "offset: %" PRId64 "\n", this->offset);
+    ptr += sprintf(ptr, "dataType: %s\n", dataTypeToString(this->dataType));
+    ptr += sprintf(ptr, "format: %s\n", formatToString(this->format));
+
+    ptr += sprintf(ptr, "storageShape: [");
+    for (int64_t i = 0; i < this->storageNdim; ++i) {
+        ptr += sprintf(ptr, "%" PRId64, this->storageShape[i]);
+        if (i < this->storageNdim - 1) {
+            ptr += sprintf(ptr, ", ");
+        }
+    }
+    ptr += sprintf(ptr, "]\n");
+
+    ptr += sprintf(ptr, "storageNdim: %" PRId64 "\n", this->storageNdim);
+
+    return buffer;
+}
diff --git a/src/devices/ascend/tensor_aclnn.h b/src/devices/ascend/tensor_aclnn.h
new file mode 100644
index 00000000..4aa25074
--- /dev/null
+++ b/src/devices/ascend/tensor_aclnn.h
@@ -0,0 +1,41 @@
+#ifndef __ACLNN_TENSOR__
+#define __ACLNN_TENSOR__
+
+#include "./common_ascend.h"
+#include "operators.h"
+#include "tensor.h"
+#include "tensor/tensor_descriptor.h"
+#include <acl/acl.h>
+#include <acl/acl_base.h>
+#include <aclnn/acl_meta.h>
+#include <vector>
+
+// Aclnn tensor descriptor,
+// used to build aclTensor
+struct aclnnTensorDescriptor {
+    uint64_t ndim;
+    std::vector<int64_t> shape;
+    std::vector<int64_t> strides;
+    int64_t offset;
+    aclDataType dataType;
+    aclFormat format;
+    std::vector<int64_t> storageShape;
+    int64_t storageNdim;
+
+    aclTensor *t;
+
+    // Transfer from infiniOp DT to aclDataType
+    infiniopStatus_t setDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides);
+    infiniopStatus_t inferStorageShape();
+    // Convert form InfiniOpTensorDescriptor
+    infiniopStatus_t fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y_desc);
+    infiniopStatus_t createTensor(void *data = nullptr);
+    infiniopStatus_t destroyTensor();
+    ~aclnnTensorDescriptor();
+
+    char *toString();
+};
+
+typedef aclnnTensorDescriptor *aclnnTensorDescriptor_t;
+
+#endif
diff --git a/src/devices/bang/bang_handle.cc b/src/devices/bang/bang_handle.cc
new file mode 100644
index 00000000..1625181e
--- /dev/null
+++ b/src/devices/bang/bang_handle.cc
@@ -0,0 +1,21 @@
+#include "bang_handle.h"
+
+infiniopStatus_t createBangHandle(BangHandle_t *handle_ptr, int device_id) {
+    unsigned int device_count;
+    cnrtGetDeviceCount(&device_count);
+    if (device_id >= static_cast<int>(device_count)) {
+        return STATUS_BAD_DEVICE;
+    }
+
+    auto pool = std::make_shared<Pool<cnnlHandle_t>>();
+    if (cnrtSetDevice(device_id) != cnrtSuccess){
+        return STATUS_BAD_DEVICE;
+    }
+    cnnlHandle_t handle;
+    cnnlCreate(&handle);
+    pool->push(std::move(handle));
+
+    *handle_ptr = new BangContext{DevCambriconMlu, device_id, std::move(pool)};
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/devices/bang/bang_handle.h b/src/devices/bang/bang_handle.h
new file mode 100644
index 00000000..cc149678
--- /dev/null
+++ b/src/devices/bang/bang_handle.h
@@ -0,0 +1,32 @@
+#ifndef BANG_HANDLE_H
+#define BANG_HANDLE_H
+
+#include "../pool.h"
+#include "cnnl.h"
+#include "cnrt.h"
+#include "device.h"
+#include "status.h"
+#include <memory>
+
+struct BangContext {
+    Device device;
+    int device_id;
+    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
+};
+typedef struct BangContext *BangHandle_t;
+
+infiniopStatus_t createBangHandle(BangHandle_t *handle_ptr, int device_id);
+
+template<typename T>
+void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, int device_id, cnrtQueue_t queue, T const &f) {
+    auto handle = pool->pop();
+    if (!handle) {
+        cnrtSetDevice(device_id);
+        cnnlCreate(&(*handle));
+    }
+    cnnlSetQueue(*handle, (cnrtQueue_t) queue);
+    f(*handle);
+    pool->push(std::move(*handle));
+}
+
+#endif
diff --git a/src/devices/bang/common_bang.h b/src/devices/bang/common_bang.h
index 6be9bfc3..b855a41f 100644
--- a/src/devices/bang/common_bang.h
+++ b/src/devices/bang/common_bang.h
@@ -3,13 +3,14 @@
 
 #include "cnnl.h"
 #include "tensor.h"
+#include "../../ops/utils.h"
 #include <vector>
 
 const int NRAM_MAX_SIZE = 1024 * 256;//the maximum NRAM memory is 1024 * 768
 const int GDRAM_MAX_SIZE = 1024 * 1024 * 1024;
 
 // set cnnl tensor descriptor without strides11
-inline void setCnnlTensor(cnnlTensorDescriptor_t desc, const TensorLayout* layout) {
+inline void setCnnlTensor(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) {
     std::vector<int> dims(layout->ndim);
     for (uint64_t i = 0; i < layout->ndim; i++) {
         dims[i] = static_cast<int>(layout->shape[i]);
@@ -19,7 +20,7 @@ inline void setCnnlTensor(cnnlTensorDescriptor_t desc, const TensorLayout* layou
 }
 
 // set cnnl tensor descriptor with strides
-inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc, const TensorLayout *layout) {
+inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) {
     std::vector<int> dim_size(layout->ndim), dim_stride(layout->ndim);
     for (uint64_t i = 0; i < layout->ndim; i++) {
         dim_size[i] = static_cast<int>(layout->shape[i]);
@@ -29,4 +30,26 @@ inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc, const TensorLayout *lay
                               dim_size.size(), dim_size.data(), dim_stride.data());
 }
 
-#endif  // __COMMON_BANG_H__
+inline cnnlDataType_t cnnlDataTypeConvert(DT dataType) {
+    if (dtype_eq(dataType, F32)) {
+        return CNNL_DTYPE_FLOAT;
+    } else if (dtype_eq(dataType, F64)) {
+        return CNNL_DTYPE_DOUBLE;
+    } else if (dtype_eq(dataType, F16)) {
+        return CNNL_DTYPE_HALF;
+    } else if (dtype_eq(dataType, I8)) {
+        return CNNL_DTYPE_INT8;
+    } else if (dtype_eq(dataType, I32)) {
+        return CNNL_DTYPE_INT32;
+    } else if (dtype_eq(dataType, U8)) {
+        return CNNL_DTYPE_UINT8;
+    } else if (dtype_eq(dataType, BF16)) {
+        return CNNL_DTYPE_BFLOAT16;
+    } else if (dtype_eq(dataType, I64)) {
+        return CNNL_DTYPE_INT64;
+    } else {
+        return CNNL_DTYPE_INVALID;
+    }
+}
+
+#endif// __COMMON_BANG_H__
diff --git a/src/devices/bang/handle_pool.cc b/src/devices/bang/handle_pool.cc
deleted file mode 100644
index 4b712c1f..00000000
--- a/src/devices/bang/handle_pool.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-#include <mutex>
-#include <vector>
-#include "handle_pool.h"
-
-const Pool<cnnlHandle_t> &get_cnnl_pool() {
-    int device_id;
-    cnrtGetDevice(&device_id);
-    static std::once_flag flag;
-    static std::vector<Pool<cnnlHandle_t>> cnnl_pool;
-    std::call_once(flag, [&]() {
-        unsigned int device_count;
-        cnrtGetDeviceCount(&device_count);
-        for (auto i = 0; i < static_cast<int>(device_count); i++) {
-            auto pool = Pool<cnnlHandle_t>();
-            cnnlHandle_t handle;
-            cnnlCreate(&handle);
-            pool.push(std::move(handle));
-            cnnl_pool.emplace_back(std::move(pool));
-        }
-    });
-    return cnnl_pool[device_id];
-}
diff --git a/src/devices/bang/handle_pool.h b/src/devices/bang/handle_pool.h
deleted file mode 100644
index e30d8768..00000000
--- a/src/devices/bang/handle_pool.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef __BANG_HANDLE_POOL_H__
-#define __BANG_HANDLE_POOL_H__
-
-#include "cnnl.h"
-#include "cnrt.h"
-#include "../pool.h"
-
-const Pool<cnnlHandle_t> &get_cnnl_pool();
-
-template<typename T>
-void use_cnnl(cnrtQueue_t queue, T const &f) {
-    auto &pool = get_cnnl_pool();
-    auto handle = pool.pop();
-    if (!handle) {
-        cnnlCreate(&(*handle));
-    }
-    cnnlSetQueue(*handle, (cnrtQueue_t) queue);
-    f(*handle);
-    pool.push(std::move(*handle));
-}
-
-#endif // __BANG_HANDLE_POOL_H__
diff --git a/src/devices/cpu/common_cpu.cc b/src/devices/cpu/common_cpu.cc
index 13228dd4..7fb9e5d8 100644
--- a/src/devices/cpu/common_cpu.cc
+++ b/src/devices/cpu/common_cpu.cc
@@ -1,22 +1,97 @@
 #include "common_cpu.h"
 
-float f16_to_f32(uint16_t code) {
-    union {
-        uint32_t u32;
-        float f32;
-    } ans{0};
-    ans.u32 = ((static_cast<uint32_t>(code) << 16) & (1 << 31)) |
-              ((((code >> 10) & mask_low(5)) - 15 + 127) << 23) |
-              ((code & mask_low(10)) << 13);
-    return ans.f32;
+float f16_to_f32(uint16_t h) {
+    uint32_t sign = (h & 0x8000) << 16; // Extract the sign bit
+    int32_t exponent = (h >> 10) & 0x1F;// Extract the exponent
+    uint32_t mantissa = h & 0x3FF;      // Extract the mantissa (fraction part)
+
+    if (exponent == 31) {// Special case for Inf and NaN
+        if (mantissa != 0) {
+            // NaN: Set float32 NaN
+            uint32_t f32 = sign | 0x7F800000 | (mantissa << 13);
+            return *(float *) &f32;
+        } else {
+            // Infinity
+            uint32_t f32 = sign | 0x7F800000;
+            return *(float *) &f32;
+        }
+    } else if (exponent == 0) {// Subnormal float16 or zero
+        if (mantissa == 0) {
+            // Zero (positive or negative)
+            uint32_t f32 = sign;// Just return signed zero
+            return *(float *) &f32;
+        } else {
+            // Subnormal: Convert to normalized float32
+            exponent = -14;                  // Set exponent for subnormal numbers
+            while ((mantissa & 0x400) == 0) {// Normalize mantissa
+                mantissa <<= 1;
+                exponent--;
+            }
+            mantissa &= 0x3FF;// Clear the leading 1 bit
+            uint32_t f32 = sign | ((exponent + 127) << 23) | (mantissa << 13);
+            return *(float *) &f32;
+        }
+    } else {
+        // Normalized float16
+        uint32_t f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13);
+        return *(float *) &f32;
+    }
 }
 
 uint16_t f32_to_f16(float val) {
-    union {
-        float f32;
-        uint32_t u32;
-    } x{val};
-    return (static_cast<uint16_t>(x.u32 >> 16) & (1 << 15)) |
-           (((static_cast<uint16_t>(x.u32 >> 23) - 127 + 15) & mask_low(5)) << 10) |
-           (static_cast<uint16_t>(x.u32 >> 13) & mask_low(10));
+    uint32_t f32 = *(uint32_t *) &val;            // Read the bits of the float32
+    uint16_t sign = (f32 >> 16) & 0x8000;         // Extract the sign bit
+    int32_t exponent = ((f32 >> 23) & 0xFF) - 127;// Extract and de-bias the exponent
+    uint32_t mantissa = f32 & 0x7FFFFF;           // Extract the mantissa (fraction part)
+
+    if (exponent >= 31) {// Special cases for Inf and NaN
+        // NaN
+        if (exponent == 128 && mantissa != 0) {
+            return sign | 0x7E00;
+        }
+        // Infinity
+        return sign | 0x7C00;
+    } else if (exponent >= -14) {// Normalized case
+        return sign | ((exponent + 15) << 10) | (mantissa >> 13);
+    } else if (exponent >= -24) {
+        mantissa |= 0x800000;// Add implicit leading 1
+        mantissa >>= (-14 - exponent);
+        return sign | (mantissa >> 13);
+    } else {
+        // Too small for subnormal: return signed zero
+        return sign;
+    }
+}
+
+uint64_t getDstOffset(uint64_t flat_index, uint64_t ndim, int64_t const *src_strides, int64_t const *dst_strides) {
+    uint64_t res = 0;
+    for (uint64_t i = 0; i < ndim; ++i) {
+        res += flat_index / src_strides[i] * dst_strides[i];
+        flat_index %= src_strides[i];
+    }
+    return res;
+}
+
+uint64_t getOffset(uint64_t flat_index, uint64_t ndim, uint64_t const *shape, int64_t const *strides) {
+    uint64_t res = 0;
+    for (long i = ndim - 1; i >= 0; --i) {
+        res += (flat_index % shape[i]) * strides[i];
+        flat_index /= shape[i];
+    }
+    return res;
+}
+
+uint64_t getPaddedSize(uint64_t ndim, uint64_t *shape, uint64_t const *pads) {
+    uint64_t total_size = 1;
+    for (size_t i = 0; i < ndim; ++i) {
+        total_size *= shape[i] + (i < 2 ? 0 : 2 * pads[i - 2]);
+    }
+    return total_size;
+}
+
+void getPaddedShape(uint64_t ndim, uint64_t const *shape, uint64_t const *pads, uint64_t *padded_shape) {
+    memcpy(padded_shape, shape, ndim * sizeof(uint64_t));
+    for (size_t i = 2; i < ndim; ++i) {
+        padded_shape[i] += 2 * pads[i - 2];
+    }
 }
diff --git a/src/devices/cpu/common_cpu.h b/src/devices/cpu/common_cpu.h
index 20f1a2d8..c3139d69 100644
--- a/src/devices/cpu/common_cpu.h
+++ b/src/devices/cpu/common_cpu.h
@@ -3,6 +3,7 @@
 
 #include <cmath>
 #include <cstdint>
+#include <cstring>
 
 // return a mask with the specified number of low bits set to 1
 constexpr static uint16_t mask_low(int bits) noexcept {
@@ -15,4 +16,19 @@ float f16_to_f32(uint16_t code);
 // convert single-precision float to half-precision float
 uint16_t f32_to_f16(float val);
 
-#endif // __COMMON_CPU_H__
+// get the corresponding offset in the destination given the flat index of the source (for element mapping in shape broadcast)
+uint64_t getDstOffset(uint64_t flat_index, uint64_t ndim, int64_t const *src_strides, int64_t const *dst_strides);
+
+// get the memory offset of the given element in a tensor given its flat index
+uint64_t getOffset(uint64_t flat_index, uint64_t ndim, uint64_t const *shape, int64_t const *strides);
+
+/**
+ * get the total array size (element count) after applying padding for a 
+ * ndim-ary tensor with the given shape
+ */
+uint64_t getPaddedSize(uint64_t ndim, uint64_t *shape, uint64_t const *pads);
+
+// calculate the padded shape and store the result in padded_shape
+void getPaddedShape(uint64_t ndim, uint64_t const *shape, uint64_t const *pads, uint64_t *padded_shape);
+
+#endif// __COMMON_CPU_H__
diff --git a/src/devices/cpu/cpu_handle.cc b/src/devices/cpu/cpu_handle.cc
new file mode 100644
index 00000000..fbbf09b7
--- /dev/null
+++ b/src/devices/cpu/cpu_handle.cc
@@ -0,0 +1,7 @@
+#include "device.h"
+#include "cpu_handle.h"
+
+infiniopStatus_t createCpuHandle(CpuHandle_t* handle_ptr){
+    *handle_ptr = new CpuContext{DevCpu};
+    return STATUS_SUCCESS;
+}
diff --git a/src/devices/cpu/cpu_handle.h b/src/devices/cpu/cpu_handle.h
new file mode 100644
index 00000000..1be72724
--- /dev/null
+++ b/src/devices/cpu/cpu_handle.h
@@ -0,0 +1,14 @@
+#ifndef CPU_HANDLE_H
+#define CPU_HANDLE_H
+
+#include "device.h"
+#include "status.h"
+
+struct CpuContext {
+    Device device;
+};
+typedef struct CpuContext *CpuHandle_t;
+
+infiniopStatus_t createCpuHandle(CpuHandle_t *handle_ptr);
+
+#endif
diff --git a/src/devices/cuda/common_cuda.h b/src/devices/cuda/common_cuda.h
index a85e7994..d46d45c4 100644
--- a/src/devices/cuda/common_cuda.h
+++ b/src/devices/cuda/common_cuda.h
@@ -1,8 +1,92 @@
 #ifndef __COMMON_CUDA_H__
 #define __COMMON_CUDA_H__
 
+#ifdef ENABLE_SUGON_DCU
+#define MAX_THREADS_PER_BLOCK 512
+#else
 #define MAX_THREADS_PER_BLOCK 1024
+#endif
+
 #define MAX_WARP_PER_BLOCK 32
 #define WARP_SIZE 32
 
-#endif // __COMMON_CUDA_H__
+#include <iostream>
+
+#define checkCudaErrorWithCode(call, errorCode)                       \
+    do {                                                              \
+        if (auto status = call; status != cudaSuccess) {              \
+            std::cerr << "CUDA error: " << cudaGetErrorString(status) \
+                      << " in file " << __FILE__                      \
+                      << ", function " << __func__                    \
+                      << ", line " << __LINE__ << std::endl;          \
+            return errorCode;                                         \
+        }                                                             \
+    } while (0)
+
+#define checkCudaError(call) checkCudaErrorWithCode(call, STATUS_BAD_DEVICE)
+
+#define checkCudnnError(call)                                           \
+    do {                                                                \
+        if (auto status = call; status != CUDNN_STATUS_SUCCESS) {       \
+            std::cerr << "CUDNN error: " << cudnnGetErrorString(status) \
+                      << " in file " << __FILE__                        \
+                      << ", function " << __func__                      \
+                      << ", line " << __LINE__ << std::endl;            \
+            return STATUS_EXECUTION_FAILED;                             \
+        }                                                               \
+    } while (0)
+
+#include "data_type.h"
+#include <cudnn.h>
+
+typedef struct DTCudnnMapping {
+    DT layout;
+    cudnnDataType_t cudnn_type;
+} DTCudnnMapping;
+
+// DT cudnnDataType_t mapping table
+const DTCudnnMapping dtMappings[] = {
+    {F16, CUDNN_DATA_HALF},
+    {F32, CUDNN_DATA_FLOAT},
+    {F64, CUDNN_DATA_DOUBLE},
+    {BF16, CUDNN_DATA_BFLOAT16},
+    {I8, CUDNN_DATA_INT8},
+    {I32, CUDNN_DATA_INT32},
+    {I64, CUDNN_DATA_INT64},
+    {U8, CUDNN_DATA_UINT8},
+};
+
+typedef struct DataLayoutMap {
+    int operator[](const DataLayout &layout) const {
+        for (const auto &mapping : dtMappings) {
+            if (mapping.layout == layout) {
+                return mapping.cudnn_type;
+            }
+        }
+        return -1;
+    }
+} DTMap;
+
+constexpr DTMap dataTypeMap;
+
+// get the corresponding offset in the destination given the flat index of the source (for element mapping in shape broadcast)
+inline __device__ uint64_t getDstOffset(uint64_t flat_index, uint64_t ndim, int64_t const *src_strides, int64_t const *dst_strides) {
+    uint64_t res = 0;
+    for (uint64_t i = 0; i < ndim; ++i) {
+        res += flat_index / src_strides[i] * dst_strides[i];
+        flat_index %= src_strides[i];
+    }
+    return res;
+}
+
+// get the memory offset of the given element in a tensor given its flat index
+inline __device__ uint64_t getOffset(uint64_t flat_index, uint64_t ndim, uint64_t const *shape, int64_t const *strides) {
+    uint64_t res = 0;
+    for (long i = ndim - 1; i >= 0; --i) {
+        res += (flat_index % shape[i]) * strides[i];
+        flat_index /= shape[i];
+    }
+    return res;
+}
+
+#endif// __COMMON_CUDA_H__
diff --git a/src/devices/cuda/cuda_handle.cc b/src/devices/cuda/cuda_handle.cc
new file mode 100644
index 00000000..7d7db662
--- /dev/null
+++ b/src/devices/cuda/cuda_handle.cc
@@ -0,0 +1,55 @@
+#include "cuda_handle.h"
+
+infiniopStatus_t createCudaHandle(CudaHandle_t *handle_ptr, int device_id) {
+    // Check if device_id is valid
+    int device_count;
+    cudaGetDeviceCount(&device_count);
+    if (device_id >= device_count) {
+        return STATUS_BAD_DEVICE;
+    }
+
+    // Create a new cublas handle pool
+    auto pool = std::make_shared<Pool<cublasHandle_t>>();
+    if (cudaSetDevice(device_id) != cudaSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    pool->push(std::move(handle));
+
+    // create a cudnn handle pool
+    auto cudnn_pool = std::make_shared<Pool<cudnnHandle_t>>();
+    cudnnHandle_t cudnn_handle;
+    checkCudnnError(cudnnCreate(&cudnn_handle));
+    cudnn_pool->push(std::move(cudnn_handle));
+
+    // set CUDA device property
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device_id);
+
+    // set device compute capability numbers
+    int capability_major;
+    int capability_minor;
+    cudaDeviceGetAttribute(&capability_major, cudaDevAttrComputeCapabilityMajor, device_id);
+    cudaDeviceGetAttribute(&capability_minor, cudaDevAttrComputeCapabilityMinor, device_id);
+
+    *handle_ptr = new CudaContext{
+        DevNvGpu,
+        device_id,
+        std::move(pool),
+        std::move(cudnn_pool),
+        std::move(prop),
+        capability_major,
+        capability_minor,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t deleteCudaHandle(CudaHandle_t handle_ptr) {
+    handle_ptr->cublas_handles_t = nullptr;
+    handle_ptr->cudnn_handles_t = nullptr;
+    delete handle_ptr;
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/devices/cuda/cuda_handle.h b/src/devices/cuda/cuda_handle.h
new file mode 100644
index 00000000..f935ed5f
--- /dev/null
+++ b/src/devices/cuda/cuda_handle.h
@@ -0,0 +1,52 @@
+#ifndef CUDA_HANDLE_H
+#define CUDA_HANDLE_H
+
+#include "../pool.h"
+#include "common_cuda.h"
+#include "device.h"
+#include "status.h"
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <memory>
+
+struct CudaContext {
+    Device device;
+    int device_id;
+    std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t;
+    std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handles_t;
+    cudaDeviceProp prop;
+    int compute_capability_major;
+    int compute_capability_minor;
+};
+typedef struct CudaContext *CudaHandle_t;
+
+infiniopStatus_t createCudaHandle(CudaHandle_t *handle_ptr, int device_id);
+
+infiniopStatus_t deleteCudaHandle(CudaHandle_t handle_ptr);
+
+template<typename T>
+void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t, int device_id, cudaStream_t stream, T const &f) {
+    auto handle = cublas_handles_t->pop();
+    if (!handle) {
+        cudaSetDevice(device_id);
+        cublasCreate(&(*handle));
+    }
+    cublasSetStream(*handle, (cudaStream_t) stream);
+    f(*handle);
+    cublas_handles_t->push(std::move(*handle));
+}
+
+template<typename T>
+cudnnStatus_t use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handles_t, int device_id, cudaStream_t stream, T const &f) {
+    auto handle = cudnn_handles_t->pop();
+    if (!handle) {
+        cudaSetDevice(device_id);
+        cudnnCreate(&(*handle));
+    }
+    cudnnSetStream(*handle, stream);
+    cudnnStatus_t status = f(*handle);
+    cudnn_handles_t->push(std::move(*handle));
+    return status;
+}
+
+#endif
diff --git a/src/devices/cuda/handle_pool.cc b/src/devices/cuda/handle_pool.cc
deleted file mode 100644
index fe89340c..00000000
--- a/src/devices/cuda/handle_pool.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "handle_pool.h"
-#include <vector>
-#include <cuda_runtime.h>
-
-const Pool<cublasHandle_t> &get_cublas_pool() {
-    int device_id;
-    cudaGetDevice(&device_id);
-    static std::once_flag flag;
-    static std::vector<Pool<cublasHandle_t>> cublas_pool;
-    std::call_once(flag, [&]() {
-        int device_count;
-        cudaGetDeviceCount(&device_count);
-        for (int i = 0; i < device_count; i++) {
-            auto pool = Pool<cublasHandle_t>();
-            cublasHandle_t handle;
-            cublasCreate(&handle);
-            pool.push(std::move(handle));
-            cublas_pool.emplace_back(std::move(pool));
-        }
-    });
-    return cublas_pool[device_id];
-}
diff --git a/src/devices/cuda/handle_pool.h b/src/devices/cuda/handle_pool.h
deleted file mode 100644
index 4165902b..00000000
--- a/src/devices/cuda/handle_pool.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __CUDA_HANDLE_POOL_H__
-#define __CUDA_HANDLE_POOL_H__
-
-#include <cublas_v2.h>
-#include "../pool.h"
-
-const Pool<cublasHandle_t> &get_cublas_pool(); 
-
-template<typename T>
-void use_cublas(cudaStream_t stream, T const &f) {
-    auto &pool = get_cublas_pool();
-    auto handle = pool.pop();
-    if (!handle) {
-        cublasCreate(&(*handle));
-    }
-    cublasSetStream(*handle, (cudaStream_t) stream);
-    f(*handle);
-    pool.push(std::move(*handle));
-}
-
-#endif // __CUDA_HANDLE_POOL_H__
diff --git a/src/devices/handle.cc b/src/devices/handle.cc
new file mode 100644
index 00000000..6b7f54a8
--- /dev/null
+++ b/src/devices/handle.cc
@@ -0,0 +1,101 @@
+#include "handle/handle_export.h"
+#ifdef ENABLE_CPU
+#include "./cpu/cpu_handle.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "./cuda/cuda_handle.h"
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+#include "./bang/bang_handle.h"
+#endif
+#ifdef ENABLE_ASCEND_NPU
+#include "./ascend/ascend_handle.h"
+#endif
+#ifdef ENABLE_METAX_GPU
+#include "./maca/maca_handle.h"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#include "./musa/musa_handle.h"
+#endif
+
+
+__C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id) {
+    if (handle_ptr == nullptr) {
+        return STATUS_MEMORY_NOT_ALLOCATED;
+    }
+    if (device_id < 0) {
+        return STATUS_BAD_PARAM;
+    }
+
+    switch (device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return createCpuHandle((CpuHandle_t *) handle_ptr);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return createCudaHandle((CudaHandle_t *) handle_ptr, device_id);
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return createBangHandle((BangHandle_t *) handle_ptr, device_id);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return createAscendHandle((AscendHandle_t *) handle_ptr, device_id);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return createMacaHandle((MacaHandle_t *) handle_ptr, device_id);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return createMusaHandle((MusaHandle_t *) handle_ptr, device_id);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+
+__C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            delete handle;
+            return STATUS_SUCCESS;
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return deleteCudaHandle((CudaHandle_t) handle);
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            delete (BangHandle_t) handle;
+            return STATUS_SUCCESS;
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return deleteAscendHandle((AscendHandle_t) handle);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return deleteMacaHandle((MacaHandle_t) handle);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            deleteMusaHandle((MusaHandle_t) handle);
+            return STATUS_SUCCESS;
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/devices/maca/common_maca.h b/src/devices/maca/common_maca.h
new file mode 100644
index 00000000..9fa82e78
--- /dev/null
+++ b/src/devices/maca/common_maca.h
@@ -0,0 +1,87 @@
+#ifndef __COMMON_MACA_H__
+#define __COMMON_MACA_H__
+
+#define MAX_THREADS_PER_BLOCK 1024
+#define MAX_WARP_PER_BLOCK 32
+#define WARP_SIZE 32
+
+#include <iostream>
+
+#define checkMacaErrorWithCode(call, errorCode)                       \
+    do {                                                              \
+        if (auto status = call; status != hcSuccess) {              \
+            std::cerr << "MACA error: " << hcGetErrorString(status) \
+                      << " in file " << __FILE__                      \
+                      << ", function " << __func__                    \
+                      << ", line " << __LINE__ << std::endl;          \
+            return errorCode;                                         \
+        }                                                             \
+    } while (0)
+
+#define checkMacaError(call) checkMacaErrorWithCode(call, STATUS_BAD_DEVICE)
+
+#define checkMcdnnError(call)                                           \
+    do {                                                                \
+        if (auto status = call; status != HCDNN_STATUS_SUCCESS) {       \
+            std::cerr << "MCDNN error: " << hcdnnGetErrorString(status) \
+                      << " in file " << __FILE__                        \
+                      << ", function " << __func__                      \
+                      << ", line " << __LINE__ << std::endl;            \
+            return STATUS_EXECUTION_FAILED;                             \
+        }                                                               \
+    } while (0)
+
+#include "data_type.h"
+#include <hcdnn/hcdnn.h>
+
+typedef struct DTMcdnnMapping {
+    DT layout;
+    hcdnnDataType_t hcdnn_type;
+} DTMcdnnMapping;
+
+// DT mcdnnDataType_t mapping table
+const DTMcdnnMapping dtMappings[] = {
+    {F16, HCDNN_DATA_HALF},
+    {F32, HCDNN_DATA_FLOAT},
+    {F64, HCDNN_DATA_DOUBLE},
+    {BF16, HCDNN_DATA_BFLOAT16},
+    {I8, HCDNN_DATA_INT8},
+    {I32, HCDNN_DATA_INT32},
+    {I64, HCDNN_DATA_INT64},
+    {U8, HCDNN_DATA_UINT8},
+};
+
+typedef struct DataLayoutMap {
+    int operator[](const DataLayout &layout) const {
+        for (const auto &mapping : dtMappings) {
+            if (mapping.layout == layout) {
+                return mapping.hcdnn_type;
+            }
+        }
+        return -1;
+    }
+} DTMap;
+
+constexpr DTMap dataTypeMap;
+
+// get the corresponding offset in the destination given the flat index of the source (for element mapping in shape broadcast)
+inline __device__ uint64_t getDstOffset(uint64_t flat_index, uint64_t ndim, int64_t const *src_strides, int64_t const *dst_strides) {
+    uint64_t res = 0;
+    for (uint64_t i = 0; i < ndim; ++i) {
+        res += flat_index / src_strides[i] * dst_strides[i];
+        flat_index %= src_strides[i];
+    }
+    return res;
+}
+
+// get the memory offset of the given element in a tensor given its flat index
+inline __device__ uint64_t getOffset(uint64_t flat_index, uint64_t ndim, uint64_t const *shape, int64_t const *strides) {
+    uint64_t res = 0;
+    for (long i = ndim - 1; i >= 0; --i) {
+        res += (flat_index % shape[i]) * strides[i];
+        flat_index /= shape[i];
+    }
+    return res;
+}
+
+#endif// __COMMON_MACA_H__
diff --git a/src/devices/maca/maca_handle.cc b/src/devices/maca/maca_handle.cc
new file mode 100644
index 00000000..9b1b52b8
--- /dev/null
+++ b/src/devices/maca/maca_handle.cc
@@ -0,0 +1,55 @@
+#include "maca_handle.h"
+
+infiniopStatus_t createMacaHandle(MacaHandle_t *handle_ptr, int device_id) {
+    // Check if device_id is valid
+    int device_count;
+    hcGetDeviceCount(&device_count);
+    if (device_id >= device_count) {
+        return STATUS_BAD_DEVICE;
+    }
+
+    // Create a new mcblas handle pool
+    auto pool = std::make_shared<Pool<hcblasHandle_t>>();
+    if (hcSetDevice(device_id) != hcSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    hcblasHandle_t handle;
+    hcblasCreate(&handle);
+    pool->push(std::move(handle));
+
+    // create a mcdnn handle pool
+    auto mcdnn_pool = std::make_shared<Pool<hcdnnHandle_t>>();
+    hcdnnHandle_t mcdnn_handle;
+    checkMcdnnError(hcdnnCreate(&mcdnn_handle));
+    mcdnn_pool->push(std::move(mcdnn_handle));
+
+    // set MACA device property
+    hcDeviceProp_t prop;
+    hcGetDeviceProperties(&prop, device_id);
+
+    // set device compute capability numbers
+    int capability_major;
+    int capability_minor;
+    hcDeviceGetAttribute(&capability_major, hcDeviceAttributeComputeCapabilityMajor, device_id);
+    hcDeviceGetAttribute(&capability_minor, hcDeviceAttributeComputeCapabilityMinor, device_id);
+
+    *handle_ptr = new MacaContext{
+        DevMetaxGpu,
+        device_id,
+        std::move(pool),
+        std::move(mcdnn_pool),
+        std::move(prop),
+        capability_major,
+        capability_minor,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t deleteMacaHandle(MacaHandle_t handle_ptr) {
+    handle_ptr->mcblas_handles_t = nullptr;
+    handle_ptr->mcdnn_handles_t = nullptr;
+    delete handle_ptr;
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/devices/maca/maca_handle.h b/src/devices/maca/maca_handle.h
new file mode 100644
index 00000000..41485099
--- /dev/null
+++ b/src/devices/maca/maca_handle.h
@@ -0,0 +1,52 @@
+#ifndef MACA_HANDLE_H
+#define MACA_HANDLE_H
+
+#include "../pool.h"
+#include "common_maca.h"
+#include "device.h"
+#include "status.h"
+#include <hcblas/hcblas.h>
+#include <hcdnn/hcdnn.h>
+#include <memory>
+
+struct MacaContext {
+    Device device;
+    int device_id;
+    std::shared_ptr<Pool<hcblasHandle_t>> mcblas_handles_t;
+    std::shared_ptr<Pool<hcdnnHandle_t>> mcdnn_handles_t;
+    hcDeviceProp_t prop;
+    int compute_capability_major;
+    int compute_capability_minor;
+};
+typedef struct MacaContext *MacaHandle_t;
+
+infiniopStatus_t createMacaHandle(MacaHandle_t *handle_ptr, int device_id);
+
+infiniopStatus_t deleteMacaHandle(MacaHandle_t handle_ptr);
+
+template<typename T>
+void use_mcblas(std::shared_ptr<Pool<hcblasHandle_t>> mcblas_handles_t, int device_id, hcStream_t stream, T const &f) {
+    auto handle = mcblas_handles_t->pop();
+    if (!handle) {
+        hcSetDevice(device_id);
+        hcblasCreate(&(*handle));
+    }
+    hcblasSetStream(*handle, (hcStream_t) stream);
+    f(*handle);
+    mcblas_handles_t->push(std::move(*handle));
+}
+
+template<typename T>
+hcdnnStatus_t use_mcdnn(std::shared_ptr<Pool<hcdnnHandle_t>> mcdnn_handles_t, int device_id, hcStream_t stream, T const &f) {
+    auto handle = mcdnn_handles_t->pop();
+    if (!handle) {
+        hcSetDevice(device_id);
+        hcdnnCreate(&(*handle));
+    }
+    hcdnnSetStream(*handle, stream);
+    hcdnnStatus_t status = f(*handle);
+    mcdnn_handles_t->push(std::move(*handle));
+    return status;
+}
+
+#endif
diff --git a/src/devices/musa/common_musa.h b/src/devices/musa/common_musa.h
new file mode 100644
index 00000000..c42b5197
--- /dev/null
+++ b/src/devices/musa/common_musa.h
@@ -0,0 +1,77 @@
+#ifndef __COMMON_MUSA_H__
+#define __COMMON_MUSA_H__
+
+#define MAX_THREADS_PER_BLOCK 1024
+#define MAX_WARP_PER_BLOCK 32
+#define WARP_SIZE 32
+
+#include <iostream>
+#include "data_type.h"
+#include <musa.h>
+#include <musa_runtime_api.h>
+#include <mudnn.h>
+
+enum class Type {
+    QINT4,
+    QINT8,
+    INT8,
+    INT16,
+    INT32,
+    INT64,
+    UINT8,
+    UINT16,
+    UINT32,
+    UINT64,
+    HALF,
+    BFLOAT16,
+    FLOAT,
+    DOUBLE,
+    BOOL,
+};
+
+enum class Format {
+    UNKNOWN,
+    NCW,
+    NWC,
+    NCHW,
+    NHWC,
+    HWCN,
+    NCDHW,
+    NDHWC,
+    DHWCN,
+};
+
+#define checkMusaErrorWithCode(call, errorCode)                       \
+    do {                                                              \
+        if (auto status = call; status != musaSuccess) {              \
+            std::cerr << "MUSA error: " << musaGetErrorString(status) \
+                      << " in file " << __FILE__                      \
+                      << ", function " << __func__                    \
+                      << ", line " << __LINE__ << std::endl;          \
+            return errorCode;                                         \
+        }                                                             \
+    } while (0)
+
+#define checkMusaError(call) checkMusaErrorWithCode(call, STATUS_BAD_DEVICE)
+
+// get the corresponding offset in the destination given the flat index of the source (for element mapping in shape broadcast)
+inline __device__ uint64_t getDstOffset(uint64_t flat_index, uint64_t ndim, int64_t const *src_strides, int64_t const *dst_strides) {
+    uint64_t res = 0;
+    for (uint64_t i = 0; i < ndim; ++i) {
+        res += flat_index / src_strides[i] * dst_strides[i];
+        flat_index %= src_strides[i];
+    }
+    return res;
+}
+
+// get the memory offset of the given element in a tensor given its flat index
+inline __device__ uint64_t getOffset(uint64_t flat_index, uint64_t ndim, uint64_t const *shape, int64_t const *strides) {
+    uint64_t res = 0;
+    for (long i = ndim - 1; i >= 0; --i) {
+        res += (flat_index % shape[i]) * strides[i];
+        flat_index /= shape[i];
+    }
+    return res;
+}
+
+#endif // __COMMON_MUSA_H__
diff --git a/src/devices/musa/musa_handle.cc b/src/devices/musa/musa_handle.cc
new file mode 100644
index 00000000..3a7f8174
--- /dev/null
+++ b/src/devices/musa/musa_handle.cc
@@ -0,0 +1,57 @@
+#include "musa_handle.h"
+#include <iostream>
+
+infiniopStatus_t createMusaHandle(MusaHandle_t* handle_ptr, int device_id) {
+    int device_count;
+    musaGetDeviceCount(&device_count);
+    if (device_id >= device_count) {
+        return STATUS_BAD_DEVICE;
+    }
+
+    int current_device;
+    if (musaGetDevice(&current_device) != musaSuccess) {
+        return STATUS_BAD_DEVICE; 
+    }
+    if (current_device != device_id && musaSetDevice(device_id) != musaSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+
+    // set MUSA device property
+    musaDeviceProp prop;
+    musaGetDeviceProperties(&prop, device_id);
+
+    // create a mublas handle pool
+    auto mublas_pool = std::make_shared<Pool<mublasHandle_t>>();
+    mublasHandle_t *mublas_handle = new mublasHandle_t;
+    mublasCreate(mublas_handle);
+    mublas_pool->push(mublas_handle);
+
+    // create a mudnn handle pool
+    auto mudnn_pool = std::make_shared<Pool<musa::dnn::Handle>>();
+    musa::dnn::Handle *mudnn_handle = new musa::dnn::Handle;
+    mudnn_pool->push(mudnn_handle);
+
+    int capability_major;
+    int capability_minor;
+    musaDeviceGetAttribute(&capability_major, musaDevAttrComputeCapabilityMajor, device_id);
+    musaDeviceGetAttribute(&capability_minor, musaDevAttrComputeCapabilityMinor, device_id);
+
+    *handle_ptr = new MusaContext{
+        DevMthreadsGpu,
+        device_id,
+        std::move(mublas_pool),
+        std::move(mudnn_pool),
+        std::move(prop),
+        capability_major,
+        capability_minor,};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t deleteMusaHandle(MusaHandle_t handle_ptr) {
+    handle_ptr->mublas_handles_t = nullptr;
+    handle_ptr->mudnn_handles_t = nullptr;
+    delete handle_ptr;
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/devices/musa/musa_handle.h b/src/devices/musa/musa_handle.h
new file mode 100644
index 00000000..6de2c2d3
--- /dev/null
+++ b/src/devices/musa/musa_handle.h
@@ -0,0 +1,64 @@
+#ifndef __MUSA_HANDLE_H__
+#define __MUSA_HANDLE_H__
+
+#include "pool.h"
+#include "device.h"
+#include "status.h"
+#include "ops/matmul/matmul.h"
+#include <memory>
+#include <musa.h>
+#include <musa_runtime_api.h>
+#include <mudnn.h>
+#include <mublas.h>
+
+struct MusaContext {
+    Device device;
+    int device_id;
+    std::shared_ptr<Pool<mublasHandle_t>> mublas_handles_t;
+    std::shared_ptr<Pool<musa::dnn::Handle>> mudnn_handles_t;
+    musaDeviceProp prop;
+    int compute_capability_major;
+    int compute_capability_minor;
+};
+typedef struct MusaContext *MusaHandle_t;
+
+infiniopStatus_t createMusaHandle(MusaHandle_t *handle_ptr, int device_id);
+
+infiniopStatus_t deleteMusaHandle(MusaHandle_t handle_ptr);
+
+template<typename T>
+void use_mublas(std::shared_ptr<Pool<mublasHandle_t>> mublas_handles_t, int device_id, MUstream stream, T const &f) {
+    mublasHandle_t *handle = mublas_handles_t->pop();
+    if (!handle) {
+        int current_device;
+        musaGetDevice(&current_device);
+        if (current_device != device_id) {
+            musaSetDevice(device_id);
+        }
+        mublasHandle_t *handle = new mublasHandle_t;
+        mublasCreate(handle);
+    }
+    mublasSetStream(*handle, (MUstream) stream);
+    f(*handle);
+    mublas_handles_t->push(handle);
+}
+
+template<typename T>
+void use_mudnn(std::shared_ptr<Pool<musa::dnn::Handle>> mudnn_handles_t, int device_id, musaStream_t stream, T const &f) {
+    musa::dnn::Handle* handle = mudnn_handles_t->pop();
+    if (!handle) {
+        int current_device;
+        musaGetDevice(&current_device);
+        if (current_device != device_id) {
+            musaSetDevice(device_id);
+        }
+        handle = new musa::dnn::Handle(device_id);
+        // mudnnCreate(handle);
+    }
+    // mudnnSetStream(*handle, (MUstream) stream);
+    handle->SetStream(stream);
+    f(handle);
+    mudnn_handles_t->push(handle);
+}
+
+#endif // __MUSA_HANDLE_H__
diff --git a/src/devices/musa/pool.h b/src/devices/musa/pool.h
new file mode 100644
index 00000000..2cfb5e32
--- /dev/null
+++ b/src/devices/musa/pool.h
@@ -0,0 +1,50 @@
+#ifndef __POOL_MUSA_H__
+#define __POOL_MUSA_H__
+
+#include <atomic>
+#include <mutex>
+#include <optional>
+
+template<class T>
+class Pool {
+public:
+    Pool() : _head(nullptr) {}
+
+    Pool(const Pool &) = delete;
+
+    Pool(Pool &&pool) noexcept : _head(pool._head.exchange(nullptr)) {}
+
+    ~Pool() {
+        while (this->pop()) {}
+    }
+
+    void push(T *val) const {
+        Node<T> *new_node = new Node<T>(val);
+        new_node->next = _head.load();
+        while (!_head.compare_exchange_weak(new_node->next, new_node));
+    }
+
+    T* pop() const {
+        Node<T> *top = _head.load();
+        Node<T> *new_head = nullptr;
+        do {
+            if (!top) {
+                return nullptr;
+            }
+            new_head = top->next;
+        } while (!_head.compare_exchange_weak(top, new_head));
+        return top->data;
+    }
+
+private:
+    template<class U>
+    struct Node {
+        U *data;
+        Node<U> *next;
+        Node(U *data) : data(data), next(nullptr) {}
+    };
+
+    mutable std::atomic<Node<T> *> _head;
+};
+
+#endif // __POOL_MUSA_H__
diff --git a/src/main.c b/src/main.c
deleted file mode 100644
index 721159e4..00000000
--- a/src/main.c
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "ops/rotary_embedding/rotary_embedding.h"
-#include "tensor.h"
-#include <stdio.h>
-
-void test_rms_norm() {
-    void *descriptor = createRotaryEmbeddingDescriptor(DevNvGpu, NULL);
-    struct TensorLayout l;
-    Tensor t = {&l, NULL};
-    Tensor t2 = {&l, NULL};
-    rotaryEmbedding(descriptor, t, t2, 10000.0, NULL);
-    destroyRotaryEmbeddingDescriptor(descriptor);
-}
-
-int main(int argc, char **argv) {
-    test_rms_norm();
-    return 0;
-}
diff --git a/src/ops/add/cpu/add_cpu.cc b/src/ops/add/cpu/add_cpu.cc
new file mode 100644
index 00000000..ce859b1a
--- /dev/null
+++ b/src/ops/add/cpu/add_cpu.cc
@@ -0,0 +1,104 @@
+#include "add_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) {
+    for (int64_t i = ndim - 1; i >= 0; --i) {
+        if (++indices[i] != shape[i]) {
+            return;
+        }
+        indices[i] = 0;
+    }
+}
+
+inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) {
+    return std::inner_product(indices, indices + ndim, strides, uint64_t(0));
+}
+
+infiniopStatus_t cpuCreateAddDescriptor(infiniopHandle_t,
+                                        AddCpuDescriptor_t *desc_ptr,
+                                        infiniopTensorDescriptor_t c,
+                                        infiniopTensorDescriptor_t a,
+                                        infiniopTensorDescriptor_t b) {
+    uint64_t ndim = c->ndim;
+    if (!isValidBroadcastShape(a, b, c)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (c->dt != F16 && c->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (c->dt != a->dt || c->dt != b->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    // get the adjusted strides for a and b
+    uint64_t *a_strides = new uint64_t[ndim];
+    uint64_t *b_strides = new uint64_t[ndim];
+    for (size_t i = 0; i < ndim; ++i) {
+        a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim];
+        b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim];
+    }
+
+    uint64_t *c_indices = new uint64_t[ndim];
+    std::fill(c_indices, c_indices + ndim, 0);
+    uint64_t *c_shape = new uint64_t[ndim];
+    std::copy(c->shape, c->shape + ndim, c_shape);
+
+    *desc_ptr = new AddCpuDescriptor{
+        DevCpu,
+        c->dt,
+        ndim,
+        c_data_size,
+        c_shape,
+        a_strides,
+        b_strides,
+        c_indices,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyAddDescriptor(AddCpuDescriptor_t desc) {
+    delete[] desc->c_shape;
+    delete[] desc->a_strides;
+    delete[] desc->b_strides;
+    delete[] desc->c_indices;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t add_cpu(AddCpuDescriptor_t desc, void *c, void const *a, void const *b) {
+    auto a_ = reinterpret_cast<Tdata const *>(a);
+    auto b_ = reinterpret_cast<Tdata const *>(b);
+    auto c_ = reinterpret_cast<Tdata *>(c);
+    const auto &indices = desc->c_indices;
+
+    for (uint64_t i = 0; i < desc->c_data_size; ++i, incrementOne(indices, desc->c_shape, desc->ndim)) {
+        auto a_index = compactToFlat(indices, desc->a_strides, desc->ndim);
+        auto b_index = compactToFlat(indices, desc->b_strides, desc->ndim);
+        if constexpr (std::is_same<Tdata, uint16_t>::value) {
+            c_[i] = f32_to_f16(f16_to_f32(a_[a_index]) + f16_to_f32(b_[b_index]));
+        } else {
+            c_[i] = a_[a_index] + b_[b_index];
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuAdd(AddCpuDescriptor_t desc,
+                        void *c, void const *a, void const *b,
+                        void *stream) {
+    if (desc->dtype == F16) {
+        return add_cpu<uint16_t>(desc, c, a, b);
+    }
+    if (desc->dtype == F32) {
+        return add_cpu<float>(desc, c, a, b);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/add/cpu/add_cpu.h b/src/ops/add/cpu/add_cpu.h
new file mode 100644
index 00000000..42e62435
--- /dev/null
+++ b/src/ops/add/cpu/add_cpu.h
@@ -0,0 +1,33 @@
+#ifndef __CPU_ADD_H__
+#define __CPU_ADD_H__
+
+#include "operators.h"
+#include <numeric>
+#include <type_traits>
+
+struct AddCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t ndim;
+    uint64_t c_data_size;
+    uint64_t const *c_shape;
+    uint64_t const *a_strides;
+    uint64_t const *b_strides;
+    uint64_t *c_indices;
+};
+
+typedef struct AddCpuDescriptor *AddCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateAddDescriptor(infiniopHandle_t,
+                                        AddCpuDescriptor_t *,
+                                        infiniopTensorDescriptor_t c,
+                                        infiniopTensorDescriptor_t a,
+                                        infiniopTensorDescriptor_t b);
+
+infiniopStatus_t cpuAdd(AddCpuDescriptor_t desc,
+                        void *c, void const *a, void const *b,
+                        void *stream);
+
+infiniopStatus_t cpuDestroyAddDescriptor(AddCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/add/cuda/add.cc b/src/ops/add/cuda/add.cc
new file mode 100644
index 00000000..eebcf4be
--- /dev/null
+++ b/src/ops/add/cuda/add.cc
@@ -0,0 +1,81 @@
+#include "add.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t handle,
+                                         AddCudaDescriptor_t *desc_ptr,
+                                         infiniopTensorDescriptor_t c,
+                                         infiniopTensorDescriptor_t a,
+                                         infiniopTensorDescriptor_t b) {
+    uint64_t ndim = c->ndim;
+    if (!isValidBroadcastShape(a, b, c)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (c->dt != F16 && c->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (c->dt != a->dt || c->dt != b->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    bool broadcasted = false;
+    if (ndim != a->ndim || ndim != b->ndim) {
+        broadcasted = true;
+    } else {
+        for (uint64_t i = 0; i < ndim; ++i) {
+            if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) {
+                broadcasted = true;
+                break;
+            }
+        }
+    }
+
+    uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    // get the adjusted strides for a and b
+    int64_t *a_strides = new int64_t[ndim];
+    int64_t *b_strides = new int64_t[ndim];
+    for (size_t i = 0; i < ndim; ++i) {
+        a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim];
+        b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim];
+    }
+
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, handle->device_id);
+
+    int64_t *a_strides_d, *b_strides_d, *c_strides_d;
+    checkCudaErrorWithCode(cudaMalloc((void **) &a_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void **) &b_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc((void **) &c_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMemcpy(a_strides_d, a_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(b_strides_d, b_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(c_strides_d, c->strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+
+    *desc_ptr = new AddCudaDescriptor{
+        DevNvGpu,
+        c->dt,
+        handle->device_id,
+        ndim,
+        c_data_size,
+        static_cast<uint64_t>(prop.maxGridSize[0]),
+        a_strides_d,
+        b_strides_d,
+        c_strides_d,
+        broadcasted,
+    };
+
+    delete[] a_strides;
+    delete[] b_strides;
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyAddDescriptor(AddCudaDescriptor_t desc) {
+    checkCudaErrorWithCode(cudaFree((void *) desc->a_strides), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void *) desc->b_strides), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree((void *) desc->c_strides), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/add/cuda/add.cu b/src/ops/add/cuda/add.cu
new file mode 100644
index 00000000..9d9aefcb
--- /dev/null
+++ b/src/ops/add/cuda/add.cu
@@ -0,0 +1,116 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "add.cuh"
+
+/**
+ * @brief A templated vector struct that supports element-wise addition on arrays.
+ *
+ * @tparam T - The access data type for elements in the vector.
+ * @tparam TComp - The computation data type used for arithmetic operations. 
+ * @tparam N - The number of elements of type T in the vector for a single access.
+ */
+template<typename T, typename TComp, size_t N>
+struct vecN {
+    T data[N];
+
+    __device__ __forceinline__ vecN operator+(const vecN<T, TComp, N> &other) const {
+        vecN<T, TComp, N> result;
+
+        for (int i = 0; i < N; ++i) {
+            if constexpr (std::is_same<T, TComp>::value) {
+                result.data[i] = data[i] + other.data[i];
+            } else {
+                constexpr static size_t pack_size = sizeof(T) / sizeof(TComp);
+                auto data_ = reinterpret_cast<vecN<TComp, TComp, pack_size> *>(result.data);
+                data_[i] = std::move(reinterpret_cast<vecN<TComp, TComp, pack_size> const *>(data)[i] +
+                                     reinterpret_cast<vecN<TComp, TComp, pack_size> const *>(other.data)[i]);
+            }
+        }
+
+        return result;
+    }
+
+    __device__ __forceinline__ const T &operator[](size_t i) const {
+        return data[i];
+    }
+};
+
+template<typename Tdata, typename BTdata>
+__global__ void add(
+    Tdata *c,
+    const Tdata *a,
+    const Tdata *b,
+    const int64_t *a_strides,
+    const int64_t *b_strides,
+    const int64_t *c_strides,
+    uint64_t data_size,
+    uint64_t ndim,
+    uint64_t offset,
+    bool broadcasted,
+    unsigned pack_size) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < data_size) {
+        if (broadcasted) {
+            idx *= pack_size;
+            auto a_ = reinterpret_cast<const BTdata *>(a);
+            auto b_ = reinterpret_cast<const BTdata *>(b);
+            auto c_ = reinterpret_cast<BTdata *>(c);
+#pragma unroll
+            for (size_t i = 0; i < pack_size; ++i) {
+                auto a_idx = getDstOffset(idx + i, ndim, c_strides, a_strides);
+                auto b_idx = getDstOffset(idx + i, ndim, c_strides, b_strides);
+                c_[idx + i] = a_[a_idx] + b_[b_idx];
+            }
+            return;
+        }
+        c[idx] = a[idx] + b[idx];
+    }
+}
+
+template<typename Tdata, typename BTdata>
+void _add_nv_gpu(AddCudaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) {
+    if (data_size == 0) {
+        return;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < data_size; i += step) {
+        add<Tdata, BTdata><<<gridDims, blockDims, 0, cuda_stream>>>(
+            c, a, b, desc->a_strides, desc->b_strides, desc->c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size);
+    }
+}
+
+template<typename Tdata, typename TIdata>
+infiniopStatus_t add_nv_gpu(AddCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream, uint64_t pack_size) {
+    const auto data_size = desc->c_data_size / pack_size;
+    const auto a_vec = reinterpret_cast<const Tdata *>(a);
+    const auto b_vec = reinterpret_cast<const Tdata *>(b);
+    const auto c_vec = reinterpret_cast<Tdata *>(c);
+    _add_nv_gpu<Tdata, TIdata>(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream);
+
+    const auto remainder = desc->c_data_size % pack_size;
+    const auto a_ = reinterpret_cast<const TIdata *>(a);
+    const auto b_ = reinterpret_cast<const TIdata *>(b);
+    const auto c_ = reinterpret_cast<TIdata *>(c);
+    _add_nv_gpu<TIdata, TIdata>(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaAdd(AddCudaDescriptor_t desc,
+                         void *c, void const *a, void const *b,
+                         void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+    if (desc->dtype == F16) {
+        return add_nv_gpu<vecN<float2, half2, 2>, half>(desc, c, a, b, stream, 8);
+    }
+    if (desc->dtype == F32) {
+        return add_nv_gpu<vecN<float2, float, 2>, float>(desc, c, a, b, stream, 4);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/add/cuda/add.cuh b/src/ops/add/cuda/add.cuh
new file mode 100644
index 00000000..03a181eb
--- /dev/null
+++ b/src/ops/add/cuda/add.cuh
@@ -0,0 +1,37 @@
+#ifndef __CUDA_ADD_H__
+#define __CUDA_ADD_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <cuda_fp16.h>
+#include <numeric>
+
+struct AddCudaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t c_data_size;
+    uint64_t max_grid_size;
+    int64_t const *a_strides;
+    int64_t const *b_strides;
+    int64_t const *c_strides;
+    bool broadcasted;
+};
+
+typedef struct AddCudaDescriptor *AddCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t,
+                                         AddCudaDescriptor_t *,
+                                         infiniopTensorDescriptor_t c,
+                                         infiniopTensorDescriptor_t a,
+                                         infiniopTensorDescriptor_t b);
+
+infiniopStatus_t cudaAdd(AddCudaDescriptor_t desc,
+                         void *c, void const *a, void const *b,
+                         void *stream);
+
+infiniopStatus_t cudaDestroyAddDescriptor(AddCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/add/musa/add_musa.cc b/src/ops/add/musa/add_musa.cc
new file mode 100644
index 00000000..8c4475fe
--- /dev/null
+++ b/src/ops/add/musa/add_musa.cc
@@ -0,0 +1,81 @@
+#include "add_musa.h"
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+
+infiniopStatus_t musaCreateAddDescriptor(MusaHandle_t handle,
+                                         AddMusaDescriptor_t *desc_ptr,
+                                         infiniopTensorDescriptor_t c,
+                                         infiniopTensorDescriptor_t a,
+                                         infiniopTensorDescriptor_t b) {
+    uint64_t ndim = c->ndim;
+    if (!isValidBroadcastShape(a, b, c)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (c->dt != F16 && c->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (c->dt != a->dt || c->dt != b->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    bool broadcasted = false;
+    if (ndim != a->ndim || ndim != b->ndim) {
+        broadcasted = true;
+    } else {
+        for (uint64_t i = 0; i < ndim; ++i) {
+            if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) {
+                broadcasted = true;
+                break;
+            }
+        }
+    }
+
+    uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    // get the adjusted strides for a and b
+    int64_t *a_strides = new int64_t[ndim];
+    int64_t *b_strides = new int64_t[ndim];
+    for (size_t i = 0; i < ndim; ++i) {
+        a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim];
+        b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim];
+    }
+
+    musaDeviceProp prop;
+    musaGetDeviceProperties(&prop, handle->device_id);
+
+    int64_t *a_strides_d, *b_strides_d, *c_strides_d;
+    checkMusaErrorWithCode(musaMalloc(&a_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkMusaErrorWithCode(musaMalloc(&b_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkMusaErrorWithCode(musaMalloc(&c_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkMusaErrorWithCode(musaMemcpy(a_strides_d, a_strides, ndim * sizeof(int64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkMusaErrorWithCode(musaMemcpy(b_strides_d, b_strides, ndim * sizeof(int64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkMusaErrorWithCode(musaMemcpy(c_strides_d, c->strides, ndim * sizeof(int64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+
+    *desc_ptr = new AddMusaDescriptor{
+        DevMthreadsGpu,
+        c->dt,
+        handle->device_id,
+        ndim,
+        c_data_size,
+        static_cast<uint64_t>(prop.maxGridSize[0]),
+        a_strides_d,
+        b_strides_d,
+        c_strides_d,
+        broadcasted,
+    };
+
+    delete[] a_strides;
+    delete[] b_strides;
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaDestroyAddDescriptor(AddMusaDescriptor_t desc) {
+    checkMusaErrorWithCode(musaFree((void *) desc->a_strides), STATUS_EXECUTION_FAILED);
+    checkMusaErrorWithCode(musaFree((void *) desc->b_strides), STATUS_EXECUTION_FAILED);
+    checkMusaErrorWithCode(musaFree((void *) desc->c_strides), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/add/musa/add_musa.h b/src/ops/add/musa/add_musa.h
new file mode 100644
index 00000000..c492c45c
--- /dev/null
+++ b/src/ops/add/musa/add_musa.h
@@ -0,0 +1,37 @@
+#ifndef __MUSA_ADD_H__
+#define __MUSA_ADD_H__
+
+#include "../../../devices/musa/common_musa.h"
+#include "../../../devices/musa/musa_handle.h"
+#include "operators.h"
+#include <musa_fp16.h>
+#include <numeric>
+
+struct AddMusaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t c_data_size;
+    uint64_t max_grid_size;
+    int64_t const *a_strides;
+    int64_t const *b_strides;
+    int64_t const *c_strides;
+    bool broadcasted;
+};
+
+typedef struct AddMusaDescriptor *AddMusaDescriptor_t;
+
+infiniopStatus_t musaCreateAddDescriptor(MusaHandle_t,
+                                         AddMusaDescriptor_t *,
+                                         infiniopTensorDescriptor_t c,
+                                         infiniopTensorDescriptor_t a,
+                                         infiniopTensorDescriptor_t b);
+
+infiniopStatus_t musaAdd(AddMusaDescriptor_t desc,
+                         void *c, void const *a, void const *b,
+                         void *stream);
+
+infiniopStatus_t musaDestroyAddDescriptor(AddMusaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/add/musa/add_musa.mu b/src/ops/add/musa/add_musa.mu
new file mode 100644
index 00000000..0766aa7c
--- /dev/null
+++ b/src/ops/add/musa/add_musa.mu
@@ -0,0 +1,116 @@
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include "add_musa.h"
+
+/**
+ * @brief A templated vector struct that supports element-wise addition on arrays.
+ *
+ * @tparam T - The access data type for elements in the vector.
+ * @tparam TComp - The computation data type used for arithmetic operations. 
+ * @tparam N - The number of elements of type T in the vector for a single access.
+ */
+template<typename T, typename TComp, size_t N>
+struct vecN {
+    T data[N];
+
+    __device__ __forceinline__ vecN operator+(const vecN<T, TComp, N> &other) const {
+        vecN<T, TComp, N> result;
+
+        for (int i = 0; i < N; ++i) {
+            if constexpr (std::is_same<T, TComp>::value) {
+                result.data[i] = data[i] + other.data[i];
+            } else {
+                constexpr static size_t pack_size = sizeof(T) / sizeof(TComp);
+                auto data_ = reinterpret_cast<vecN<TComp, TComp, pack_size> *>(result.data);
+                data_[i] = std::move(reinterpret_cast<vecN<TComp, TComp, pack_size> const *>(data)[i] +
+                                     reinterpret_cast<vecN<TComp, TComp, pack_size> const *>(other.data)[i]);
+            }
+        }
+
+        return result;
+    }
+
+    __device__ __forceinline__ const T &operator[](size_t i) const {
+        return data[i];
+    }
+};
+
+template<typename Tdata, typename BTdata>
+__global__ void add(
+    Tdata *c,
+    const Tdata *a,
+    const Tdata *b,
+    const int64_t *a_strides,
+    const int64_t *b_strides,
+    const int64_t *c_strides,
+    uint64_t data_size,
+    uint64_t ndim,
+    uint64_t offset,
+    bool broadcasted,
+    unsigned pack_size) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < data_size) {
+        if (broadcasted) {
+            idx *= pack_size;
+            auto a_ = reinterpret_cast<const BTdata *>(a);
+            auto b_ = reinterpret_cast<const BTdata *>(b);
+            auto c_ = reinterpret_cast<BTdata *>(c);
+#pragma unroll
+            for (size_t i = 0; i < pack_size; ++i) {
+                auto a_idx = getDstOffset(idx + i, ndim, c_strides, a_strides);
+                auto b_idx = getDstOffset(idx + i, ndim, c_strides, b_strides);
+                c_[idx + i] = a_[a_idx] + b_[b_idx];
+            }
+            return;
+        }
+        c[idx] = a[idx] + b[idx];
+    }
+}
+
+template<typename Tdata, typename BTdata>
+void _add_nv_gpu(AddMusaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) {
+    if (data_size == 0) {
+        return;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    musaStream_t musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < data_size; i += step) {
+        add<Tdata, BTdata><<<gridDims, blockDims, 0, musa_stream>>>(
+            c, a, b, desc->a_strides, desc->b_strides, desc->c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size);
+    }
+}
+
+template<typename Tdata, typename TIdata>
+infiniopStatus_t add_mt_gpu(AddMusaDescriptor_t desc, void *c, void const *a, void const *b, void *stream, uint64_t pack_size) {
+    const auto data_size = desc->c_data_size / pack_size;
+    const auto a_vec = reinterpret_cast<const Tdata *>(a);
+    const auto b_vec = reinterpret_cast<const Tdata *>(b);
+    const auto c_vec = reinterpret_cast<Tdata *>(c);
+    _add_nv_gpu<Tdata, TIdata>(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream);
+
+    const auto remainder = desc->c_data_size % pack_size;
+    const auto a_ = reinterpret_cast<const TIdata *>(a);
+    const auto b_ = reinterpret_cast<const TIdata *>(b);
+    const auto c_ = reinterpret_cast<TIdata *>(c);
+    _add_nv_gpu<TIdata, TIdata>(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaAdd(AddMusaDescriptor_t desc,
+                         void *c, void const *a, void const *b,
+                         void *stream) {
+    checkMusaError(musaSetDevice(desc->device_id));
+    if (desc->dtype == F16) {
+        return add_mt_gpu<vecN<float2, half2, 2>, half>(desc, c, a, b, stream, 8);
+    }
+    if (desc->dtype == F32) {
+        return add_mt_gpu<vecN<float2, float, 2>, float>(desc, c, a, b, stream, 4);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/add/operator.cc b/src/ops/add/operator.cc
new file mode 100644
index 00000000..de97dc94
--- /dev/null
+++ b/src/ops/add/operator.cc
@@ -0,0 +1,91 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/add/add.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/add_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/add.cuh"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#include "musa/add_musa.h"
+#endif
+
+__C infiniopStatus_t infiniopCreateAddDescriptor(
+    infiniopHandle_t handle,
+    infiniopAddDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c,
+    infiniopTensorDescriptor_t a,
+    infiniopTensorDescriptor_t b) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateAddDescriptor(handle, (AddCpuDescriptor_t *) desc_ptr, c, a, b);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateAddDescriptor((CudaHandle_t) handle, (AddCudaDescriptor_t *) desc_ptr, c, a, b);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCreateAddDescriptor((MusaHandle_t) handle, (AddMusaDescriptor_t *) desc_ptr, c, a, b);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc, void *c, void const *a, void const *b, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuAdd((AddCpuDescriptor_t) desc, c, a, b, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaAdd((AddCudaDescriptor_t) desc, c, a, b, stream);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaAdd((AddMusaDescriptor_t) desc, c, a, b, stream);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyAddDescriptor((AddCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyAddDescriptor((AddCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaDestroyAddDescriptor((AddMusaDescriptor_t) desc);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/attention/operator.cc b/src/ops/attention/operator.cc
new file mode 100644
index 00000000..fc3ee9b3
--- /dev/null
+++ b/src/ops/attention/operator.cc
@@ -0,0 +1,320 @@
+#include "../utils.h"
+#include "ops/attention/attention.h"
+#include "ops/causal_softmax/causal_softmax.h"
+#include "ops/matmul/matmul.h"
+#include "ops/rearrange/rearrange.h"
+#include "tensor/tensor_descriptor.h"
+#include <cmath>
+
+struct _AttentionDescriptor {
+    Device device;
+    infiniopRearrangeDescriptor_t rearrange_desc_k;
+    infiniopRearrangeDescriptor_t rearrange_desc_v;
+    infiniopRearrangeDescriptor_t rearrange_desc_q;
+    infiniopRearrangeDescriptor_t rearrange_desc_out;
+    infiniopMatmulDescriptor_t matmul_desc1;
+    infiniopMatmulDescriptor_t matmul_desc2;
+    infiniopCausalSoftmaxDescriptor_t softmax_desc;
+    uint64_t workspace_size;
+    uint64_t rearranged_q_size;
+    uint64_t matmul1_workspace_size;
+    uint64_t matmul1_tensor_size;
+    uint64_t matmul2_workspace_size;
+    uint64_t matmul2_tensor_size;
+    uint64_t softmax_workspace_size;
+    uint64_t k_cache_offset;
+    uint64_t v_cache_offset;
+};
+
+typedef struct _AttentionDescriptor *_AttentionDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t handle,
+                                                                infiniopAttentionDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t out_desc,
+                                                                infiniopTensorDescriptor_t q_desc,
+                                                                infiniopTensorDescriptor_t k_desc,
+                                                                infiniopTensorDescriptor_t v_desc,
+                                                                infiniopTensorDescriptor_t k_cache_desc,
+                                                                infiniopTensorDescriptor_t v_cache_desc,
+                                                                uint64_t pos) {
+    if (out_desc->ndim != 3 || q_desc->ndim != 3 || k_desc->ndim != 3 ||
+        v_desc->ndim != 3 || k_cache_desc->ndim != 3 || v_cache_desc->ndim != 3) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (!is_contiguous(out_desc, 0, 2)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    if (q_desc->strides[2] != 1 || k_desc->strides[2] != 1 || v_desc->strides[2] != 1 ||
+        k_cache_desc->strides[2] != 1 || v_cache_desc->strides[2] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    uint64_t n_q_head = q_desc->shape[0];
+    uint64_t seq_len = q_desc->shape[1];
+    uint64_t head_dim = q_desc->shape[2];
+    uint64_t hidden_size = n_q_head * head_dim;
+    uint64_t n_kv_head = k_desc->shape[0];
+    uint64_t total_seq_len = seq_len + pos;
+    uint64_t n_group = n_q_head / n_kv_head;
+
+    // out: [seq_len, n_q_head, head_dim]
+    if (out_desc->shape[0] != seq_len || out_desc->shape[1] != n_q_head || out_desc->shape[2] != head_dim) {
+        return STATUS_BAD_PARAM;
+    }
+
+    // k: [n_kv_head, seq_len, head_dim]
+    if (k_desc->shape[0] != n_kv_head || k_desc->shape[1] != seq_len || k_desc->shape[2] != head_dim) {
+        return STATUS_BAD_PARAM;
+    }
+
+    // v: [n_kv_head, seq_len, head_dim]
+    if (v_desc->shape[0] != n_kv_head || v_desc->shape[1] != seq_len || v_desc->shape[2] != head_dim) {
+        return STATUS_BAD_PARAM;
+    }
+
+    // k_cache: [n_kv_head, _, head_dim]
+    if (k_cache_desc->shape[0] != n_kv_head || k_cache_desc->shape[1] < total_seq_len || k_cache_desc->shape[2] != head_dim) {
+        return STATUS_BAD_PARAM;
+    }
+
+    // v_cache: [n_kv_head, _, head_dim]
+    if (v_cache_desc->shape[0] != n_kv_head || v_cache_desc->shape[1] < total_seq_len || v_cache_desc->shape[2] != head_dim) {
+        return STATUS_BAD_PARAM;
+    }
+
+    // Rearrange k into k_cache
+    infiniopTensorDescriptor_t dst_k_desc;
+    CHECK_STATUS(infiniopCreateTensorDescriptor(&dst_k_desc, 3, k_desc->shape, k_cache_desc->strides, k_cache_desc->dt), STATUS_SUCCESS);
+    infiniopRearrangeDescriptor_t rearrange_desc_k;
+    CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_k, dst_k_desc, k_desc), STATUS_SUCCESS);
+
+    // Rearrange v into v_cache
+    infiniopTensorDescriptor_t dst_v_desc;
+    CHECK_STATUS(infiniopCreateTensorDescriptor(&dst_v_desc, 3, v_desc->shape, v_cache_desc->strides, v_cache_desc->dt), STATUS_SUCCESS);
+    infiniopRearrangeDescriptor_t rearrange_desc_v;
+    CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_v, dst_v_desc, v_desc), STATUS_SUCCESS);
+
+    // Rearrange q into contiguous
+    infiniopRearrangeDescriptor_t rearrange_desc_q = nullptr;
+    uint64_t rearranged_q_size = 0;
+    if (!is_contiguous(q_desc, 0, 1)) {
+        infiniopTensorDescriptor_t rearranged_q_desc;
+        CHECK_STATUS(infiniopCreateTensorDescriptor(&rearranged_q_desc, 3, q_desc->shape, nullptr, q_desc->dt), STATUS_SUCCESS);
+        rearranged_q_size = get_byte_size(rearranged_q_desc);
+        rearrange_desc_q = new RearrangeDescriptor;
+        CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_q, rearranged_q_desc, q_desc), STATUS_SUCCESS);
+    }
+
+    // Matmul1: q * full_k
+    //      q: [n_q_head, seq_len, head_dim] -> [n_kv_head, n_group *seq_len, head_dim]
+    infiniopTensorDescriptor_t reshaped_q_desc;
+    CHECK_STATUS(infiniopCreateTensorDescriptor(&reshaped_q_desc, 3, q_desc->shape, nullptr, q_desc->dt), STATUS_SUCCESS);
+    reshaped_q_desc = dim_split(reshaped_q_desc, 0, {n_kv_head, n_group});
+    if (!reshaped_q_desc) {
+        return STATUS_BAD_PARAM;
+    }
+    reshaped_q_desc = dim_merge(reshaped_q_desc, 1, 2);
+    if (!reshaped_q_desc) {
+        return STATUS_BAD_PARAM;
+    }
+    //      full_k: [n_kv_head, head_dim, total_seq_len]
+    infiniopTensorDescriptor_t full_k_desc;
+    uint64_t full_k_shape[3] = {n_kv_head, total_seq_len, head_dim};
+    CHECK_STATUS(infiniopCreateTensorDescriptor(&full_k_desc, 3, full_k_shape, k_cache_desc->strides, k_cache_desc->dt), STATUS_SUCCESS);
+    full_k_desc = permute(full_k_desc, {0, 2, 1});
+    if (!full_k_desc) {
+        return STATUS_BAD_PARAM;
+    }
+    //      qk: [n_kv_head, n_group * seq_len, total_seq_len]
+    infiniopTensorDescriptor_t qk_desc;
+    uint64_t qk_shape[3] = {n_kv_head, n_group * seq_len, total_seq_len};
+    CHECK_STATUS(infiniopCreateTensorDescriptor(&qk_desc, 3, qk_shape, nullptr, q_desc->dt), STATUS_SUCCESS);
+    //      matmul1_desc
+    //          qk_alpha
+    float qk_alpha = 1 / sqrt(head_dim);
+    infiniopMatmulDescriptor_t matmul1_desc;
+    CHECK_STATUS(infiniopCreateMatmulDescriptor(handle, &matmul1_desc, qk_desc, qk_alpha, reshaped_q_desc, full_k_desc, 0.0), STATUS_SUCCESS);
+    //      matmul1 workspace size
+    uint64_t matmul1_workspace_size;
+    CHECK_STATUS(infiniopGetMatmulWorkspaceSize(matmul1_desc, &matmul1_workspace_size), STATUS_SUCCESS);
+    //      matmul1 tensor size
+    uint64_t matmul1_tensor_size = get_byte_size(qk_desc);
+
+    // CausalSoftmax: softmax(qk)
+    //      qk: [n_kv_head, n_group * seq_len, total_seq_len] -> [n_q_head, seq_len, total_seq_len]
+    qk_desc = dim_split(qk_desc, 1, {n_group, seq_len});
+    if (!qk_desc) {
+        return STATUS_BAD_PARAM;
+    }
+    qk_desc = dim_merge(qk_desc, 0, 1);
+    if (!qk_desc) {
+        return STATUS_BAD_PARAM;
+    }
+    infiniopCausalSoftmaxDescriptor_t softmax_desc;
+    CHECK_STATUS(infiniopCreateCausalSoftmaxDescriptor(handle, &softmax_desc, qk_desc), STATUS_SUCCESS);
+    //      softmax workspace size
+    uint64_t softmax_workspace_size;
+    CHECK_STATUS(infiniopGetCausalSoftmaxWorkspaceSize(softmax_desc, &softmax_workspace_size), STATUS_SUCCESS);
+
+    // Matmul2: softmax(qk) * full_v
+    //      softmax(qk): [n_q_head, seq_len, total_seq_len] -> [n_kv_head, n_group * seq_len, total_seq_len]
+    //      full_v: [n_kv_head, total_seq_len, head_dim]
+    qk_desc = dim_split(qk_desc, 0, {n_kv_head, n_group});
+    if (!qk_desc) {
+        return STATUS_BAD_PARAM;
+    }
+    qk_desc = dim_merge(qk_desc, 1, 2);
+    if (!qk_desc) {
+        return STATUS_BAD_PARAM;
+    }
+    infiniopTensorDescriptor_t full_v_desc;
+    uint64_t full_v_shape[3] = {n_kv_head, total_seq_len, head_dim};
+    CHECK_STATUS(infiniopCreateTensorDescriptor(&full_v_desc, 3, full_v_shape, v_cache_desc->strides, v_cache_desc->dt), STATUS_SUCCESS);
+    //      temp_out: [n_kv_head, n_group * seq_len, head_dim]
+    infiniopTensorDescriptor_t temp_out_desc;
+    uint64_t temp_out_shape[3] = {n_kv_head, n_group * seq_len, head_dim};
+    CHECK_STATUS(infiniopCreateTensorDescriptor(&temp_out_desc, 3, temp_out_shape, nullptr, q_desc->dt), STATUS_SUCCESS);
+    //      matmul2_desc
+    infiniopMatmulDescriptor_t matmul2_desc;
+    CHECK_STATUS(infiniopCreateMatmulDescriptor(handle, &matmul2_desc, temp_out_desc, 1.0, qk_desc, full_v_desc, 0.0), STATUS_SUCCESS);
+    //      matmul2 workspace size
+    uint64_t matmul2_workspace_size;
+    CHECK_STATUS(infiniopGetMatmulWorkspaceSize(matmul2_desc, &matmul2_workspace_size), STATUS_SUCCESS);
+    //      matmul2 tensor size
+    uint64_t matmul2_tensor_size = get_byte_size(temp_out_desc);
+
+    // Rearrange temp_out into out
+    //      out: [seq_len, n_q_head, head_dim]
+    //      temp_out: [n_kv_head, n_group * seq_len, head_dim] -> [n_q_head, seq_len, head_dim] -> [seq_len, n_q_head, head_dim]
+    temp_out_desc = dim_split(temp_out_desc, 1, {n_group, seq_len});
+    if (!temp_out_desc) {
+        return STATUS_BAD_PARAM;
+    }
+    temp_out_desc = dim_merge(temp_out_desc, 0, 1);
+    if (!temp_out_desc) {
+        return STATUS_BAD_PARAM;
+    }
+    temp_out_desc = permute(temp_out_desc, {1, 0, 2});
+    if (!temp_out_desc) {
+        return STATUS_BAD_PARAM;
+    }
+    infiniopRearrangeDescriptor_t rearrange_desc_out;
+    CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_out, out_desc, temp_out_desc), STATUS_SUCCESS);
+
+    // workspace size
+    uint64_t workspace_size = rearranged_q_size + std::max(std::max(matmul1_workspace_size + matmul1_tensor_size,
+                                                                    matmul1_tensor_size + softmax_workspace_size),
+                                                           matmul1_tensor_size + matmul2_workspace_size + matmul2_tensor_size);
+
+    // k_cache_offset
+    uint64_t k_cache_offset = 0;
+    if (pos > 0) {
+        k_cache_offset = pos * get_byte_strides(k_cache_desc)[1];
+    }
+
+    // v_cache_offset
+    uint64_t v_cache_offset = 0;
+    if (pos > 0) {
+        v_cache_offset = pos * get_byte_strides(v_cache_desc)[1];
+    }
+
+    // create attention descriptor
+    *(_AttentionDescriptor_t *) desc_ptr = new _AttentionDescriptor{
+        handle->device,
+        rearrange_desc_k,
+        rearrange_desc_v,
+        rearrange_desc_q,
+        rearrange_desc_out,
+        matmul1_desc,
+        matmul2_desc,
+        softmax_desc,
+        workspace_size,
+        rearranged_q_size,
+        matmul1_workspace_size,
+        matmul1_tensor_size,
+        matmul2_workspace_size,
+        matmul2_tensor_size,
+        softmax_workspace_size,
+        k_cache_offset,
+        v_cache_offset,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetAttentionWorkspaceSize(infiniopAttentionDescriptor_t desc, uint64_t *size) {
+    *size = ((_AttentionDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc,
+                                                void *workspace,
+                                                uint64_t workspace_size,
+                                                void *out,
+                                                void const *q,
+                                                void const *k,
+                                                void const *v,
+                                                void *k_cache,
+                                                void *v_cache,
+                                                void *stream) {
+    auto _desc = (_AttentionDescriptor_t) desc;
+    void *_workspace = workspace;
+    if (workspace_size < _desc->workspace_size) {
+        return STATUS_MEMORY_NOT_ALLOCATED;
+    }
+
+    // concat k and v to k_cache and v_cache
+    CHECK_STATUS(infiniopRearrange(_desc->rearrange_desc_k,
+                                   (char *) k_cache + _desc->k_cache_offset, k, stream),
+                 STATUS_SUCCESS);
+
+    CHECK_STATUS(infiniopRearrange(_desc->rearrange_desc_v,
+                                   (char *) v_cache + _desc->v_cache_offset, v, stream),
+                 STATUS_SUCCESS);
+
+    // rearrange q into contiguous
+    void const *_q = q;
+    if (_desc->rearrange_desc_q) {
+        CHECK_STATUS(infiniopRearrange(_desc->rearrange_desc_q, (char *) _workspace, q, stream), STATUS_SUCCESS);
+        _q = _workspace;
+        _workspace = (char *) _workspace + _desc->rearranged_q_size;
+    }
+
+    // matmul1: q * full_k
+    CHECK_STATUS(infiniopMatmul(_desc->matmul_desc1,
+                                (char *) _workspace + _desc->matmul1_tensor_size, _desc->workspace_size - _desc->matmul1_tensor_size,
+                                _workspace, _q, k_cache, stream),
+                 STATUS_SUCCESS);
+    // softmax(qk)
+    CHECK_STATUS(infiniopCausalSoftmax(_desc->softmax_desc,
+                                       (char *) _workspace + _desc->matmul1_tensor_size, _desc->workspace_size - _desc->matmul1_tensor_size,
+                                       _workspace, stream),
+                 STATUS_SUCCESS);
+    // matmul2: softmax(qk) * full_v
+    CHECK_STATUS(infiniopMatmul(_desc->matmul_desc2,
+                                (char *) _workspace + _desc->matmul1_tensor_size + _desc->matmul2_tensor_size,
+                                _desc->workspace_size - _desc->matmul1_tensor_size - _desc->matmul2_tensor_size,
+                                (char *) _workspace + _desc->matmul1_tensor_size, _workspace, v_cache, stream),
+                 STATUS_SUCCESS);
+    // rearrange out
+    CHECK_STATUS(infiniopRearrange(_desc->rearrange_desc_out, out, (char *) _workspace + _desc->matmul1_tensor_size, stream), STATUS_SUCCESS);
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyAttentionDescriptor(infiniopAttentionDescriptor_t desc) {
+    if (((_AttentionDescriptor_t) desc)->rearrange_desc_q) {
+        CHECK_STATUS(infiniopDestroyRearrangeDescriptor(((_AttentionDescriptor_t) desc)->rearrange_desc_q), STATUS_SUCCESS);
+    }
+    CHECK_STATUS(infiniopDestroyRearrangeDescriptor(((_AttentionDescriptor_t) desc)->rearrange_desc_k), STATUS_SUCCESS);
+    CHECK_STATUS(infiniopDestroyRearrangeDescriptor(((_AttentionDescriptor_t) desc)->rearrange_desc_v), STATUS_SUCCESS);
+    CHECK_STATUS(infiniopDestroyRearrangeDescriptor(((_AttentionDescriptor_t) desc)->rearrange_desc_out), STATUS_SUCCESS);
+    CHECK_STATUS(infiniopDestroyMatmulDescriptor(((_AttentionDescriptor_t) desc)->matmul_desc1), STATUS_SUCCESS);
+    CHECK_STATUS(infiniopDestroyMatmulDescriptor(((_AttentionDescriptor_t) desc)->matmul_desc2), STATUS_SUCCESS);
+    CHECK_STATUS(infiniopDestroyCausalSoftmaxDescriptor(((_AttentionDescriptor_t) desc)->softmax_desc), STATUS_SUCCESS);
+    delete (_AttentionDescriptor_t) desc;
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/avg_pool/operator.cc b/src/ops/avg_pool/operator.cc
new file mode 100644
index 00000000..29c1a332
--- /dev/null
+++ b/src/ops/avg_pool/operator.cc
@@ -0,0 +1,54 @@
+#include "../pooling/pooling.h"
+#include "../utils.h"
+#include "ops/avg_pool/avg_pool.h"
+
+struct _AvgPoolDescriptor {
+    Device device;
+    infiniopPoolingDescriptor_t pooling_desc;
+    uint64_t workspace_size;
+};
+
+typedef struct _AvgPoolDescriptor *_AvgPoolDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle,
+                                                              infiniopAvgPoolDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t y,
+                                                              infiniopTensorDescriptor_t x,
+                                                              uint64_t const *kernel_shape,
+                                                              uint64_t const *pads,
+                                                              int64_t const *strides,
+                                                              uint64_t n) {
+    infiniopPoolingDescriptor_t pooling_desc;
+    CHECK_STATUS(infiniopCreatePoolingDescriptor(handle, &pooling_desc, y, x, kernel_shape, pads, strides, n, 1), STATUS_SUCCESS);
+    uint64_t workspace_size = 0;
+    CHECK_STATUS(infiniopGetPoolingWorkspaceSize(pooling_desc, &workspace_size), STATUS_SUCCESS);
+
+    *(_AvgPoolDescriptor_t *) desc_ptr = new _AvgPoolDescriptor{
+        handle->device,
+        pooling_desc,
+        workspace_size};
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, uint64_t *size) {
+    *size = ((_AvgPoolDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) {
+    auto _desc = (_AvgPoolDescriptor_t) desc;
+    if (workspace_size < _desc->workspace_size) {
+        return STATUS_MEMORY_NOT_ALLOCATED;
+    }
+
+    CHECK_STATUS(infiniopPooling(_desc->pooling_desc, workspace, workspace_size, y, x, stream),
+                 STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyPoolingDescriptor(((_AvgPoolDescriptor_t) desc)->pooling_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
new file mode 100644
index 00000000..26ed34c1
--- /dev/null
+++ b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
@@ -0,0 +1,187 @@
+#include "causal_softmax_aclnn.h"
+#include "../../utils.h"
+
+CausalSoftmaxAclnnDescriptor::CausalSoftmaxAclnnDescriptor(Device _device) {
+    device = _device;
+    device_id = 0;
+    aDesc = new aclnnTensorDescriptor();
+    maskDesc = new aclnnTensorDescriptor();
+    outDesc = new aclnnTensorDescriptor();
+    executor = nullptr;
+    workspaceSize = 0;
+    maskAddr = nullptr;
+}
+
+infiniopStatus_t aclnnCreateCausalSoftmaxDescriptor(AscendHandle_t handle,
+                                                    CausalSoftmaxAclnnDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t y) {
+    if (y->ndim < 2 || y->ndim >= 4) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (!is_contiguous(y, 0, y->ndim - 1)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    // Construct CausalSoftmaxAclnnDescriptor
+    *desc_ptr = new CausalSoftmaxAclnnDescriptor(handle->device);
+    (*desc_ptr)->device_id = handle->device_id;
+
+    // Set value from infiniopTensorDescriptor
+    auto &aDesc = (*desc_ptr)->aDesc;
+    auto &outDesc = (*desc_ptr)->outDesc;
+
+    uint64_t ndim = y->ndim;
+    uint64_t *shape = y->shape;
+    int64_t *strides = y->strides;
+    int64_t total_seq_len = static_cast<int64_t>(shape[ndim - 1]);
+    int64_t seq_len = static_cast<int64_t>(shape[ndim - 2]);
+
+    if (total_seq_len < seq_len) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    // Change input shape and stride
+    auto aclnn_shape = std::vector<int64_t>(4);
+    auto aclnn_strides = std::vector<int64_t>(4);
+    for (uint64_t i = 0; i < ndim; ++i) {
+        aclnn_shape[4 - i - 1] = shape[ndim - i - 1];
+        aclnn_strides[4 - i - 1] = strides[ndim - i - 1];
+    }
+    // Add padding to input shape and stride if ndim < 4
+    for (uint64_t i = 0; i < 4 - ndim; ++i) {
+        aclnn_shape[i] = 1;
+        aclnn_strides[i] = aclnn_shape[i + 1] * aclnn_strides[i + 1];
+    }
+
+    CHECK_STATUS(aDesc->setDescriptor(toAclDataType(y->dt), aclnn_shape, aclnn_strides), STATUS_SUCCESS);
+    CHECK_STATUS(outDesc->setDescriptor(toAclDataType(y->dt), aclnn_shape, aclnn_strides), STATUS_SUCCESS);
+
+    // Set mask Desc
+    auto &maskDesc = (*desc_ptr)->maskDesc;
+    auto mask_shape = std::vector<int64_t>(3);
+
+    mask_shape[2] = total_seq_len;
+    mask_shape[1] = seq_len;
+    if (ndim == 2) {
+        mask_shape[0] = 1;
+    } else {
+        mask_shape[0] = static_cast<int64_t>(shape[0]);
+    }
+    auto mask_strides = std::vector<int64_t>{total_seq_len * seq_len, total_seq_len, 1};
+
+    CHECK_STATUS(maskDesc->setDescriptor(toAclDataType(y->dt), mask_shape, mask_strides), STATUS_SUCCESS);
+
+    // Create aclTensor
+    CHECK_STATUS(aDesc->createTensor(), STATUS_SUCCESS);
+    CHECK_STATUS(maskDesc->createTensor(), STATUS_SUCCESS);
+    CHECK_STATUS(outDesc->createTensor(), STATUS_SUCCESS);
+
+    // Get Tensor
+    aclTensor *ta = aDesc->t;
+    aclTensor *tmask = maskDesc->t;
+    aclTensor *tout = outDesc->t;
+
+    auto &workspaceSize = (*desc_ptr)->workspaceSize;
+    auto &executor = (*desc_ptr)->executor;
+    auto ret = aclnnMaskedSoftmaxWithRelPosBiasGetWorkspaceSize(ta,
+                                                                nullptr,
+                                                                tmask,
+                                                                1.0, 0,
+                                                                tout,
+                                                                &workspaceSize,
+                                                                &executor);
+    aclSetAclOpExecutorRepeatable(executor);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnMaskedSoftmaxWithRelPosBiasGetWorkspaceSize failed. ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+
+    // Fill upgrade matrix
+    uint16_t mask_matrix[maskDesc->shape[0]][maskDesc->shape[1]][maskDesc->shape[2]];
+    auto &dims = maskDesc->shape;
+    auto ele_size = aclDataTypeSize(maskDesc->dataType);
+
+    // float neg_inf = -100000000;
+    for (int i = 0; i < dims[0]; ++i) {
+        for (int m = 0; m < dims[1]; ++m) {
+            for (int n = 0; n < dims[2]; ++n) {
+                if (n - m > dims[2] - dims[1]) {
+                    // 0xF939 = -10240 half
+                    mask_matrix[i][m][n] = 0xF880;
+                } else {
+                    mask_matrix[i][m][n] = 0;
+                }
+            }
+        }
+    }
+
+    // malloc mask space
+    auto &maskAddr = (*desc_ptr)->maskAddr;
+    auto mask_size = numElements(maskDesc->shape.data(), maskDesc->ndim) * ele_size;
+    CHECK_STATUS(mallocWorkspace(&maskAddr, mask_size), STATUS_SUCCESS);
+
+    // copy mask matrix to device mem
+    ret = aclrtMemcpy(maskAddr,
+                      mask_size,
+                      mask_matrix,
+                      mask_size,
+                      ACL_MEMCPY_HOST_TO_DEVICE);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnGetCausalSoftmaxWorkspaceSize(CausalSoftmaxAclnnDescriptor_t desc, uint64_t *size) {
+
+    *size = desc->workspaceSize;
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnCausalSoftmax(CausalSoftmaxAclnnDescriptor_t desc,
+                                    void *workspace,
+                                    uint64_t workspace_size,
+                                    void *data,
+                                    void *stream) {
+    auto &aDesc = desc->aDesc;
+    auto &maskDesc = desc->maskDesc;
+    auto &outDesc = desc->outDesc;
+
+
+    // Get aclTensor pt
+    aclTensor *ta = aDesc->t;
+    aclTensor *tmask = maskDesc->t;
+    aclTensor *tout = outDesc->t;
+
+    auto &executor = desc->executor;
+    auto &workspaceSize = desc->workspaceSize;
+    auto &maskAddr = desc->maskAddr;
+
+    // Set runing on handle device
+    aclrtSetDevice(desc->device_id);
+
+    AclSetTensorAddr(executor, 0, ta, data);
+    AclSetTensorAddr(executor, 2, tmask, maskAddr);
+    AclSetTensorAddr(executor, 3, tout, data);
+
+    auto ret = aclnnMaskedSoftmaxWithRelPosBias(workspace,
+                                                workspaceSize,
+                                                executor,
+                                                stream);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnMaskedSoftmaxWithRelPosBias failed. ERROR: %d\n", ret));
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnDestroyCausalSoftmaxDescriptor(CausalSoftmaxAclnnDescriptor_t desc) {
+    delete desc->aDesc;
+    delete desc->maskDesc;
+    delete desc->outDesc;
+    aclDestroyAclOpExecutor(desc->executor);
+    CHECK_STATUS(freeWorkspace(desc->maskAddr), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/causal_softmax/ascend/causal_softmax_aclnn.h b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.h
new file mode 100644
index 00000000..f6b6d320
--- /dev/null
+++ b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.h
@@ -0,0 +1,38 @@
+#ifndef __ACLNN_CAUSAL_SOFTMAX_H__
+#define __ACLNN_CAUSAL_SOFTMAX_H__
+
+#include "../../../devices/ascend/ascend_handle.h"
+#include "../../../devices/ascend/tensor_aclnn.h"
+#include "operators.h"
+#include <acl/acl_base.h>
+#include <aclnn/acl_meta.h>
+#include <aclnnop/aclnn_masked_softmax_with_rel_pos_bias.h>
+
+struct CausalSoftmaxAclnnDescriptor {
+    Device device;
+    int device_id;
+    aclOpExecutor *executor;
+    aclnnTensorDescriptor_t aDesc, maskDesc, outDesc;
+    uint64_t workspaceSize;
+    void *maskAddr;
+
+    CausalSoftmaxAclnnDescriptor(Device device);
+};
+
+typedef CausalSoftmaxAclnnDescriptor *CausalSoftmaxAclnnDescriptor_t;
+
+infiniopStatus_t aclnnCreateCausalSoftmaxDescriptor(AscendHandle_t handle,
+                                                    CausalSoftmaxAclnnDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t y_desc);
+
+infiniopStatus_t aclnnGetCausalSoftmaxWorkspaceSize(CausalSoftmaxAclnnDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t aclnnCausalSoftmax(CausalSoftmaxAclnnDescriptor_t desc,
+                                    void *workspace,
+                                    uint64_t workspace_size,
+                                    void *data,
+                                    void *stream);
+
+infiniopStatus_t aclnnDestroyCausalSoftmaxDescriptor(CausalSoftmaxAclnnDescriptor_t desc);
+
+#endif
diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.cc b/src/ops/causal_softmax/bang/causal_softmax_bang.cc
new file mode 100644
index 00000000..cc9b6d37
--- /dev/null
+++ b/src/ops/causal_softmax/bang/causal_softmax_bang.cc
@@ -0,0 +1,50 @@
+#include "causal_softmax_bang.h"
+#include "../../utils.h"
+
+infiniopStatus_t bangCreateCausalSoftmaxDescriptor(BangHandle_t handle,
+                                                   CausalSoftmaxBangDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y) {
+    if (y->ndim < 2 || y->shape[y->ndim - 1] < y->shape[y->ndim - 2]) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    int ndim = y->ndim;
+    int *stride = new int[ndim];
+    int *shape = new int[ndim];
+
+    int n = 1;
+    for (int i = 0; i < ndim; i++) {
+        stride[i] = static_cast<int>(y->strides[i]);
+        shape[i] = static_cast<int>(y->shape[i]);
+        if (i < ndim - 1) {
+            n *= shape[i];
+        }
+    }
+
+    *desc_ptr = new CausalSoftmaxBangDescriptor{
+        handle->device,
+        handle->device_id,
+        y->dt,
+        ndim,
+        stride,
+        shape,
+        n};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, uint64_t *size) {
+    if (desc->ndim > 3) {
+        *size = desc->ndim * sizeof(int) * 2;
+    } else {
+        *size = 0;
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t bangDestroyCausalSoftmaxDescriptor(CausalSoftmaxBangDescriptor_t desc) {
+    delete[] desc->stride;
+    delete[] desc->shape;
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.h b/src/ops/causal_softmax/bang/causal_softmax_bang.h
index e7a33a5f..c9e09921 100644
--- a/src/ops/causal_softmax/bang/causal_softmax_bang.h
+++ b/src/ops/causal_softmax/bang/causal_softmax_bang.h
@@ -1,11 +1,35 @@
 #ifndef __BANG_CAUSAL_SOFTMAX_H__
 #define __BANG_CAUSAL_SOFTMAX_H__
 
+#include "../../../devices/bang/bang_handle.h"
 #include "../../utils.h"
-#include "cnrt.h"
 #include "operators.h"
 
-void causal_softmax_bang_f16(Tensor y, void *stream);
+struct CausalSoftmaxBangDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    int ndim;
+    int *stride;
+    int *shape;
+    int n;
+};
 
-#endif// __BANG_CAUSAL_SOFTMAX_H__
+typedef struct CausalSoftmaxBangDescriptor *CausalSoftmaxBangDescriptor_t;
 
+infiniopStatus_t bangCreateCausalSoftmaxDescriptor(BangHandle_t handle,
+                                                   CausalSoftmaxBangDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y_desc);
+
+infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *data,
+                                   void *stream);
+
+infiniopStatus_t bangDestroyCausalSoftmaxDescriptor(CausalSoftmaxBangDescriptor_t desc);
+
+
+#endif
diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.mlu b/src/ops/causal_softmax/bang/causal_softmax_bang.mlu
index 10304324..12b3e610 100644
--- a/src/ops/causal_softmax/bang/causal_softmax_bang.mlu
+++ b/src/ops/causal_softmax/bang/causal_softmax_bang.mlu
@@ -1,221 +1,212 @@
+#include "../../../devices/bang/common_bang.h"
 #include "bang.h"
 #include "bang_device_functions.h"
-#include "cnrt.h"
 #include "causal_softmax_bang.h"
-#include "../../../devices/bang/common_bang.h"
+#include "cnrt.h"
+
 const int SRC_MAX_SIZE = 1024 * 64;//至少大于等于128字节
-__nram__  char nram_buffer[NRAM_MAX_SIZE];
-template <typename T>
-__mlu_device__ void causal_softmaxKernel(T *destination, T *source, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int dimS, int mask, int ndim){
-    
-    const int maxNum = SRC_MAX_SIZE/sizeof(T);
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template<typename T>
+__mlu_device__ void causal_softmaxKernel(T *destination, int *strideDest, int *shape, int othersize, int dimsize, int dimS, int mask, int ndim) {
+
+    const int maxNum = SRC_MAX_SIZE / sizeof(T);
     int wSize = 128 / sizeof(T);
     __nram__ T srcMax[2];
-    if(dimsize > maxNum){
-        T *src = (T *)nram_buffer;//[maxNum]
-        T *destSum = src + maxNum;//[maxNum]
+    if (dimsize > maxNum) {
+        T *src = (T *) nram_buffer;        //[maxNum]
+        T *destSum = src + maxNum;         //[maxNum]
         T *destSumFinal = destSum + maxNum;//[wSize]
-        T *tmp = destSumFinal + wSize;//[maxNum]
-        
+        T *tmp = destSumFinal + wSize;     //[maxNum]
+
         T destOldMax;
         T destNewMax;
-        
+
         int remain = dimsize % maxNum;
         int repeat = (dimsize - remain) / maxNum;
-        
+
         int remainT = othersize % taskDim;
         int stepEasy = (othersize - remainT) / taskDim;
         int stepHard = stepEasy + 1;
         int step = (taskId < remainT ? stepHard : stepEasy);
         int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
+
+        for (int i = indStart; i < indStart + step; i++) {
             int indd = 0;
             int indi = i;
-            int lastI = indi%shape[ndim - 2];
+            int lastI = indi % shape[ndim - 2];
             for (int j = ndim - 2; j >= 0; --j) {
-                inds += (indi % shape[j]) * strideSrc[j];
+
                 indd += (indi % shape[j]) * strideDest[j];
                 indi /= shape[j];
             }
-            
-            if(mask + 1 + lastI < maxNum){
-                __bang_write_value(src, maxNum, -INFINITY);//提前设置负无穷
-                __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从source读取对应数据
-                __bang_argmax(srcMax, src, maxNum);//获取最大值
+
+            if (mask + 1 + lastI < maxNum) {
+                __bang_write_value(src, maxNum, -INFINITY);                                   //提前设置负无穷
+                __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从destination读取对应数据
+                __bang_argmax(srcMax, src, maxNum);                                           //获取最大值
                 __bang_write_value(destSum, maxNum, srcMax[0]);
                 __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//destSum前面(mask + 1 + lastI)为src，后面部分为最大值
-                __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum);//destSum前面(mask + 1 + lastI)为(src - M)，后面部分为0
-                __bang_active_exp_less_0(destSum, destSum, maxNum);//destSum前面(mask + 1 + lastI)为exp(src - M)，后面部分为1
-                __bang_write_zero(src, maxNum);//重新设置src全部为0
+                __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum);           //destSum前面(mask + 1 + lastI)为(src - M)，后面部分为0
+                __bang_active_exp_less_0(destSum, destSum, maxNum);               //destSum前面(mask + 1 + lastI)为exp(src - M)，后面部分为1
+                __bang_write_zero(src, maxNum);                                   //重新设置src全部为0
                 __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//src前面(mask + 1 + lastI)为exp(src - M)，后面部分为0
-                
-                if(maxNum >= wSize){
+
+                if (maxNum >= wSize) {
                     int segNum = maxNum / wSize;//准备数值求和
-                    for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                        for(int j = 0; j < strip; j++){
+                    for (int strip = segNum / 2; strip > 0; strip = strip / 2) {
+                        for (int j = 0; j < strip; j++) {
                             __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
                         }
                     }
                     __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
-                }
-                else{
+
+                } else {
                     __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM);
                     __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
                 }
                 T globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - (mask + 1 + lastI)));//下面开始指数变换，写回GDRAM
                 __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                
+
                 __memcpy(destination + indd, src, maxNum * sizeof(T), NRAM2GDRAM);
                 __bang_write_zero(src, maxNum);
-                for(int s = 1; s < repeat; s++){
+                for (int s = 1; s < repeat; s++) {
                     __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM);
                 }
-                if(remain){
+                if (remain) {
                     __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM);
                 }
-            }
-            else{
+            } else {
                 int newRemain = (mask + 1 + lastI) % maxNum;
                 int nR = (mask + 1 + lastI - newRemain) / maxNum;
-                
+
                 __bang_write_zero(destSum, maxNum);
                 __bang_write_zero(destSumFinal, wSize);
-                
+
                 destOldMax = -INFINITY;
                 destNewMax = -INFINITY;
-                for(int s = 0; s < nR; s++){
-                    
-                    __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+                for (int s = 0; s < nR; s++) {
+
+                    __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
                     __bang_argmax(srcMax, src, maxNum);
-                    
-                    if(destNewMax < srcMax[0]){
+
+                    if (destNewMax < srcMax[0]) {
                         destNewMax = srcMax[0];
                     }
                     __bang_sub_scalar(src, src, destNewMax, maxNum);
                     __bang_active_exp_less_0(src, src, maxNum);
-                    
-                    if(s > 0){
+
+                    if (s > 0) {
                         __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum);
                     }
                     __bang_add(destSum, destSum, src, maxNum);
-                    
+
                     destOldMax = destNewMax;
                 }
-                
-                if(newRemain){  
+
+                if (newRemain) {
                     //__bang_write_value(src, maxNum, -INFINITY);
-                    
-                    __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
-                    
+
+                    __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
+
                     __bang_argmax(srcMax, src, maxNum);
-                    
-                    if(destNewMax < srcMax[0]){
+
+                    if (destNewMax < srcMax[0]) {
                         destNewMax = srcMax[0];
                     }
-                    
+
                     __bang_write_value(tmp, maxNum, destNewMax);
                     __memcpy(tmp, src, newRemain * sizeof(T), NRAM2NRAM);
-                    
+
                     __bang_sub_scalar(tmp, tmp, destNewMax, maxNum);
                     __bang_active_exp_less_0(tmp, tmp, maxNum);
-                    
-                    if(nR > 0){
+
+                    if (nR > 0) {
                         __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum);
                     }
                     __bang_add(destSum, destSum, tmp, maxNum);
-                    
+
                     destOldMax = destNewMax;
                 }
-                
-                if(maxNum >= wSize){
+
+                if (maxNum >= wSize) {
                     int segNum = maxNum / wSize;//准备数值求和
-                    for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                        for(int j = 0; j < strip; j++){
+                    for (int strip = segNum / 2; strip > 0; strip = strip / 2) {
+                        for (int j = 0; j < strip; j++) {
                             __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
                         }
                     }
                     __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
-                }
-                else{
-                    
+
+                } else {
+
                     __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM);
                     __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
                 }
-                
+
                 T globalSumInv;
-                if(newRemain){
+                if (newRemain) {
                     globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - newRemain));//下面开始指数变换，写回GDRAM
-                    
-                }
-                else{
+
+                } else {
                     globalSumInv = 1.0 / destSumFinal[0];//下面开始指数变换，写回GDRAM
-                   
                 }
-                
-                for(int s = 0; s < nR; s++){
-                    __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-                    
+
+                for (int s = 0; s < nR; s++) {
+                    __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+
                     __bang_sub_scalar(src, src, destNewMax, maxNum);
                     __bang_active_exp_less_0(src, src, maxNum);
                     __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                    
+
                     __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM);
                 }
                 __bang_write_zero(src, maxNum);
-                for(int s = nR; s < repeat; s++){
+                for (int s = nR; s < repeat; s++) {
                     __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM);
                 }
-                if(remain){
+                if (remain) {
                     __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM);
                 }
-                
-                if(newRemain){
-                    
-                    __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
-                    
+
+                if (newRemain) {
+
+                    __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
+
                     __bang_sub_scalar(src, src, destNewMax, maxNum);
                     __bang_active_exp_less_0(src, src, maxNum);
-                   __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                    
+                    __bang_mul_scalar(src, src, globalSumInv, maxNum);
+
                     __memcpy(destination + indd + nR * maxNum, src, newRemain * sizeof(T), NRAM2GDRAM);
                 }
-                
             }
         }
-    }
-    else{
-        T *src = (T *)nram_buffer;//[dimS]
-        T *destSum = src + dimS;//[dimS]
+    } else {
+        T *src = (T *) nram_buffer;      //[dimS]
+        T *destSum = src + dimS;         //[dimS]
         T *destSumFinal = destSum + dimS;//[wSize]
-        
+
         int remainT = othersize % taskDim;
         int stepEasy = (othersize - remainT) / taskDim;
         int stepHard = stepEasy + 1;
         int step = (taskId < remainT ? stepHard : stepEasy);
         int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
+
+
+        for (int i = indStart; i < indStart + step; i++) {
+
             int indd = 0;
             int indi = i;
-            
+
             for (int j = ndim - 2; j >= 0; --j) {
-                inds += (indi % shape[j]) * strideSrc[j];
+
                 indd += (indi % shape[j]) * strideDest[j];
                 indi /= shape[j];
             }
             __bang_write_value(src, dimS, -INFINITY);
             __bang_write_zero(destSumFinal, wSize);
             int lastI = i % shape[ndim - 2];
-            __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);
+            __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);
             __bang_argmax(srcMax, src, dimS);
             __bang_write_value(destSum, dimS, srcMax[0]);
             __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);
@@ -224,33 +215,31 @@ __mlu_device__ void causal_softmaxKernel(T *destination, T *source, int *strideS
             __bang_write_zero(src, dimS);
             __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);
             int segNum = dimS / wSize;//准备数值求和
-            for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                for(int j = 0; j < strip; j++){
+            for (int strip = segNum / 2; strip > 0; strip = strip / 2) {
+                for (int j = 0; j < strip; j++) {
                     __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
                 }
             }
-            __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
+            __bang_reduce_sum(destSumFinal, destSum, wSize);                       //此时destSum[0]保存的就是当前maxNum长度数据的数值和
             T globalSumInv = 1.0 / (destSumFinal[0] - (dimS - (mask + 1 + lastI)));//下面开始指数变换，写回GDRAM
             __bang_mul_scalar(src, src, globalSumInv, dimS);
-            
-            __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
-            
 
+            __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
         }
     }
 }
+
 template<typename T>
-__mlu_global__ void causal_softmaxUnion1(T *destination, T *source, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int dimS, int mask, int ndim) {
+__mlu_global__ void causal_softmaxUnion1(T *destination, int *strideDest, int *shape, int othersize, int dimsize, int dimS, int mask, int ndim) {
 
-    causal_softmaxKernel<T>(destination, source, strideSrc, strideDest, shape, othersize, dimsize, dimS, mask, ndim);
+    causal_softmaxKernel<T>(destination, strideDest, shape, othersize, dimsize, dimS, mask, ndim);
 }
+
 template<typename T>
-void causal_softmax(cnrtQueue_t queue, void *destination, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int mask, int ndim) {
+void causal_softmax(cnrtQueue_t queue, void *destination, int *strideDest, int *shape, int othersize, int dimsize, int mask, int ndim) {
     int wSize = 128 / sizeof(T);
     auto y_ = reinterpret_cast<T *>(destination);
-    T *x_;
-    cnrtMalloc((void**)&x_, othersize * dimsize * sizeof(T));
-    cnrtMemcpy(x_, y_, othersize * dimsize * sizeof(T), cnrtMemcpyDevToDev);
+
     int dimS;
     float mi = log2(dimsize);
     if (floor(mi) == mi) {
@@ -261,7 +250,7 @@ void causal_softmax(cnrtQueue_t queue, void *destination, int *strideSrc, int *s
     if (dimS < wSize) {
         dimS = wSize;
     }
-    
+
     cnrtDim3_t k_dim;
     cnrtFunctionType_t k_type;
 
@@ -270,218 +259,205 @@ void causal_softmax(cnrtQueue_t queue, void *destination, int *strideSrc, int *s
     k_dim.z = 1;
     k_type = CNRT_FUNC_TYPE_UNION1;
 
-    causal_softmaxUnion1<T><<<k_dim, k_type, queue>>>(y_, x_, strideSrc, strideDest, shape, othersize, dimsize, dimS, mask, ndim);
-    // cnrtQueueSync(queue);
-    cnrtFree(x_);
+    causal_softmaxUnion1<T><<<k_dim, k_type, queue>>>(y_, strideDest, shape, othersize, dimsize, dimS, mask, ndim);
+    cnrtQueueSync(queue);
 }
-void causal_softmax_fp16(cnrtQueue_t queue, void *destination, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int mask, int ndim) {
-   causal_softmax<half>(queue, destination, strideSrc, strideDest, shape, othersize, dimsize, mask, ndim);
-}
-template <typename T>
-__mlu_global__ void causal_softmaxDim_2(T *destination, T *source, int strideS_f, int strideD_f, int othersize, int dimsize, int dimS, int mask){
-    
-    const int maxNum = SRC_MAX_SIZE/sizeof(T);
+
+template<typename T>
+__mlu_global__ void causal_softmaxDim_2(T *destination, int strideD_f, int othersize, int dimsize, int dimS, int mask) {
+
+    const int maxNum = SRC_MAX_SIZE / sizeof(T);
     int wSize = 128 / sizeof(T);
     __nram__ T srcMax[2];
-    if(dimsize > maxNum){
-        T *src = (T *)nram_buffer;//[maxNum]
-        T *destSum = src + maxNum;//[maxNum]
+    if (dimsize > maxNum) {
+        T *src = (T *) nram_buffer;        //[maxNum]
+        T *destSum = src + maxNum;         //[maxNum]
         T *destSumFinal = destSum + maxNum;//[wSize]
-        T *tmp = destSumFinal + wSize;//[maxNum]
-        
+        T *tmp = destSumFinal + wSize;     //[maxNum]
+
         T destOldMax;
         T destNewMax;
-        
+
         int remain = dimsize % maxNum;
         int repeat = (dimsize - remain) / maxNum;
-        
+
         int remainT = othersize % taskDim;
         int stepEasy = (othersize - remainT) / taskDim;
         int stepHard = stepEasy + 1;
         int step = (taskId < remainT ? stepHard : stepEasy);
         int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
+
+        for (int i = indStart; i < indStart + step; i++) {
+
             int indd = 0;
             int indi = i;
-            int lastI = indi%othersize;
-            inds += (indi % othersize) * strideS_f;
+            int lastI = indi % othersize;
+
             indd += (indi % othersize) * strideD_f;
-            
-            if(mask + 1 + lastI < maxNum){
-                __bang_write_value(src, maxNum, -INFINITY);//提前设置负无穷
-                __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从source读取对应数据
-                __bang_argmax(srcMax, src, maxNum);//获取最大值
+
+            if (mask + 1 + lastI < maxNum) {
+                __bang_write_value(src, maxNum, -INFINITY);                                   //提前设置负无穷
+                __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从destination读取对应数据
+                __bang_argmax(srcMax, src, maxNum);                                           //获取最大值
                 __bang_write_value(destSum, maxNum, srcMax[0]);
                 __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//destSum前面(mask + 1 + lastI)为src，后面部分为最大值
-                __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum);//destSum前面(mask + 1 + lastI)为(src - M)，后面部分为0
-                __bang_active_exp_less_0(destSum, destSum, maxNum);//destSum前面(mask + 1 + lastI)为exp(src - M)，后面部分为1
-                __bang_write_zero(src, maxNum);//重新设置src全部为0
+                __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum);           //destSum前面(mask + 1 + lastI)为(src - M)，后面部分为0
+                __bang_active_exp_less_0(destSum, destSum, maxNum);               //destSum前面(mask + 1 + lastI)为exp(src - M)，后面部分为1
+                __bang_write_zero(src, maxNum);                                   //重新设置src全部为0
                 __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//src前面(mask + 1 + lastI)为exp(src - M)，后面部分为0
-                
-                if(maxNum >= wSize){
+
+                if (maxNum >= wSize) {
                     int segNum = maxNum / wSize;//准备数值求和
-                    for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                        for(int j = 0; j < strip; j++){
+                    for (int strip = segNum / 2; strip > 0; strip = strip / 2) {
+                        for (int j = 0; j < strip; j++) {
                             __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
                         }
                     }
                     __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
-                }
-                else{
+
+                } else {
                     __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM);
                     __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
                 }
                 T globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - (mask + 1 + lastI)));//下面开始指数变换，写回GDRAM
                 __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                
+
                 __memcpy(destination + indd, src, maxNum * sizeof(T), NRAM2GDRAM);
                 __bang_write_zero(src, maxNum);
-                for(int s = 1; s < repeat; s++){
+                for (int s = 1; s < repeat; s++) {
                     __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM);
                 }
-                if(remain){
+                if (remain) {
                     __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM);
                 }
-            }
-            else{
+            } else {
                 int newRemain = (mask + 1 + lastI) % maxNum;
                 int nR = (mask + 1 + lastI - newRemain) / maxNum;
-                
+
                 __bang_write_zero(destSum, maxNum);
                 __bang_write_zero(destSumFinal, wSize);
-                
+
                 destOldMax = -INFINITY;
                 destNewMax = -INFINITY;
-                for(int s = 0; s < nR; s++){
-                    
-                    __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+                for (int s = 0; s < nR; s++) {
+
+                    __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
                     __bang_argmax(srcMax, src, maxNum);
-                    
-                    if(destNewMax < srcMax[0]){
+
+                    if (destNewMax < srcMax[0]) {
                         destNewMax = srcMax[0];
                     }
                     __bang_sub_scalar(src, src, destNewMax, maxNum);
                     __bang_active_exp_less_0(src, src, maxNum);
-                    
-                    if(s > 0){
+
+                    if (s > 0) {
                         __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum);
                     }
                     __bang_add(destSum, destSum, src, maxNum);
-                    
+
                     destOldMax = destNewMax;
                 }
-                
-                if(newRemain){  
+
+                if (newRemain) {
                     //__bang_write_value(src, maxNum, -INFINITY);
-                    
-                    __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
-                    
+
+                    __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
+
                     __bang_argmax(srcMax, src, maxNum);
-                    
-                    if(destNewMax < srcMax[0]){
+
+                    if (destNewMax < srcMax[0]) {
                         destNewMax = srcMax[0];
                     }
-                    
+
                     __bang_write_value(tmp, maxNum, destNewMax);
                     __memcpy(tmp, src, newRemain * sizeof(T), NRAM2NRAM);
-                    
+
                     __bang_sub_scalar(tmp, tmp, destNewMax, maxNum);
                     __bang_active_exp_less_0(tmp, tmp, maxNum);
-                    
-                    if(nR > 0){
+
+                    if (nR > 0) {
                         __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum);
                     }
                     __bang_add(destSum, destSum, tmp, maxNum);
-                    
+
                     destOldMax = destNewMax;
                 }
-                
-                if(maxNum >= wSize){
+
+                if (maxNum >= wSize) {
                     int segNum = maxNum / wSize;//准备数值求和
-                    for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                        for(int j = 0; j < strip; j++){
+                    for (int strip = segNum / 2; strip > 0; strip = strip / 2) {
+                        for (int j = 0; j < strip; j++) {
                             __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
                         }
                     }
                     __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
-                }
-                else{
-                    
+
+                } else {
+
                     __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM);
                     __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
                 }
-                
+
                 T globalSumInv;
-                if(newRemain){
+                if (newRemain) {
                     globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - newRemain));//下面开始指数变换，写回GDRAM
-                    
-                }
-                else{
+
+                } else {
                     globalSumInv = 1.0 / destSumFinal[0];//下面开始指数变换，写回GDRAM
-                   
                 }
-                
-                for(int s = 0; s < nR; s++){
-                    __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-                    
+
+                for (int s = 0; s < nR; s++) {
+                    __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+
                     __bang_sub_scalar(src, src, destNewMax, maxNum);
                     __bang_active_exp_less_0(src, src, maxNum);
                     __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                    
+
                     __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM);
                 }
                 __bang_write_zero(src, maxNum);
-                for(int s = nR; s < repeat; s++){
+                for (int s = nR; s < repeat; s++) {
                     __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM);
                 }
-                if(remain){
+                if (remain) {
                     __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM);
                 }
-                
-                if(newRemain){
-                    
-                    __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
-                    
+
+                if (newRemain) {
+
+                    __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
+
                     __bang_sub_scalar(src, src, destNewMax, maxNum);
                     __bang_active_exp_less_0(src, src, maxNum);
-                   __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                    
+                    __bang_mul_scalar(src, src, globalSumInv, maxNum);
+
                     __memcpy(destination + indd + nR * maxNum, src, newRemain * sizeof(T), NRAM2GDRAM);
                 }
-                
             }
         }
-    }
-    else{
-        T *src = (T *)nram_buffer;//[dimS]
-        T *destSum = src + dimS;//[dimS]
+    } else {
+        T *src = (T *) nram_buffer;      //[dimS]
+        T *destSum = src + dimS;         //[dimS]
         T *destSumFinal = destSum + dimS;//[wSize]
-        
+
         int remainT = othersize % taskDim;
         int stepEasy = (othersize - remainT) / taskDim;
         int stepHard = stepEasy + 1;
         int step = (taskId < remainT ? stepHard : stepEasy);
         int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
+
+
+        for (int i = indStart; i < indStart + step; i++) {
+
             int indd = 0;
             int indi = i;
-            
-            inds += (indi % othersize) * strideS_f;
+
+
             indd += (indi % othersize) * strideD_f;
             __bang_write_value(src, dimS, -INFINITY);
             __bang_write_zero(destSumFinal, wSize);
             int lastI = i % othersize;
-            __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);
+            __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);
             __bang_argmax(srcMax, src, dimS);
             __bang_write_value(destSum, dimS, srcMax[0]);
             __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);
@@ -490,28 +466,24 @@ __mlu_global__ void causal_softmaxDim_2(T *destination, T *source, int strideS_f
             __bang_write_zero(src, dimS);
             __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);
             int segNum = dimS / wSize;//准备数值求和
-            for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                for(int j = 0; j < strip; j++){
+            for (int strip = segNum / 2; strip > 0; strip = strip / 2) {
+                for (int j = 0; j < strip; j++) {
                     __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
                 }
             }
-            __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
+            __bang_reduce_sum(destSumFinal, destSum, wSize);                       //此时destSum[0]保存的就是当前maxNum长度数据的数值和
             T globalSumInv = 1.0 / (destSumFinal[0] - (dimS - (mask + 1 + lastI)));//下面开始指数变换，写回GDRAM
             __bang_mul_scalar(src, src, globalSumInv, dimS);
-            
-            __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
-            
 
+            __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
         }
     }
 }
+
 template<typename T>
-void causal_softmaxUnionDim_2(cnrtQueue_t queue, void *destination, int strideS_f, int strideD_f, int othersize, int dimsize, int mask) {
+void causal_softmaxUnionDim_2(cnrtQueue_t queue, void *destination, int strideD_f, int othersize, int dimsize, int mask) {
     int wSize = 128 / sizeof(T);
     auto y_ = reinterpret_cast<T *>(destination);
-    T *x_;
-    cnrtMalloc((void**)&x_, othersize * dimsize * sizeof(T));
-    cnrtMemcpy(x_, y_, othersize * dimsize * sizeof(T), cnrtMemcpyDevToDev);
     int dimS;
     float mi = log2(dimsize);
     if (floor(mi) == mi) {
@@ -522,7 +494,7 @@ void causal_softmaxUnionDim_2(cnrtQueue_t queue, void *destination, int strideS_
     if (dimS < wSize) {
         dimS = wSize;
     }
-    
+
     cnrtDim3_t k_dim;
     cnrtFunctionType_t k_type;
 
@@ -531,250 +503,237 @@ void causal_softmaxUnionDim_2(cnrtQueue_t queue, void *destination, int strideS_
     k_dim.z = 1;
     k_type = CNRT_FUNC_TYPE_UNION1;
 
-    causal_softmaxDim_2<T><<<k_dim, k_type, queue>>>(y_, x_, strideS_f, strideD_f, othersize, dimsize, dimS, mask);
-    // cnrtQueueSync(queue);
-    cnrtFree(x_);
+    causal_softmaxDim_2<T><<<k_dim, k_type, queue>>>(y_, strideD_f, othersize, dimsize, dimS, mask);
+    cnrtQueueSync(queue);
 }
-template <typename T>
-__mlu_global__ void causal_softmaxDim_3(T *destination, T *source, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int othersize, int middle, int dimsize, int dimS, int mask){
-    
-    const int maxNum = SRC_MAX_SIZE/sizeof(T);
+
+template<typename T>
+__mlu_global__ void causal_softmaxDim_3(T *destination, int strideD_f, int strideD_m, int othersize, int middle, int dimsize, int dimS, int mask) {
+
+    const int maxNum = SRC_MAX_SIZE / sizeof(T);
     int wSize = 128 / sizeof(T);
     __nram__ T srcMax[2];
     int startDim = othersize / middle;
-    if(dimsize > maxNum){
-        T *src = (T *)nram_buffer;//[maxNum]
-        T *destSum = src + maxNum;//[maxNum]
+    if (dimsize > maxNum) {
+        T *src = (T *) nram_buffer;        //[maxNum]
+        T *destSum = src + maxNum;         //[maxNum]
         T *destSumFinal = destSum + maxNum;//[wSize]
-        T *tmp = destSumFinal + wSize;//[maxNum]
-        
+        T *tmp = destSumFinal + wSize;     //[maxNum]
+
         T destOldMax;
         T destNewMax;
-        
+
         int remain = dimsize % maxNum;
         int repeat = (dimsize - remain) / maxNum;
-        
+
         int remainT = othersize % taskDim;
         int stepEasy = (othersize - remainT) / taskDim;
         int stepHard = stepEasy + 1;
         int step = (taskId < remainT ? stepHard : stepEasy);
         int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
+
+        for (int i = indStart; i < indStart + step; i++) {
+
             int indd = 0;
             int indi = i;
-            int lastI = indi%middle;
-            inds += (indi % middle) * strideS_m;
+            int lastI = indi % middle;
+
             indd += (indi % middle) * strideD_m;
             indi /= middle;
-            inds += (indi % startDim) * strideS_f;
+
             indd += (indi % startDim) * strideD_f;
-            
-            if(mask + 1 + lastI < maxNum){
-                __bang_write_value(src, maxNum, -INFINITY);//提前设置负无穷
-                __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从source读取对应数据
-                __bang_argmax(srcMax, src, maxNum);//获取最大值
+
+            if (mask + 1 + lastI < maxNum) {
+                __bang_write_value(src, maxNum, -INFINITY);                                   //提前设置负无穷
+                __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从destination读取对应数据
+                __bang_argmax(srcMax, src, maxNum);                                           //获取最大值
                 __bang_write_value(destSum, maxNum, srcMax[0]);
                 __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//destSum前面(mask + 1 + lastI)为src，后面部分为最大值
-                __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum);//destSum前面(mask + 1 + lastI)为(src - M)，后面部分为0
-                __bang_active_exp_less_0(destSum, destSum, maxNum);//destSum前面(mask + 1 + lastI)为exp(src - M)，后面部分为1
-                __bang_write_zero(src, maxNum);//重新设置src全部为0
+                __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum);           //destSum前面(mask + 1 + lastI)为(src - M)，后面部分为0
+                __bang_active_exp_less_0(destSum, destSum, maxNum);               //destSum前面(mask + 1 + lastI)为exp(src - M)，后面部分为1
+                __bang_write_zero(src, maxNum);                                   //重新设置src全部为0
                 __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//src前面(mask + 1 + lastI)为exp(src - M)，后面部分为0
-                
-                if(maxNum >= wSize){
+
+                if (maxNum >= wSize) {
                     int segNum = maxNum / wSize;//准备数值求和
-                    for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                        for(int j = 0; j < strip; j++){
+                    for (int strip = segNum / 2; strip > 0; strip = strip / 2) {
+                        for (int j = 0; j < strip; j++) {
                             __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
                         }
                     }
                     __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
-                }
-                else{
+
+                } else {
                     __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM);
                     __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
                 }
                 T globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - (mask + 1 + lastI)));//下面开始指数变换，写回GDRAM
                 __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                
+
                 __memcpy(destination + indd, src, maxNum * sizeof(T), NRAM2GDRAM);
                 __bang_write_zero(src, maxNum);
-                for(int s = 1; s < repeat; s++){
+                for (int s = 1; s < repeat; s++) {
                     __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM);
                 }
-                if(remain){
+                if (remain) {
                     __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM);
                 }
-            }
-            else{
+            } else {
                 int newRemain = (mask + 1 + lastI) % maxNum;
                 int nR = (mask + 1 + lastI - newRemain) / maxNum;
-                
+
                 __bang_write_zero(destSum, maxNum);
                 __bang_write_zero(destSumFinal, wSize);
-                
+
                 destOldMax = -INFINITY;
                 destNewMax = -INFINITY;
-                for(int s = 0; s < nR; s++){
-                    
-                    __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+                for (int s = 0; s < nR; s++) {
+
+                    __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
                     __bang_argmax(srcMax, src, maxNum);
-                    
-                    if(destNewMax < srcMax[0]){
+
+                    if (destNewMax < srcMax[0]) {
                         destNewMax = srcMax[0];
                     }
                     __bang_sub_scalar(src, src, destNewMax, maxNum);
                     __bang_active_exp_less_0(src, src, maxNum);
-                    
-                    if(s > 0){
+
+                    if (s > 0) {
                         __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum);
                     }
                     __bang_add(destSum, destSum, src, maxNum);
-                    
+
                     destOldMax = destNewMax;
                 }
-                
-                if(newRemain){  
+
+                if (newRemain) {
                     //__bang_write_value(src, maxNum, -INFINITY);
-                    
-                    __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
-                    
+
+                    __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
+
                     __bang_argmax(srcMax, src, maxNum);
-                    
-                    if(destNewMax < srcMax[0]){
+
+                    if (destNewMax < srcMax[0]) {
                         destNewMax = srcMax[0];
                     }
-                    
+
                     __bang_write_value(tmp, maxNum, destNewMax);
                     __memcpy(tmp, src, newRemain * sizeof(T), NRAM2NRAM);
-                    
+
                     __bang_sub_scalar(tmp, tmp, destNewMax, maxNum);
                     __bang_active_exp_less_0(tmp, tmp, maxNum);
-                    
-                    if(nR > 0){
+
+                    if (nR > 0) {
                         __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum);
                     }
                     __bang_add(destSum, destSum, tmp, maxNum);
-                    
+
                     destOldMax = destNewMax;
                 }
-                
-                if(maxNum >= wSize){
+
+                if (maxNum >= wSize) {
                     int segNum = maxNum / wSize;//准备数值求和
-                    for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                        for(int j = 0; j < strip; j++){
+                    for (int strip = segNum / 2; strip > 0; strip = strip / 2) {
+                        for (int j = 0; j < strip; j++) {
                             __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
                         }
                     }
                     __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
-                }
-                else{
-                    
+
+                } else {
+
                     __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM);
                     __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                    
                 }
-                
+
                 T globalSumInv;
-                if(newRemain){
+                if (newRemain) {
                     globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - newRemain));//下面开始指数变换，写回GDRAM
-                    
-                }
-                else{
+
+                } else {
                     globalSumInv = 1.0 / destSumFinal[0];//下面开始指数变换，写回GDRAM
-                   
                 }
-                
-                for(int s = 0; s < nR; s++){
-                    __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-                    
+
+                for (int s = 0; s < nR; s++) {
+                    __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+
                     __bang_sub_scalar(src, src, destNewMax, maxNum);
                     __bang_active_exp_less_0(src, src, maxNum);
                     __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                    
+
                     __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM);
                 }
                 __bang_write_zero(src, maxNum);
-                for(int s = nR; s < repeat; s++){
+                for (int s = nR; s < repeat; s++) {
                     __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM);
                 }
-                if(remain){
+                if (remain) {
                     __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM);
                 }
-                
-                if(newRemain){
-                    
-                    __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
-                    
+
+                if (newRemain) {
+
+                    __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM);
+
                     __bang_sub_scalar(src, src, destNewMax, maxNum);
                     __bang_active_exp_less_0(src, src, maxNum);
-                   __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                    
+                    __bang_mul_scalar(src, src, globalSumInv, maxNum);
+
                     __memcpy(destination + indd + nR * maxNum, src, newRemain * sizeof(T), NRAM2GDRAM);
                 }
-                
             }
         }
-    }
-    else{
-        T *src = (T *)nram_buffer;//[dimS]
-        T *destSum = src + dimS;//[dimS]
+    } else {
+        T *src = (T *) nram_buffer;      //[dimS]
+        T *destSum = src + dimS;         //[dimS]
         T *destSumFinal = destSum + dimS;//[wSize]
-        
+
         int remainT = othersize % taskDim;
         int stepEasy = (othersize - remainT) / taskDim;
         int stepHard = stepEasy + 1;
         int step = (taskId < remainT ? stepHard : stepEasy);
         int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
+
+        for (int i = indStart; i < indStart + step; i++) {
+
             int indd = 0;
             int indi = i;
-            
-            inds += (indi % middle) * strideS_m;
+
+
             indd += (indi % middle) * strideD_m;
             indi /= middle;
-            inds += (indi % startDim) * strideS_f;
+
             indd += (indi % startDim) * strideD_f;
             __bang_write_value(src, dimS, -INFINITY);
             __bang_write_zero(destSumFinal, wSize);
             int lastI = i % middle;
-            __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);
+            __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//长度为dimsize的向量，只考虑前面mask + 1 + lastI部分的softmax
             __bang_argmax(srcMax, src, dimS);
-            __bang_write_value(destSum, dimS, srcMax[0]);
-            __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);
-            __bang_sub_scalar(destSum, destSum, srcMax[0], dimS);
-            __bang_active_exp_less_0(destSum, destSum, dimS);
-            __bang_write_zero(src, dimS);
-            __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);
+            __bang_write_zero(destSum, dimS);
+            __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//初始化destSum为0，前面mask + 1 + lastI部分元素和src保持一致
+            __bang_sub_scalar(destSum, destSum, srcMax[0], mask + 1 + lastI);//前面mask + 1 + lastI元素减去最大值M，后面的元素还是0
+            __bang_active_exp_less_0(destSum, destSum, mask + 1 + lastI);//前面mask + 1 + lastI元素做指数变换，后面的元素还是0
+            __memcpy(src, destSum, dimS * sizeof(T), NRAM2NRAM);
             int segNum = dimS / wSize;//准备数值求和
-            for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                for(int j = 0; j < strip; j++){
+            for (int strip = segNum / 2; strip > 0; strip = strip / 2) {
+                for (int j = 0; j < strip; j++) {
                     __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
                 }
             }
-            __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-            T globalSumInv = 1.0 / (destSumFinal[0] - (dimS - (mask + 1 + lastI)));//下面开始指数变换，写回GDRAM
+            __bang_reduce_sum(destSumFinal, destSum, wSize);  //此时destSumFinal[0]存储的是前面mask + 1 + lastI的sum             
+            T globalSumInv = 1.0 / destSumFinal[0];
             __bang_mul_scalar(src, src, globalSumInv, dimS);
-            
-            __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
-            
 
+            __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
         }
     }
 }
+
 template<typename T>
-void causal_softmaxUnionDim_3(cnrtQueue_t queue, void *destination, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int othersize, int middle, int dimsize, int mask) {
+void causal_softmaxUnionDim_3(cnrtQueue_t queue, void *destination, int strideD_f, int strideD_m, int othersize, int middle, int dimsize, int mask) {
     int wSize = 128 / sizeof(T);
     auto y_ = reinterpret_cast<T *>(destination);
-    T *x_;
-    cnrtMalloc((void**)&x_, othersize * dimsize * sizeof(T));
-    cnrtMemcpy(x_, y_, othersize * dimsize * sizeof(T), cnrtMemcpyDevToDev);
+
     int dimS;
     float mi = log2(dimsize);
     if (floor(mi) == mi) {
@@ -785,7 +744,7 @@ void causal_softmaxUnionDim_3(cnrtQueue_t queue, void *destination, int strideS_
     if (dimS < wSize) {
         dimS = wSize;
     }
-    
+
     cnrtDim3_t k_dim;
     cnrtFunctionType_t k_type;
 
@@ -794,61 +753,48 @@ void causal_softmaxUnionDim_3(cnrtQueue_t queue, void *destination, int strideS_
     k_dim.z = 1;
     k_type = CNRT_FUNC_TYPE_UNION1;
 
-    causal_softmaxDim_3<T><<<k_dim, k_type, queue>>>(y_, x_, strideS_f, strideS_m, strideD_f, strideD_m, othersize, middle, dimsize, dimS, mask);
-    // cnrtQueueSync(queue);
-    cnrtFree(x_);
+    causal_softmaxDim_3<T><<<k_dim, k_type, queue>>>(y_, strideD_f, strideD_m, othersize, middle, dimsize, dimS, mask);
+    cnrtQueueSync(queue);
 }
-void causal_softmax_bang_f16(Tensor y, void *stream) {
-   
-    ASSERT(y.layout->ndim >= 2);
-    ASSERT(y.layout->shape[y.layout->ndim - 1] >= y.layout->shape[y.layout->ndim - 2]);
-    int n = 1;
-    
-    int ndim = y.layout->ndim;
-    
-    int x_stride[ndim], y_stride[ndim], shape[ndim];
-    for (int i = 0; i < ndim; i++) {
-        x_stride[i] = static_cast<int>(y.layout->strides[i]) / y.layout->dt.size;
-        y_stride[i] = static_cast<int>(y.layout->strides[i]) / y.layout->dt.size;
-        shape[i] = static_cast<int>(y.layout->shape[i]);
-        if(i < ndim - 1){
-            n *= shape[i];
-        }
-    }  
-    int d = shape[ndim - 1];  
-    int mask = shape[ndim - 1] - shape[ndim - 2];
-    
+
+void causal_softmax_bang_f16(CausalSoftmaxBangDescriptor_t desc, void *workspace, void *y, void *stream) {
+    int n = desc->n;
+    int d = desc->shape[desc->ndim - 1];
+    int mask = desc->shape[desc->ndim - 1] - desc->shape[desc->ndim - 2];
     auto queue = reinterpret_cast<cnrtQueue_t>(stream);
-    if(ndim == 2){
-        int strideS_f = x_stride[0];
-        int strideD_f = y_stride[0];
-        
-        causal_softmaxUnionDim_2<half>(queue, y.data, strideS_f, strideD_f, n, d, mask);
+
+    if (desc->ndim == 2) {
+        int strideD_f = desc->stride[0];
+        causal_softmaxUnionDim_2<half>(queue, y, strideD_f, n, d, mask);
+
+    } else if (desc->ndim == 3) {
+        int strideD_f = desc->stride[0];
+        int strideD_m = desc->stride[1];
+        int middle = desc->shape[1];
+        causal_softmaxUnionDim_3<half>(queue, y, strideD_f, strideD_m, n, middle, d, mask);
+
+    } else {
+        int *mlu_strideY = reinterpret_cast<int *>(workspace);
+        int *mlu_shape = mlu_strideY + desc->ndim;
+
+        CNRT_CHECK(cnrtMemcpy(mlu_strideY, desc->stride, desc->ndim * sizeof(int), cnrtMemcpyHostToDev));
+        CNRT_CHECK(cnrtMemcpy(mlu_shape, desc->shape, desc->ndim * sizeof(int), cnrtMemcpyHostToDev));
+
+        causal_softmax<half>(queue, y, mlu_strideY, mlu_shape, n, d, mask, desc->ndim);
     }
-    
-    else if(ndim == 3){
-        int strideS_f = x_stride[0];
-        int strideD_f = y_stride[0];
-        int strideS_m = x_stride[1];
-        int strideD_m = y_stride[1];
-        int middle = shape[1];
-        
-        causal_softmaxUnionDim_3<half>(queue, y.data, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, d, mask);
+}
+
+infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *data,
+                                   void *stream) {
+    if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
+        return STATUS_BAD_DEVICE;
     }
-    
-    else{
-        int *mlu_strideX, *mlu_strideY, *mlu_shape;
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_strideX, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_strideY, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_shape, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMemcpy(mlu_strideX, x_stride, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpy(mlu_strideY, y_stride, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpy(mlu_shape, shape, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        
-        causal_softmax_fp16(queue, y.data, mlu_strideX, mlu_strideY, mlu_shape, n, d, mask, ndim);
-        cnrtFree(mlu_strideX);
-        cnrtFree(mlu_strideY);
-        cnrtFree(mlu_shape);
+    if (dtype_eq(desc->dtype, F16)) {
+        causal_softmax_bang_f16(desc, workspace, data, stream);
+        return STATUS_SUCCESS;
     }
-    
-} 
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc b/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc
index 54443e9a..c1ef405d 100644
--- a/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc
+++ b/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc
@@ -1,36 +1,71 @@
 ﻿#include "causal_softmax_cnnl.h"
+#include "../../../devices/bang/bang_handle.h"
 #include "../../../devices/bang/common_bang.h"
-#include "../../../devices/bang/handle_pool.h"
 #include "../../utils.h"
-#include "cnrt.h"
+#include "cnnl_extra.h"
 
-CausalSoftmaxBangDescriptor::CausalSoftmaxBangDescriptor(Device device) {
-    this->device = device;
-    get_cnnl_pool();
-}
-
-void causal_softmax_cnnl_f16(Tensor t, void *stream) {
-    ASSERT(t.layout->ndim >= 2);
-    ASSERT(t.layout->shape[t.layout->ndim - 1] >= t.layout->shape[t.layout->ndim - 2]);
-    cnnlTensorDescriptor_t tDesc, maskDesc;
-    cnnlCreateTensorDescriptor(&maskDesc);
-    cnnlCreateTensorDescriptor(&tDesc);
+infiniopStatus_t cnnlCreateCausalSoftmaxDescriptor(BangHandle_t handle,
+                                                   CausalSoftmaxCnnlDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y) {
+    if (y->ndim < 2 || y->shape[y->ndim - 1] < y->shape[y->ndim - 2]) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
 
-    int ndim_ = std::max(int(t.layout->ndim), 4);
+    // cnnlMaskedSoftmax only support 4D or 5D tensors
+    int ndim_ = std::max(static_cast<int>(y->ndim), 4);
     std::vector<int> dims(ndim_, 1);
-    for (uint64_t i = 0; i < t.layout->ndim; i++) {
-        dims[ndim_ - 1 - i] = static_cast<int>(t.layout->shape[t.layout->ndim - i - 1]);
+    for (uint64_t i = 0; i < y->ndim; i++) {
+        dims[ndim_ - 1 - i] = static_cast<int>(y->shape[y->ndim - i - 1]);
     }
 
-    // 创建 mask
-    bool mask_matrix[dims[0]][dims[1]][dims[2]][dims[3]];
+    cnnlTensorDescriptor_t yDesc, maskDesc;
+    cnnlCreateTensorDescriptor(&yDesc);
+    cnnlCreateTensorDescriptor(&maskDesc);
+    cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(y->dt),
+                            dims.size(), dims.data());
+    cnnlSetTensorDescriptor(maskDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_BOOL,
+                            dims.size(), dims.data());
+
+    *desc_ptr = new CausalSoftmaxCnnlDescriptor{
+        handle->device,
+        handle->device_id,
+        handle->cnnl_handles,
+        y->dt,
+        std::move(yDesc),
+        std::move(maskDesc),
+        std::move(dims)};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, uint64_t *size) {
+    *size = sizeof(bool) * desc->dims[0] * desc->dims[1] * desc->dims[2] * desc->dims[3];
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cnnlDestroyCausalSoftmaxDescriptor(CausalSoftmaxCnnlDescriptor_t desc) {
+    cnnlDestroyTensorDescriptor(desc->yDesc);
+    cnnlDestroyTensorDescriptor(desc->maskDesc);
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cnnlCausalSoftmax(CausalSoftmaxCnnlDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *data,
+                                   void *stream) {
+    if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    bool mask_matrix[desc->dims[0]][desc->dims[1]][desc->dims[2]][desc->dims[3]];
 
     // 填充上三角矩阵（右上角为 false）
-    for (int i = 0; i < dims[0]; ++i) {
-        for (int j = 0; j < dims[1]; ++j) {
-            for (int m = 0; m < dims[2]; ++m) {
-                for (int n = 0; n < dims[3]; ++n) {
-                    if (n - m > dims[3] - dims[2]) {
+    for (int i = 0; i < desc->dims[0]; ++i) {
+        for (int j = 0; j < desc->dims[1]; ++j) {
+            for (int m = 0; m < desc->dims[2]; ++m) {
+                for (int n = 0; n < desc->dims[3]; ++n) {
+                    if (n - m > desc->dims[3] - desc->dims[2]) {
                         mask_matrix[i][j][m][n] = true;
                     } else {
                         mask_matrix[i][j][m][n] = false;
@@ -39,24 +74,16 @@ void causal_softmax_cnnl_f16(Tensor t, void *stream) {
             }
         }
     }
+    size_t mask_size = sizeof(bool) * desc->dims[0] * desc->dims[1] * desc->dims[2] * desc->dims[3];
+    cnrtMemcpyAsync(workspace, mask_matrix, mask_size, (cnrtQueue_t) stream, cnrtMemcpyHostToDev);
 
-    void *mask;
-    cnrtMalloc((void **) &mask, sizeof(bool) * dims[0] * dims[1] * dims[2] * dims[3]);
-    cnrtMemcpy(mask, mask_matrix, sizeof(bool) * dims[0] * dims[1] * dims[2] * dims[3], cnrtMemcpyHostToDev);
-
-    // 不支持 stride
-    cnnlSetTensorDescriptor(tDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
-                            dims.size(), dims.data());
-    cnnlSetTensorDescriptor(maskDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_BOOL,
-                            dims.size(), dims.data());
-
-    use_cnnl((cnrtQueue_t) stream,
+    use_cnnl(desc->pool, desc->device_id, (cnrtQueue_t) stream,
              [&](cnnlHandle_t handle) {
                  cnnlMaskedSoftmax(handle, CNNL_MASKED_SOFTMAX_MASKED_FILL,
-                                   -1, 1.0, tDesc, t.data, maskDesc, mask,
-                                   tDesc, t.data);
+                                   -1, 1.0, desc->yDesc, data, desc->maskDesc, workspace,
+                                   desc->yDesc, data);
              });
+    cnrtQueueSync((cnrtQueue_t)stream);
 
-    cnnlDestroyTensorDescriptor(tDesc);
-    cnnlDestroyTensorDescriptor(maskDesc);
+    return STATUS_SUCCESS;
 }
diff --git a/src/ops/causal_softmax/bang/causal_softmax_cnnl.h b/src/ops/causal_softmax/bang/causal_softmax_cnnl.h
index 5f0b2adc..feaf274e 100644
--- a/src/ops/causal_softmax/bang/causal_softmax_cnnl.h
+++ b/src/ops/causal_softmax/bang/causal_softmax_cnnl.h
@@ -1,15 +1,35 @@
 #ifndef __CNNL_CAUSAL_SOFTMAX_H__
 #define __CNNL_CAUSAL_SOFTMAX_H__
 
+#include "../../../devices/bang/bang_handle.h"
 #include "cnnl.h"
-#include "cnnl_extra.h"
 #include "operators.h"
+#include <vector>
 
-struct CausalSoftmaxBangDescriptor {
+struct CausalSoftmaxCnnlDescriptor {
     Device device;
-    CausalSoftmaxBangDescriptor(Device device);
+    int device_id;
+    std::shared_ptr<Pool<cnnlHandle_t>> pool;
+    DT dtype;
+    cnnlTensorDescriptor_t yDesc;
+    cnnlTensorDescriptor_t maskDesc;
+    std::vector<int> dims;
 };
 
-void causal_softmax_cnnl_f16(Tensor t, void *stream);
+typedef struct CausalSoftmaxCnnlDescriptor *CausalSoftmaxCnnlDescriptor_t;
 
-#endif// __CNNL_CAUSAL_SOFTMAX_H__
+infiniopStatus_t cnnlCreateCausalSoftmaxDescriptor(BangHandle_t handle,
+                                                   CausalSoftmaxCnnlDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y_desc);
+
+infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cnnlCausalSoftmax(CausalSoftmaxCnnlDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *data,
+                                   void *stream);
+
+infiniopStatus_t cnnlDestroyCausalSoftmaxDescriptor(CausalSoftmaxCnnlDescriptor_t desc);
+
+#endif
diff --git a/src/ops/causal_softmax/cpu/causal_softmax_cpu.cc b/src/ops/causal_softmax/cpu/causal_softmax_cpu.cc
index 0650601e..ed2a2a82 100644
--- a/src/ops/causal_softmax/cpu/causal_softmax_cpu.cc
+++ b/src/ops/causal_softmax/cpu/causal_softmax_cpu.cc
@@ -3,21 +3,60 @@
 #include "../../utils.h"
 #include <algorithm>
 
-void causal_softmax_cpu_f16(Tensor y) {
-    uint64_t ndim = y.layout->ndim;
-    ASSERT(ndim == 2 || ndim == 3);
-    uint64_t total_seq_len = y.layout->shape[ndim - 1];
-    uint64_t seq_len = y.layout->shape[ndim - 2];
+infiniopStatus_t cpuCreateCausalSoftmaxDescriptor(infiniopHandle_t,
+                                                  CausalSoftmaxCpuDescriptor_t *desc_ptr,
+                                                  infiniopTensorDescriptor_t y) {
+    uint64_t ndim = y->ndim;
+    if (ndim != 2 && ndim != 3) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!dtype_eq(y->dt, F16)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    uint64_t total_seq_len = y->shape[ndim - 1];
+    uint64_t seq_len = y->shape[ndim - 2];
     uint64_t batch_size = 1;
-    uint64_t stride_j = y.layout->strides[ndim - 1] / 2;
-    uint64_t stride_i = y.layout->strides[ndim - 2] / 2;
+    uint64_t stride_j = y->strides[ndim - 1];
+    uint64_t stride_i = y->strides[ndim - 2];
     uint64_t stride_b = 0;
     if (ndim == 3)
-        stride_b = y.layout->strides[ndim - 3] / 2;
+        stride_b = y->strides[ndim - 3];
     for (size_t i = 0; i < ndim - 2; i++) {
-        batch_size *= y.layout->shape[i];
+        batch_size *= y->shape[i];
     }
-    auto y_ptr = reinterpret_cast<uint16_t *>(y.data);
+
+    *desc_ptr = new CausalSoftmaxCpuDescriptor{
+        DevCpu,
+        y->dt,
+        batch_size,
+        stride_b,
+        seq_len,
+        stride_i,
+        total_seq_len,
+        stride_j};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCpuDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyCausalSoftmaxDescriptor(CausalSoftmaxCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+
+void causal_softmax_cpu_f16(CausalSoftmaxCpuDescriptor_t desc, void* y) {
+    uint64_t total_seq_len = desc->total_seq_len;
+    uint64_t seq_len = desc->seq_len;
+    uint64_t batch_size = desc->batch_size;
+    uint64_t stride_j = desc->stride_j;
+    uint64_t stride_i = desc->stride_i;
+    uint64_t stride_b = desc->stride_b;
+    auto y_ptr = reinterpret_cast<uint16_t *>(y);
     for (size_t b = 0; b < batch_size; b++) {
         for (size_t i = 0; i < seq_len; i++) {
             uint64_t offset = b * stride_b + i * stride_i;
@@ -41,3 +80,16 @@ void causal_softmax_cpu_f16(Tensor y) {
         }
     }
 }
+
+infiniopStatus_t cpuCausalSoftmax(CausalSoftmaxCpuDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *data,
+                                  void *stream) {
+    if(dtype_eq(desc->dtype, F16)){
+        causal_softmax_cpu_f16(desc, data);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/causal_softmax/cpu/causal_softmax_cpu.h b/src/ops/causal_softmax/cpu/causal_softmax_cpu.h
index e77a159f..e85bc598 100644
--- a/src/ops/causal_softmax/cpu/causal_softmax_cpu.h
+++ b/src/ops/causal_softmax/cpu/causal_softmax_cpu.h
@@ -2,10 +2,31 @@
 #define __CPU_CAUSAL_SOFTMAX_H__
 
 #include "operators.h"
-typedef struct CausalSoftmaxCpuDescriptor {
+struct CausalSoftmaxCpuDescriptor {
     Device device;
-} CausalSoftmaxCpuDescriptor;
+    DT dtype;
+    uint64_t batch_size;
+    uint64_t stride_b;
+    uint64_t seq_len;
+    uint64_t stride_i;
+    uint64_t total_seq_len;
+    uint64_t stride_j;
+};
 
-void causal_softmax_cpu_f16(Tensor);
+typedef struct CausalSoftmaxCpuDescriptor *CausalSoftmaxCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateCausalSoftmaxDescriptor(infiniopHandle_t,
+                                                  CausalSoftmaxCpuDescriptor_t *,
+                                                  infiniopTensorDescriptor_t y_desc);
+
+infiniopStatus_t cpuGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCpuDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cpuCausalSoftmax(CausalSoftmaxCpuDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *data, 
+                                  void *stream);
+
+infiniopStatus_t cpuDestroyCausalSoftmaxDescriptor(CausalSoftmaxCpuDescriptor_t desc);
 
 #endif
diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cc b/src/ops/causal_softmax/cuda/causal_softmax.cc
new file mode 100644
index 00000000..c7f4d5ed
--- /dev/null
+++ b/src/ops/causal_softmax/cuda/causal_softmax.cc
@@ -0,0 +1,55 @@
+#include "causal_softmax.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle,
+                                                   CausalSoftmaxCudaDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y) {
+    uint64_t ndim = y->ndim;
+    // TODO: only support 2d or 3d tensor
+    if (ndim != 2 && ndim != 3) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!dtype_eq(y->dt, F16)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    uint64_t total_seq_len = y->shape[ndim - 1];
+    uint64_t seq_len = y->shape[ndim - 2];
+    uint64_t batch_size = 1;
+    uint64_t stride_b = 0;
+    uint64_t stride_i = y->strides[ndim - 2];
+    uint64_t stride_j = y->strides[ndim - 1];
+    if (stride_j != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    for (int i = 0; i < ndim - 2; i++) {
+        batch_size *= y->shape[i];
+    }
+    if (ndim == 3)
+        stride_b = y->strides[ndim - 3];
+    unsigned int max_items_per_thread = ROUND_UP_DIV(total_seq_len, MAX_THREADS_PER_BLOCK);
+
+    *desc_ptr = new CausalSoftmaxCudaDescriptor{
+        handle->device,
+        handle->device_id,
+        y->dt,
+        batch_size,
+        stride_b,
+        seq_len,
+        stride_i,
+        total_seq_len,
+        stride_j,
+        max_items_per_thread};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyCausalSoftmaxDescriptor(CausalSoftmaxCudaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cu b/src/ops/causal_softmax/cuda/causal_softmax.cu
index dd65aef8..7f937edc 100644
--- a/src/ops/causal_softmax/cuda/causal_softmax.cu
+++ b/src/ops/causal_softmax/cuda/causal_softmax.cu
@@ -16,6 +16,12 @@ struct AttentionCausualMask {
     }
 };
 
+struct MaxOp {
+    __device__ float operator()(const float a, const float b) const {
+        return a > b ? a: b;
+    }
+};
+
 template<unsigned int BLOCK_SIZE, class Tdata, class Tmask>
 static __device__ void block_padding(
     Tdata *__restrict__ att,
@@ -33,7 +39,12 @@ static __device__ void block_padding(
 
     __shared__ float max;
     {
+#ifdef ENABLE_SUGON_DCU
+        MaxOp max_op;
+        auto acc = block_op.Reduce(thread_data, max_op, total_seq_len);
+#else
         auto acc = block_op.Reduce(thread_data, cub::Max(), total_seq_len);
+#endif
         if (threadIdx.x == 0) { max = acc; }
     }
     __syncthreads();
@@ -67,7 +78,12 @@ static __device__ void block_folding(
         thread_data[i] = att_idx < total_seq_len && mask(token_idx, seq_len, att_idx, total_seq_len)
                              ? float(att[i])
                              : -__FLT_MAX__;
+#ifdef ENABLE_SUGON_DCU
+        MaxOp max_op;
+        thread_max = max_op(thread_max, thread_data[i]);
+#else
         thread_max = cub::Max()(thread_max, thread_data[i]);
+#endif
     }
 
     using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
@@ -76,7 +92,12 @@ static __device__ void block_folding(
 
     __shared__ float max;
     {
+#ifdef ENABLE_SUGON_DCU
+        MaxOp max_op;
+        auto acc = block_op.Reduce(thread_max, max_op);
+#else
         auto acc = block_op.Reduce(thread_max, cub::Max());
+#endif
         if (threadIdx.x == 0) { max = acc; }
     }
     __syncthreads();
@@ -130,7 +151,7 @@ static __forceinline__ __device__ void folding(
 }
 
 template<unsigned int BLOCK_SIZE, class Tdata>
-__global__ void fused_softmax_padding(
+__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void fused_softmax_padding(
     Tdata *__restrict__ att,
     unsigned int const stride_x,
     unsigned int const stride_y,
@@ -140,7 +161,7 @@ __global__ void fused_softmax_padding(
 }
 
 template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata>
-__global__ void fused_softmax_folding(
+__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void fused_softmax_folding(
     Tdata *__restrict__ att,
     unsigned int const stride_x,
     unsigned int const stride_y,
@@ -152,7 +173,7 @@ __global__ void fused_softmax_folding(
 }
 
 template<unsigned int BLOCK_SIZE, class Tdata>
-__global__ void fused_softmax_standard(
+__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void fused_softmax_standard(
     Tdata *__restrict__ att_,
     unsigned int const stride_x,
     unsigned int const stride_y,
@@ -183,7 +204,12 @@ __global__ void fused_softmax_standard(
         __syncthreads();
         // Block reduce max
         {
+#ifdef ENABLE_SUGON_DCU
+            MaxOp max_op;
+            auto acc = block_op.Reduce(partial, max_op);
+#else
             auto acc = block_op.Reduce(partial, cub::Max());
+#endif
             if (threadIdx.x == 0) { max_ = acc; }
         }
         __syncthreads();
@@ -200,7 +226,11 @@ __global__ void fused_softmax_standard(
 
         // Block reduce sum
         {
+#ifdef ENABLE_SUGON_DCU
+            auto acc = block_op.Sum(partial);
+#else
             auto acc = block_op.Reduce(partial, cub::Sum());
+#endif
             if (threadIdx.x == 0) { sum_ = acc; }
         }
         __syncthreads();
@@ -218,31 +248,41 @@ __global__ void fused_softmax_standard(
 }
 
 
-void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor *desc, Tensor y, void *stream) {
-    // TODO: only support 2d or 3d tensor
-    ASSERT(y.layout->ndim == 2 || y.layout->ndim == 3);
-    uint64_t total_seq_len = y.layout->shape[y.layout->ndim - 1];
-    uint64_t seq_len = y.layout->shape[y.layout->ndim - 2];
-    uint64_t batch_size = 1;
-    uint64_t stride_x = 1;
-    uint64_t stride_y = y.layout->strides[y.layout->ndim - 2] / 2;
-    uint64_t stride_z = y.layout->strides[y.layout->ndim - 1] / 2;
-    ASSERT(stride_z == 1); // the last dimension should be contiguous
-    for (size_t i = 0; i < y.layout->ndim - 2; i++) {
-        batch_size *= y.layout->shape[i];
-        stride_x *= y.layout->strides[i];
-    }
-    stride_x /= 2; // covert byte strides to element strides
+void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor_t desc, void *y, void *stream) {
+    uint64_t total_seq_len = desc->total_seq_len;
+    uint64_t seq_len = desc->seq_len;
+    uint64_t batch_size = desc->batch_size;
+    uint64_t stride_x = desc->stride_b;
+    uint64_t stride_y = desc->stride_i;
+    uint64_t stride_z = desc->stride_j;// covert byte strides to element strides
+    unsigned int max_items_per_thread = desc->max_items_per_thread;
+
     dim3 grid(batch_size, seq_len);
-    auto max_items_per_thread = ROUND_UP_DIV(total_seq_len, MAX_THREADS_PER_BLOCK);
+
     if (max_items_per_thread == 1) {
         fused_softmax_padding<MAX_THREADS_PER_BLOCK>
-            <<<grid, total_seq_len, 0, (cudaStream_t) stream>>>((half *) (y.data), stride_x, stride_y, stride_z);
+            <<<grid, total_seq_len, 0, (cudaStream_t) stream>>>((half *) (y), stride_x, stride_y, stride_z);
     } else if (max_items_per_thread <= 16) {
         fused_softmax_folding<MAX_THREADS_PER_BLOCK, 16>
-            <<<grid, MAX_THREADS_PER_BLOCK, 0, (cudaStream_t) stream>>>((half *) (y.data), stride_x, stride_y, stride_z, total_seq_len);
+            <<<grid, MAX_THREADS_PER_BLOCK, 0, (cudaStream_t) stream>>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len);
     } else {
         fused_softmax_standard<MAX_THREADS_PER_BLOCK>
-            <<<grid, MAX_THREADS_PER_BLOCK, 0, (cudaStream_t) stream>>>((half *) (y.data), stride_x, stride_y, stride_z, total_seq_len);
+            <<<grid, MAX_THREADS_PER_BLOCK, 0, (cudaStream_t) stream>>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len);
     }
 }
+
+infiniopStatus_t cudaCausalSoftmax(CausalSoftmaxCudaDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *data,
+                                   void *stream) {
+    if (cudaSetDevice(desc->device_id) != cudaSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    if (dtype_eq(desc->dtype, F16)) {
+        causal_softmax_nv_gpu_f16(desc, data, stream);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cuh b/src/ops/causal_softmax/cuda/causal_softmax.cuh
index 0aafab57..30516bee 100644
--- a/src/ops/causal_softmax/cuda/causal_softmax.cuh
+++ b/src/ops/causal_softmax/cuda/causal_softmax.cuh
@@ -1,12 +1,36 @@
-#ifndef __NV_CPU_CAUSAL_SOFTMAX_H__
-#define __NV_CPU_CAUSAL_SOFTMAX_H__
+#ifndef __CUDA_CAUSAL_SOFTMAX_H__
+#define __CUDA_CAUSAL_SOFTMAX_H__
 
+#include "../../../devices/cuda/cuda_handle.h"
 #include "operators.h"
 
-typedef struct CausalSoftmaxCudaDescriptor {
+struct CausalSoftmaxCudaDescriptor {
     Device device;
-} CausalSoftmaxCudaDescriptor;
+    int device_id;
+    DT dtype;
+    uint64_t batch_size;
+    uint64_t stride_b;
+    uint64_t seq_len;
+    uint64_t stride_i;
+    uint64_t total_seq_len;
+    uint64_t stride_j;
+    unsigned int max_items_per_thread;
+};
 
-void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor *, Tensor, void *stream);
+typedef struct CausalSoftmaxCudaDescriptor *CausalSoftmaxCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle,
+                                                   CausalSoftmaxCudaDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y_desc);
+
+infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cudaCausalSoftmax(CausalSoftmaxCudaDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *data,
+                                   void *stream);
+
+infiniopStatus_t cudaDestroyCausalSoftmaxDescriptor(CausalSoftmaxCudaDescriptor_t desc);
 
 #endif
diff --git a/src/ops/causal_softmax/maca/causal_softmax_maca.cc b/src/ops/causal_softmax/maca/causal_softmax_maca.cc
new file mode 100644
index 00000000..5a3803e7
--- /dev/null
+++ b/src/ops/causal_softmax/maca/causal_softmax_maca.cc
@@ -0,0 +1,55 @@
+#include "causal_softmax_maca.h"
+#include "../../../devices/maca/common_maca.h"
+#include "../../utils.h"
+
+infiniopStatus_t macaCreateCausalSoftmaxDescriptor(MacaHandle_t handle,
+                                                   CausalSoftmaxMacaDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y) {
+    uint64_t ndim = y->ndim;
+    // TODO: only support 2d or 3d tensor
+    if (ndim != 2 && ndim != 3) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!dtype_eq(y->dt, F16)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    uint64_t total_seq_len = y->shape[ndim - 1];
+    uint64_t seq_len = y->shape[ndim - 2];
+    uint64_t batch_size = 1;
+    uint64_t stride_b = 0;
+    uint64_t stride_i = y->strides[ndim - 2];
+    uint64_t stride_j = y->strides[ndim - 1];
+    if (stride_j != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    for (int i = 0; i < ndim - 2; i++) {
+        batch_size *= y->shape[i];
+    }
+    if (ndim == 3)
+        stride_b = y->strides[ndim - 3];
+    unsigned int max_items_per_thread = ROUND_UP_DIV(total_seq_len, MAX_THREADS_PER_BLOCK);
+
+    *desc_ptr = new CausalSoftmaxMacaDescriptor{
+        handle->device,
+        handle->device_id,
+        y->dt,
+        batch_size,
+        stride_b,
+        seq_len,
+        stride_i,
+        total_seq_len,
+        stride_j,
+        max_items_per_thread};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t macaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxMacaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t macaDestroyCausalSoftmaxDescriptor(CausalSoftmaxMacaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/causal_softmax/maca/causal_softmax_maca.h b/src/ops/causal_softmax/maca/causal_softmax_maca.h
new file mode 100644
index 00000000..daa198b7
--- /dev/null
+++ b/src/ops/causal_softmax/maca/causal_softmax_maca.h
@@ -0,0 +1,36 @@
+#ifndef __MACA_CAUSAL_SOFTMAX_H__
+#define __MACA_CAUSAL_SOFTMAX_H__
+
+#include "../../../devices/maca/maca_handle.h"
+#include "operators.h"
+
+struct CausalSoftmaxMacaDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t batch_size;
+    uint64_t stride_b;
+    uint64_t seq_len;
+    uint64_t stride_i;
+    uint64_t total_seq_len;
+    uint64_t stride_j;
+    unsigned int max_items_per_thread;
+};
+
+typedef struct CausalSoftmaxMacaDescriptor *CausalSoftmaxMacaDescriptor_t;
+
+infiniopStatus_t macaCreateCausalSoftmaxDescriptor(MacaHandle_t handle,
+                                                   CausalSoftmaxMacaDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y_desc);
+
+infiniopStatus_t macaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxMacaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t macaCausalSoftmax(CausalSoftmaxMacaDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *data,
+                                   void *stream);
+
+infiniopStatus_t macaDestroyCausalSoftmaxDescriptor(CausalSoftmaxMacaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/causal_softmax/maca/causal_softmax_maca.maca b/src/ops/causal_softmax/maca/causal_softmax_maca.maca
new file mode 100644
index 00000000..94b884e8
--- /dev/null
+++ b/src/ops/causal_softmax/maca/causal_softmax_maca.maca
@@ -0,0 +1,259 @@
+#include "../../../devices/maca/common_maca.h"
+#include "../../utils.h"
+#include "causal_softmax_maca.h"
+#include <cub/block/block_reduce.cuh>
+
+struct AttentionCausualMask {
+    __forceinline__ __device__ bool
+    operator()(int tok_id, int seq_len,
+               int pos_id, int total_seq_len) {
+        //   tok_id ↓ |<-total_seq_len->|
+        //          0 | * * * ... *     |
+        //          1 | * * * ... * *   |
+        //          2 | * * * ... * * * |
+        // seq_len: 3  pos_id->
+        return total_seq_len + tok_id >= pos_id + seq_len;
+    }
+};
+
+template<unsigned int BLOCK_SIZE, class Tdata, class Tmask>
+static __device__ void block_padding(
+    Tdata *__restrict__ att,
+    Tmask mask,
+    unsigned int const token_idx,
+    unsigned int const seq_len) {
+    auto att_idx = threadIdx.x;
+    auto total_seq_len = blockDim.x;
+    auto thread_data = mask(token_idx, seq_len, att_idx, total_seq_len)
+                           ? float(att[att_idx])
+                           : -__FLT_MAX__;
+
+    using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+    __shared__ typename BlockOp::TempStorage temp_storage;
+    auto block_op = BlockOp(temp_storage);
+
+    __shared__ float max;
+    {
+        auto acc = block_op.Reduce(thread_data, cub::Max(), total_seq_len);
+        if (threadIdx.x == 0) { max = acc; }
+    }
+    __syncthreads();
+
+    __shared__ float mean;
+    {
+        auto acc = block_op.Sum(thread_data = expf(thread_data - max), total_seq_len);
+        if (threadIdx.x == 0) { mean = fdividef(1, acc); }
+    }
+    __syncthreads();
+
+    att[att_idx] = Tdata(thread_data * mean);
+}
+
+template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata, class Tmask>
+static __device__ void block_folding(
+    Tdata *__restrict__ att,
+    Tmask mask,
+    unsigned int const token_idx,
+    unsigned int const seq_len,
+    unsigned int const total_seq_len) {
+
+    auto local = (total_seq_len + blockDim.x - 1) / blockDim.x;
+
+    auto thread_offset = threadIdx.x * local;
+    att += thread_offset;
+
+    float thread_data[ITEMS_PER_THREAD], thread_max = -__FLT_MAX__, thread_sum = 0;
+    for (unsigned int i = 0; i < local; ++i) {
+        auto att_idx = thread_offset + i;
+        thread_data[i] = att_idx < total_seq_len && mask(token_idx, seq_len, att_idx, total_seq_len)
+                             ? float(att[i])
+                             : -__FLT_MAX__;
+        thread_max = cub::Max()(thread_max, thread_data[i]);
+    }
+
+    using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+    __shared__ typename BlockOp::TempStorage temp_storage;
+    auto block_op = BlockOp(temp_storage);
+
+    __shared__ float max;
+    {
+        auto acc = block_op.Reduce(thread_max, cub::Max());
+        if (threadIdx.x == 0) { max = acc; }
+    }
+    __syncthreads();
+
+    __shared__ float mean;
+    {
+        for (unsigned int i = 0; i < local; ++i) {
+            thread_data[i] = expf(thread_data[i] - max);
+            thread_sum += thread_data[i];
+        }
+        auto acc = block_op.Sum(thread_sum);
+        if (threadIdx.x == 0) { mean = fdividef(1, acc); }
+    }
+    __syncthreads();
+
+    for (unsigned int i = 0; i < local; ++i) {
+        if (auto att_idx = thread_offset + i; att_idx < total_seq_len) {
+            att[i] = Tdata(thread_data[i] * mean);
+        }
+    }
+}
+
+// assert BLOCK_SIZE >= blockDim.x
+template<unsigned int BLOCK_SIZE, class Tdata, class Tmask>
+static __forceinline__ __device__ void padding(
+    Tdata *__restrict__ att,
+    Tmask mask,
+    int const stride_x,
+    int const stride_y,
+    int const stride_z) {
+    auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y,
+         token_idx = blockIdx.y,
+         seq_len = gridDim.y;
+    block_padding<BLOCK_SIZE>(
+        att + offset, mask, token_idx, seq_len);
+}
+
+template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata, class Tmask>
+static __forceinline__ __device__ void folding(
+    Tdata *__restrict__ att,
+    Tmask mask,
+    unsigned int const total_seq_len,
+    int const stride_x,
+    int const stride_y,
+    int const stride_z) {
+    auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y,
+         token_idx = blockIdx.y,
+         seq_len = gridDim.y;
+    block_folding<BLOCK_SIZE, ITEMS_PER_THREAD>(
+        att + offset, mask, token_idx, seq_len, total_seq_len);
+}
+
+template<unsigned int BLOCK_SIZE, class Tdata>
+__global__ void fused_softmax_padding(
+    Tdata *__restrict__ att,
+    unsigned int const stride_x,
+    unsigned int const stride_y,
+    unsigned int const stride_z) {
+
+    padding<BLOCK_SIZE>(att, AttentionCausualMask(), stride_x, stride_y, stride_z);
+}
+
+template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata>
+__global__ void fused_softmax_folding(
+    Tdata *__restrict__ att,
+    unsigned int const stride_x,
+    unsigned int const stride_y,
+    unsigned int const stride_z,
+    unsigned int const total_seq_len) {
+    {
+        folding<BLOCK_SIZE, ITEMS_PER_THREAD>(att, AttentionCausualMask(), total_seq_len, stride_x, stride_y, stride_z);
+    }
+}
+
+template<unsigned int BLOCK_SIZE, class Tdata>
+__global__ void fused_softmax_standard(
+    Tdata *__restrict__ att_,
+    unsigned int const stride_x,
+    unsigned int const stride_y,
+    unsigned int const stride_z,
+    unsigned int const total_seq_len) {
+    {
+        auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y,
+             token_idx = blockIdx.y,
+             seq_len = gridDim.y;
+
+        auto att = att_ + offset;
+        auto att_idx = threadIdx.x;
+
+        float partial;
+        __shared__ float max_;
+        __shared__ float sum_;
+        using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+        __shared__ typename BlockOp::TempStorage temp_storage;
+        auto block_op = BlockOp(temp_storage);
+
+        // Partial max
+        partial = -__FLT_MAX__;
+        for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) {
+            if (i <= total_seq_len - seq_len + token_idx) {
+                partial = max(partial, float(att[i]));
+            }
+        }
+        __syncthreads();
+        // Block reduce max
+        {
+            auto acc = block_op.Reduce(partial, cub::Max());
+            if (threadIdx.x == 0) { max_ = acc; }
+        }
+        __syncthreads();
+
+        // Partial sum
+        partial = 0.;
+        for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) {
+            if (i <= total_seq_len - seq_len + token_idx) {
+                float e = expf(float(att[i]) - max_);
+                partial += e;
+            }
+        }
+        __syncthreads();
+
+        // Block reduce sum
+        {
+            auto acc = block_op.Reduce(partial, cub::Sum());
+            if (threadIdx.x == 0) { sum_ = acc; }
+        }
+        __syncthreads();
+
+        // Softmax
+        for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) {
+            if (i <= total_seq_len - seq_len + token_idx) {
+                float e = expf(float(att[i]) - max_);
+                att[i] = e / sum_;
+            } else {
+                att[i] = half(0);
+            }
+        }
+    }
+}
+
+
+void causal_softmax_nv_gpu_f16(CausalSoftmaxMacaDescriptor_t desc, void *y, void *stream) {
+    uint64_t total_seq_len = desc->total_seq_len;
+    uint64_t seq_len = desc->seq_len;
+    uint64_t batch_size = desc->batch_size;
+    uint64_t stride_x = desc->stride_b;
+    uint64_t stride_y = desc->stride_i;
+    uint64_t stride_z = desc->stride_j;// covert byte strides to element strides
+    unsigned int max_items_per_thread = desc->max_items_per_thread;
+
+    dim3 grid(batch_size, seq_len);
+
+    if (max_items_per_thread == 1) {
+        fused_softmax_padding<MAX_THREADS_PER_BLOCK>
+            <<<grid, total_seq_len, 0, (hcStream_t) stream>>>((half *) (y), stride_x, stride_y, stride_z);
+    } else if (max_items_per_thread <= 16) {
+        fused_softmax_folding<MAX_THREADS_PER_BLOCK, 16>
+            <<<grid, MAX_THREADS_PER_BLOCK, 0, (hcStream_t) stream>>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len);
+    } else {
+        fused_softmax_standard<MAX_THREADS_PER_BLOCK>
+            <<<grid, MAX_THREADS_PER_BLOCK, 0, (hcStream_t) stream>>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len);
+    }
+}
+
+infiniopStatus_t macaCausalSoftmax(CausalSoftmaxMacaDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *data,
+                                   void *stream) {
+    if (hcSetDevice(desc->device_id) != hcSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    if (dtype_eq(desc->dtype, F16)) {
+        causal_softmax_nv_gpu_f16(desc, data, stream);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/causal_softmax/musa/causal_softmax_musa.cc b/src/ops/causal_softmax/musa/causal_softmax_musa.cc
new file mode 100644
index 00000000..6ff55d65
--- /dev/null
+++ b/src/ops/causal_softmax/musa/causal_softmax_musa.cc
@@ -0,0 +1,55 @@
+#include "causal_softmax_musa.h"
+#include "../../utils.h"
+#include "../../../devices/musa/common_musa.h"
+
+infiniopStatus_t musaCreateCausalSoftmaxDescriptor(MusaHandle_t handle,
+                                                   CausalSoftmaxMusaDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y) {
+    uint64_t ndim = y->ndim;
+    // TODO: only support 2d or 3d tensor
+    if (ndim != 2 && ndim != 3) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!dtype_eq(y->dt, F16)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    uint64_t total_seq_len = y->shape[ndim - 1];
+    uint64_t seq_len = y->shape[ndim - 2];
+    uint64_t batch_size = 1;
+    uint64_t stride_b = 0;
+    uint64_t stride_i = y->strides[ndim - 2];
+    uint64_t stride_j = y->strides[ndim - 1];
+    if (stride_j != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    for (uint64_t i = 0; i < ndim - 2; i++) {
+        batch_size *= y->shape[i];
+    }
+    if (ndim == 3)
+        stride_b = y->strides[ndim - 3];
+    unsigned int max_items_per_thread = ROUND_UP_DIV(total_seq_len, MAX_THREADS_PER_BLOCK);
+
+    *desc_ptr = new CausalSoftmaxMusaDescriptor{
+        handle->device,
+        handle->device_id,
+        y->dt,
+        batch_size,
+        stride_b,
+        seq_len,
+        stride_i,
+        total_seq_len,
+        stride_j,
+        max_items_per_thread};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxMusaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaDestroyCausalSoftmaxDescriptor(CausalSoftmaxMusaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/causal_softmax/musa/causal_softmax_musa.h b/src/ops/causal_softmax/musa/causal_softmax_musa.h
new file mode 100644
index 00000000..c6f81afc
--- /dev/null
+++ b/src/ops/causal_softmax/musa/causal_softmax_musa.h
@@ -0,0 +1,35 @@
+#ifndef __MUSA_CAUSAL_SOFTMAX_H__
+#define __MUSA_CAUSAL_SOFTMAX_H__
+
+#include "operators.h"
+#include "../../../devices/musa/musa_handle.h"
+
+struct CausalSoftmaxMusaDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t batch_size;
+    uint64_t stride_b;
+    uint64_t seq_len;
+    uint64_t stride_i;
+    uint64_t total_seq_len;
+    uint64_t stride_j;
+    uint64_t max_items_per_thread;
+};
+
+typedef struct CausalSoftmaxMusaDescriptor *CausalSoftmaxMusaDescriptor_t;
+
+infiniopStatus_t musaCreateCausalSoftmaxDescriptor(MusaHandle_t handle,
+                                                   CausalSoftmaxMusaDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y_desc);
+
+infiniopStatus_t musaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxMusaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t musaCausalSoftmax(CausalSoftmaxMusaDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *data,
+                                   void *stream);
+
+infiniopStatus_t musaDestroyCausalSoftmaxDescriptor(CausalSoftmaxMusaDescriptor_t desc);
+#endif
diff --git a/src/ops/causal_softmax/musa/causal_softmax_musa.mu b/src/ops/causal_softmax/musa/causal_softmax_musa.mu
new file mode 100644
index 00000000..5eb5c8d9
--- /dev/null
+++ b/src/ops/causal_softmax/musa/causal_softmax_musa.mu
@@ -0,0 +1,262 @@
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include "causal_softmax_musa.h"
+#include <cub/block/block_reduce.cuh>
+
+struct AttentionCausualMask {
+    __forceinline__ __device__ bool
+    operator()(int tok_id, int seq_len,
+               int pos_id, int total_seq_len) {
+        //   tok_id ↓ |<-total_seq_len->|
+        //          0 | * * * ... *     |
+        //          1 | * * * ... * *   |
+        //          2 | * * * ... * * * |
+        // seq_len: 3  pos_id->
+        return total_seq_len + tok_id >= pos_id + seq_len;
+    }
+};
+
+template<unsigned int BLOCK_SIZE, class Tdata, class Tmask>
+static __device__ void block_padding(
+    Tdata *__restrict__ att,
+    Tmask mask,
+    unsigned int const token_idx,
+    unsigned int const seq_len) {
+    auto att_idx = threadIdx.x, total_seq_len = blockDim.x;
+    auto thread_data = mask(token_idx, seq_len, att_idx, total_seq_len)
+                           ? float(att[att_idx])
+                           : -__FLT_MAX__;
+
+    using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+    __shared__ typename BlockOp::TempStorage temp_storage;
+    auto block_op = BlockOp(temp_storage);
+
+    __shared__ float max;
+    {
+        auto acc = block_op.Reduce(thread_data, cub::Max(), total_seq_len);
+        if (threadIdx.x == 0) { max = acc; }
+    }
+    __syncthreads();
+
+    __shared__ float mean;
+    {
+        auto acc = block_op.Sum(thread_data = expf(thread_data - max), total_seq_len);
+        if (threadIdx.x == 0) { mean = fdividef(1, acc); }
+    }
+    __syncthreads();
+
+    att[att_idx] = Tdata(thread_data * mean);
+}
+
+template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata, class Tmask>
+static __device__ void block_folding(
+    Tdata *__restrict__ att,
+    Tmask mask,
+    unsigned int const token_idx,
+    unsigned int const seq_len,
+    unsigned int const total_seq_len) {
+
+    auto local = (total_seq_len + blockDim.x - 1) / blockDim.x;
+
+    auto thread_offset = threadIdx.x * local;
+    att += thread_offset;
+
+    float thread_data[ITEMS_PER_THREAD], thread_max = -__FLT_MAX__, thread_sum = 0;
+    for (unsigned int i = 0; i < local; ++i) {
+        auto att_idx = thread_offset + i;
+        thread_data[i] = att_idx < total_seq_len && mask(token_idx, seq_len, att_idx, total_seq_len)
+                             ? float(att[i])
+                             : -__FLT_MAX__;
+        thread_max = cub::Max()(thread_max, thread_data[i]);
+    }
+
+    using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+    __shared__ typename BlockOp::TempStorage temp_storage;
+    auto block_op = BlockOp(temp_storage);
+
+    __shared__ float max;
+    {
+        auto acc = block_op.Reduce(thread_max, cub::Max());
+        if (threadIdx.x == 0) { max = acc; }
+    }
+    __syncthreads();
+
+    __shared__ float mean;
+    {
+        for (unsigned int i = 0; i < local; ++i) {
+            thread_data[i] = expf(thread_data[i] - max);
+            thread_sum += thread_data[i];
+        }
+        auto acc = block_op.Sum(thread_sum);
+        if (threadIdx.x == 0) { mean = fdividef(1, acc); }
+    }
+    __syncthreads();
+
+    for (unsigned int i = 0; i < local; ++i) {
+        if (auto att_idx = thread_offset + i; att_idx < total_seq_len) {
+            att[i] = Tdata(thread_data[i] * mean);
+        }
+    }
+}
+
+// assert BLOCK_SIZE >= blockDim.x
+template<unsigned int BLOCK_SIZE, class Tdata, class Tmask>
+static __forceinline__ __device__ void padding(
+    Tdata *__restrict__ att,
+    Tmask mask,
+    int const stride_x,
+    int const stride_y,
+    int const stride_z) {
+    auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y,
+         token_idx = blockIdx.y,
+         seq_len = gridDim.y;
+    block_padding<BLOCK_SIZE>(
+        att + offset, mask, token_idx, seq_len);
+}
+
+template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata, class Tmask>
+static __forceinline__ __device__ void folding(
+    Tdata *__restrict__ att,
+    Tmask mask,
+    unsigned int const total_seq_len,
+    int const stride_x,
+    int const stride_y,
+    int const stride_z) {
+    auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y,
+         token_idx = blockIdx.y,
+         seq_len = gridDim.y;
+    block_folding<BLOCK_SIZE, ITEMS_PER_THREAD>(
+        att + offset, mask, token_idx, seq_len, total_seq_len);
+}
+
+template<unsigned int BLOCK_SIZE, class Tdata>
+__global__ void fused_softmax_padding(
+    Tdata *__restrict__ att,
+    unsigned int const stride_x,
+    unsigned int const stride_y,
+    unsigned int const stride_z) {
+
+    padding<BLOCK_SIZE>(att, AttentionCausualMask(), stride_x, stride_y, stride_z);
+}
+
+template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata>
+__global__ void fused_softmax_folding(
+    Tdata *__restrict__ att,
+    unsigned int const stride_x,
+    unsigned int const stride_y,
+    unsigned int const stride_z,
+    unsigned int const total_seq_len) {
+    {
+        folding<BLOCK_SIZE, ITEMS_PER_THREAD>(att, AttentionCausualMask(), total_seq_len, stride_x, stride_y, stride_z);
+    }
+}
+
+template<unsigned int BLOCK_SIZE, class Tdata>
+__global__ void fused_softmax_standard(
+    Tdata *__restrict__ att_,
+    unsigned int const stride_x,
+    unsigned int const stride_y,
+    unsigned int const stride_z,
+    unsigned int const total_seq_len) {
+    {
+        auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y,
+             token_idx = blockIdx.y,
+             seq_len = gridDim.y;
+
+        auto att = att_ + offset;
+        auto att_idx = threadIdx.x;
+
+        float partial;
+        __shared__ float max_;
+        __shared__ float sum_;
+        using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+        __shared__ typename BlockOp::TempStorage temp_storage;
+        auto block_op = BlockOp(temp_storage);
+
+        // Partial max
+        partial = -__FLT_MAX__;
+        for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) {
+            if (i <= total_seq_len - seq_len + token_idx) {
+                partial = max(partial, float(att[i]));
+            }
+        }
+        __syncthreads();
+        // Block reduce max
+        {
+            auto acc = block_op.Reduce(partial, cub::Max());
+            if (threadIdx.x == 0) { max_ = acc; }
+        }
+        __syncthreads();
+
+        // Partial sum
+        partial = 0.;
+        for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) {
+            if (i <= total_seq_len - seq_len + token_idx) {
+                float e = expf(float(att[i]) - max_);
+                partial += e;
+            }
+        }
+        __syncthreads();
+
+        // Block reduce sum
+        {
+            auto acc = block_op.Reduce(partial, cub::Sum());
+            if (threadIdx.x == 0) { sum_ = acc; }
+        }
+        __syncthreads();
+
+        // Softmax
+        for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) {
+            if (i <= total_seq_len - seq_len + token_idx) {
+                float e = expf(float(att[i]) - max_);
+                att[i] = e / sum_;
+            } else {
+                att[i] = half(0);
+            }
+        }
+    }
+}
+
+
+void causal_softmax_mt_gpu_f16(CausalSoftmaxMusaDescriptor_t desc, void* y, void *stream) {
+    uint64_t total_seq_len = desc->total_seq_len;
+    uint64_t seq_len = desc->seq_len;
+    uint64_t batch_size = desc->batch_size;
+    uint64_t stride_x = desc->stride_b;
+    uint64_t stride_y = desc->stride_i;
+    uint64_t stride_z = desc->stride_j;// covert byte strides to element strides
+    unsigned int max_items_per_thread = desc->max_items_per_thread;
+
+    dim3 grid(batch_size, seq_len);
+    
+    if (max_items_per_thread == 1) {
+        fused_softmax_padding<MAX_THREADS_PER_BLOCK>
+            <<<grid, total_seq_len, 0, (musaStream_t) stream>>>((half *) (y), stride_x, stride_y, stride_z);
+    } else if (max_items_per_thread <= 16) {
+        fused_softmax_folding<MAX_THREADS_PER_BLOCK, 16>
+            <<<grid, MAX_THREADS_PER_BLOCK, 0, (musaStream_t) stream>>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len);
+    } else {
+        fused_softmax_standard<MAX_THREADS_PER_BLOCK>
+            <<<grid, MAX_THREADS_PER_BLOCK, 0, (musaStream_t) stream>>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len);
+    }
+}
+
+infiniopStatus_t musaCausalSoftmax(CausalSoftmaxMusaDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *data,
+                                   void *stream) {
+    int current_device;
+    if (musaGetDevice(&current_device) != musaSuccess) {
+        return STATUS_BAD_DEVICE; 
+    }
+    if (current_device != desc->device_id && musaSetDevice(desc->device_id) != musaSuccess) {
+        return STATUS_BAD_DEVICE;
+    }                      
+    if (dtype_eq(desc->dtype, F16)) {
+        causal_softmax_mt_gpu_f16(desc, data, stream);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/causal_softmax/operator.cc b/src/ops/causal_softmax/operator.cc
index 3b1f6b97..92498dca 100644
--- a/src/ops/causal_softmax/operator.cc
+++ b/src/ops/causal_softmax/operator.cc
@@ -1,4 +1,5 @@
 #include "../utils.h"
+#include "operators.h"
 #include "ops/causal_softmax/causal_softmax.h"
 
 #ifdef ENABLE_CPU
@@ -7,81 +8,171 @@
 #ifdef ENABLE_NV_GPU
 #include "../../devices/cuda/common_cuda.h"
 #include "cuda/causal_softmax.cuh"
+#include "../../devices/cuda/cuda_handle.h"
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-#include "bang/causal_softmax_cnnl.h"
+#include "../../devices/bang/bang_handle.h"
 #include "bang/causal_softmax_bang.h"
+#include "bang/causal_softmax_cnnl.h"
+#endif
+#ifdef ENABLE_ASCEND_NPU
+#include "ascend/causal_softmax_aclnn.h"
+#endif
+#ifdef ENABLE_METAX_GPU
+#include "maca/causal_softmax_maca.h"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#include "musa/causal_softmax_musa.h"
+#include "../../devices/musa/common_musa.h"
 #endif
 
-struct CausalSoftmaxDescriptor {
-    Device device;
-};
+__C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopCausalSoftmaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateCausalSoftmaxDescriptor(handle, (CausalSoftmaxCpuDescriptor_t *) desc_ptr, y_desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateCausalSoftmaxDescriptor((CudaHandle_t)handle, (CausalSoftmaxCudaDescriptor_t *) desc_ptr, y_desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxBangDescriptor_t *) desc_ptr, y_desc);
+            // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnCreateCausalSoftmaxDescriptor((AscendHandle_t) handle, (CausalSoftmaxAclnnDescriptor_t *) desc_ptr, y_desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCreateCausalSoftmaxDescriptor((MacaHandle_t) handle, (CausalSoftmaxMacaDescriptor_t *) desc_ptr, y_desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCreateCausalSoftmaxDescriptor((MusaHandle_t) handle, (CausalSoftmaxMusaDescriptor_t *) desc_ptr, y_desc);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
 
-__C CausalSoftmaxDescriptor *createCausalSoftmaxDescriptor(Device device, void *config) {
-    switch (device) {
+__C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            return (CausalSoftmaxDescriptor *) (new CausalSoftmaxCpuDescriptor{device});
+            return cpuGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCpuDescriptor_t) desc, size);
 #endif
 #ifdef ENABLE_NV_GPU
         case DevNvGpu: {
-            return (CausalSoftmaxDescriptor *) (new CausalSoftmaxCudaDescriptor{device});
+            return cudaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCudaDescriptor_t) desc, size);
         }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         case DevCambriconMlu: {
-            return (CausalSoftmaxDescriptor *) (new CausalSoftmaxBangDescriptor(device));
+            return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t) desc, size);
+            // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size);
+        }
+
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnGetCausalSoftmaxWorkspaceSize((CausalSoftmaxAclnnDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t) desc, size);
         }
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
-    return nullptr;
+    return STATUS_BAD_DEVICE;
 }
 
-__C void destroyCausalSoftmaxDescriptor(CausalSoftmaxDescriptor *descriptor) {
-    switch (descriptor->device) {
+__C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *data, void *stream) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            delete (CausalSoftmaxCpuDescriptor *) (descriptor);
-            break;
+            return cpuCausalSoftmax((CausalSoftmaxCpuDescriptor_t) desc, workspace, workspace_size, data, stream);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            delete (CausalSoftmaxCudaDescriptor *) (descriptor);
-            break;
+        case DevNvGpu: {
+            return cudaCausalSoftmax((CausalSoftmaxCudaDescriptor_t) desc, workspace, workspace_size, data, stream);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         case DevCambriconMlu: {
-            delete (CausalSoftmaxBangDescriptor *) (descriptor);
-            break;
+            return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t) desc, workspace, workspace_size, data, stream);
+            // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnCausalSoftmax((CausalSoftmaxAclnnDescriptor_t) desc, workspace, workspace_size, data, stream);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t) desc, workspace, workspace_size, data, stream);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t) desc, workspace, workspace_size, data, stream);
         }
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
+    return STATUS_BAD_DEVICE;
 }
 
-__C void causalSoftmax(CausalSoftmaxDescriptor *descriptor, Tensor y, void *stream) {
-    switch (descriptor->device) {
+__C infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            causal_softmax_cpu_f16(y);
-            break;
+            return cpuDestroyCausalSoftmaxDescriptor((CausalSoftmaxCpuDescriptor_t) desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            causal_softmax_nv_gpu_f16((CausalSoftmaxCudaDescriptor *) descriptor, y, stream);
-            break;
+        case DevNvGpu: {
+            return cudaDestroyCausalSoftmaxDescriptor((CausalSoftmaxCudaDescriptor_t) desc);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu:
-            // causal_softmax_bang_f16(y, y, stream);
-            causal_softmax_cnnl_f16(y, stream);
-            break;
+        case DevCambriconMlu: {
+            return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t) desc);
+            // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnDestroyCausalSoftmaxDescriptor((CausalSoftmaxAclnnDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu:
+            return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t) desc);
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
+    return STATUS_BAD_DEVICE;
 }
diff --git a/src/ops/conv/cpu/conv_cpu.cc b/src/ops/conv/cpu/conv_cpu.cc
new file mode 100644
index 00000000..2646c482
--- /dev/null
+++ b/src/ops/conv/cpu/conv_cpu.cc
@@ -0,0 +1,242 @@
+#include "conv_cpu.h"
+#include "../../utils.h"
+
+// get the total number of elements in arr
+inline uint64_t getTotalSize(const uint64_t *arr, uint64_t ndim) {
+    return std::accumulate(arr, arr + ndim, 1ULL, std::multiplies<uint64_t>());
+}
+
+// check if padding is needed
+inline bool requirePadding(uint64_t const *pads, uint64_t ndim) {
+    return std::any_of(pads, pads + ndim - 2,
+                       [](uint64_t pad) { return pad > 0; });
+}
+
+infiniopStatus_t cpuCreateConvDescriptor(infiniopHandle_t,
+                                         ConvCpuDescriptor_t *desc_ptr,
+                                         infiniopTensorDescriptor_t y,
+                                         infiniopTensorDescriptor_t x,
+                                         infiniopTensorDescriptor_t w,
+                                         void const *pads,
+                                         void const *strides,
+                                         void const *dilations,
+                                         uint64_t n) {
+    uint64_t ndim = y->ndim;
+    if (ndim < 3 || ndim != x->ndim || ndim != w->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (x->shape[0] != y->shape[0] || w->shape[0] != y->shape[1] || x->shape[1] != w->shape[1]) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt || y->dt != w->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t y_size = getTotalSize(y->shape, ndim);
+    const auto pads_ = reinterpret_cast<uint64_t const *>(pads);
+    uint64_t padded_x_size = requirePadding(pads_, ndim) ? getPaddedSize(ndim, x->shape, pads_) : 0;
+    uint64_t *x_shape = new uint64_t[ndim];
+    uint64_t *w_shape = new uint64_t[ndim];
+    uint64_t *y_shape = new uint64_t[ndim];
+    uint64_t *pad_ = new uint64_t[n];
+    int64_t *strides_ = new int64_t[n];
+    uint64_t *dilations_ = new uint64_t[n];
+    memcpy(x_shape, x->shape, ndim * sizeof(uint64_t));
+    memcpy(w_shape, w->shape, ndim * sizeof(uint64_t));
+    memcpy(y_shape, y->shape, ndim * sizeof(uint64_t));
+    for (size_t i = 0; i < n; ++i) {
+        pad_[i] = pads_[i];
+        strides_[i] = reinterpret_cast<int64_t const *>(strides)[i];
+        dilations_[i] = reinterpret_cast<uint64_t const *>(dilations)[i];
+    }
+
+    *desc_ptr = new ConvCpuDescriptor{
+        DevCpu,
+        y->dt,
+        ndim,
+        y_size,
+        padded_x_size,
+        x_shape,
+        w_shape,
+        y_shape,
+        pad_,
+        strides_,
+        dilations_,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGetConvWorkspaceSize(ConvCpuDescriptor_t desc, uint64_t *size) {
+    *size = desc->padded_x_size * desc->dtype.size;
+    if (desc->dtype == F16) {
+        *size += desc->y_size * sizeof(float);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyConvDescriptor(ConvCpuDescriptor_t desc) {
+    delete[] desc->x_shape;
+    delete[] desc->w_shape;
+    delete[] desc->y_shape;
+    delete[] desc->pads;
+    delete[] desc->strides;
+    delete[] desc->dilations;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+// initialize the padded input with the data from the original input
+template<typename Tdata>
+void fillPaddedInput(ConvCpuDescriptor_t desc, uint64_t const *padded_x_shape,
+                     Tdata *padded_x, Tdata const *x,
+                     uint64_t const *pads, uint64_t x_index,
+                     uint64_t padded_x_index, uint64_t ndim) {
+    const auto x_shape = desc->x_shape[ndim];
+    const auto padded_x_shape_ = padded_x_shape[ndim];
+    const auto x_base_index = x_index * x_shape;
+    const auto padded_x_base_index = padded_x_index * padded_x_shape_ +
+                                     (x_shape == padded_x_shape_ ? 0 : pads[ndim - 2]);
+
+    for (size_t i = 0; i < x_shape; ++i) {
+        // base case (last dimension)
+        if (ndim == desc->ndim - 1) {
+            padded_x[padded_x_base_index + i] = x[x_base_index + i];
+        }
+        // recursive case
+        else {
+            fillPaddedInput(desc, padded_x_shape, padded_x, x, pads, x_base_index + i,
+                            padded_x_base_index + i, ndim + 1);
+        }
+    }
+}
+
+// Recursive convolution function
+template<typename Xdata, typename Ydata>
+void _applyConv(ConvCpuDescriptor_t desc, Ydata *y, Xdata const *x,
+                Xdata const *w, uint64_t const *x_shape,
+                uint64_t x_index, uint64_t w_index, uint64_t y_index,
+                uint64_t ndim) {
+    const auto dim_size = x_shape[ndim];
+    const auto kernel_size = desc->w_shape[ndim];
+    const auto dilation = desc->dilations[ndim - 2];
+    const auto stride = desc->strides[ndim - 2];
+    const auto steps =
+        (dim_size - dilation * (kernel_size - 1) - 1) / stride + 1;
+    x_index *= dim_size;
+    w_index *= kernel_size;
+    y_index *= desc->y_shape[ndim];
+
+    // perform all the convolutions along this axis
+    for (size_t i = 0; i < steps; ++i, ++y_index) {
+        // perform a single convolution
+        for (size_t k = 0; k < kernel_size; ++k) {
+            // calculate the current indices
+            const auto curr_x_index = x_index + i * stride + k * dilation;
+            const auto curr_w_index = w_index + k;
+
+            // base case (last dimension)
+            if (ndim == desc->ndim - 1) {
+                if (desc->dtype == F16) {
+                    y[y_index] += f16_to_f32(x[curr_x_index]) * f16_to_f32(w[curr_w_index]);
+                } else {
+                    y[y_index] += x[curr_x_index] * w[curr_w_index];
+                }
+            }
+            // recursive case
+            else {
+                _applyConv(desc, y, x, w, x_shape, curr_x_index, curr_w_index,
+                           y_index, ndim + 1);
+            }
+        }
+    }
+}
+
+template<typename Xdata, typename Ydata>
+void applyConv(ConvCpuDescriptor_t desc, Ydata *y, Xdata const *x,
+               Xdata const *w, uint64_t const *x_shape) {
+    const auto y_num_channel_elements =
+        getTotalSize(desc->y_shape + 2, desc->ndim - 2);
+
+#pragma omp parallel for collapse(2)
+    // batch
+    for (size_t i = 0; i < x_shape[0]; ++i) {
+
+        // output channel
+        for (size_t j = 0; j < desc->w_shape[0]; ++j) {
+            uint64_t y_index = i * desc->y_shape[1] + j;
+
+            // input channel
+            for (size_t k = 0; k < x_shape[1]; ++k) {
+                uint64_t x_index = i * x_shape[1] + k;
+                uint64_t w_index = j * desc->w_shape[1] + k;
+                _applyConv(desc, y, x, w, x_shape, x_index, w_index, y_index, 2);
+            }
+        }
+    }
+}
+
+template<typename Xdata, typename Ydata>
+void _conv_cpu(ConvCpuDescriptor_t desc, void *workspace, uint64_t workspace_size,
+               Ydata *y, Xdata const *x, Xdata const *w) {
+    if (desc->padded_x_size > 0) {
+        auto padded_x = reinterpret_cast<Xdata *>(workspace);
+        std::vector<uint64_t> padded_shape_(desc->ndim);
+        auto padded_shape = padded_shape_.data();
+        std::fill(padded_x, padded_x + desc->padded_x_size, 0);
+        getPaddedShape(desc->ndim, desc->x_shape, desc->pads, padded_shape);
+        fillPaddedInput<Xdata>(desc, padded_shape, padded_x, x, desc->pads, 0, 0, 0);
+        applyConv<Xdata, Ydata>(desc, y, padded_x, w, padded_shape);
+    } else {
+        applyConv<Xdata, Ydata>(desc, y, x, w, desc->x_shape);
+    }
+}
+
+// Convolution function
+template<typename Tdata>
+infiniopStatus_t conv_cpu(ConvCpuDescriptor_t desc, void *workspace, uint64_t workspace_size,
+                          void *y, void const *x, void const *w) {
+    auto y_ = reinterpret_cast<Tdata *>(y);
+    auto x_ = reinterpret_cast<Tdata const *>(x);
+    auto w_ = reinterpret_cast<Tdata const *>(w);
+    std::fill(y_, y_ + desc->y_size, 0);
+    _conv_cpu<Tdata, Tdata>(desc, workspace, workspace_size, y_, x_, w_);
+    return STATUS_SUCCESS;
+}
+
+// sepcial case for fp16 (uint16_t)
+template<>
+infiniopStatus_t conv_cpu<uint16_t>(ConvCpuDescriptor_t desc, void *workspace, uint64_t workspace_size,
+                                    void *y, void const *x, void const *w) {
+    auto y_ = reinterpret_cast<float *>(workspace);
+    auto x_ = reinterpret_cast<uint16_t const *>(x);
+    auto w_ = reinterpret_cast<uint16_t const *>(w);
+    std::fill(y_, y_ + desc->y_size, 0);
+
+    _conv_cpu<uint16_t, float>(desc, y_ + desc->y_size, workspace_size, y_, x_, w_);
+
+    // copy data from y_ to y
+    auto y_16 = reinterpret_cast<uint16_t *>(y);
+#pragma omp parallel for
+    for (size_t i = 0; i < desc->y_size; ++i) {
+        y_16[i] = f32_to_f16(y_[i]);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuConv(ConvCpuDescriptor_t desc,
+                         void *workspace, uint64_t workspace_size,
+                         void *y, void const *x, void const *w,
+                         void *stream) {
+    if (desc->dtype == F16) {
+        return conv_cpu<uint16_t>(desc, workspace, workspace_size, y, x, w);
+    }
+    if (desc->dtype == F32) {
+        return conv_cpu<float>(desc, workspace, workspace_size, y, x, w);
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/conv/cpu/conv_cpu.h b/src/ops/conv/cpu/conv_cpu.h
new file mode 100644
index 00000000..48a91990
--- /dev/null
+++ b/src/ops/conv/cpu/conv_cpu.h
@@ -0,0 +1,45 @@
+#ifndef __CPU_CONV_H__
+#define __CPU_CONV_H__
+
+#include "../../../devices/cpu/common_cpu.h"
+#include "operators.h"
+#include <algorithm>
+#include <cstring>
+#include <numeric>
+
+struct ConvCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t ndim;
+    uint64_t y_size;
+    uint64_t padded_x_size;
+    uint64_t const *x_shape;
+    uint64_t const *w_shape;
+    uint64_t const *y_shape;
+    uint64_t const *pads;
+    int64_t const *strides;
+    uint64_t const *dilations;
+};
+
+typedef struct ConvCpuDescriptor *ConvCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateConvDescriptor(infiniopHandle_t,
+                                         ConvCpuDescriptor_t *,
+                                         infiniopTensorDescriptor_t y,
+                                         infiniopTensorDescriptor_t x,
+                                         infiniopTensorDescriptor_t w,
+                                         void const *pads,
+                                         void const *strides,
+                                         void const *dilations,
+                                         uint64_t n);
+
+infiniopStatus_t cpuGetConvWorkspaceSize(ConvCpuDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cpuConv(ConvCpuDescriptor_t desc,
+                         void *workspace, uint64_t workspace_size,
+                         void *y, void const *x, void const *w,
+                         void *stream);
+
+infiniopStatus_t cpuDestroyConvDescriptor(ConvCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/conv/cuda/conv.cc b/src/ops/conv/cuda/conv.cc
new file mode 100644
index 00000000..2ccabfda
--- /dev/null
+++ b/src/ops/conv/cuda/conv.cc
@@ -0,0 +1,163 @@
+#include "conv.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateConvDescriptor(CudaHandle_t handle,
+                                          ConvCudaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t y,
+                                          infiniopTensorDescriptor_t x,
+                                          infiniopTensorDescriptor_t w,
+                                          void const *pads,
+                                          void const *strides,
+                                          void const *dilations,
+                                          uint64_t n) {
+    uint64_t ndim = y->ndim;
+    if (ndim < 3 || ndim != x->ndim || ndim != w->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (x->shape[0] != y->shape[0] || w->shape[0] != y->shape[1] || x->shape[1] != w->shape[1]) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt || y->dt != w->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    const uint64_t new_ndim = std::max(ndim, (uint64_t)4);
+    // convert pads, strides, dilations into int32[]
+    int32_t *pad = new int32_t[new_ndim];
+    int32_t *stride = new int32_t[new_ndim];
+    int32_t *dilation = new int32_t[new_ndim];
+    int32_t *x_shape = new int32_t[new_ndim];
+    int32_t *w_shape = new int32_t[new_ndim];
+    int32_t *y_shape = new int32_t[new_ndim];
+    auto pads_ = reinterpret_cast<uint64_t const *>(pads);
+    auto strides_ = reinterpret_cast<int64_t const *>(strides);
+    auto dilations_ = reinterpret_cast<uint64_t const *>(dilations);
+    for (size_t i = 0; i < new_ndim; ++i) {
+        pad[i] = i < ndim - 2 ? static_cast<int32_t>(pads_[i]) : 0;
+        stride[i] = i < ndim - 2 ? static_cast<int32_t>(strides_[i]) : 1;
+        dilation[i] = i < ndim - 2 ? static_cast<int32_t>(dilations_[i]) : 1;
+        x_shape[i] = i < ndim ? static_cast<int32_t>(x->shape[i]) : 1;
+        w_shape[i] = i < ndim ? static_cast<int32_t>(w->shape[i]) : 1;
+        y_shape[i] = i < ndim ? static_cast<int32_t>(y->shape[i]) : 1;
+    }
+
+    // get the data types of the tensors and the conv operator
+    CREATE_CHECK_ERROR(auto tensor_dt = dataTypeMap[x->dt], tensor_dt, -1, STATUS_BAD_PARAM);
+    cudnnDataType_t conv_op_dt = [&] {
+        switch (tensor_dt) {
+            case CUDNN_DATA_HALF:
+                if (ndim >= 5) {
+                    return CUDNN_DATA_FLOAT;
+                }
+                if (handle->compute_capability_major > 5 || (handle->compute_capability_major == 5 && handle->compute_capability_minor >= 3)) {
+                    return CUDNN_DATA_HALF;
+                }
+                return CUDNN_DATA_FLOAT;
+            case CUDNN_DATA_BFLOAT16:
+            case CUDNN_DATA_FLOAT:
+                return CUDNN_DATA_FLOAT;
+            case CUDNN_DATA_DOUBLE:
+                return CUDNN_DATA_DOUBLE;
+            default:
+                return CUDNN_DATA_INT32;
+        }
+    }();
+
+    // create and set tensor descriptors for x
+    cudnnTensorDescriptor_t x_desc;
+    checkCudnnError(cudnnCreateTensorDescriptor(&x_desc));
+    checkCudnnError(cudnnSetTensorNdDescriptorEx(x_desc, CUDNN_TENSOR_NCHW, static_cast<cudnnDataType_t>(tensor_dt), new_ndim, x_shape));
+
+    // create and set tensor descriptors for w
+    cudnnFilterDescriptor_t w_desc;
+    checkCudnnError(cudnnCreateFilterDescriptor(&w_desc));
+    checkCudnnError(cudnnSetFilterNdDescriptor(w_desc, static_cast<cudnnDataType_t>(tensor_dt), CUDNN_TENSOR_NCHW, new_ndim, w_shape));
+
+
+    // create and set conv operator descriptor
+    cudnnConvolutionDescriptor_t op_desc;
+    checkCudnnError(cudnnCreateConvolutionDescriptor(&op_desc));
+    checkCudnnError(cudnnSetConvolutionNdDescriptor(
+        op_desc, new_ndim - 2, pad, stride, dilation, CUDNN_CROSS_CORRELATION,
+        conv_op_dt));
+
+    // create and set tensor descriptors for y
+    cudnnTensorDescriptor_t y_desc;
+    std::vector<int> outDim_(new_ndim);
+    auto outDim = outDim_.data();
+    checkCudnnError(cudnnGetConvolutionNdForwardOutputDim(op_desc, x_desc, w_desc, new_ndim, outDim));
+    checkCudnnError(cudnnCreateTensorDescriptor(&y_desc));
+    checkCudnnError(cudnnSetTensorNdDescriptorEx(y_desc, CUDNN_TENSOR_NCHW, static_cast<cudnnDataType_t>(tensor_dt), new_ndim, y_shape));
+
+    // tuning: get the best algorithm
+    int requestedAlgoCount = 1;
+    checkCudnnError(use_cudnn(handle->cudnn_handles_t, handle->device_id, nullptr,
+                              [&](cudnnHandle_t handle) { return cudnnGetConvolutionForwardAlgorithmMaxCount(handle, &requestedAlgoCount); }));
+    int algoCounts = 0;
+    int chosenAlgoIndex = 0;
+    bool chosen = false;
+    size_t workspace_size = 0;
+    std::vector<cudnnConvolutionFwdAlgoPerf_t> perf_results_(requestedAlgoCount);
+    auto perf_results = perf_results_.data();
+    checkCudnnError(use_cudnn(handle->cudnn_handles_t, handle->device_id, nullptr,
+                              [&](cudnnHandle_t handle) { return cudnnFindConvolutionForwardAlgorithm(handle, x_desc, w_desc, op_desc, y_desc, requestedAlgoCount, &algoCounts, perf_results); }));
+    if (algoCounts < 1) {
+        return STATUS_EXECUTION_FAILED;
+    }
+    for (int i = 0; i < algoCounts; ++i) {
+        if (use_cudnn(handle->cudnn_handles_t, handle->device_id, nullptr,
+                      [&](cudnnHandle_t handle) { return cudnnGetConvolutionForwardWorkspaceSize(handle, x_desc, w_desc, op_desc, y_desc, perf_results[i].algo, &workspace_size); }) == CUDNN_STATUS_SUCCESS) {
+            chosenAlgoIndex = i;
+            chosen = true;
+            break;
+        }
+    }
+    if (!chosen) {
+        return STATUS_EXECUTION_FAILED;
+    }
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+
+    *desc_ptr = new ConvCudaDescriptor{
+        DevNvGpu,
+        y->dt,
+        handle->device_id,
+        handle->cudnn_handles_t,
+        x_desc,
+        w_desc,
+        y_desc,
+        op_desc,
+        perf_results[chosenAlgoIndex].algo,
+        alpha,
+        beta,
+        workspace_size};
+
+    delete[] pad;
+    delete[] stride;
+    delete[] dilation;
+    delete[] x_shape;
+    delete[] w_shape;
+    delete[] y_shape;
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGetConvWorkspaceSize(ConvCudaDescriptor_t desc, uint64_t *size) {
+    *size = desc->workspace_size;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyConvDescriptor(ConvCudaDescriptor_t desc) {
+    checkCudnnError(cudnnDestroyConvolutionDescriptor(desc->op_desc));
+    checkCudnnError(cudnnDestroyTensorDescriptor(desc->y_desc));
+    checkCudnnError(cudnnDestroyFilterDescriptor(desc->w_desc));
+    checkCudnnError(cudnnDestroyTensorDescriptor(desc->x_desc));
+    desc->cudnn_handles_t = nullptr;
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/conv/cuda/conv.cu b/src/ops/conv/cuda/conv.cu
new file mode 100644
index 00000000..3f15843b
--- /dev/null
+++ b/src/ops/conv/cuda/conv.cu
@@ -0,0 +1,23 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "conv.cuh"
+
+infiniopStatus_t conv_nv_gpu(ConvCudaDescriptor_t desc, void *workspace, uint64_t workspace_size,
+                             void *y, void const *x, void const *w, void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+    checkCudnnError(use_cudnn(desc->cudnn_handles_t, desc->device_id, (cudaStream_t) stream,
+                              [&](cudnnHandle_t handle) { return cudnnConvolutionForward(handle, &desc->alpha,
+                                                                                         desc->x_desc, x, desc->w_desc, w, desc->op_desc, desc->algo, workspace, workspace_size,
+                                                                                         &desc->beta, desc->y_desc, y); }));
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaConv(ConvCudaDescriptor_t desc,
+                          void *workspace, uint64_t workspace_size,
+                          void *y, void const *x, void const *w,
+                          void *stream) {
+    if (desc->dtype == F16 || desc->dtype == F32) {
+        return conv_nv_gpu(desc, workspace, workspace_size, y, x, w, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/conv/cuda/conv.cuh b/src/ops/conv/cuda/conv.cuh
new file mode 100644
index 00000000..36f22e90
--- /dev/null
+++ b/src/ops/conv/cuda/conv.cuh
@@ -0,0 +1,45 @@
+#ifndef __CUDA_CONV_H__
+#define __CUDA_CONV_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <cudnn.h>
+
+struct ConvCudaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handles_t;
+    cudnnTensorDescriptor_t const x_desc;
+    cudnnFilterDescriptor_t const w_desc;
+    cudnnTensorDescriptor_t const y_desc;
+    cudnnConvolutionDescriptor_t const op_desc;
+    cudnnConvolutionFwdAlgo_t algo;
+    const float alpha;
+    const float beta;
+    uint64_t workspace_size;
+};
+
+typedef struct ConvCudaDescriptor *ConvCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateConvDescriptor(CudaHandle_t,
+                                          ConvCudaDescriptor_t *,
+                                          infiniopTensorDescriptor_t y,
+                                          infiniopTensorDescriptor_t x,
+                                          infiniopTensorDescriptor_t w,
+                                          void const *pads,
+                                          void const *strides,
+                                          void const *dilations,
+                                          uint64_t n);
+
+infiniopStatus_t cudaGetConvWorkspaceSize(ConvCudaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cudaConv(ConvCudaDescriptor_t desc,
+                          void *workspace, uint64_t workspace_size,
+                          void *y, void const *x, void const *w,
+                          void *stream);
+
+infiniopStatus_t cudaDestroyConvDescriptor(ConvCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/conv/operator.cc b/src/ops/conv/operator.cc
new file mode 100644
index 00000000..306527e5
--- /dev/null
+++ b/src/ops/conv/operator.cc
@@ -0,0 +1,96 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/conv/conv.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/conv_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/conv.cuh"
+#endif
+
+__C infiniopStatus_t infiniopCreateConvDescriptor(
+    infiniopHandle_t handle,
+    infiniopConvDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t w,
+    void *pads,
+    void *strides,
+    void *dilations,
+    uint64_t n) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateConvDescriptor(handle, (ConvCpuDescriptor_t *) desc_ptr, y, x, w, pads, strides, dilations, n);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateConvDescriptor((CudaHandle_t) handle, (ConvCudaDescriptor_t *) desc_ptr, y, x, w, pads, strides, dilations, n);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopGetConvWorkspaceSize(infiniopConvDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGetConvWorkspaceSize((ConvCpuDescriptor_t) desc, size);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaGetConvWorkspaceSize((ConvCudaDescriptor_t) desc, size);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void const *w, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuConv((ConvCpuDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaConv((ConvCudaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyConvDescriptor((ConvCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyConvDescriptor((ConvCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/expand/cpu/expand_cpu.cc b/src/ops/expand/cpu/expand_cpu.cc
new file mode 100644
index 00000000..d3bcb866
--- /dev/null
+++ b/src/ops/expand/cpu/expand_cpu.cc
@@ -0,0 +1,69 @@
+#include "expand_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+infiniopStatus_t cpuCreateExpandDescriptor(infiniopHandle_t,
+                                           ExpandCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t y,
+                                           infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (!isValidBroadcastShape(y, x)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t y_data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    // get the adjusted strides for x in terms of y
+    int64_t *x_strides = new int64_t[ndim];
+    int64_t *y_strides = new int64_t[ndim];
+#pragma omp parallel for
+    for (size_t i = 0; i < ndim; ++i) {
+        x_strides[i] = (i < ndim - x->ndim || y->shape[i] != x->shape[i + x->ndim - ndim]) ? 0 : x->strides[i + x->ndim - ndim];
+    }
+    memcpy(y_strides, y->strides, ndim * sizeof(int64_t));
+
+    *desc_ptr = new ExpandCpuDescriptor{
+        DevCpu,
+        y->dt,
+        ndim,
+        y_data_size,
+        x_strides,
+        y_strides,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyExpandDescriptor(ExpandCpuDescriptor_t desc) {
+    delete[] desc->x_strides;
+    delete[] desc->y_strides;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t expand_cpu(ExpandCpuDescriptor_t desc, void *y, void const *x) {
+    auto x_ = reinterpret_cast<Tdata const *>(x);
+    auto y_ = reinterpret_cast<Tdata *>(y);
+
+#pragma omp parallel for
+    for (uint64_t i = 0; i < desc->y_data_size; ++i) {
+        y_[i] = x_[getDstOffset(i, desc->ndim, desc->y_strides, desc->x_strides)];
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuExpand(ExpandCpuDescriptor_t desc,
+                           void *y, void const *x,
+                           void *stream) {
+    if (desc->dtype == F16) {
+        return expand_cpu<uint16_t>(desc, y, x);
+    }
+    if (desc->dtype == F32) {
+        return expand_cpu<float>(desc, y, x);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/expand/cpu/expand_cpu.h b/src/ops/expand/cpu/expand_cpu.h
new file mode 100644
index 00000000..868fefe8
--- /dev/null
+++ b/src/ops/expand/cpu/expand_cpu.h
@@ -0,0 +1,29 @@
+#ifndef __CPU_EXPAND_H__
+#define __CPU_EXPAND_H__
+
+#include "operators.h"
+#include <cstring>
+#include <numeric>
+
+struct ExpandCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t ndim;
+    uint64_t y_data_size;
+    int64_t const *x_strides;
+    int64_t const *y_strides;
+};
+
+typedef struct ExpandCpuDescriptor *ExpandCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateExpandDescriptor(infiniopHandle_t,
+                                           ExpandCpuDescriptor_t *,
+                                           infiniopTensorDescriptor_t y,
+                                           infiniopTensorDescriptor_t x);
+
+infiniopStatus_t cpuExpand(ExpandCpuDescriptor_t desc,
+                           void *y, void const *x, void *stream);
+
+infiniopStatus_t cpuDestroyExpandDescriptor(ExpandCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/expand/cuda/expand.cc b/src/ops/expand/cuda/expand.cc
new file mode 100644
index 00000000..d0467c01
--- /dev/null
+++ b/src/ops/expand/cuda/expand.cc
@@ -0,0 +1,51 @@
+#include "expand.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateExpandDescriptor(CudaHandle_t handle,
+                                            ExpandCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (!isValidBroadcastShape(y, x)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t y_data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    // get the adjusted strides for x in terms of y
+    int64_t *x_strides = new int64_t[ndim];
+    for (size_t i = 0; i < ndim; ++i) {
+        x_strides[i] = (i < ndim - x->ndim || y->shape[i] != x->shape[i + x->ndim - ndim]) ? 0 : x->strides[i + x->ndim - ndim];
+    }
+
+    int64_t *x_strides_d, *y_strides_d;
+    char *strides_and_shape_d;
+    checkCudaErrorWithCode(cudaMalloc((void **) &strides_and_shape_d, ndim * (2 * sizeof(int64_t) + sizeof(uint64_t))), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d, x_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d + ndim * sizeof(int64_t), y->strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d + 2 * ndim * sizeof(int64_t), y->shape, ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+
+    *desc_ptr = new ExpandCudaDescriptor{
+        DevNvGpu,
+        y->dt,
+        handle->device_id,
+        ndim,
+        y_data_size,
+        static_cast<uint64_t>(handle->prop.maxGridSize[0]),
+        strides_and_shape_d,
+    };
+
+    delete[] x_strides;
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyExpandDescriptor(ExpandCudaDescriptor_t desc) {
+    checkCudaErrorWithCode(cudaFree((void *) desc->strides_and_shape_d), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/expand/cuda/expand.cu b/src/ops/expand/cuda/expand.cu
new file mode 100644
index 00000000..6d75e651
--- /dev/null
+++ b/src/ops/expand/cuda/expand.cu
@@ -0,0 +1,58 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "expand.cuh"
+
+template<typename Tdata>
+__global__ void expand(
+    Tdata *y,
+    const Tdata *x,
+    const int64_t *y_strides,
+    const int64_t *x_strides,
+    const uint64_t *y_shape,
+    uint64_t y_data_size,
+    uint64_t ndim,
+    uint64_t offset) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < y_data_size) {
+        uint64_t y_idx = getOffset(idx, ndim, y_shape, y_strides);
+        y[y_idx] = x[getDstOffset(y_idx, ndim, y_strides, x_strides)];
+    }
+}
+
+template<typename Tdata>
+infiniopStatus_t expand_nv_gpu(ExpandCudaDescriptor_t desc, void *y, void const *x, void *stream) {
+    if (desc->y_data_size == 0) {
+        return STATUS_SUCCESS;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), desc->y_data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->y_data_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    const auto x_ = reinterpret_cast<Tdata const *>(x);
+    const auto y_ = reinterpret_cast<Tdata *>(y);
+    const auto x_strides = reinterpret_cast<int64_t const *>(desc->strides_and_shape_d);
+    const auto y_strides = reinterpret_cast<int64_t const *>(desc->strides_and_shape_d + desc->ndim * sizeof(int64_t));
+    const auto y_shape = reinterpret_cast<uint64_t const *>(desc->strides_and_shape_d + 2 * desc->ndim * sizeof(int64_t));
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < desc->y_data_size; i += step) {
+        expand<Tdata><<<gridDims, blockDims, 0, cuda_stream>>>(
+            y_, x_, y_strides, x_strides, y_shape, i + desc->y_data_size, desc->ndim, i);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaExpand(ExpandCudaDescriptor_t desc,
+                            void *y, void const *x,
+                            void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+    if (desc->dtype == F16) {
+        return expand_nv_gpu<half>(desc, y, x, stream);
+    }
+    if (desc->dtype == F32) {
+        return expand_nv_gpu<float>(desc, y, x, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/expand/cuda/expand.cuh b/src/ops/expand/cuda/expand.cuh
new file mode 100644
index 00000000..17cc1337
--- /dev/null
+++ b/src/ops/expand/cuda/expand.cuh
@@ -0,0 +1,33 @@
+#ifndef __CUDA_EXPAND_H__
+#define __CUDA_EXPAND_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <cuda_fp16.h>
+#include <numeric>
+
+struct ExpandCudaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t y_data_size;
+    uint64_t max_grid_size;
+    char const *strides_and_shape_d;
+};
+
+typedef struct ExpandCudaDescriptor *ExpandCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateExpandDescriptor(CudaHandle_t,
+                                            ExpandCudaDescriptor_t *,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x);
+
+infiniopStatus_t cudaExpand(ExpandCudaDescriptor_t desc,
+                            void *y, void const *x,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyExpandDescriptor(ExpandCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/expand/musa/expand_musa.cc b/src/ops/expand/musa/expand_musa.cc
new file mode 100644
index 00000000..0e2e4581
--- /dev/null
+++ b/src/ops/expand/musa/expand_musa.cc
@@ -0,0 +1,51 @@
+#include "expand_musa.h"
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+
+infiniopStatus_t musaCreateExpandDescriptor(MusaHandle_t handle,
+                                            ExpandMusaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (!isValidBroadcastShape(y, x)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t y_data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    // get the adjusted strides for x in terms of y
+    int64_t *x_strides = new int64_t[ndim];
+    for (size_t i = 0; i < ndim; ++i) {
+        x_strides[i] = (i < ndim - x->ndim || y->shape[i] != x->shape[i + x->ndim - ndim]) ? 0 : x->strides[i + x->ndim - ndim];
+    }
+
+    int64_t *x_strides_d, *y_strides_d;
+    char *strides_and_shape_d;
+    checkMusaErrorWithCode(musaMalloc(&strides_and_shape_d, ndim * (2 * sizeof(int64_t) + sizeof(uint64_t))), STATUS_MEMORY_NOT_ALLOCATED);
+    checkMusaErrorWithCode(musaMemcpy(strides_and_shape_d, x_strides, ndim * sizeof(int64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkMusaErrorWithCode(musaMemcpy(strides_and_shape_d + ndim * sizeof(int64_t), y->strides, ndim * sizeof(int64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+    checkMusaErrorWithCode(musaMemcpy(strides_and_shape_d + 2 * ndim * sizeof(int64_t), y->shape, ndim * sizeof(uint64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
+
+    *desc_ptr = new ExpandMusaDescriptor{
+        DevMthreadsGpu,
+        y->dt,
+        handle->device_id,
+        ndim,
+        y_data_size,
+        static_cast<uint64_t>(handle->prop.maxGridSize[0]),
+        strides_and_shape_d,
+    };
+
+    delete[] x_strides;
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaDestroyExpandDescriptor(ExpandMusaDescriptor_t desc) {
+    checkMusaErrorWithCode(musaFree((void *) desc->strides_and_shape_d), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/expand/musa/expand_musa.h b/src/ops/expand/musa/expand_musa.h
new file mode 100644
index 00000000..8e4651e1
--- /dev/null
+++ b/src/ops/expand/musa/expand_musa.h
@@ -0,0 +1,33 @@
+#ifndef __MUSA_EXPAND_H__
+#define __MUSA_EXPAND_H__
+
+#include "../../../devices/musa/common_musa.h"
+#include "../../../devices/musa/musa_handle.h"
+#include "operators.h"
+#include <musa_fp16.h>
+#include <numeric>
+
+struct ExpandMusaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t y_data_size;
+    uint64_t max_grid_size;
+    char const *strides_and_shape_d;
+};
+
+typedef struct ExpandMusaDescriptor *ExpandMusaDescriptor_t;
+
+infiniopStatus_t musaCreateExpandDescriptor(MusaHandle_t,
+                                            ExpandMusaDescriptor_t *,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x);
+
+infiniopStatus_t musaExpand(ExpandMusaDescriptor_t desc,
+                            void *y, void const *x,
+                            void *stream);
+
+infiniopStatus_t musaDestroyExpandDescriptor(ExpandMusaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/expand/musa/expand_musa.mu b/src/ops/expand/musa/expand_musa.mu
new file mode 100644
index 00000000..4b549541
--- /dev/null
+++ b/src/ops/expand/musa/expand_musa.mu
@@ -0,0 +1,58 @@
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include "expand_musa.h"
+
+template<typename Tdata>
+__global__ void expand(
+    Tdata *y,
+    const Tdata *x,
+    const int64_t *y_strides,
+    const int64_t *x_strides,
+    const uint64_t *y_shape,
+    uint64_t y_data_size,
+    uint64_t ndim,
+    uint64_t offset) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < y_data_size) {
+        uint64_t y_idx = getOffset(idx, ndim, y_shape, y_strides);
+        y[y_idx] = x[getDstOffset(y_idx, ndim, y_strides, x_strides)];
+    }
+}
+
+template<typename Tdata>
+infiniopStatus_t expand_mt_gpu(ExpandMusaDescriptor_t desc, void *y, void const *x, void *stream) {
+    if (desc->y_data_size == 0) {
+        return STATUS_SUCCESS;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), desc->y_data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->y_data_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    const auto x_ = reinterpret_cast<Tdata const *>(x);
+    const auto y_ = reinterpret_cast<Tdata *>(y);
+    const auto x_strides = reinterpret_cast<int64_t const *>(desc->strides_and_shape_d);
+    const auto y_strides = reinterpret_cast<int64_t const *>(desc->strides_and_shape_d + desc->ndim * sizeof(int64_t));
+    const auto y_shape = reinterpret_cast<uint64_t const *>(desc->strides_and_shape_d + 2 * desc->ndim * sizeof(int64_t));
+    musaStream_t musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < desc->y_data_size; i += step) {
+        expand<Tdata><<<gridDims, blockDims, 0, musa_stream>>>(
+            y_, x_, y_strides, x_strides, y_shape, i + desc->y_data_size, desc->ndim, i);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaExpand(ExpandMusaDescriptor_t desc,
+                            void *y, void const *x,
+                            void *stream) {
+    checkMusaError(musaSetDevice(desc->device_id));
+    if (desc->dtype == F16) {
+        return expand_mt_gpu<half>(desc, y, x, stream);
+    }
+    if (desc->dtype == F32) {
+        return expand_mt_gpu<float>(desc, y, x, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/expand/operator.cc b/src/ops/expand/operator.cc
new file mode 100644
index 00000000..b0374645
--- /dev/null
+++ b/src/ops/expand/operator.cc
@@ -0,0 +1,91 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/expand/expand.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/expand_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/expand.cuh"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#include "musa/expand_musa.h"
+#endif
+
+
+__C infiniopStatus_t infiniopCreateExpandDescriptor(
+    infiniopHandle_t handle,
+    infiniopExpandDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateExpandDescriptor(handle, (ExpandCpuDescriptor_t *) desc_ptr, y, x);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateExpandDescriptor((CudaHandle_t) handle, (ExpandCudaDescriptor_t *) desc_ptr, y, x);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCreateExpandDescriptor((MusaHandle_t) handle, (ExpandMusaDescriptor_t *) desc_ptr, y, x);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopExpand(infiniopExpandDescriptor_t desc, void *y, void const *x, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuExpand((ExpandCpuDescriptor_t) desc, y, x, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaExpand((ExpandCudaDescriptor_t) desc, y, x, stream);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaExpand((ExpandMusaDescriptor_t) desc, y, x, stream);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyExpandDescriptor(infiniopExpandDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyExpandDescriptor((ExpandCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyExpandDescriptor((ExpandCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaDestroyExpandDescriptor((ExpandMusaDescriptor_t) desc);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/gemm/operator.cc b/src/ops/gemm/operator.cc
new file mode 100644
index 00000000..7036b032
--- /dev/null
+++ b/src/ops/gemm/operator.cc
@@ -0,0 +1,96 @@
+#include "../utils.h"
+#include "ops/expand/expand.h"
+#include "ops/gemm/gemm.h"
+#include "ops/matmul/matmul.h"
+#include "tensor/tensor_descriptor.h"
+
+struct _GEMMDescriptor {
+    Device device;
+    infiniopMatmulDescriptor_t matmul_desc;
+    infiniopExpandDescriptor_t expand_desc;
+    uint64_t workspace_size;
+};
+
+typedef struct _GEMMDescriptor *_GEMMDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateGEMMDescriptor(infiniopHandle_t handle,
+                                                           infiniopGEMMDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t y_desc,
+                                                           infiniopTensorDescriptor_t a_desc,
+                                                           infiniopTensorDescriptor_t b_desc,
+                                                           infiniopTensorDescriptor_t c_desc,
+                                                           float alpha,
+                                                           float beta,
+                                                           char transA,
+                                                           char transB) {
+    // transpose a and b if needed
+    a_desc = transA ? permute(a_desc, {1, 0}) : a_desc;
+    b_desc = transB ? permute(b_desc, {1, 0}) : b_desc;
+
+    // expand desc
+    infiniopExpandDescriptor_t expand_desc = nullptr;
+
+    // c is optional, set beta to 0 when c is not provided
+    if (!c_desc || c_desc->ndim == 0 || c_desc->shape == nullptr || c_desc->shape[0] == 0) {
+        beta = 0;
+    } else {
+        expand_desc = new ExpandDescriptor{handle->device};
+        CHECK_STATUS(infiniopCreateExpandDescriptor(handle, &expand_desc, y_desc, c_desc), STATUS_SUCCESS);
+    }
+
+    // matmul desc
+    infiniopMatmulDescriptor_t matmul_desc = new MatmulDescriptor{handle->device};
+    CHECK_STATUS(infiniopCreateMatmulDescriptor(handle, &matmul_desc, y_desc, alpha, a_desc, b_desc, beta), STATUS_SUCCESS);
+    uint64_t workspace_size = 0;
+    CHECK_STATUS(infiniopGetMatmulWorkspaceSize(matmul_desc, &workspace_size), STATUS_SUCCESS);
+
+    *(_GEMMDescriptor_t *) desc_ptr = new _GEMMDescriptor{
+        handle->device,
+        matmul_desc,
+        expand_desc,
+        workspace_size,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetGEMMWorkspaceSize(infiniopGEMMDescriptor_t desc, uint64_t *size) {
+    *size = ((_GEMMDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGEMM(infiniopGEMMDescriptor_t desc,
+                                           void *workspace,
+                                           uint64_t workspace_size,
+                                           void *y,
+                                           void const *a,
+                                           void const *b,
+                                           void const *c,
+                                           void *stream) {
+    auto _desc = (_GEMMDescriptor_t) desc;
+    if (workspace_size < _desc->workspace_size) {
+        return STATUS_MEMORY_NOT_ALLOCATED;
+    }
+
+    if (_desc->expand_desc != nullptr) {
+        CHECK_STATUS(infiniopExpand(_desc->expand_desc,
+                                    y, c, stream),
+                     STATUS_SUCCESS);
+    }
+
+    CHECK_STATUS(infiniopMatmul(_desc->matmul_desc,
+                                workspace,
+                                workspace_size,
+                                y, a, b, stream),
+                 STATUS_SUCCESS);
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyGEMMDescriptor(infiniopGEMMDescriptor_t desc) {
+    if (((_GEMMDescriptor_t) desc)->expand_desc) {
+        CHECK_STATUS(infiniopDestroyExpandDescriptor(((_GEMMDescriptor_t) desc)->expand_desc), STATUS_SUCCESS);
+    }
+    CHECK_STATUS(infiniopDestroyMatmulDescriptor(((_GEMMDescriptor_t) desc)->matmul_desc), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.cc b/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.cc
new file mode 100644
index 00000000..7650e1fd
--- /dev/null
+++ b/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.cc
@@ -0,0 +1,84 @@
+#include "global_avg_pool_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+infiniopStatus_t cpuCreateGlobalAvgPoolDescriptor(infiniopHandle_t,
+                                                  GlobalAvgPoolCpuDescriptor_t *desc_ptr,
+                                                  infiniopTensorDescriptor_t y,
+                                                  infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (ndim < 2 || ndim != x->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (size_t i = 0; i < ndim; ++i) {
+        if (i < 2 && y->shape[i] != x->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        } else if (i >= 2 && y->shape[i] != 1) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t y_data_size = std::accumulate(y->shape, y->shape + 2, 1ULL, std::multiplies<uint64_t>());
+    uint64_t x_per_NC_data_size = std::accumulate(x->shape + 2, x->shape + ndim, 1ULL, std::multiplies<uint64_t>());
+
+    *desc_ptr = new GlobalAvgPoolCpuDescriptor{
+        DevCpu,
+        y->dt,
+        y_data_size,
+        x_per_NC_data_size,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolCpuDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t global_avg_pool_cpu(GlobalAvgPoolCpuDescriptor_t desc, void *y, void const *x) {
+    auto x_ = reinterpret_cast<Tdata const *>(x);
+    auto y_ = reinterpret_cast<Tdata *>(y);
+    const auto x_size = desc->x_per_NC_data_size;
+
+#pragma omp parallel for
+    for (uint64_t i = 0; i < desc->y_data_size; ++i) {
+        if constexpr (std::is_same<Tdata, uint16_t>::value) {
+            float sum = std::accumulate(x_ + i * x_size, x_ + (i + 1) * x_size, 0.0f,
+                                        [](float res, uint16_t value) {
+                                            return res + f16_to_f32(value);
+                                        });
+            y_[i] = f32_to_f16(sum / x_size);
+        } else {
+            y_[i] = std::accumulate(x_ + i * x_size, x_ + (i + 1) * x_size, Tdata(0)) / x_size;
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGlobalAvgPool(GlobalAvgPoolCpuDescriptor_t desc,
+                                  void *workspace, uint64_t workspace_size, void *y, void const *x,
+                                  void *stream) {
+    if (desc->dtype == F16) {
+        return global_avg_pool_cpu<uint16_t>(desc, y, x);
+    }
+    if (desc->dtype == F32) {
+        return global_avg_pool_cpu<float>(desc, y, x);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.h b/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.h
new file mode 100644
index 00000000..f370a709
--- /dev/null
+++ b/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.h
@@ -0,0 +1,29 @@
+#ifndef __CPU_GLOBAL_AVG_POOL_H__
+#define __CPU_GLOBAL_AVG_POOL_H__
+
+#include "operators.h"
+#include <numeric>
+
+struct GlobalAvgPoolCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t y_data_size;
+    uint64_t x_per_NC_data_size;
+};
+
+typedef struct GlobalAvgPoolCpuDescriptor *GlobalAvgPoolCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateGlobalAvgPoolDescriptor(infiniopHandle_t,
+                                                  GlobalAvgPoolCpuDescriptor_t *,
+                                                  infiniopTensorDescriptor_t y,
+                                                  infiniopTensorDescriptor_t x);
+
+infiniopStatus_t cpuGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolCpuDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cpuGlobalAvgPool(GlobalAvgPoolCpuDescriptor_t desc,
+                                  void *workspace, uint64_t workspace_size, void *y, void const *x,
+                                  void *stream);
+
+infiniopStatus_t cpuDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/global_avg_pool/cuda/global_avg_pool.cc b/src/ops/global_avg_pool/cuda/global_avg_pool.cc
new file mode 100644
index 00000000..25d7acbe
--- /dev/null
+++ b/src/ops/global_avg_pool/cuda/global_avg_pool.cc
@@ -0,0 +1,197 @@
+#include "global_avg_pool.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateGlobalAvgPoolDescriptor(CudaHandle_t handle,
+                                                   GlobalAvgPoolCudaDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t y,
+                                                   infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (ndim <= 2 || ndim != x->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (size_t i = 0; i < ndim; ++i) {
+        if (i < 2 && y->shape[i] != x->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        } else if (i >= 2 && y->shape[i] != 1) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    // use cuDNN lib call
+    if (x->ndim <= 4) {
+        int n = x->shape[0];
+        int c = x->shape[1];
+        int h = ndim == 3 ? 1 : x->shape[2];
+        int w = ndim == 3 ? x->shape[2] : x->shape[3];
+
+        // get the data types of the tensors and the conv operator
+        CREATE_CHECK_ERROR(auto tensor_dt = dataTypeMap[x->dt], tensor_dt, -1, STATUS_BAD_PARAM);
+
+        // create and set tensor descriptor for x
+        cudnnTensorDescriptor_t x_desc;
+        checkCudnnError(cudnnCreateTensorDescriptor(&x_desc));
+        checkCudnnError(cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, static_cast<cudnnDataType_t>(tensor_dt), n, c, h, w));
+
+        // create and set tensor descriptor for y
+        cudnnTensorDescriptor_t y_desc;
+        checkCudnnError(cudnnCreateTensorDescriptor(&y_desc));
+        checkCudnnError(cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, static_cast<cudnnDataType_t>(tensor_dt), n, c, 1, 1));
+
+        // Create and set pooling descriptor for average pooling
+        cudnnPoolingDescriptor_t pool_desc;
+        checkCudnnError(cudnnCreatePoolingDescriptor(&pool_desc));
+        checkCudnnError(cudnnSetPooling2dDescriptor(pool_desc,
+                                                    CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+                                                    CUDNN_NOT_PROPAGATE_NAN,
+                                                    h,// pooling window height
+                                                    w,// pooling window width
+                                                    0,// vertical padding
+                                                    0,// horizontal padding
+                                                    1,// vertical Stride
+                                                    1 // horizontal stride
+                                                    ));
+        float alpha = 1.0f, beta = 0.0f;
+
+        *desc_ptr = new GlobalAvgPoolCudaDescriptor{
+            DevNvGpu,
+            y->dt,
+            handle->device_id,
+            ndim,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            handle->cudnn_handles_t,
+            x_desc,
+            y_desc,
+            pool_desc,
+            alpha,
+            beta,
+        };
+
+    } else if (x->ndim <= 5) {
+        std::vector<int> x_shape(ndim);
+        std::vector<int> x_strides(ndim);
+        std::vector<int> y_shape(ndim);
+        std::vector<int> y_strides(ndim);
+        std::vector<int> k_shape(ndim - 2);
+        std::vector<int> pads(ndim - 2);
+        std::vector<int> strides(ndim - 2);
+
+#pragma omp parallel for
+        for (size_t i = 0; i < ndim; ++i) {
+            x_shape[i] = static_cast<int>(x->shape[i]);
+            x_strides[i] = static_cast<int>(x->strides[i]);
+            y_shape[i] = static_cast<int>(y->shape[i]);
+            y_strides[i] = static_cast<int>(y->strides[i]);
+            if (i < ndim - 2) {
+                k_shape[i] = static_cast<int>(x->shape[i + 2]);
+                pads[i] = 0;
+                strides[i] = 1;
+            }
+        }
+
+        // get the data types of the tensors and the conv operator
+        CREATE_CHECK_ERROR(auto tensor_dt = dataTypeMap[x->dt], tensor_dt, -1, STATUS_BAD_PARAM);
+
+        // create and set tensor descriptors for x
+        cudnnTensorDescriptor_t x_desc;
+        checkCudnnError(cudnnCreateTensorDescriptor(&x_desc));
+        checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, x_shape.data(), x_strides.data()));
+
+        // Create and set pooling descriptor for average pooling
+        cudnnPoolingDescriptor_t pool_desc;
+        checkCudnnError(cudnnCreatePoolingDescriptor(&pool_desc));
+        checkCudnnError(cudnnSetPoolingNdDescriptor(pool_desc,
+                                                    CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+                                                    CUDNN_NOT_PROPAGATE_NAN,
+                                                    ndim - 2,
+                                                    k_shape.data(),
+                                                    pads.data(),
+                                                    strides.data()));
+        // create and set tensor descriptors for y
+        cudnnTensorDescriptor_t y_desc;
+        checkCudnnError(cudnnCreateTensorDescriptor(&y_desc));
+        checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape.data()));
+        checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, y_shape.data(), y_strides.data()));
+
+        float alpha = 1.0f, beta = 0.0f;
+
+        *desc_ptr = new GlobalAvgPoolCudaDescriptor{
+            DevNvGpu,
+            y->dt,
+            handle->device_id,
+            ndim,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            handle->cudnn_handles_t,
+            x_desc,
+            y_desc,
+            pool_desc,
+            alpha,
+            beta,
+        };
+
+    } else {
+        uint64_t y_data_size = std::accumulate(y->shape, y->shape + 2, 1ULL, std::multiplies<uint64_t>());
+        uint64_t x_per_NC_data_size = std::accumulate(x->shape + 2, x->shape + ndim, 1ULL, std::multiplies<uint64_t>());
+        uint64_t data_size = y_data_size * x_per_NC_data_size;
+
+        unsigned max_block_size = std::min(256, handle->prop.maxThreadsPerBlock);
+        uint64_t max_grid_size = static_cast<uint64_t>(handle->prop.maxGridSize[0]);
+        uint64_t items_per_thread = data_size / (max_block_size * max_grid_size);
+
+        *desc_ptr = new GlobalAvgPoolCudaDescriptor{
+            DevNvGpu,
+            y->dt,
+            handle->device_id,
+            ndim,
+            data_size,
+            y_data_size,
+            x_per_NC_data_size,
+            max_block_size,
+            max_grid_size,
+            items_per_thread,
+            handle->cudnn_handles_t,
+            nullptr,
+            nullptr,
+            nullptr,
+            0,
+            0,
+        };
+    }
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolCudaDescriptor_t desc, uint64_t *size) {
+    *size = desc->ndim <= 5 ? 0 : (desc->dtype != F16 ? 0 : std::min(desc->dtype.size * 2, 8) * desc->y_data_size);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolCudaDescriptor_t desc) {
+    if (desc->ndim <= 5) {
+        checkCudnnError(cudnnDestroyTensorDescriptor(desc->x_desc));
+        checkCudnnError(cudnnDestroyTensorDescriptor(desc->y_desc));
+        checkCudnnError(cudnnDestroyPoolingDescriptor(desc->pool_desc));
+    }
+    desc->cudnn_handles_t = nullptr;
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/global_avg_pool/cuda/global_avg_pool.cu b/src/ops/global_avg_pool/cuda/global_avg_pool.cu
new file mode 100644
index 00000000..ca5965ab
--- /dev/null
+++ b/src/ops/global_avg_pool/cuda/global_avg_pool.cu
@@ -0,0 +1,415 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "global_avg_pool.cuh"
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/cub.cuh>
+
+namespace infini {
+    struct float2_t {
+        float x, y;
+
+        __device__ float2_t() : x(0), y(0) {}
+        __device__ float2_t(int val) : x(static_cast<float>(val)), y(static_cast<float>(val)) {}
+        __device__ float2_t(const float2 &val) : x(val.x), y(val.y) {}
+        __device__ float2_t(const float2_t &other) : x(other.x), y(other.y) {}
+        __device__ float2_t(float x, float y) : x(x), y(y) {}
+
+        __device__ float2_t &operator=(const float2_t &other) {
+            if (this != &other) {
+                this->x = other.x;
+                this->y = other.y;
+            }
+            return *this;
+        }
+
+        __device__ float2_t operator+(const float2_t &other) const {
+            return float2_t{x + other.x, y + other.y};
+        }
+
+        __device__ float operator+(const float &other) const {
+            return x + y + other;
+        }
+
+        __device__ float2_t &operator+=(const float2_t &other) {
+            x += other.x;
+            y += other.y;
+            return *this;
+        }
+
+        __device__ float operator[](size_t index) const {
+            return index == 0 ? x : y;
+        }
+    };
+
+    struct half2 {
+        half x, y;
+
+        __device__ half2 &operator=(const half2 &other) {
+            if (this != &other) {
+                this->x = other.x;
+                this->y = other.y;
+            }
+            return *this;
+        }
+
+        __device__ half2 &operator=(const infini::float2_t &other) {
+            this->x = __float2half(other.x);
+            this->y = __float2half(other.y);
+            return *this;
+        }
+
+        __device__ half2 operator+(const half2 &other) const {
+            return half2{__hadd(x, other.x), __hadd(y, other.y)};
+        }
+
+        __device__ half operator+(const half &other) const {
+            return __hadd(__hadd(x, y), other);
+        }
+
+        __device__ half operator[](size_t index) const {
+            return __hadd(x, y);
+        }
+    };
+
+    struct half4 {
+        __half x, y, z, w;
+
+        __device__ half4 operator+(const half4 &other) const {
+            return half4{__hadd(x, other.x), __hadd(y, other.y), __hadd(z, other.z), __hadd(w, other.w)};
+        }
+    };
+
+    __device__ __forceinline__ infini::float2_t divide(infini::float2_t val, float divisor) {
+        return {val.x / divisor, val.y / divisor};
+    }
+}// namespace infini
+
+
+struct half2float_functor {
+    __device__ __forceinline__ float operator()(half val) const {
+        return __half2float(val);
+    }
+};
+
+struct float2half_functor {
+    __device__ __forceinline__ half operator()(float val) const {
+        return __float2half(val);
+    }
+};
+
+struct half22float_functor {
+    __device__ __forceinline__ float operator()(infini::half2 val) const {
+        return __half2float(val.x) + __half2float(val.y);
+    }
+};
+
+struct float22half2_functor {
+    __device__ __forceinline__ infini::half2 operator()(const infini::float2_t &val) const {
+        return {__float2half(val.x), __float2half(val.y)};
+    }
+};
+
+uint64_t getBlockDim(uint64_t size) {
+    if (size < static_cast<uint64_t>(MAX_THREADS_PER_BLOCK)) {
+        return size;
+    }
+    for (size_t i = MAX_THREADS_PER_BLOCK; i > 1; --i) {
+        if (size % i == 0) {
+            return i;
+        }
+    }
+    return 1;
+}
+
+/** ---------------------------------------- */
+/** ---------------   Sum  ----------------- */
+/** ---------------------------------------- */
+
+template<typename Tdata, typename TIdata, typename Ldata, int BLOCK_SIZE = 256>
+__global__ void sum(
+    Ldata *__restrict__ y,
+    const Tdata *__restrict__ x,
+    uint64_t data_size,
+    uint64_t x_per_NC_data_size,
+    uint64_t blocks_per_y,
+    unsigned remainder,
+    uint64_t offset,
+    unsigned pack_size) {
+    uint64_t block_offset = blockIdx.x / blocks_per_y * x_per_NC_data_size + blockIdx.x % blocks_per_y * blockDim.x * pack_size;
+    uint64_t idx = block_offset + threadIdx.x * pack_size + offset;
+
+    if (idx < data_size) {
+        Tdata thread_data[1];
+
+        using BlockOp = cub::BlockLoad<Tdata, BLOCK_SIZE, 1, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+        __shared__ typename BlockOp::TempStorage load_temp_storage;
+        BlockOp(load_temp_storage).Load(x + block_offset, thread_data);
+
+        using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
+        __shared__ typename BlockReduce::TempStorage reduce_temp_storage;
+        Ldata block_sum;
+        if constexpr (std::is_same<Tdata, half>::value) {
+            block_sum = BlockReduce(reduce_temp_storage).Sum(__half2float(thread_data[0]), blockDim.x);
+        } else {
+            block_sum = BlockReduce(reduce_temp_storage).Sum(Ldata(thread_data[0]), blockDim.x);
+        }
+
+        // add up the remaining elements
+        if (blockIdx.x % blocks_per_y == blocks_per_y - 1) {
+            __shared__ typename BlockOp::TempStorage load_r_temp_storage;
+            BlockOp(load_r_temp_storage).Load(x + block_offset + blockDim.x, thread_data, remainder, 0);
+            if constexpr (std::is_same<Tdata, half>::value) {
+                block_sum += __half2float(BlockReduce(reduce_temp_storage).Sum(__half2float(thread_data[0]), blockDim.x));
+            } else {
+                block_sum += BlockReduce(reduce_temp_storage).Sum(Ldata(thread_data[0]), remainder);
+            }
+        }
+
+        __syncthreads();
+
+        if (threadIdx.x == 0) {
+            atomicAdd(&y[idx / x_per_NC_data_size], block_sum);
+        }
+    }
+}
+
+template<typename Xdata, typename Ydata>
+void _sum_nv_gpu(Ydata *y, Xdata const *x, uint64_t data_size, uint64_t x_per_NC_data_size,
+                 unsigned int pack_size, uint64_t max_grid_size, void *stream) {
+    if (data_size == 0) {
+        return;
+    }
+    dim3 blockDims = dim3(256);
+    dim3 gridDims = dim3(std::min(data_size / blockDims.x, max_grid_size));
+    uint64_t blocks_per_y = x_per_NC_data_size / blockDims.x;
+    unsigned int remainder = x_per_NC_data_size % blockDims.x;
+
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    sum<Xdata, Ydata><<<gridDims, blockDims, 0, cuda_stream>>>(y, x, data_size, x_per_NC_data_size, blocks_per_y, remainder, 0, pack_size);
+}
+
+template<typename Xdata, typename XIdata, typename Ydata, typename YIdata>
+void sum_nv_gpu(void *y, void const *x, uint64_t data_size, uint64_t x_per_NC_data_size, unsigned int pack_size, uint64_t max_grid_size, void *stream) {
+    const auto x_ = reinterpret_cast<Xdata const *>(x);
+    const auto y_ = reinterpret_cast<Ydata *>(y);
+    _sum_nv_gpu<Xdata, Ydata>(y_, x_, data_size, x_per_NC_data_size, pack_size, max_grid_size, stream);
+}
+
+/** ---------------------------------------- */
+/** --------------   Reset  ---------------- */
+/** ---------------------------------------- */
+template<typename Tdata>
+__global__ void reset(
+    Tdata *__restrict__ dst,
+    uint64_t data_size,
+    uint64_t offset,
+    unsigned int pack_size) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < data_size) {
+        dst[idx] = Tdata(0);
+    }
+}
+
+template<typename Tdata>
+void _reset_nv_gpu(Tdata *x, uint64_t data_size, unsigned int pack_size, uint64_t offset, uint64_t max_grid_size, void *stream) {
+    if (data_size == 0) {
+        return;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < data_size; i += step) {
+        reset<Tdata><<<gridDims, blockDims, 0, cuda_stream>>>(x, offset + data_size, offset + i, pack_size);
+    }
+}
+
+template<typename Tdata, typename TIdata>
+void reset_nv_gpu(void *x, uint64_t data_size, unsigned int pack_size, uint64_t max_grid_size, void *stream) {
+    const auto packed_data_size = data_size / pack_size;
+    const auto x_vec = reinterpret_cast<Tdata *>(x);
+    _reset_nv_gpu<Tdata>(x_vec, packed_data_size, pack_size, 0, max_grid_size, stream);
+
+    const auto remainder = data_size % pack_size;
+    const auto x_ = reinterpret_cast<TIdata *>(x);
+    _reset_nv_gpu<TIdata>(x_, remainder, 1, data_size * pack_size, max_grid_size, stream);
+}
+
+
+/** ---------------------------------------- */
+/** -------------   Average  --------------- */
+/** ---------------------------------------- */
+template<typename Xdata, typename Ydata>
+__global__ void average(
+    Ydata *y,
+    Xdata const *x,
+    uint64_t data_size,
+    uint64_t x_per_NC_data_size,
+    uint64_t offset,
+    unsigned pack_size) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < data_size) {
+        if constexpr (std::is_same<Xdata, half>::value && std::is_same<Ydata, half>::value) {
+            y[idx] = __float2half(__half2float(x[idx]) / x_per_NC_data_size);
+        } else if constexpr (std::is_same<Ydata, half>::value) {
+            y[idx] = __float2half(x[idx] / x_per_NC_data_size);
+        } else if constexpr (std::is_same<Xdata, half>::value) {
+            y[idx] = __half2float(x[idx]) / x_per_NC_data_size;
+        } else {
+            y[idx] = x[idx] / x_per_NC_data_size;
+        }
+    }
+}
+
+template<typename Xdata, typename Ydata>
+void _average_nv_gpu(Ydata *y, Xdata const *x, uint64_t data_size, uint64_t x_per_NC_data_size,
+                     unsigned int pack_size, uint64_t offset, uint64_t max_grid_size, void *stream) {
+    if (data_size == 0) {
+        return;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < data_size; i += step) {
+        average<Xdata, Ydata><<<gridDims, blockDims, 0, cuda_stream>>>(y, x, offset + data_size, x_per_NC_data_size, offset + i, pack_size);
+    }
+}
+
+template<typename Xdata, typename XIdata, typename Ydata, typename YIdata>
+void average_nv_gpu(void *y, void const *x, uint64_t data_size, uint64_t x_per_NC_data_size, unsigned int pack_size, uint64_t max_grid_size, void *stream) {
+    const auto packed_data_size = data_size / pack_size;
+    const auto x_vec = reinterpret_cast<Xdata const *>(x);
+    const auto y_vec = reinterpret_cast<Ydata *>(y);
+    _average_nv_gpu<Xdata, Ydata>(y_vec, x_vec, packed_data_size, x_per_NC_data_size, pack_size, 0, max_grid_size, stream);
+
+    const auto remainder = data_size % pack_size;
+    const auto x_ = reinterpret_cast<XIdata const *>(x);
+    const auto y_ = reinterpret_cast<YIdata *>(y);
+    _average_nv_gpu<XIdata, YIdata>(y_, x_, remainder, x_per_NC_data_size, 1, data_size * pack_size, max_grid_size, stream);
+}
+
+
+/** ---------------------------------------- */
+/** ---------   Global Avg Pool  ----------- */
+/** ---------------------------------------- */
+
+template<typename Tdata, typename TIdata, typename Ldata, typename LIdata, unsigned int BLOCK_SIZE>
+__global__ void global_avg_pool_padding(
+    Tdata *__restrict__ y,
+    Tdata const *__restrict__ x,
+    uint64_t data_size,
+    uint64_t x_per_NC_data_size,
+    uint64_t offset,
+    unsigned pack_size) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < data_size) {
+        Tdata thread_data[1];
+
+        using BlockOp = cub::BlockLoad<Tdata, BLOCK_SIZE, 1, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+        __shared__ typename BlockOp::TempStorage load_temp_storage;
+        BlockOp(load_temp_storage).Load(x + blockIdx.x * blockDim.x, thread_data);
+
+        using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
+        __shared__ typename BlockReduce::TempStorage reduce_temp_storage;
+        Ldata block_sum = BlockReduce(reduce_temp_storage).Sum(Ldata(thread_data[0]), blockDim.x);
+
+        if (threadIdx.x == 0) {
+            y[blockIdx.x] = Tdata(block_sum / x_per_NC_data_size);
+        }
+    }
+}
+
+template<typename Tdata, typename TIdata, typename Ldata, typename LIdata>
+void launch_global_avg_pool_padding(GlobalAvgPoolCudaDescriptor_t desc, Tdata *y, Tdata const *x, void *stream, unsigned pack_size) {
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(desc->max_block_size), desc->x_per_NC_data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->data_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < desc->data_size; i += step) {
+        global_avg_pool_padding<Tdata, TIdata, Ldata, LIdata, 256><<<gridDims, blockDims, 0, cuda_stream>>>(
+            y, x, desc->data_size, desc->x_per_NC_data_size, i, pack_size);
+    }
+}
+
+
+template<typename Tdata, typename TIdata, unsigned int BLOCK_SIZE>
+void global_avg_pool_folding_direct(GlobalAvgPoolCudaDescriptor_t desc, void *y, void const *x, void *stream, unsigned pack_size) {
+    reset_nv_gpu<Tdata, TIdata>(y, desc->y_data_size, pack_size, desc->max_grid_size, stream);
+    sum_nv_gpu<Tdata, TIdata, Tdata, TIdata>(y, x, desc->data_size, desc->x_per_NC_data_size, pack_size, desc->max_grid_size, stream);
+    average_nv_gpu<Tdata, TIdata, Tdata, TIdata>(y, y, desc->y_data_size, desc->x_per_NC_data_size, pack_size, desc->max_grid_size, stream);
+}
+
+template<typename Tdata, typename TIdata, typename Ldata, typename LIdata, unsigned int BLOCK_SIZE>
+void global_avg_pool_folding_workspace(GlobalAvgPoolCudaDescriptor_t desc, void *y, void const *x, void *workspace, void *stream, unsigned pack_size) {
+    reset_nv_gpu<Ldata, LIdata>(workspace, desc->y_data_size, pack_size, desc->max_grid_size, stream);
+    sum_nv_gpu<Tdata, TIdata, Ldata, LIdata>(workspace, x, desc->data_size, desc->x_per_NC_data_size, pack_size, desc->max_grid_size, stream);
+    average_nv_gpu<Ldata, LIdata, Tdata, TIdata>(y, workspace, desc->y_data_size, desc->x_per_NC_data_size, pack_size, desc->max_grid_size, stream);
+}
+
+// launch folding functions based on workspace size
+template<typename Tdata, typename TIdata, typename Ldata, typename LIdata>
+void launch_global_avg_pool_folding(GlobalAvgPoolCudaDescriptor_t desc, void *y, void const *x, void *workspace, uint64_t workspace_size, void *stream, unsigned pack_size) {
+    if (workspace_size == 0) {
+        global_avg_pool_folding_direct<Tdata, TIdata, 256>(desc, y, x, stream, pack_size);
+    } else {
+        global_avg_pool_folding_workspace<Tdata, TIdata, Ldata, LIdata, 256>(desc, y, x, workspace, stream, pack_size);
+    }
+}
+
+// global average pool for high dimensional data (ndim > 4)
+template<typename Tdata, typename TIdata, typename Ldata, typename LIdata>
+void global_avg_pool_nv_gpu_hd(GlobalAvgPoolCudaDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream, unsigned pack_size) {
+    if (desc->data_size == 0) {
+        return;
+    }
+    if (desc->x_per_NC_data_size <= desc->max_block_size) {
+        const auto y_ = reinterpret_cast<Tdata *>(y);
+        const auto x_ = reinterpret_cast<Tdata const *>(x);
+        launch_global_avg_pool_padding<Tdata, TIdata, Ldata, LIdata>(desc, y_, x_, stream, pack_size);
+    } else {
+        launch_global_avg_pool_folding<Tdata, TIdata, Ldata, LIdata>(desc, y, x, workspace, workspace_size, stream, pack_size);
+    }
+}
+
+template<typename Tdata, typename TIdata, typename Ldata, typename LIdata>
+infiniopStatus_t global_avg_pool_nv_gpu(GlobalAvgPoolCudaDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream, unsigned pack_size) {
+    // use cuDNN lib
+    if (desc->ndim <= 5) {
+        checkCudnnError(use_cudnn(desc->cudnn_handles_t, desc->device_id, (cudaStream_t) stream,
+                                  [&](cudnnHandle_t handle) { return cudnnPoolingForward(handle, desc->pool_desc,
+                                                                                         &desc->alpha, desc->x_desc, x, &desc->beta,
+                                                                                         desc->y_desc, y); }));
+    } else {
+        global_avg_pool_nv_gpu_hd<Tdata, TIdata, Ldata, LIdata>(desc, workspace, workspace_size, y, x, stream, pack_size);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGlobalAvgPool(GlobalAvgPoolCudaDescriptor_t desc,
+                                   void *workspace, uint64_t workspace_size,
+                                   void *y, void const *x,
+                                   void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+    if (desc->dtype == F16) {
+        return global_avg_pool_nv_gpu<half, half, float, float>(desc, workspace, workspace_size, y, x, stream, 1);
+    }
+    if (desc->dtype == F32) {
+        return global_avg_pool_nv_gpu<float, float, float, float>(desc, workspace, workspace_size, y, x, stream, 1);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/global_avg_pool/cuda/global_avg_pool.cuh b/src/ops/global_avg_pool/cuda/global_avg_pool.cuh
new file mode 100644
index 00000000..cd97be5b
--- /dev/null
+++ b/src/ops/global_avg_pool/cuda/global_avg_pool.cuh
@@ -0,0 +1,46 @@
+#ifndef __CUDA_GLOBAL_AVG_POOL_H__
+#define __CUDA_GLOBAL_AVG_POOL_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <numeric>
+#include <vector>
+
+struct GlobalAvgPoolCudaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t data_size;
+    uint64_t y_data_size;
+    uint64_t x_per_NC_data_size;
+    unsigned max_block_size;
+    uint64_t max_grid_size;
+    uint64_t items_per_thread;
+    std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handles_t;
+    cudnnTensorDescriptor_t const x_desc;
+    cudnnTensorDescriptor_t const y_desc;
+    cudnnPoolingDescriptor_t const pool_desc;
+    const float alpha;
+    const float beta;
+};
+
+typedef struct GlobalAvgPoolCudaDescriptor *GlobalAvgPoolCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateGlobalAvgPoolDescriptor(CudaHandle_t,
+                                                   GlobalAvgPoolCudaDescriptor_t *,
+                                                   infiniopTensorDescriptor_t y,
+                                                   infiniopTensorDescriptor_t x);
+
+infiniopStatus_t cudaGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolCudaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cudaGlobalAvgPool(GlobalAvgPoolCudaDescriptor_t desc,
+                                   void *workspace, uint64_t workspace_size, void *y, void const *x,
+                                   void *stream);
+
+infiniopStatus_t cudaDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/global_avg_pool/operator.cc b/src/ops/global_avg_pool/operator.cc
new file mode 100644
index 00000000..92484283
--- /dev/null
+++ b/src/ops/global_avg_pool/operator.cc
@@ -0,0 +1,95 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/global_avg_pool/global_avg_pool.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/global_avg_pool_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/global_avg_pool.cuh"
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+// TODO: Cambricon
+#endif
+
+__C infiniopStatus_t infiniopCreateGlobalAvgPoolDescriptor(
+    infiniopHandle_t handle,
+    infiniopGlobalAvgPoolDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateGlobalAvgPoolDescriptor(handle, (GlobalAvgPoolCpuDescriptor_t *) desc_ptr, y, x);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateGlobalAvgPoolDescriptor((CudaHandle_t) handle, (GlobalAvgPoolCudaDescriptor_t *) desc_ptr, y, x);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopGetGlobalAvgPoolWorkspaceSize(infiniopGlobalAvgPoolDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGetGlobalAvgPoolWorkspaceSize((GlobalAvgPoolCpuDescriptor_t) desc, size);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaGetGlobalAvgPoolWorkspaceSize((GlobalAvgPoolCudaDescriptor_t) desc, size);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO: Cambricon support
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+
+__C infiniopStatus_t infiniopGlobalAvgPool(infiniopGlobalAvgPoolDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGlobalAvgPool((GlobalAvgPoolCpuDescriptor_t) desc, workspace, workspace_size, y, x, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaGlobalAvgPool((GlobalAvgPoolCudaDescriptor_t) desc, workspace, workspace_size, y, x, stream);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyGlobalAvgPoolDescriptor(infiniopGlobalAvgPoolDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyGlobalAvgPoolDescriptor((GlobalAvgPoolCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyGlobalAvgPoolDescriptor((GlobalAvgPoolCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/matmul/ascend/matmul_aclnn.cc b/src/ops/matmul/ascend/matmul_aclnn.cc
new file mode 100644
index 00000000..1502469e
--- /dev/null
+++ b/src/ops/matmul/ascend/matmul_aclnn.cc
@@ -0,0 +1,137 @@
+#include "matmul_aclnn.h"
+
+MatmulAclnnDescriptor::MatmulAclnnDescriptor(Device _device) {
+    device = _device;
+    device_id = 0;
+    executor = nullptr;
+    info = nullptr;
+    cDesc = new aclnnTensorDescriptor();
+    aDesc = new aclnnTensorDescriptor();
+    bDesc = new aclnnTensorDescriptor();
+    alpha = 1.0;
+    beta = 0;
+    mt = 1;
+    workspaceSize = 0;
+}
+
+infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
+                                             MatmulAclnnDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t c_desc,
+                                             float alpha,
+                                             infiniopTensorDescriptor_t a_desc,
+                                             infiniopTensorDescriptor_t b_desc,
+                                             float beta,
+                                             int8_t mt) {
+    DT dtype = c_desc->dt;
+    if (dtype != F16 && dtype != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    *desc_ptr = new MatmulAclnnDescriptor(handle->device);
+    (*desc_ptr)->device_id = handle->device_id;
+    (*desc_ptr)->dtype = dtype;
+    (*desc_ptr)->mt = mt;
+    (*desc_ptr)->alpha = alpha;
+    (*desc_ptr)->beta = beta;
+    infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
+    auto info = new MatmulInfo(c_desc, a_desc, b_desc, status, false);
+    if (*status != STATUS_SUCCESS) {
+        return *status;
+    }
+    (*desc_ptr)->info = info;
+
+    auto &cDesc = (*desc_ptr)->cDesc;
+    auto &aDesc = (*desc_ptr)->aDesc;
+    auto &bDesc = (*desc_ptr)->bDesc;
+
+    // Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched operation
+    CHECK_STATUS(cDesc->setDescriptor(toAclDataType(c_desc->dt), {info->c_matrix.rows, info->c_matrix.cols}, {info->c_matrix.row_stride, info->c_matrix.col_stride}), STATUS_SUCCESS);
+    CHECK_STATUS(aDesc->setDescriptor(toAclDataType(a_desc->dt), {info->a_matrix.rows, info->a_matrix.cols}, {info->a_matrix.row_stride, info->a_matrix.col_stride}), STATUS_SUCCESS);
+    CHECK_STATUS(bDesc->setDescriptor(toAclDataType(b_desc->dt), {info->b_matrix.rows, info->b_matrix.cols}, {info->b_matrix.row_stride, info->b_matrix.col_stride}), STATUS_SUCCESS);
+
+    CHECK_STATUS(cDesc->createTensor(), STATUS_SUCCESS);
+    CHECK_STATUS(aDesc->createTensor(), STATUS_SUCCESS);
+    CHECK_STATUS(bDesc->createTensor(), STATUS_SUCCESS);
+
+
+    auto &workspaceSize = (*desc_ptr)->workspaceSize;
+    auto &executor = (*desc_ptr)->executor;
+
+    aclTensor *tc = cDesc->t;
+    aclTensor *ta = aDesc->t;
+    aclTensor *tb = bDesc->t;
+
+    aclnnStatus ret;
+
+
+    int64_t transA = 0;
+    int64_t transB = 0;
+    // aclnnGemm support C = alpha * A @ B + beta * C
+    // see https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
+    ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, (*desc_ptr)->alpha, (*desc_ptr)->beta, transA, transB, tc,
+                                    (*desc_ptr)->mt, &workspaceSize, &executor);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+    aclSetAclOpExecutorRepeatable(executor);
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
+                                             uint64_t *size) {
+    *size = desc->workspaceSize;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *c,
+                             void const *a,
+                             void const *b,
+                             void *stream) {
+    auto &cDesc = desc->cDesc;
+    auto &aDesc = desc->aDesc;
+    auto &bDesc = desc->bDesc;
+
+    aclTensor *tc = cDesc->t;
+    aclTensor *ta = aDesc->t;
+    aclTensor *tb = bDesc->t;
+
+    auto batch = desc->info->batch;
+
+    auto &executor = desc->executor;
+    auto &workspaceSize = desc->workspaceSize;
+
+    // Set runing on handle device
+    aclrtSetDevice(desc->device_id);
+
+    for (int i = 0; i < batch; i++) {
+        AclSetTensorAddr(executor, 0, ta, (char *) (a) + i * desc->info->a_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 1, tb, (char *) (b) + i * desc->info->b_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 2, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 3, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
+        aclnnStatus ret = aclnnGemm(workspace,
+                                    workspaceSize,
+                                    executor,
+                                    stream);
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
+                  return STATUS_EXECUTION_FAILED);
+    }
+
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc) {
+    delete desc->cDesc;
+    delete desc->bDesc;
+    delete desc->aDesc;
+    delete desc->info;
+    aclDestroyAclOpExecutor(desc->executor);
+    delete desc;
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/matmul/ascend/matmul_aclnn.h b/src/ops/matmul/ascend/matmul_aclnn.h
new file mode 100644
index 00000000..41ce92b0
--- /dev/null
+++ b/src/ops/matmul/ascend/matmul_aclnn.h
@@ -0,0 +1,55 @@
+#ifndef __ACLNN_MATMUL_H__
+#define __ACLNN_MATMUL_H__
+
+#include "../../../devices/ascend/ascend_handle.h"
+#include "../../../devices/ascend/tensor_aclnn.h"
+#include "../../utils.h"
+#include "../blas.h"
+#include "operators.h"
+#include <acl/acl_base.h>
+#include <aclnn/acl_meta.h>
+#include <aclnnop/level2/aclnn_gemm.h>
+#include <aclnnop/aclnn_matmul.h>
+
+struct MatmulAclnnDescriptor {
+    Device device;
+    int device_id;
+    aclOpExecutor* executor;
+    MatmulInfo* info;
+    DT dtype;
+    aclnnTensorDescriptor_t cDesc, aDesc, bDesc;
+    // cubeMathType
+    // see doc: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
+    float alpha;
+    float beta;
+    int8_t mt;
+    uint64_t workspaceSize;
+
+    MatmulAclnnDescriptor(Device _device);
+};
+
+typedef struct MatmulAclnnDescriptor *MatmulAclnnDescriptor_t;
+
+infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
+                                             MatmulAclnnDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t c_desc,
+                                             float alpha,
+                                             infiniopTensorDescriptor_t a_desc,
+                                             infiniopTensorDescriptor_t b_desc,
+                                             float beta,
+                                             int8_t cubeMathType);
+
+infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
+                                             uint64_t *size);
+
+infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *c,
+                             const void *a,
+                             const void *b,
+                             void *stream);
+
+infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc);
+
+#endif
diff --git a/src/ops/matmul/bang/matmul_cnnl.cc b/src/ops/matmul/bang/matmul_cnnl.cc
index 05a2760a..6b7948c1 100644
--- a/src/ops/matmul/bang/matmul_cnnl.cc
+++ b/src/ops/matmul/bang/matmul_cnnl.cc
@@ -1,19 +1,20 @@
 ﻿#include "matmul_cnnl.h"
+#include "../../../devices/bang/bang_handle.h"
 #include "../../../devices/bang/common_bang.h"
-#include "../../../devices/bang/handle_pool.h"
 #include "../../utils.h"
 #include "cnrt.h"
-
-MatmulBangDescriptor::MatmulBangDescriptor(Device device) {
-    this->device = device;
-    get_cnnl_pool();
-}
-
-void matmul_cnnl_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream) {
-    auto info = MatmulInfo(c, a, b, false);
-
-    int32_t use_stride = true;
-
+infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
+                                            MatmulBangDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            float alpha,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc,
+                                            float beta) {
+    infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, status, false);
+    if (*status != STATUS_SUCCESS) {
+        return *status;
+    }
     cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
     cnnlCreateTensorDescriptor(&aDesc);
     cnnlCreateTensorDescriptor(&bDesc);
@@ -28,36 +29,75 @@ void matmul_cnnl_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, void
     cnnlMatMulHeuristicResult_t algoResult;
     cnnlMatMulDescCreate(&opDesc);
     cnnlMatMulAlgoCreate(&algo);
-    cnnlCreateMatMulHeuristicResult(&algoResult);    
-
+    cnnlCreateMatMulHeuristicResult(&algoResult);
+    int32_t use_stride = true;
     cnnlSetMatMulDescAttr(opDesc, CNNL_MATMUL_USE_STRIDE, &use_stride,
                           sizeof(int32_t));
+    *desc_ptr = new MatmulBangDescriptor{
+        handle->device,
+        handle->device_id,
+        info,
+        alpha,
+        beta,
+        c_desc->dt,
+        handle->cnnl_handles,
+        aDesc,
+        bDesc,
+        cDesc,
+        opDesc,
+        algo,
+        algoResult};
+    return STATUS_SUCCESS;
+}
+infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
 
+infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc) {
+    desc->cnnl_handles = nullptr;
+    cnnlDestroyTensorDescriptor(desc->aDesc);
+    cnnlDestroyTensorDescriptor(desc->bDesc);
+    cnnlDestroyTensorDescriptor(desc->cDesc);
+    cnnlMatMulDescDestroy(desc->opDesc);
+    cnnlMatMulAlgoDestroy(desc->algo);
+    cnnlDestroyMatMulHeuristicResult(desc->algoResult);
+    delete desc;
+    return STATUS_SUCCESS;
+}
 
-    void *workspace;
+void matmul_cnnl_f16(MatmulBangDescriptor_t desc, void *workspace, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
+    auto info = desc->info;
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
 
-    use_cnnl((cnrtQueue_t) stream,
+    use_cnnl(desc->cnnl_handles, desc->device_id, (cnrtQueue_t) stream,
              [&](cnnlHandle_t handle) {
                  int count = 0;
-                 cnnlGetBatchMatMulAlgoHeuristic(handle, opDesc, aDesc,
-                                                 bDesc, cDesc,
-                                                 NULL, 1, &algoResult, &count);
+                 cnnlGetBatchMatMulAlgoHeuristic(handle, desc->opDesc, desc->aDesc,
+                                                 desc->bDesc, desc->cDesc,
+                                                 NULL, 1, &desc->algoResult, &count);
                  size_t wsSize;
-                 cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &wsSize);
+                 cnnlGetBatchMatMulHeuristicResult(desc->algoResult, desc->algo, &wsSize);
                  cnrtMalloc(&workspace, wsSize);
-                 cnnlBatchMatMulBCast_v2(handle, opDesc, algo,
-                                         &alpha, aDesc, info.a_ptr,
-                                         bDesc, info.b_ptr,
-                                         &beta, cDesc, info.c_ptr,
+                 cnnlBatchMatMulBCast_v2(handle, desc->opDesc, desc->algo,
+                                         &alpha, desc->aDesc, a,
+                                         desc->bDesc, b,
+                                         &beta, desc->cDesc, c,
                                          workspace, wsSize);
              });
-
-    cnrtFree(workspace);
-
-    cnnlDestroyTensorDescriptor(aDesc);
-    cnnlDestroyTensorDescriptor(bDesc);
-    cnnlDestroyTensorDescriptor(cDesc);
-    cnnlMatMulDescDestroy(opDesc);
-    cnnlMatMulAlgoDestroy(algo);
-    cnnlDestroyMatMulHeuristicResult(algoResult);
+}
+infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream) {
+    if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    float alpha = desc->alpha;
+    float beta = desc->beta;
+    if (dtype_eq(desc->dtype, F16)) {
+        matmul_cnnl_f16(desc, workspace, c, beta, a, b, alpha, stream);
+        cnrtQueueSync((cnrtQueue_t)stream);
+        return STATUS_SUCCESS;
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
 }
diff --git a/src/ops/matmul/bang/matmul_cnnl.h b/src/ops/matmul/bang/matmul_cnnl.h
index 66ef8f71..70830450 100644
--- a/src/ops/matmul/bang/matmul_cnnl.h
+++ b/src/ops/matmul/bang/matmul_cnnl.h
@@ -1,6 +1,6 @@
 #ifndef __CNNL_MATMUL_H__
 #define __CNNL_MATMUL_H__
-
+#include "../../../devices/bang/bang_handle.h"
 #include "../blas.h"
 #include "cnnl.h"
 #include "cnnl_extra.h"
@@ -8,8 +8,34 @@
 
 struct MatmulBangDescriptor {
     Device device;
-    MatmulBangDescriptor(Device device);
+    int device_id;
+    MatmulInfo info;
+    float alpha;
+    float beta;
+    DT dtype;
+    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
+    cnnlTensorDescriptor_t aDesc;
+    cnnlTensorDescriptor_t bDesc;
+    cnnlTensorDescriptor_t cDesc;
+    cnnlMatMulDescriptor_t opDesc;
+    cnnlMatMulAlgo_t algo;
+    cnnlMatMulHeuristicResult_t algoResult;
 };
+typedef struct MatmulBangDescriptor *MatmulBangDescriptor_t;
+
+infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
+                                            MatmulBangDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            float alpha,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc,
+                                            float beta);
+
+infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream);
+
+infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc);
 
 inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &matrix, bool trans = false) {
     int ndim = matrix.ndim;
@@ -33,6 +59,5 @@ inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &mat
     }
 }
 
-void matmul_cnnl_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream);
 
 #endif// __CNNL_MATMUL_H__
diff --git a/src/ops/matmul/blas.h b/src/ops/matmul/blas.h
index 36fca6fd..7882dba2 100644
--- a/src/ops/matmul/blas.h
+++ b/src/ops/matmul/blas.h
@@ -17,31 +17,34 @@ typedef struct BlasMatrix {
 
     BlasMatrix() {}
 
-    BlasMatrix(TensorLayout *layout) {
+    BlasMatrix(infiniopTensorDescriptor_t layout, infiniopStatus_t *status) {
         if (layout->ndim == 2) {
             this->ndim = 2;
             this->batch = 1;
             this->stride = 0;
             this->rows = layout->shape[0];
             this->cols = layout->shape[1];
-            this->row_stride = layout->strides[0] / layout->dt.size;
-            this->col_stride = layout->strides[1] / layout->dt.size;
+            this->row_stride = layout->strides[0];
+            this->col_stride = layout->strides[1];
         } else if (layout->ndim == 3) {
             this->ndim = 3;
             this->batch = layout->shape[0];
-            this->stride = this->batch == 1 ? 0 : layout->strides[0] / layout->dt.size;
+            this->stride = this->batch == 1 ? 0 : layout->strides[0];
             this->rows = layout->shape[1];
             this->cols = layout->shape[2];
-            this->row_stride = layout->strides[1] / layout->dt.size;
-            this->col_stride = layout->strides[2] / layout->dt.size;
+            this->row_stride = layout->strides[1];
+            this->col_stride = layout->strides[2];
         } else {
-            PANIC(InvalidMatrixShape);
+            *status = STATUS_BAD_TENSOR_SHAPE;
+            return;
         }
 
         if (this->row_stride != 1 && this->col_stride != 1) {
-            ASSERT(false);
-            PANIC(MatrixIsNotContiguous);
+            *status = STATUS_BAD_TENSOR_STRIDES;
+            return;
         }
+
+        *status = STATUS_SUCCESS;
     }
 
     bool match_batch(int batch) const {
@@ -67,20 +70,23 @@ struct MatmulInfo {
     BlasMatrix b_matrix;
     BlasMatrix c_matrix;
 
-    void const *a_ptr;
-    void const *b_ptr;
-    void *c_ptr;
-
     int m, n, k, batch;
 
-    MatmulInfo(Tensor c, Tensor a, Tensor b, bool col_major = true) {
-        a_matrix = BlasMatrix(a.layout);
-        b_matrix = BlasMatrix(b.layout);
-        c_matrix = BlasMatrix(c.layout);
+    bool is_transed = false;
 
-        a_ptr = a.data;
-        b_ptr = b.data;
-        c_ptr = c.data;
+    MatmulInfo(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, infiniopStatus_t *status, bool col_major = true) {
+        a_matrix = BlasMatrix(a_desc, status);
+        if (*status != STATUS_SUCCESS) {
+            return;
+        }
+        b_matrix = BlasMatrix(b_desc, status);
+        if (*status != STATUS_SUCCESS) {
+            return;
+        }
+        c_matrix = BlasMatrix(c_desc, status);
+        if (*status != STATUS_SUCCESS) {
+            return;
+        }
 
         ASSERT_EQ(c_matrix.rows, a_matrix.rows);// m
         ASSERT_EQ(c_matrix.cols, b_matrix.cols);// n
@@ -88,7 +94,8 @@ struct MatmulInfo {
 
         batch = c_matrix.batch;
         if (!a_matrix.match_batch(batch) || !b_matrix.match_batch(batch)) {
-            PANIC(InvalidBatchSize);
+            *status = STATUS_BAD_PARAM;
+            return;
         }
 
         if ((col_major && c_matrix.col_stride == 1) || (!col_major && c_matrix.row_stride == 1)) {
@@ -96,7 +103,7 @@ struct MatmulInfo {
             b_matrix.transpose();
             a_matrix.transpose();
             std::swap(a_matrix, b_matrix);
-            std::swap(a_ptr, b_ptr);
+            is_transed = true;
         }
 
         m = c_matrix.rows;
diff --git a/src/ops/matmul/cpu/matmul_cpu.cc b/src/ops/matmul/cpu/matmul_cpu.cc
index 000e0df0..2dcc9d2e 100644
--- a/src/ops/matmul/cpu/matmul_cpu.cc
+++ b/src/ops/matmul/cpu/matmul_cpu.cc
@@ -1,24 +1,94 @@
 #include "matmul_cpu.h"
 #include "../../../devices/cpu/common_cpu.h"
 #include "../../utils.h"
-#include "../blas.h"
 #include <cmath>
 
-void matmul_cpu_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha) {
-    auto info = MatmulInfo(c, a, b);
+infiniopStatus_t cpuCreateMatmulDescriptor(CpuHandle_t handle,
+                                           MatmulCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t c_desc,
+                                           float alpha,
+                                           infiniopTensorDescriptor_t a_desc,
+                                           infiniopTensorDescriptor_t b_desc,
+                                           float beta) {
+    DT dtype = c_desc->dt;
+
+    if (dtype != F16 && dtype != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, status);
+    if (*status != STATUS_SUCCESS) {
+        return *status;
+    }
+
+    *desc_ptr = new MatmulCpuDescriptor{
+        DevCpu,
+        dtype,
+        info,
+        alpha,
+        beta};
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha) {
+    auto info = desc->info;
+
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
 
     for (int i = 0; i < info.batch; ++i) {
         for (int m_ = 0; m_ < info.m; ++m_) {
             for (int n_ = 0; n_ < info.n; ++n_) {
-                auto c_ = reinterpret_cast<uint16_t *>(info.c_ptr) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride;
+                auto c_ = reinterpret_cast<Tdata *>(c) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride;
                 float sum = 0;
                 for (int k_ = 0; k_ < info.k; ++k_) {
-                    auto a_ = reinterpret_cast<uint16_t const *>(info.a_ptr) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride;
-                    auto b_ = reinterpret_cast<uint16_t const *>(info.b_ptr) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride;
-                    sum += f16_to_f32(*a_) * f16_to_f32(*b_);
+                    auto a_ = reinterpret_cast<Tdata const *>(a) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride;
+                    auto b_ = reinterpret_cast<Tdata const *>(b) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride;
+                    if constexpr (std::is_same<Tdata, uint16_t>::value) {
+                        sum += f16_to_f32(*a_) * f16_to_f32(*b_);
+                    } else {
+                        sum += *a_ * (*b_);
+                    }
+                }
+                if constexpr (std::is_same<Tdata, uint16_t>::value) {
+                    if (beta == 0) {
+                        *c_ = f32_to_f16(alpha * sum);
+                    } else {
+                        *c_ = f32_to_f16(beta * f16_to_f32(*c_) + alpha * sum);
+                    }
+                } else {
+                    *c_ = beta * (*c_) + alpha * sum;
                 }
-                *c_ = f32_to_f16(beta * f16_to_f32(*c_) + alpha * sum);
             }
         }
     }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc,
+                           void *workspace,
+                           uint64_t workspace_size,
+                           void *c,
+                           void const *a,
+                           void const *b) {
+    if (desc->dtype == F16) {
+        return matmul_cpu<uint16_t>(desc, c, desc->beta, a, b, desc->alpha);
+    }
+    if (desc->dtype == F32) {
+        return matmul_cpu<float>(desc, c, desc->beta, a, b, desc->alpha);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
 }
diff --git a/src/ops/matmul/cpu/matmul_cpu.h b/src/ops/matmul/cpu/matmul_cpu.h
index c1ddbc8f..3a5970e8 100644
--- a/src/ops/matmul/cpu/matmul_cpu.h
+++ b/src/ops/matmul/cpu/matmul_cpu.h
@@ -1,11 +1,37 @@
 #ifndef __CPU_MATMUL_H__
 #define __CPU_MATMUL_H__
 
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../blas.h"
 #include "operators.h"
+
 typedef struct MatmulCpuDescriptor {
     Device device;
+    DT dtype;
+    MatmulInfo info;
+    float alpha;
+    float beta;
 } MatmulCpuDescriptor;
 
-void matmul_cpu_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha);
+typedef struct MatmulCpuDescriptor *MatmulCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateMatmulDescriptor(CpuHandle_t handle,
+                                           MatmulCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t c_desc,
+                                           float alpha,
+                                           infiniopTensorDescriptor_t a_desc,
+                                           infiniopTensorDescriptor_t b_desc,
+                                           float beta);
+
+infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc,
+                           void *workspace,
+                           uint64_t workspace_size,
+                           void *c,
+                           void const *a,
+                           void const *b);
+
+infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc);
 
 #endif// __CPU_MATMUL_H__
diff --git a/src/ops/matmul/cuda/matmul_cuda.cc b/src/ops/matmul/cuda/matmul_cuda.cc
new file mode 100644
index 00000000..8bac48d4
--- /dev/null
+++ b/src/ops/matmul/cuda/matmul_cuda.cc
@@ -0,0 +1,44 @@
+#include "matmul_cuda.h"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateMatmulDescriptor(CudaHandle_t handle,
+                                            MatmulCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            float alpha,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc,
+                                            float beta) {
+    DT dtype = c_desc->dt;
+
+    if (dtype != F16 && dtype != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, status);
+    if (*status != STATUS_SUCCESS) {
+        return *status;
+    }
+
+    *desc_ptr = new MatmulCudaDescriptor{
+        DevNvGpu,
+        dtype,
+        handle->device_id,
+        info,
+        alpha,
+        beta,
+        handle->cublas_handles_t};
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGetMatmulWorkspaceSize(MatmulCudaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyMatmulDescriptor(MatmulCudaDescriptor_t desc) {
+    desc->cublas_handles_t = nullptr;
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/matmul/cuda/matmul_cuda.cu b/src/ops/matmul/cuda/matmul_cuda.cu
index c7e25f81..fcbc755d 100644
--- a/src/ops/matmul/cuda/matmul_cuda.cu
+++ b/src/ops/matmul/cuda/matmul_cuda.cu
@@ -1,25 +1,36 @@
-#include "../../../devices/cuda/handle_pool.h"
+#include "../../../devices/cuda/cuda_handle.h"
 #include "../../utils.h"
 #include "../blas.h"
 #include "matmul_cuda.h"
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
 
-MatmulCudaDescriptor::MatmulCudaDescriptor(Device device) {
-    this->device = device;
-    get_cublas_pool();
-}
+template<typename Tdata>
+infiniopStatus_t matmul_cuda(MatmulCudaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
+    auto info = desc->info;
 
-void matmul_nv_gpu_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream) {
-    auto info = MatmulInfo(c, a, b);
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
 
-    auto alpha_f16 = __float2half(alpha);
-    auto beta_f16 = __float2half(beta);
+    cudaDataType a_type, b_type, c_type;
+    cublasComputeType_t compute_type;
+    if constexpr (std::is_same<Tdata, half>::value) {
+        a_type = b_type = c_type = CUDA_R_16F;
+        compute_type = CUBLAS_COMPUTE_32F;
+    } else {
+        a_type = b_type = c_type = CUDA_R_32F;
+#ifdef ENABLE_SUGON_DCU
+        compute_type = CUBLAS_COMPUTE_32F;
+#else
+        compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
+#endif
+    }
 
     auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
     auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
 
-    use_cublas((cudaStream_t) stream,
+    use_cublas(desc->cublas_handles_t, desc->device_id, (cudaStream_t) stream,
                [&](cublasHandle_t handle) { cublasGemmStridedBatchedEx(
                                                 handle,
                                                 op_a,
@@ -27,21 +38,38 @@ void matmul_nv_gpu_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, vo
                                                 info.m,
                                                 info.n,
                                                 info.k,
-                                                &alpha_f16,
-                                                info.a_ptr,
-                                                CUDA_R_16F,
+                                                &alpha,
+                                                a,
+                                                a_type,
                                                 info.a_matrix.ld(),
                                                 info.a_matrix.stride,
-                                                info.b_ptr,
-                                                CUDA_R_16F,
+                                                b,
+                                                b_type,
                                                 info.b_matrix.ld(),
                                                 info.b_matrix.stride,
-                                                &beta_f16,
-                                                info.c_ptr,
-                                                CUDA_R_16F,
+                                                &beta,
+                                                c,
+                                                c_type,
                                                 info.c_matrix.ld(),
                                                 info.c_matrix.stride,
                                                 info.batch,
-                                                CUBLAS_COMPUTE_16F,
+                                                compute_type,
                                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP); });
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaMatmul(MatmulCudaDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream) {
+    if (desc->dtype == F16) {
+        return matmul_cuda<half>(desc, c, desc->beta, a, b, desc->alpha, stream);
+    }
+    if (desc->dtype == F32) {
+        return matmul_cuda<float>(desc, c, desc->beta, a, b, desc->alpha, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
 }
diff --git a/src/ops/matmul/cuda/matmul_cuda.h b/src/ops/matmul/cuda/matmul_cuda.h
index 77760e27..3e82c1ed 100644
--- a/src/ops/matmul/cuda/matmul_cuda.h
+++ b/src/ops/matmul/cuda/matmul_cuda.h
@@ -1,13 +1,41 @@
-#ifndef __NV_GPU_MATMUL_H__
-#define __NV_GPU_MATMUL_H__
+#ifndef __CUDA_MATMUL_H__
+#define __CUDA_MATMUL_H__
 
+#include "../../../devices/cuda/cuda_handle.h"
+#include "../blas.h"
 #include "operators.h"
+#include <memory>
 
 typedef struct MatmulCudaDescriptor {
     Device device;
-    MatmulCudaDescriptor(Device device);
+    DT dtype;
+    int device_id;
+    MatmulInfo info;
+    float alpha;
+    float beta;
+    std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t;
 } MatmulCudaDescriptor;
 
-void matmul_nv_gpu_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream);
+typedef struct MatmulCudaDescriptor *MatmulCudaDescriptor_t;
 
-#endif// __NV_GPU_MATMUL_H__
+infiniopStatus_t cudaCreateMatmulDescriptor(CudaHandle_t handle,
+                                            MatmulCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            float alpha,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc,
+                                            float beta);
+
+infiniopStatus_t cudaGetMatmulWorkspaceSize(MatmulCudaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cudaMatmul(MatmulCudaDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyMatmulDescriptor(MatmulCudaDescriptor_t desc);
+
+#endif// __CUDA_MATMUL_H__
diff --git a/src/ops/matmul/maca/matmul_maca.cc b/src/ops/matmul/maca/matmul_maca.cc
new file mode 100644
index 00000000..2d6658f7
--- /dev/null
+++ b/src/ops/matmul/maca/matmul_maca.cc
@@ -0,0 +1,44 @@
+#include "matmul_maca.h"
+#include "../../../devices/maca/common_maca.h"
+#include "../../utils.h"
+
+infiniopStatus_t macaCreateMatmulDescriptor(MacaHandle_t handle,
+                                            MatmulMacaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            float alpha,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc,
+                                            float beta) {
+    DT dtype = c_desc->dt;
+
+    if (dtype != F16 && dtype != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, status);
+    if (*status != STATUS_SUCCESS) {
+        return *status;
+    }
+
+    *desc_ptr = new MatmulMacaDescriptor{
+        DevMetaxGpu,
+        dtype,
+        handle->device_id,
+        info,
+        alpha,
+        beta,
+        handle->mcblas_handles_t};
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t macaGetMatmulWorkspaceSize(MatmulMacaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t macaDestroyMatmulDescriptor(MatmulMacaDescriptor_t desc) {
+    desc->mcblas_handles_t = nullptr;
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/matmul/maca/matmul_maca.h b/src/ops/matmul/maca/matmul_maca.h
new file mode 100644
index 00000000..2264cdc4
--- /dev/null
+++ b/src/ops/matmul/maca/matmul_maca.h
@@ -0,0 +1,41 @@
+#ifndef __MACA_MATMUL_H__
+#define __MACA_MATMUL_H__
+
+#include "../../../devices/maca/maca_handle.h"
+#include "../blas.h"
+#include "operators.h"
+#include <memory>
+
+typedef struct MatmulMacaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    MatmulInfo info;
+    float alpha;
+    float beta;
+    std::shared_ptr<Pool<hcblasHandle_t>> mcblas_handles_t;
+} MatmulMacaDescriptor;
+
+typedef struct MatmulMacaDescriptor *MatmulMacaDescriptor_t;
+
+infiniopStatus_t macaCreateMatmulDescriptor(MacaHandle_t handle,
+                                            MatmulMacaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            float alpha,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc,
+                                            float beta);
+
+infiniopStatus_t macaGetMatmulWorkspaceSize(MatmulMacaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t macaMatmul(MatmulMacaDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream);
+
+infiniopStatus_t macaDestroyMatmulDescriptor(MatmulMacaDescriptor_t desc);
+
+#endif// __MACA_MATMUL_H__
diff --git a/src/ops/matmul/maca/matmul_maca.maca b/src/ops/matmul/maca/matmul_maca.maca
new file mode 100644
index 00000000..d944c85a
--- /dev/null
+++ b/src/ops/matmul/maca/matmul_maca.maca
@@ -0,0 +1,77 @@
+#include "../../../devices/maca/maca_handle.h"
+#include "../../utils.h"
+#include "../blas.h"
+#include "matmul_maca.h"
+#include <hcblas/hcblas.h>
+#include <common/hpcc_fp16.h>
+
+template<typename Tdata>
+infiniopStatus_t matmul_maca(MatmulMacaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
+    auto info = desc->info;
+
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
+
+    Tdata alpha_, beta_;
+    hpccDataType a_type, b_type, c_type;
+    hcblasComputeType_t compute_type;
+
+    if constexpr (std::is_same<Tdata, half>::value) {
+        alpha_ = __float2half(alpha);
+        beta_ = __float2half(beta);
+        a_type = b_type = c_type = HPCC_R_16F;
+        compute_type = HCBLAS_COMPUTE_16F;
+    } else {
+        alpha_ = alpha;
+        beta_ = beta;
+        a_type = b_type = c_type = HPCC_R_32F;
+        compute_type = HCBLAS_COMPUTE_32F_FAST_TF32;
+    }
+
+    auto op_a = info.a_matrix.row_stride == 1 ? HCBLAS_OP_N : HCBLAS_OP_T;
+    auto op_b = info.b_matrix.row_stride == 1 ? HCBLAS_OP_N : HCBLAS_OP_T;
+
+    use_mcblas(desc->mcblas_handles_t, desc->device_id, (hcStream_t) stream,
+               [&](hcblasHandle_t handle) { hcblasGemmStridedBatchedEx(
+                                                handle,
+                                                op_a,
+                                                op_b,
+                                                info.m,
+                                                info.n,
+                                                info.k,
+                                                &alpha_,
+                                                a,
+                                                a_type,
+                                                info.a_matrix.ld(),
+                                                info.a_matrix.stride,
+                                                b,
+                                                b_type,
+                                                info.b_matrix.ld(),
+                                                info.b_matrix.stride,
+                                                &beta_,
+                                                c,
+                                                c_type,
+                                                info.c_matrix.ld(),
+                                                info.c_matrix.stride,
+                                                info.batch,
+                                                compute_type,
+                                                HCBLAS_GEMM_DEFAULT_TENSOR_OP); });
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t macaMatmul(MatmulMacaDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream) {
+    if (desc->dtype == F16) {
+        return matmul_maca<half>(desc, c, desc->beta, a, b, desc->alpha, stream);
+    }
+    if (desc->dtype == F32) {
+        return matmul_maca<float>(desc, c, desc->beta, a, b, desc->alpha, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/matmul/musa/matmul_musa.cc b/src/ops/matmul/musa/matmul_musa.cc
new file mode 100644
index 00000000..3256dca6
--- /dev/null
+++ b/src/ops/matmul/musa/matmul_musa.cc
@@ -0,0 +1,48 @@
+#include "matmul_musa.h"
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include <musa.h>
+#include <mublas.h>
+
+#include <iostream>
+
+infiniopStatus_t musaCreateMatmulDescriptor(MusaHandle_t handle,
+                                            MatmulMusaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            float alpha,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc,
+                                            float beta) {
+    DT dtype = c_desc->dt;
+    
+    if (dtype != F16 && dtype != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, status);
+    if (*status != STATUS_SUCCESS) {
+        return *status;
+    }
+
+    *desc_ptr = new MatmulMusaDescriptor{
+        DevMthreadsGpu,
+        dtype,
+        handle->device_id,
+        info,
+        alpha,
+        beta,
+        handle->mublas_handles_t};
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaGetMatmulWorkspaceSize(MatmulMusaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaDestroyMatmulDescriptor(MatmulMusaDescriptor_t desc) {
+    desc->mublas_handles_t = nullptr;
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/matmul/musa/matmul_musa.h b/src/ops/matmul/musa/matmul_musa.h
new file mode 100644
index 00000000..b086a494
--- /dev/null
+++ b/src/ops/matmul/musa/matmul_musa.h
@@ -0,0 +1,45 @@
+#ifndef __MUSA_MATMUL_H__
+#define __MUSA_MATMUL_H__
+
+#include <memory>
+#include <musa.h>
+#include <musa_runtime.h>
+#include <mudnn.h>
+#include <mudnn_base.h>
+#include "../blas.h"
+#include "operators.h"
+#include "../../../devices/musa/musa_handle.h"
+
+typedef struct MatmulMusaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    MatmulInfo info;
+    float alpha;
+    float beta;
+    std::shared_ptr<Pool<mublasHandle_t>> mublas_handles_t;
+} MatmulMusaDescriptor;
+
+typedef struct MatmulMusaDescriptor *MatmulMusaDescriptor_t;
+
+infiniopStatus_t musaCreateMatmulDescriptor(MusaHandle_t handle,
+                                            MatmulMusaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            float alpha,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc,
+                                            float beta);
+
+infiniopStatus_t musaGetMatmulWorkspaceSize(MatmulMusaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t musaMatmul(MatmulMusaDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream);
+
+infiniopStatus_t musaDestroyMatmulDescriptor(MatmulMusaDescriptor_t desc);
+
+#endif // __MUSA_MATMUL_H__
diff --git a/src/ops/matmul/musa/matmul_musa.mu b/src/ops/matmul/musa/matmul_musa.mu
new file mode 100644
index 00000000..b445a7b3
--- /dev/null
+++ b/src/ops/matmul/musa/matmul_musa.mu
@@ -0,0 +1,77 @@
+#include "../../../devices/musa/musa_handle.h"
+#include "../../utils.h"
+#include "../blas.h"
+#include "matmul_musa.h"
+#include <mublas.h>
+#include <musa_fp16.h>
+
+template<typename Tdata>
+infiniopStatus_t matmul_musa(MatmulMusaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
+    auto info = desc->info;
+
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
+
+    Tdata alpha_, beta_;
+    musaDataType_t a_type, b_type, c_type;
+    mublasComputeType_t compute_type;
+
+    if constexpr (std::is_same<Tdata, half>::value) {
+        alpha_ = __float2half(alpha);
+        beta_ = __float2half(beta);
+        a_type = b_type = c_type = MUSA_R_16F;
+        compute_type = MUBLAS_COMPUTE_16F;
+    } else {
+        alpha_ = alpha;
+        beta_ = beta;
+        a_type = b_type = c_type = MUSA_R_32F;
+        compute_type = MUBLAS_COMPUTE_32F_FAST_TF32;
+    }
+
+    auto op_a = info.a_matrix.row_stride == 1 ? MUBLAS_OP_N : MUBLAS_OP_T;
+    auto op_b = info.b_matrix.row_stride == 1 ? MUBLAS_OP_N : MUBLAS_OP_T;
+
+    use_mublas(desc->mublas_handles_t, desc->device_id, (MUstream) stream,
+               [&](mublasHandle_t handle) { mublasGemmStridedBatchedEx(
+                                                handle,
+                                                op_a,
+                                                op_b,
+                                                info.m,
+                                                info.n,
+                                                info.k,
+                                                &alpha_,
+                                                a,
+                                                a_type,
+                                                info.a_matrix.ld(),
+                                                info.a_matrix.stride,
+                                                b,
+                                                b_type,
+                                                info.b_matrix.ld(),
+                                                info.b_matrix.stride,
+                                                &beta_,
+                                                c,
+                                                c_type,
+                                                info.c_matrix.ld(),
+                                                info.c_matrix.stride,
+                                                info.batch,
+                                                compute_type,
+                                                MUBLAS_GEMM_DEFAULT);});
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaMatmul(MatmulMusaDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream) {
+    if (desc->dtype == F16) {
+        return matmul_musa<half>(desc, c, desc->beta, a, b, desc->alpha, stream);
+    }
+    if (desc->dtype == F32) {
+        return matmul_musa<float>(desc, c, desc->beta, a, b, desc->alpha, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/matmul/operator.cc b/src/ops/matmul/operator.cc
index d323d009..5fa766eb 100644
--- a/src/ops/matmul/operator.cc
+++ b/src/ops/matmul/operator.cc
@@ -11,74 +11,172 @@
 #ifdef ENABLE_CAMBRICON_MLU
 #include "bang/matmul_cnnl.h"
 #endif
+#ifdef ENABLE_ASCEND_NPU
+#include "ascend/matmul_aclnn.h"
+#endif
+#ifdef ENABLE_METAX_GPU
+#include "maca/matmul_maca.h"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#include "musa/matmul_musa.h"
+#endif
 
-struct MatmulDescriptor {
-    Device device;
-};
+__C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle,
+                                                    infiniopMatmulDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t c_desc,
+                                                    float alpha,
+                                                    infiniopTensorDescriptor_t a_desc,
+                                                    infiniopTensorDescriptor_t b_desc,
+                                                    float beta) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateMatmulDescriptor((CpuHandle_t) handle, (MatmulCpuDescriptor_t *) desc_ptr, c_desc, alpha, a_desc, b_desc, beta);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateMatmulDescriptor((CudaHandle_t) handle, (MatmulCudaDescriptor_t *) desc_ptr, c_desc, alpha, a_desc, b_desc, beta);
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangCreateMatmulDescriptor((BangHandle_t) handle, (MatmulBangDescriptor_t *) desc_ptr, c_desc, alpha, a_desc, b_desc, beta);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnCreateMatmulDescriptor((AscendHandle_t) handle,
+                                               (MatmulAclnnDescriptor_t *) desc_ptr,
+                                               c_desc,
+                                               alpha,
+                                               a_desc,
+                                               b_desc,
+                                               beta,
+                                               1);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCreateMatmulDescriptor((MacaHandle_t) handle, (MatmulMacaDescriptor_t *) desc_ptr, c_desc, alpha, a_desc, b_desc, beta);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCreateMatmulDescriptor((MusaHandle_t) handle, (MatmulMusaDescriptor_t *) desc_ptr, c_desc, alpha, a_desc, b_desc, beta);   
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
 
-__C MatmulDescriptor *createMatmulDescriptor(Device device, void *config) {
-    switch (device) {
+__C infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            return (MatmulDescriptor *) (new MatmulCpuDescriptor{device});
+            return cpuGetMatmulWorkspaceSize((MatmulCpuDescriptor_t) desc, size);
 #endif
 #ifdef ENABLE_NV_GPU
         case DevNvGpu: {
-            return (MatmulDescriptor *) (new MatmulCudaDescriptor(device));
+            return cudaGetMatmulWorkspaceSize((MatmulCudaDescriptor_t) desc, size);
         }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         case DevCambriconMlu: {
-            return (MatmulDescriptor *) (new MatmulBangDescriptor(device));
+            return bangGetMatmulWorkspaceSize((MatmulBangDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t) desc,
+                                               size);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaGetMatmulWorkspaceSize((MatmulMacaDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaGetMatmulWorkspaceSize((MatmulMusaDescriptor_t) desc, size);
         }
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
-    return nullptr;
+    return STATUS_BAD_DEVICE;
 }
 
-__C void destroyMatmulDescriptor(MatmulDescriptor *descriptor) {
-    switch (descriptor->device) {
+__C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            delete (MatmulCpuDescriptor *) (descriptor);
-            break;
+            return cpuMatmul((MatmulCpuDescriptor_t) desc, workspace, workspace_size, c, a, b);
 #endif
 #ifdef ENABLE_NV_GPU
         case DevNvGpu:
-            delete (MatmulCudaDescriptor *) (descriptor);
-            break;
+            return cudaMatmul((MatmulCudaDescriptor_t) desc, workspace, workspace_size, c, a, b, stream);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         case DevCambriconMlu: {
-            delete (MatmulBangDescriptor *) (descriptor);
-            break;
+            return bangMatmul((MatmulBangDescriptor_t) desc, workspace, workspace_size, c, a, b, stream);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu:
+            return aclnnMatmul((MatmulAclnnDescriptor_t) desc,
+                               workspace,
+                               workspace_size,
+                               c,
+                               a,
+                               b,
+                               stream);
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaMatmul((MatmulMacaDescriptor_t) desc, workspace, workspace_size, c, a, b, stream);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaMatmul((MatmulMusaDescriptor_t) desc, workspace, workspace_size, c, a, b, stream);
         }
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
+    return STATUS_BAD_DEVICE;
 }
 
-__C void matmul(MatmulDescriptor *descriptor, Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream) {
-    switch (descriptor->device) {
+__C infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            matmul_cpu_f16(c, beta, a, b, alpha);
-            break;
+            return cpuDestroyMatmulDescriptor((MatmulCpuDescriptor_t) desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            matmul_nv_gpu_f16(c, beta, a, b, alpha, stream);
-            break;
+        case DevNvGpu: {
+            return cudaDestroyMatmulDescriptor((MatmulCudaDescriptor_t) desc);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu:
-            matmul_cnnl_f16(c, beta, a, b, alpha, stream);
-            break;
+        case DevCambriconMlu: {
+            return bangDestroyMatmulDescriptor((MatmulBangDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaDestroyMatmulDescriptor((MatmulMacaDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaDestroyMatmulDescriptor((MatmulMusaDescriptor_t) desc);
+        }
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
+    return STATUS_BAD_DEVICE;
 }
diff --git a/src/ops/max_pool/operator.cc b/src/ops/max_pool/operator.cc
new file mode 100644
index 00000000..2644f8bd
--- /dev/null
+++ b/src/ops/max_pool/operator.cc
@@ -0,0 +1,54 @@
+#include "../pooling/pooling.h"
+#include "../utils.h"
+#include "ops/max_pool/max_pool.h"
+
+struct _MaxPoolDescriptor {
+    Device device;
+    infiniopPoolingDescriptor_t pooling_desc;
+    uint64_t workspace_size;
+};
+
+typedef struct _MaxPoolDescriptor *_MaxPoolDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle,
+                                                              infiniopMaxPoolDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t y,
+                                                              infiniopTensorDescriptor_t x,
+                                                              uint64_t const *kernel_shape,
+                                                              uint64_t const *pads,
+                                                              int64_t const *strides,
+                                                              uint64_t n) {
+    infiniopPoolingDescriptor_t pooling_desc;
+    CHECK_STATUS(infiniopCreatePoolingDescriptor(handle, &pooling_desc, y, x, kernel_shape, pads, strides, n, 0), STATUS_SUCCESS);
+    uint64_t workspace_size = 0;
+    CHECK_STATUS(infiniopGetPoolingWorkspaceSize(pooling_desc, &workspace_size), STATUS_SUCCESS);
+
+    *(_MaxPoolDescriptor_t *) desc_ptr = new _MaxPoolDescriptor{
+        handle->device,
+        pooling_desc,
+        workspace_size};
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, uint64_t *size) {
+    *size = ((_MaxPoolDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) {
+    auto _desc = (_MaxPoolDescriptor_t) desc;
+    if (workspace_size < _desc->workspace_size) {
+        return STATUS_MEMORY_NOT_ALLOCATED;
+    }
+
+    CHECK_STATUS(infiniopPooling(_desc->pooling_desc, workspace, workspace_size, y, x, stream),
+                 STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyPoolingDescriptor(((_MaxPoolDescriptor_t) desc)->pooling_desc), STATUS_SUCCESS);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/mlp/operator.cc b/src/ops/mlp/operator.cc
new file mode 100644
index 00000000..48475bb2
--- /dev/null
+++ b/src/ops/mlp/operator.cc
@@ -0,0 +1,130 @@
+#include "../utils.h"
+#include "ops/matmul/matmul.h"
+#include "ops/mlp/mlp.h"
+#include "ops/swiglu/swiglu.h"
+#include "tensor/tensor_descriptor.h"
+
+struct _MLPDescriptor {
+    Device device;
+    infiniopMatmulDescriptor_t matmul_desc1;
+    infiniopMatmulDescriptor_t matmul_desc2;
+    infiniopSwiGLUDescriptor_t swiglu_desc;
+    uint64_t w2_offset_by_bytes;
+    uint64_t workspace_size;
+    uint64_t matmul1_workspace_size;
+    uint64_t matmul2_workspace_size;
+    uint64_t matmul1_tensor_size;
+    uint64_t swiglu_tensor_size;
+};
+
+typedef struct _MLPDescriptor *_MLPDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateMLPDescriptor(infiniopHandle_t handle,
+                                                          infiniopMLPDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t y_desc,
+                                                          infiniopTensorDescriptor_t x_desc,
+                                                          infiniopTensorDescriptor_t w12_desc,
+                                                          infiniopTensorDescriptor_t w3_desc,
+                                                          float alpha,
+                                                          char residual) {
+    if (y_desc->ndim != 2 || x_desc->ndim != 2 || w12_desc->ndim != 2 || w3_desc->ndim != 2) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (x_desc->strides[1] != 1 || y_desc->strides[1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    // matmul1 desc
+    infiniopTensorDescriptor_t desc1 = new TensorDescriptor;
+    uint64_t shape1[2] = {x_desc->shape[0], w12_desc->shape[1]};// [num_tokens, 2 * intermediate_size]
+    CHECK_STATUS(infiniopCreateTensorDescriptor(&desc1, 2, shape1, nullptr, x_desc->dt), STATUS_SUCCESS);
+    infiniopMatmulDescriptor_t matmul_desc1 = new MatmulDescriptor{handle->device};
+    CHECK_STATUS(infiniopCreateMatmulDescriptor(handle, &matmul_desc1, desc1, 1.0, x_desc, w12_desc, 0.0), STATUS_SUCCESS);
+    uint64_t matmul1_tensor_size = get_byte_size(desc1);
+    uint64_t matmul1_workspace_size = 0;
+    CHECK_STATUS(infiniopGetMatmulWorkspaceSize(matmul_desc1, &matmul1_workspace_size), STATUS_SUCCESS);
+
+    // swiglu desc
+    infiniopTensorDescriptor_t desc2 = new TensorDescriptor;
+    uint64_t w2_offset_by_bytes = w12_desc->shape[1] / 2 * w12_desc->dt.size;
+    uint64_t shape2[2] = {x_desc->shape[0], w12_desc->shape[1] / 2};// [num_tokens, itermediate_size]
+    CHECK_STATUS(infiniopCreateTensorDescriptor(&desc2, 2, shape2, nullptr, x_desc->dt), STATUS_SUCCESS);
+    infiniopTensorDescriptor_t desc3 = new TensorDescriptor;
+    int64_t strides3[2] = {desc1->strides[0], desc1->strides[1]};
+    CHECK_STATUS(infiniopCreateTensorDescriptor(&desc3, 2, shape2, strides3, x_desc->dt), STATUS_SUCCESS);
+    infiniopSwiGLUDescriptor_t swiglu_desc = new SwiGLUDescriptor{handle->device};
+    CHECK_STATUS(infiniopCreateSwiGLUDescriptor(handle, &swiglu_desc, desc2, desc3, desc3), STATUS_SUCCESS);
+    uint64_t swiglu_tensor_size = get_byte_size(desc2);
+
+    // matmul2 desc
+    infiniopMatmulDescriptor_t matmul_desc2 = new MatmulDescriptor{handle->device};
+    CHECK_STATUS(infiniopCreateMatmulDescriptor(handle, &matmul_desc2, y_desc, alpha, desc2, w3_desc, residual ? 1.0 : 0.0), STATUS_SUCCESS);
+    uint64_t matmul2_workspace_size = 0;
+    CHECK_STATUS(infiniopGetMatmulWorkspaceSize(matmul_desc2, &matmul2_workspace_size), STATUS_SUCCESS);
+
+    // calculate workspace size
+    uint64_t workspace_size = std::max(std::max(matmul1_workspace_size + matmul1_tensor_size,
+                                                matmul1_tensor_size + swiglu_tensor_size),
+                                       swiglu_tensor_size + matmul2_workspace_size);
+
+    // create descriptor
+    *(_MLPDescriptor_t *) desc_ptr = new _MLPDescriptor{
+        handle->device,
+        matmul_desc1,
+        matmul_desc2,
+        swiglu_desc,
+        w2_offset_by_bytes,
+        workspace_size,
+        matmul1_workspace_size,
+        matmul2_workspace_size,
+        matmul1_tensor_size,
+        swiglu_tensor_size};
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetMLPWorkspaceSize(infiniopMLPDescriptor_t desc, uint64_t *size) {
+    // compute order: matmul1, swiglu, matmul2
+    *size = ((_MLPDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopMLP(infiniopMLPDescriptor_t desc,
+                                          void *workspace,
+                                          uint64_t workspace_size,
+                                          void *y,
+                                          void const *x,
+                                          void const *w12,
+                                          void const *w3,
+                                          void *stream) {
+    auto _desc = (_MLPDescriptor_t) desc;
+    if (workspace_size < _desc->workspace_size) {
+        return STATUS_MEMORY_NOT_ALLOCATED;
+    }
+
+    CHECK_STATUS(infiniopMatmul(_desc->matmul_desc1,
+                                (char *) workspace + _desc->matmul1_tensor_size,
+                                _desc->workspace_size - _desc->matmul1_tensor_size,
+                                workspace, x, w12, stream),
+                 STATUS_SUCCESS);
+    CHECK_STATUS(infiniopSwiGLU(_desc->swiglu_desc,
+                                (char *) workspace + _desc->matmul1_tensor_size,
+                                (char *) workspace + _desc->w2_offset_by_bytes,
+                                workspace, stream),
+                 STATUS_SUCCESS);
+    CHECK_STATUS(infiniopMatmul(_desc->matmul_desc2, (char *) workspace + _desc->matmul1_tensor_size + _desc->swiglu_tensor_size,
+                                _desc->workspace_size - _desc->matmul1_tensor_size - _desc->swiglu_tensor_size,
+                                y, (char *) workspace + _desc->matmul1_tensor_size, w3, stream),
+                 STATUS_SUCCESS);
+
+    return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyMLPDescriptor(infiniopMLPDescriptor_t desc) {
+    CHECK_STATUS(infiniopDestroyMatmulDescriptor(((_MLPDescriptor_t) desc)->matmul_desc1), STATUS_SUCCESS);
+    CHECK_STATUS(infiniopDestroyMatmulDescriptor(((_MLPDescriptor_t) desc)->matmul_desc2), STATUS_SUCCESS);
+    CHECK_STATUS(infiniopDestroySwiGLUDescriptor(((_MLPDescriptor_t) desc)->swiglu_desc), STATUS_SUCCESS);
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/pooling/cpu/pooling_cpu.cc b/src/ops/pooling/cpu/pooling_cpu.cc
new file mode 100644
index 00000000..3c783c14
--- /dev/null
+++ b/src/ops/pooling/cpu/pooling_cpu.cc
@@ -0,0 +1,258 @@
+#include "pooling_cpu.h"
+#include "../../utils.h"
+
+// get the total number of elements in arr
+inline uint64_t getTotalSize(const uint64_t *arr, uint64_t ndim) {
+    return std::accumulate(arr, arr + ndim, 1ULL, std::multiplies<uint64_t>());
+}
+
+// check if padding is needed
+inline bool requirePadding(uint64_t const *pads, uint64_t ndim) {
+    return std::any_of(pads, pads + ndim - 2,
+                       [](uint64_t pad) { return pad > 0; });
+}
+
+infiniopStatus_t cpuCreatePoolingDescriptor(infiniopHandle_t,
+                                            PoolingCpuDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x,
+                                            uint64_t const *kernel_shape,
+                                            uint64_t const *pads,
+                                            int64_t const *strides,
+                                            uint64_t n,
+                                            int pooling_type) {
+    uint64_t ndim = y->ndim;
+    if (ndim < 3 || ndim != x->ndim || ndim != n + 2) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (x->shape[0] != y->shape[0] || x->shape[1] != y->shape[1]) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (pooling_type > 1) {
+        return STATUS_BAD_PARAM;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    const auto y_size = getTotalSize(y->shape, ndim);
+    const auto padded_x_size = requirePadding(pads, ndim) ? getPaddedSize(ndim, x->shape, pads) : 0;
+    uint64_t *x_shape = new uint64_t[ndim];
+    uint64_t *y_shape = new uint64_t[ndim];
+    uint64_t *kernel_ = new uint64_t[n];
+    uint64_t *pads_ = new uint64_t[n];
+    int64_t *strides_ = new int64_t[n];
+    memcpy(x_shape, x->shape, ndim * sizeof(uint64_t));
+    memcpy(y_shape, y->shape, ndim * sizeof(uint64_t));
+    for (size_t i = 0; i < n; ++i) {
+        kernel_[i] = kernel_shape[i];
+        pads_[i] = pads[i];
+        strides_[i] = strides[i];
+    }
+
+    *desc_ptr = new PoolingCpuDescriptor{
+        DevCpu,
+        y->dt,
+        ndim,
+        y_size,
+        padded_x_size,
+        x_shape,
+        kernel_,
+        y_shape,
+        pads_,
+        strides_,
+        pooling_type,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGetPoolingWorkspaceSize(PoolingCpuDescriptor_t desc, uint64_t *size) {
+    *size = desc->padded_x_size * desc->dt.size;
+    if (desc->dt == F16) {
+        *size += desc->y_size * sizeof(float);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyPoolingDescriptor(PoolingCpuDescriptor_t desc) {
+    delete[] desc->x_shape;
+    delete[] desc->y_shape;
+    delete[] desc->k_shape;
+    delete[] desc->pads;
+    delete[] desc->strides;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+// initialize the padded input with the data from the original input
+template<typename Tdata>
+void fillPaddedInput(PoolingCpuDescriptor_t desc, uint64_t const *padded_x_shape,
+                     Tdata *padded_x, Tdata const *x,
+                     uint64_t const *pads, uint64_t x_index,
+                     uint64_t padded_x_index, uint64_t ndim) {
+    const auto x_shape = desc->x_shape[ndim];
+    const auto padded_x_shape_ = padded_x_shape[ndim];
+    const auto x_base_index = x_index * x_shape;
+    const auto padded_x_base_index = padded_x_index * padded_x_shape_ +
+                                     (x_shape == padded_x_shape_ ? 0 : pads[ndim - 2]);
+
+    for (size_t i = 0; i < x_shape; ++i) {
+        // base case (last dimension)
+        if (ndim == desc->ndim - 1) {
+            padded_x[padded_x_base_index + i] = x[x_base_index + i];
+        }
+        // recursive case
+        else {
+            fillPaddedInput(desc, padded_x_shape, padded_x, x, pads, x_base_index + i,
+                            padded_x_base_index + i, ndim + 1);
+        }
+    }
+}
+
+// perform the a singleton pooling operation depending on the data type and pooling type
+template<typename Xdata, typename Ydata>
+inline void pool(PoolingCpuDescriptor_t desc, Ydata *y, Xdata const *x,
+                 uint64_t const *x_shape, uint64_t curr_x_index, uint64_t y_index) {
+    switch (desc->pooling_mode) {
+        // 0. Max pooling
+        case 0:
+            if constexpr (std::is_same<Xdata, uint16_t>::value) {
+                y[y_index] = std::fmax(f16_to_f32(x[curr_x_index]), y[y_index]);
+            } else {
+                y[y_index] = std::max(x[curr_x_index], y[y_index]);
+            }
+            break;
+        // 1. Average pooling
+        default:
+            if constexpr (std::is_same<Xdata, uint16_t>::value) {
+                y[y_index] += f16_to_f32(x[curr_x_index]);
+            } else {
+                y[y_index] += x[curr_x_index];
+            }
+    }
+}
+
+// Recursive convolution function
+template<typename Xdata, typename Ydata>
+void _applyPooling(PoolingCpuDescriptor_t desc, Ydata *y, Xdata const *x,
+                   uint64_t const *x_shape, uint64_t x_index, uint64_t y_index,
+                   uint64_t ndim) {
+    const auto dim_size = x_shape[ndim];
+    const auto kernel_size = desc->k_shape[ndim - 2];
+    const auto dilation = 1;
+    const auto stride = desc->strides[ndim - 2];
+    const auto steps =
+        (dim_size - dilation * (kernel_size - 1) - 1) / stride + 1;
+    x_index *= dim_size;
+    y_index *= desc->y_shape[ndim];
+
+    // perform all the pooling along this axis
+    for (size_t i = 0; i < steps; ++i, ++y_index) {
+        // perform a single pooling
+        for (size_t k = 0; k < kernel_size; ++k) {
+            // calculate the current indices
+            const auto curr_x_index = x_index + i * stride + k * dilation;
+
+            // base case (last dimension)
+            if (ndim == desc->ndim - 1) {
+                pool(desc, y, x, x_shape, curr_x_index, y_index);
+            }
+            // recursive case
+            else {
+                _applyPooling(desc, y, x, x_shape, curr_x_index, y_index, ndim + 1);
+            }
+        }
+    }
+}
+
+template<typename Xdata, typename Ydata>
+void applyPooling(PoolingCpuDescriptor_t desc, Ydata *y, Xdata const *x, uint64_t const *x_shape) {
+#pragma omp parallel for collapse(2)
+    // batch
+    for (size_t i = 0; i < x_shape[0]; ++i) {
+
+        // channel
+        for (size_t j = 0; j < x_shape[1]; ++j) {
+            uint64_t x_index = i * x_shape[1] + j;
+            uint64_t y_index = i * desc->y_shape[1] + j;
+            _applyPooling(desc, y, x, x_shape, x_index, y_index, 2);
+        }
+    }
+
+    // if is average pooling, take the average
+    if (desc->pooling_mode == 1) {
+        Ydata num_kernel_elements = getTotalSize(desc->k_shape, desc->ndim - 2);
+#pragma omp parallel for
+        for (size_t i = 0; i < desc->y_size; ++i) {
+            y[i] /= num_kernel_elements;
+        }
+    }
+}
+
+template<typename Xdata, typename Ydata>
+void _pooling_cpu(PoolingCpuDescriptor_t desc, void *workspace, uint64_t workspace_size,
+                  Ydata *y, Xdata const *x) {
+    if (desc->padded_x_size > 0) {
+        auto padded_x = reinterpret_cast<Xdata *>(workspace);
+        std::vector<uint64_t> padded_shape_(desc->ndim);
+        auto padded_shape = padded_shape_.data();
+        std::fill(padded_x, padded_x + desc->padded_x_size, 0);
+        getPaddedShape(desc->ndim, desc->x_shape, desc->pads, padded_shape);
+        fillPaddedInput<Xdata>(desc, padded_shape, padded_x, x, desc->pads, 0, 0, 0);
+        applyPooling<Xdata, Ydata>(desc, y, padded_x, padded_shape);
+    } else {
+        applyPooling<Xdata, Ydata>(desc, y, x, desc->x_shape);
+    }
+}
+
+// Pooling function
+template<typename Tdata>
+infiniopStatus_t pooling_cpu(PoolingCpuDescriptor_t desc, void *workspace, uint64_t workspace_size,
+                             void *y, void const *x) {
+    auto y_ = reinterpret_cast<Tdata *>(y);
+    auto x_ = reinterpret_cast<Tdata const *>(x);
+    std::fill(y_, y_ + desc->y_size, 0);
+    _pooling_cpu<Tdata, Tdata>(desc, workspace, workspace_size, y_, x_);
+    return STATUS_SUCCESS;
+}
+
+// sepcial case for fp16 (uint16_t)
+template<>
+infiniopStatus_t pooling_cpu<uint16_t>(PoolingCpuDescriptor_t desc, void *workspace, uint64_t workspace_size,
+                                       void *y, void const *x) {
+    auto y_ = reinterpret_cast<float *>(workspace);
+    auto x_ = reinterpret_cast<uint16_t const *>(x);
+    std::fill(y_, y_ + desc->y_size, 0);
+
+    _pooling_cpu<uint16_t, float>(desc, y_ + desc->y_size, workspace_size, y_, x_);
+
+    // copy data from y_ to y
+    auto y_16 = reinterpret_cast<uint16_t *>(y);
+#pragma omp parallel for
+    for (size_t i = 0; i < desc->y_size; ++i) {
+        y_16[i] = f32_to_f16(y_[i]);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuPooling(PoolingCpuDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *y,
+                            void const *x,
+                            void *stream) {
+    if (desc->dt == F16) {
+        return pooling_cpu<uint16_t>(desc, workspace, workspace_size, y, x);
+    }
+    if (desc->dt == F32) {
+        return pooling_cpu<float>(desc, workspace, workspace_size, y, x);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/pooling/cpu/pooling_cpu.h b/src/ops/pooling/cpu/pooling_cpu.h
new file mode 100644
index 00000000..5f70f82c
--- /dev/null
+++ b/src/ops/pooling/cpu/pooling_cpu.h
@@ -0,0 +1,48 @@
+#ifndef __CPU_POOLING_H__
+#define __CPU_POOLING_H__
+
+#include "../../../devices/cpu/common_cpu.h"
+#include "operators.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <numeric>
+
+struct PoolingCpuDescriptor {
+    Device device;
+    DataLayout dt;
+    uint64_t ndim;
+    uint64_t y_size;
+    uint64_t padded_x_size;
+    uint64_t const *x_shape;
+    uint64_t const *k_shape;
+    uint64_t const *y_shape;
+    uint64_t const *pads;
+    int64_t const *strides;
+    int pooling_mode;
+};
+
+typedef struct PoolingCpuDescriptor *PoolingCpuDescriptor_t;
+
+infiniopStatus_t cpuCreatePoolingDescriptor(infiniopHandle_t handle,
+                                            PoolingCpuDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y,
+                                            infiniopTensorDescriptor_t x,
+                                            uint64_t const *kernel_shape,
+                                            uint64_t const *pads,
+                                            int64_t const *strides,
+                                            uint64_t n,
+                                            int pooling_type);
+
+infiniopStatus_t cpuGetPoolingWorkspaceSize(PoolingCpuDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cpuPooling(PoolingCpuDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *y,
+                            void const *x,
+                            void *stream);
+
+infiniopStatus_t cpuDestroyPoolingDescriptor(PoolingCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/pooling/cuda/pooling.cc b/src/ops/pooling/cuda/pooling.cc
new file mode 100644
index 00000000..0cf45d64
--- /dev/null
+++ b/src/ops/pooling/cuda/pooling.cc
@@ -0,0 +1,167 @@
+#include "pooling.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include <numeric>
+
+infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle,
+                                             PoolingCudaDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t y,
+                                             infiniopTensorDescriptor_t x,
+                                             uint64_t const *kernel_shape,
+                                             uint64_t const *pads,
+                                             int64_t const *strides,
+                                             uint64_t n,
+                                             int pooling_type) {
+    uint64_t ndim = y->ndim;
+    if (ndim < 3 || ndim != x->ndim || ndim != n + 2) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (x->shape[0] != y->shape[0] || x->shape[1] != y->shape[1]) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (pooling_type > 1) {
+        return STATUS_BAD_PARAM;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    float alpha = 1.0f, beta = 0.0f;
+
+    if (ndim <= 4) {
+
+        int xn = x->shape[0];
+        int xc = x->shape[1];
+        int xh = ndim == 3 ? 1 : x->shape[2];
+        int xw = ndim == 3 ? x->shape[2] : x->shape[3];
+        int yh = ndim == 3 ? 1 : y->shape[2];
+        int yw = ndim == 3 ? y->shape[2] : y->shape[3];
+        const auto kernel_ = reinterpret_cast<int64_t const *>(kernel_shape);
+        const auto pads_ = reinterpret_cast<int64_t const *>(pads);
+        const auto strides_ = reinterpret_cast<int64_t const *>(strides);
+        int kh = ndim == 3 ? 1 : kernel_[0];
+        int kw = ndim == 3 ? kernel_[0] : kernel_[1];
+        int ph = ndim == 3 ? 0 : pads_[0];
+        int pw = ndim == 3 ? pads_[0] : pads_[1];
+        int sh = ndim == 3 ? 1 : strides_[0];
+        int sw = ndim == 3 ? strides_[0] : strides_[1];
+
+        // get the data types of the tensors and the conv operator
+        CREATE_CHECK_ERROR(auto tensor_dt = dataTypeMap[x->dt], tensor_dt, -1, STATUS_BAD_PARAM);
+
+        // create and set tensor descriptors for x
+        cudnnTensorDescriptor_t x_desc;
+        checkCudnnError(cudnnCreateTensorDescriptor(&x_desc));
+        checkCudnnError(cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, static_cast<cudnnDataType_t>(tensor_dt), xn, xc, xh, xw));
+
+        // Create and set pooling descriptor for average pooling
+        cudnnPoolingDescriptor_t pool_desc;
+        checkCudnnError(cudnnCreatePoolingDescriptor(&pool_desc));
+        checkCudnnError(cudnnSetPooling2dDescriptor(pool_desc,
+                                                    getPoolingMode(pooling_type),
+                                                    CUDNN_NOT_PROPAGATE_NAN,
+                                                    kh,// pooling window height
+                                                    kw,// pooling window width
+                                                    ph,// vertical padding
+                                                    pw,// horizontal padding
+                                                    sh,// vertical Stride
+                                                    sw // horizontal stride
+                                                    ));
+        // create and set tensor descriptors for y
+        cudnnTensorDescriptor_t y_desc;
+        checkCudnnError(cudnnCreateTensorDescriptor(&y_desc));
+        checkCudnnError(cudnnGetPooling2dForwardOutputDim(pool_desc, x_desc, &xn, &xc, &yh, &yw));
+        checkCudnnError(cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, static_cast<cudnnDataType_t>(tensor_dt), xn, xc, yh, yw));
+
+        *desc_ptr = new PoolingCudaDescriptor{
+            DevNvGpu,
+            y->dt,
+            handle->device_id,
+            handle->cudnn_handles_t,
+            x_desc,
+            y_desc,
+            pool_desc,
+            alpha,
+            beta,
+        };
+    } else {
+        std::vector<int> x_shape(ndim);
+        std::vector<int> x_strides(ndim);
+        std::vector<int> y_shape(ndim);
+        std::vector<int> y_strides(ndim);
+        std::vector<int> k_shape(ndim - 2);
+        std::vector<int> pads_int(ndim - 2);
+        std::vector<int> strides_int(ndim - 2);
+
+#pragma omp parallel for
+        for (size_t i = 0; i < ndim; ++i) {
+            x_shape[i] = static_cast<int>(x->shape[i]);
+            x_strides[i] = static_cast<int>(x->strides[i]);
+            y_shape[i] = static_cast<int>(y->shape[i]);
+            y_strides[i] = static_cast<int>(y->strides[i]);
+            if (i < ndim - 2) {
+                k_shape[i] = static_cast<int>(kernel_shape[i]);
+                pads_int[i] = static_cast<int>(pads[i]);
+                strides_int[i] = static_cast<int>(strides[i]);
+            }
+        }
+
+        // get the data types of the tensors and the conv operator
+        CREATE_CHECK_ERROR(auto tensor_dt = dataTypeMap[x->dt], tensor_dt, -1, STATUS_BAD_PARAM);
+
+        // create and set tensor descriptors for x
+        cudnnTensorDescriptor_t x_desc;
+        checkCudnnError(cudnnCreateTensorDescriptor(&x_desc));
+        checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, x_shape.data(), x_strides.data()));
+
+        // Create and set pooling descriptor for average pooling
+        cudnnPoolingDescriptor_t pool_desc;
+        checkCudnnError(cudnnCreatePoolingDescriptor(&pool_desc));
+        checkCudnnError(cudnnSetPoolingNdDescriptor(pool_desc,
+                                                    getPoolingMode(pooling_type),
+                                                    CUDNN_NOT_PROPAGATE_NAN,
+                                                    ndim - 2,
+                                                    k_shape.data(),
+                                                    pads_int.data(),
+                                                    strides_int.data()));
+        // create and set tensor descriptors for y
+        cudnnTensorDescriptor_t y_desc;
+        checkCudnnError(cudnnCreateTensorDescriptor(&y_desc));
+        checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape.data()));
+        checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast<cudnnDataType_t>(tensor_dt), ndim, y_shape.data(), y_strides.data()));
+
+        *desc_ptr = new PoolingCudaDescriptor{
+            DevNvGpu,
+            y->dt,
+            handle->device_id,
+            handle->cudnn_handles_t,
+            x_desc,
+            y_desc,
+            pool_desc,
+            alpha,
+            beta,
+        };
+        return STATUS_SUCCESS;
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGetPoolingWorkspaceSize(PoolingCudaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyPoolingDescriptor(PoolingCudaDescriptor_t desc) {
+    checkCudnnError(cudnnDestroyTensorDescriptor(desc->x_desc));
+    checkCudnnError(cudnnDestroyTensorDescriptor(desc->y_desc));
+    checkCudnnError(cudnnDestroyPoolingDescriptor(desc->pool_desc));
+    desc->cudnn_handles_t = nullptr;
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/pooling/cuda/pooling.cu b/src/ops/pooling/cuda/pooling.cu
new file mode 100644
index 00000000..bac683c5
--- /dev/null
+++ b/src/ops/pooling/cuda/pooling.cu
@@ -0,0 +1,20 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "pooling.cuh"
+
+infiniopStatus_t pooling_nv_gpu(PoolingCudaDescriptor_t desc, void *y, void const *x, void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+    checkCudnnError(use_cudnn(desc->cudnn_handles_t, desc->device_id, (cudaStream_t) stream,
+                              [&](cudnnHandle_t handle) { return cudnnPoolingForward(handle, desc->pool_desc,
+                                                                                     &desc->alpha, desc->x_desc, x, &desc->beta,
+                                                                                     desc->y_desc, y); }));
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaPooling(PoolingCudaDescriptor_t desc,
+                             void *workspace, uint64_t workspace_size,
+                             void *y, void const *x, void *stream) {
+    if (desc->dtype == F16 || desc->dtype == F32) {
+        return pooling_nv_gpu(desc, y, x, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/pooling/cuda/pooling.cuh b/src/ops/pooling/cuda/pooling.cuh
new file mode 100644
index 00000000..dd080e1e
--- /dev/null
+++ b/src/ops/pooling/cuda/pooling.cuh
@@ -0,0 +1,54 @@
+#ifndef __CUDA_POOLING_H__
+#define __CUDA_POOLING_H__
+
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <vector>
+
+struct PoolingCudaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handles_t;
+    cudnnTensorDescriptor_t const x_desc;
+    cudnnTensorDescriptor_t const y_desc;
+    cudnnPoolingDescriptor_t const pool_desc;
+    const float alpha;
+    const float beta;
+};
+
+typedef struct PoolingCudaDescriptor *PoolingCudaDescriptor_t;
+
+infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle,
+                                             PoolingCudaDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t y,
+                                             infiniopTensorDescriptor_t x,
+                                             uint64_t const *kernel_shape,
+                                             uint64_t const *pads,
+                                             int64_t const *strides,
+                                             uint64_t n,
+                                             int pooling_type);
+
+infiniopStatus_t cudaGetPoolingWorkspaceSize(PoolingCudaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cudaPooling(PoolingCudaDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *y,
+                             void const *x,
+                             void *stream);
+
+infiniopStatus_t cudaDestroyPoolingDescriptor(PoolingCudaDescriptor_t desc);
+
+inline cudnnPoolingMode_t getPoolingMode(int pooling_type) {
+    switch (pooling_type) {
+        case 0:
+            return CUDNN_POOLING_MAX;
+        case 1:
+            return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+        default:
+            return CUDNN_POOLING_MAX;
+    }
+}
+
+#endif// __CUDA_POOLING_H__
diff --git a/src/ops/pooling/operator.cc b/src/ops/pooling/operator.cc
new file mode 100644
index 00000000..4772be52
--- /dev/null
+++ b/src/ops/pooling/operator.cc
@@ -0,0 +1,101 @@
+#include "../utils.h"
+#include "operators.h"
+#include "pooling.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/pooling_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/common_cuda.h"
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/pooling.cuh"
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+// TODO
+#endif
+
+__C infiniopStatus_t infiniopCreatePoolingDescriptor(
+    infiniopHandle_t handle,
+    infiniopPoolingDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    uint64_t const *kernel_shape,
+    uint64_t const *pads,
+    int64_t const *strides,
+    uint64_t n,
+    int pooling_type) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreatePoolingDescriptor(handle, (PoolingCpuDescriptor_t *) desc_ptr, y, x, kernel_shape, pads, strides, n, pooling_type);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreatePoolingDescriptor((CudaHandle_t) handle, (PoolingCudaDescriptor_t *) desc_ptr, y, x, kernel_shape, pads, strides, n, pooling_type);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopGetPoolingWorkspaceSize(infiniopPoolingDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGetPoolingWorkspaceSize((PoolingCpuDescriptor_t) desc, size);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaGetPoolingWorkspaceSize((PoolingCudaDescriptor_t) desc, size);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopPooling(infiniopPoolingDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuPooling((PoolingCpuDescriptor_t) desc, workspace, workspace_size, y, x, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaPooling((PoolingCudaDescriptor_t) desc, workspace, workspace_size, y, x, stream);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyPoolingDescriptor(infiniopPoolingDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyPoolingDescriptor((PoolingCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyPoolingDescriptor((PoolingCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/pooling/pooling.h b/src/ops/pooling/pooling.h
new file mode 100644
index 00000000..b57856f0
--- /dev/null
+++ b/src/ops/pooling/pooling.h
@@ -0,0 +1,27 @@
+#ifndef POOLING_H
+#define POOLING_H
+
+#include "export.h"
+#include "operators.h"
+
+typedef struct PoolingDescriptor {
+    Device device;
+} PoolingDescriptor;
+typedef PoolingDescriptor *infiniopPoolingDescriptor_t;
+
+__C infiniopStatus_t infiniopCreatePoolingDescriptor(infiniopHandle_t handle,
+                                                     infiniopPoolingDescriptor_t *desc_ptr,
+                                                     infiniopTensorDescriptor_t y,
+                                                     infiniopTensorDescriptor_t x,
+                                                     uint64_t const *kernel_shape,
+                                                     uint64_t const *pads,
+                                                     int64_t const *strides,
+                                                     uint64_t n,
+                                                     int pooling_type);
+
+__C infiniopStatus_t infiniopGetPoolingWorkspaceSize(infiniopPoolingDescriptor_t desc, uint64_t *size);
+
+__C infiniopStatus_t infiniopPooling(infiniopPoolingDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream);
+
+__C infiniopStatus_t infiniopDestroyPoolingDescriptor(infiniopPoolingDescriptor_t desc);
+#endif
diff --git a/src/ops/random_sample/ascend/random_sample.cc b/src/ops/random_sample/ascend/random_sample.cc
new file mode 100644
index 00000000..b16159dc
--- /dev/null
+++ b/src/ops/random_sample/ascend/random_sample.cc
@@ -0,0 +1,153 @@
+#include "random_sample.h"
+
+RandomSampleAscendDescriptor::RandomSampleAscendDescriptor(Device _device) {
+    device = _device;
+    device_id = 0;
+    pDesc = new aclnnTensorDescriptor();
+    topkIdxDesc = new aclnnTensorDescriptor();
+    topkValDesc = new aclnnTensorDescriptor();
+    resDesc = new aclnnTensorDescriptor();
+}
+
+infiniopStatus_t ascendCreateRandomSampleDescriptor(AscendHandle_t handle,
+                                                    RandomSampleAscendDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t result,
+                                                    infiniopTensorDescriptor_t probs) {
+    if (probs->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!dtype_eq(result->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+    if (result->ndim != 1 && result->shape[0] != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    (*desc_ptr) = new RandomSampleAscendDescriptor(handle->device);
+    (*desc_ptr)->device_id = handle->device_id;
+
+    CHECK_STATUS((*desc_ptr)->pDesc->fromInfiniOpTensorDescriptor(probs), STATUS_SUCCESS);
+    CHECK_STATUS((*desc_ptr)->resDesc->fromInfiniOpTensorDescriptor(result), STATUS_SUCCESS);
+    // Ascend aclnnTopk doesn't support U64 type
+    (*desc_ptr)->resDesc->dataType = aclDataType::ACL_INT64;
+
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t ascendGetRandomSampleWorkspaceSize(RandomSampleAscendDescriptor_t desc,
+                                                    uint64_t *size) {
+    auto &pDesc = desc->pDesc;
+    *size = numElements(pDesc->shape.data(), pDesc->ndim) * aclDataTypeSize(pDesc->dataType) +
+            numElements(pDesc->shape.data(), pDesc->ndim) * sizeof(I64);
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t ascendRandomSample(RandomSampleAscendDescriptor_t desc,
+                                    void *workspace,
+                                    uint64_t workspace_size,
+                                    void *result,
+                                    void const *probs,
+                                    float random_val,
+                                    float topp,
+                                    int topk,
+                                    float temperature,
+                                    void *stream) {
+    if (topk <= 0 || topp < 0 || topp > 1.0) {
+        return STATUS_BAD_PARAM;
+    }
+
+    if (random_val < 0 || random_val > 1.0) {
+        return STATUS_BAD_PARAM;
+    }
+
+    auto &pDesc = desc->pDesc;
+    auto &topkIdxDesc = desc->topkIdxDesc;
+    auto &topkValDesc = desc->topkValDesc;
+    auto ndim = static_cast<int64_t>(pDesc->ndim);
+    auto voc = pDesc->shape[0];
+    auto topk_ = topk <= voc ? topk : voc;
+    bool doSample = topk_ > 1 && temperature != 0 && topp != 0;
+
+    auto topkShape = std::vector<int64_t>(pDesc->shape);
+    topkShape[ndim - 1] = doSample ? topk_ : 1;
+
+    auto topkStrides = std::vector<int64_t>(pDesc->strides);
+    // Infer contiguous strides
+    topkStrides[ndim - 1] = 1;
+    for (int64_t i = ndim - 2; i >= 0; --i) {
+        topkStrides[i] = topkStrides[i + 1] * topkShape[i + 1];
+    }
+
+    CHECK_STATUS(topkValDesc->setDescriptor(pDesc->dataType, topkShape, topkStrides), STATUS_SUCCESS);
+    CHECK_STATUS(topkIdxDesc->setDescriptor(aclDataType::ACL_INT64, topkShape, topkStrides), STATUS_SUCCESS);
+
+    // Infer data ptr
+    auto workspaceTmp = workspace;
+    auto topkValAddr = workspaceTmp;
+    workspaceTmp = (void *) ((uint8_t *) workspace +
+                             numElements(topkValDesc->shape.data(), topkValDesc->ndim) * aclDataTypeSize(topkValDesc->dataType));
+    auto topkIdxAddr = workspaceTmp;
+    auto pAddr = (void *) probs;
+
+    // Create aclTensor
+    CHECK_STATUS(pDesc->createTensor(pAddr), STATUS_SUCCESS);
+    CHECK_STATUS(topkValDesc->createTensor(topkValAddr), STATUS_SUCCESS);
+    CHECK_STATUS(topkIdxDesc->createTensor(topkIdxAddr), STATUS_SUCCESS);
+    if (!doSample) {
+        CHECK_STATUS(desc->resDesc->createTensor(result), STATUS_SUCCESS);
+    }
+
+    // Do Topk calculate
+    uint64_t topkWorkspaceSize = 0;
+    aclOpExecutor *topkExecutor = nullptr;
+    auto ret = aclnnTopkGetWorkspaceSize(pDesc->t,
+                                         topkShape[ndim - 1],
+                                         ndim - 1,
+                                         true,
+                                         true,
+                                         topkValDesc->t,
+                                         doSample ? topkIdxDesc->t
+                                                  : desc->resDesc->t,
+                                         &topkWorkspaceSize,
+                                         &topkExecutor);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnTopkGetWorkspaceSize failed ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+    void *topkWorkspace;
+    CHECK_STATUS(mallocWorkspace(&topkWorkspace, topkWorkspaceSize), STATUS_SUCCESS);
+    ret = aclnnTopk(topkWorkspace,
+                    topkWorkspaceSize,
+                    topkExecutor,
+                    stream);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnTopk failed ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+    CHECK_STATUS(freeWorkspace(topkWorkspace), STATUS_SUCCESS);
+
+    if (doSample) {
+        // Do softmax and topp random sample
+        CHECK_STATUS(random_sample_do(
+                         pAddr,
+                         result,
+                         topkValAddr,
+                         topkIdxAddr,
+                         topk,
+                         static_cast<int>(pDesc->shape[0]),
+                         topp,
+                         temperature,
+                         random_val,
+                         pDesc->dataType,
+                         stream),
+                     STATUS_SUCCESS);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t ascendDestroyRandomSampleDescriptor(RandomSampleAscendDescriptor_t desc) {
+    delete desc->pDesc;
+    delete desc->topkIdxDesc;
+    delete desc->topkValDesc;
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/random_sample/ascend/random_sample.h b/src/ops/random_sample/ascend/random_sample.h
new file mode 100644
index 00000000..1ecc16fc
--- /dev/null
+++ b/src/ops/random_sample/ascend/random_sample.h
@@ -0,0 +1,52 @@
+#ifndef __ASCEND_RANDOM_SAMPLE_H__
+#define __ASCEND_RANDOM_SAMPLE_H__
+
+#include "../../../devices/ascend/ascend_handle.h"
+#include "../../../devices/ascend/tensor_aclnn.h"
+#include "../../utils.h"
+#include "operators.h"
+#include <acl/acl.h>
+#include <acl/acl_base.h>
+#include <acl/acl_rt.h>
+#include <aclnnop/aclnn_topk.h>
+
+
+struct RandomSampleAscendDescriptor {
+    Device device;
+    int device_id;
+    aclnnTensorDescriptor_t pDesc;
+    aclnnTensorDescriptor_t topkValDesc;
+    aclnnTensorDescriptor_t topkIdxDesc;
+    aclnnTensorDescriptor_t resDesc;
+    RandomSampleAscendDescriptor(Device _device);
+};
+
+typedef struct RandomSampleAscendDescriptor *RandomSampleAscendDescriptor_t;
+
+infiniopStatus_t ascendCreateRandomSampleDescriptor(AscendHandle_t handle,
+                                                    RandomSampleAscendDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t result,
+                                                    infiniopTensorDescriptor_t probs);
+
+infiniopStatus_t ascendGetRandomSampleWorkspaceSize(RandomSampleAscendDescriptor_t desc,
+                                                    uint64_t *size);
+
+infiniopStatus_t ascendRandomSample(RandomSampleAscendDescriptor_t desc,
+                                    void *workspace,
+                                    uint64_t workspace_size,
+                                    void *result,
+                                    void const *probs,
+                                    float random_val,
+                                    float topp,
+                                    int topk,
+                                    float temperature,
+                                    void *stream);
+
+infiniopStatus_t ascendDestroyRandomSampleDescriptor(RandomSampleAscendDescriptor_t desc);
+
+extern "C" infiniopStatus_t
+random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr,
+                 int32_t topk, int32_t voc, float topp, float temper,
+                 float random, int dtype, void *stream);
+
+#endif
diff --git a/src/ops/random_sample/ascend/random_sample_kernel.cpp b/src/ops/random_sample/ascend/random_sample_kernel.cpp
new file mode 100644
index 00000000..18b482bc
--- /dev/null
+++ b/src/ops/random_sample/ascend/random_sample_kernel.cpp
@@ -0,0 +1,232 @@
+#include "../../../../include/status.h"
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+template<typename T>
+class KernelRandomSample {
+public:
+    __aicore__ inline KernelRandomSample() {}
+    __aicore__ inline void Init(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr,
+                                GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_,
+                                float topp_, float temper_, float random_) {
+
+        topk = topk_;
+        voc = voc_;
+        topp = topp_;
+        temperature = temper_;
+        random = random_;
+        blockSize = 256 * 2;
+
+        // CumSumInfo
+        if (sizeof(T) == sizeof(float)) {
+            topkAligned = (topk + 7) / 8 * 8;
+            vocAligned = (voc + 7) / 8 * 8;
+        } else {
+            topkAligned = (topk + 15) / 16 * 16;
+            vocAligned = (voc + 15) / 16 * 16;
+        }
+        topkIdxAligned = (topk + 3) / 4 * 4;
+
+        // Set Gm
+        pGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(p), voc);
+        topkGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(topkAddr), topk);
+        topkIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(topkIdxAddr), topk);
+        resGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(res), 1);
+
+        // Global input and output
+        pipe.InitBuffer(pQue, 1, vocAligned * sizeof(T));
+        pipe.InitBuffer(topkQue, 1, topkAligned * sizeof(T));
+        pipe.InitBuffer(topkIdxQue, 1, topkIdxAligned * sizeof(int64_t));
+        pipe.InitBuffer(resQue, 1, 32);// 32 bytes for aligned
+
+        pipe.InitBuffer(softMaxBuf1, blockSize);
+        pipe.InitBuffer(softMaxBuf2, blockSize);
+        pipe.InitBuffer(softMaxBuf3, blockSize);
+        pipe.InitBuffer(softMaxOutBuf, topkAligned * sizeof(T));
+
+        pipe.InitBuffer(inclusiveSumOutBuf, topkAligned * sizeof(T));
+    }
+    __aicore__ inline void Process() {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    // Softmax
+    __aicore__ inline void SoftMax(LocalTensor<T> &valIn,
+                                   LocalTensor<T> &topkValIn,
+                                   LocalTensor<T> &softMaxOut) {
+        int32_t repeatTimes = vocAligned * sizeof(T) / blockSize;
+        int32_t remainder = vocAligned * sizeof(T) % blockSize / sizeof(T);
+        int32_t tileLength = blockSize / sizeof(T);
+        float negMax = -static_cast<float>(topkValIn(0));
+        float invTemperature = 1.0f / temperature;
+        float sum = 0.f;
+        float sum_s = 0.f;
+        LocalTensor<T> tmpBuffer = softMaxBuf1.Get<T>();
+        LocalTensor<T> tmpBuffer2 = softMaxBuf2.Get<T>();
+        LocalTensor<T> tmpBuffer3 = softMaxBuf3.Get<T>();
+        for (int32_t i = 0; i < repeatTimes; i++) {
+            Adds(tmpBuffer, valIn[i * tileLength], static_cast<T>(negMax), tileLength);
+            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), tileLength);
+            Exp(tmpBuffer3, tmpBuffer2, tileLength);
+            sum_s = 0.f;
+            for (int j = 0; j < tileLength; ++j) {
+                sum_s += static_cast<float>(tmpBuffer3(j));
+            }
+            sum += sum_s;
+        }
+        if (remainder != 0) {
+            Adds(tmpBuffer, valIn[repeatTimes * tileLength], static_cast<T>(negMax), remainder);
+            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), remainder);
+            Exp(tmpBuffer3, tmpBuffer2, remainder);
+            sum_s = 0.f;
+            for (int i = 0; i < remainder; ++i) {
+                sum_s += static_cast<float>(tmpBuffer3(i));
+            }
+            sum += sum_s;
+        }
+        float invSum = 1.0f / sum;
+        Adds(tmpBuffer, topkValIn, static_cast<T>(negMax), topk);
+        Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), topk);
+        Exp(tmpBuffer3, tmpBuffer2, topk);
+        Muls(softMaxOut, tmpBuffer3, static_cast<T>(invSum), topk);
+    }
+
+    // Cumsum
+    __aicore__ inline void InclusiveSum(LocalTensor<T> &topkValIn,
+                                        LocalTensor<T> &topkValOut) {
+        static constexpr CumSumConfig cumSumConfig{true, false, false};
+        LocalTensor<T> lastRowLocal;
+        CumSum<T, cumSumConfig>(topkValOut, lastRowLocal, topkValIn,
+                                {1, static_cast<uint32_t>(topkAligned)});
+    }
+
+    // Random sample
+    __aicore__ inline void RandomSample(LocalTensor<T> &valIn,
+                                        LocalTensor<int64_t> &Index,
+                                        LocalTensor<int64_t> &result) {
+        int end = 0;
+        for (end = 0; end < topk; end++) {
+            if (static_cast<float>(valIn(end)) >= topp) {
+                break;
+            }
+        }
+        if (end < topk - 1) {
+            end += 1;
+        } else {
+            end = topk;
+        }
+
+        auto randomVal = random * static_cast<float>(valIn(end - 1));
+        for (int i = 0; i < end; i++) {
+            if (randomVal < static_cast<float>(valIn(i))) {
+                result(0) = Index(i);
+                return;
+            }
+        }
+        result(0) = Index(end - 1);
+    }
+
+    __aicore__ inline void CopyIn() {
+        LocalTensor<T> pLocal = pQue.AllocTensor<T>();
+        LocalTensor<T> topkValLocal = topkQue.AllocTensor<T>();
+        LocalTensor<int64_t> topkIdxLocal = topkIdxQue.AllocTensor<int64_t>();
+
+        DataCopy(pLocal, pGm, vocAligned);
+        DataCopy(topkValLocal, topkGm, topkAligned);
+        DataCopy(topkIdxLocal, topkIdxGm, topkIdxAligned);
+
+        pQue.EnQue(pLocal);
+        topkQue.EnQue(topkValLocal);
+        topkIdxQue.EnQue(topkIdxLocal);
+    }
+
+    __aicore__ inline void Compute() {
+        // Get input data
+        LocalTensor<T> pLocal = pQue.DeQue<T>();
+        LocalTensor<T> topkValLocal = topkQue.DeQue<T>();
+
+        // SoftMax
+        LocalTensor<T> softMaxOutLocal = softMaxOutBuf.Get<T>();
+        SoftMax(pLocal, topkValLocal, softMaxOutLocal);
+
+        // InclusiveSum
+        LocalTensor<T> inclusiveOutLocal = inclusiveSumOutBuf.Get<T>();
+        InclusiveSum(softMaxOutLocal, inclusiveOutLocal);
+
+        // randomSample
+        LocalTensor<int64_t> topkIdxLocal = topkIdxQue.DeQue<int64_t>();
+        LocalTensor<int64_t> resultLocal = resQue.AllocTensor<int64_t>();
+        RandomSample(inclusiveOutLocal, topkIdxLocal, resultLocal);
+
+        pQue.FreeTensor(pLocal);
+        topkQue.FreeTensor(topkValLocal);
+        topkIdxQue.FreeTensor(topkIdxLocal);
+        resQue.EnQue(resultLocal);
+    }
+    __aicore__ inline void CopyOut() {
+        LocalTensor<int64_t> resLocal = resQue.DeQue<int64_t>();
+        DataCopy(resGm, resLocal, 32 / sizeof(int64_t));
+        resQue.FreeTensor(resLocal);
+    }
+
+private:
+    GlobalTensor<T> pGm;
+    GlobalTensor<T> topkGm;
+    GlobalTensor<int64_t> topkIdxGm;
+    GlobalTensor<int64_t> resGm;
+
+    TPipe pipe;
+
+    TQue<QuePosition::VECIN, 1> pQue;
+    TQue<QuePosition::VECIN, 1> topkQue;
+    TQue<QuePosition::VECIN, 1> topkIdxQue;
+    TQue<QuePosition::VECOUT, 1> resQue;
+
+    TBuf<TPosition::VECCALC> softMaxBuf1;
+    TBuf<TPosition::VECCALC> softMaxBuf2;
+    TBuf<TPosition::VECCALC> softMaxBuf3;
+    TBuf<TPosition::VECCALC> softMaxOutBuf;
+
+    TBuf<TPosition::VECCALC> inclusiveSumOutBuf;
+
+    // Kernel params
+    int32_t topk;
+    int32_t voc;
+    float topp;
+    float temperature;
+    float random;
+
+    int32_t topkAligned;
+    int32_t topkIdxAligned;
+    int32_t vocAligned;
+    int32_t blockSize;
+};
+
+extern "C" __global__ __aicore__ void
+random_sample_kernel_f16(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr,
+                         GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_,
+                         float topp_, float temper_, float random_) {
+    KernelRandomSample<half> op;
+    op.Init(p, res, topkAddr, topkIdxAddr, topk_, voc_, topp_, temper_, random_);
+    op.Process();
+}
+
+extern "C" infiniopStatus_t
+random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr,
+                 int32_t topk, int32_t voc, float topp, float temper,
+                 float random, int dtype, void *stream) {
+
+    switch (dtype) {
+        case 0:
+            return STATUS_SUCCESS;
+        case 1:
+            random_sample_kernel_f16<<<1, nullptr, stream>>>(
+                p, res, topkAddr, topkIdxAddr, topk, voc, topp, temper, random);
+            return STATUS_SUCCESS;
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/random_sample/bang/random_sample_bang.cc b/src/ops/random_sample/bang/random_sample_bang.cc
new file mode 100644
index 00000000..ed1945da
--- /dev/null
+++ b/src/ops/random_sample/bang/random_sample_bang.cc
@@ -0,0 +1,39 @@
+#include "random_sample_bang.h"
+#include "../../utils.h"
+
+infiniopStatus_t bangCreateRandomSampleDescriptor(BangHandle_t handle,
+                                                  RandomSampleBangDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
+                                                  infiniopTensorDescriptor_t probs) {
+    if (probs->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!dtype_eq(probs->dt, F16)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!dtype_eq(result->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+    int voc = probs->shape[0];
+    int rLength = result->shape[0];
+    if (result->ndim != 1 && rLength != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    *desc_ptr = new RandomSampleBangDescriptor{
+        handle->device,
+        handle->device_id,
+        probs->dt,
+        voc,
+        result->dt,
+        rLength};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, uint64_t *size) {
+    *size = desc->voc * (sizeof(uint64_t) + sizeof(desc->dtype)) + sizeof(desc->dtype);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t bangDestroyRandomSampleDescriptor(RandomSampleBangDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/random_sample/bang/random_sample_bang.h b/src/ops/random_sample/bang/random_sample_bang.h
new file mode 100644
index 00000000..de830fbf
--- /dev/null
+++ b/src/ops/random_sample/bang/random_sample_bang.h
@@ -0,0 +1,39 @@
+#ifndef __BANG_RANDOM_SAMPLE_H__
+#define __BANG_RANDOM_SAMPLE_H__
+
+#include "../../../devices/bang/bang_handle.h"
+#include "../../utils.h"
+#include "operators.h"
+
+struct RandomSampleBangDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    int voc;
+    DT rDtype;
+    int rLength;
+};
+
+typedef struct RandomSampleBangDescriptor *RandomSampleBangDescriptor_t;
+
+infiniopStatus_t bangCreateRandomSampleDescriptor(BangHandle_t handle,
+                                                  RandomSampleBangDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
+                                                  infiniopTensorDescriptor_t probs);
+
+infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t bangRandomSample(RandomSampleBangDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *result,
+                                  void const *probs,
+                                  float random_val,
+                                  float topp,
+                                  int topk,
+                                  float temperature,
+                                  void *stream);
+
+infiniopStatus_t bangDestroyRandomSampleDescriptor(RandomSampleBangDescriptor_t desc);
+
+
+#endif
diff --git a/src/ops/random_sample/bang/random_sample_bang.mlu b/src/ops/random_sample/bang/random_sample_bang.mlu
new file mode 100644
index 00000000..eb6f636f
--- /dev/null
+++ b/src/ops/random_sample/bang/random_sample_bang.mlu
@@ -0,0 +1,512 @@
+#include "bang.h"
+#include "bang_device_functions.h"
+#include "cnrt.h"
+#include "random_sample_bang.h"
+#include "../../../devices/bang/common_bang.h"
+#include <stdlib.h>
+
+const int SRC_MAX_SIZE = 1024 * 32;
+__nram__  char nram_buffer[NRAM_MAX_SIZE];
+template <typename T>
+__mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t *indGdram, T *globalTopk, T *globalSum, float random_val, float topp, int topk, float temperature, int voc){
+    const int maxNum = SRC_MAX_SIZE/sizeof(T);
+    int wSize = 128 / sizeof(T);
+    int segNum = maxNum / wSize;
+
+    T temInv = 1.0 / static_cast<T>(temperature);
+
+    int remainT = voc % taskDim;
+    int stepEasy = (voc - remainT) / taskDim;
+    int stepHard = stepEasy + 1;
+    int step = (taskId < remainT ? stepHard : stepEasy);
+    int indStart = (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy);
+
+    char *nram_bufferInd = nram_buffer + (2 * maxNum + wSize + taskDim * topk) * sizeof(T);
+    uint64_t *srcInd = (uint64_t *)nram_bufferInd;//[maxNum],必须要求maxNum >= max{step, topk}
+    uint64_t *indGlobal = srcInd + maxNum;//[taskDim * topk]
+
+    __sync_all();
+
+    T *src = (T *)nram_buffer;//[maxNum],必须要求maxNum >= max{step, topk}
+    T *destSum = src + maxNum;//[maxNum]
+    T *destSumFinal = destSum + maxNum;//[wSize]
+    T *srcGlobal = destSumFinal + wSize;//[taskDim * topk]
+    __bang_write_value(src, maxNum, -INFINITY);
+    __bang_write_zero(destSum, maxNum);
+    __bang_write_zero(destSumFinal, wSize);
+
+    
+
+    if(step){
+        for(int i = 0; i < step; i++){
+            srcInd[i] = indStart + i;
+        }
+        __memcpy(src, source + indStart, step * sizeof(T), GDRAM2NRAM);
+        if(step >= topk){
+            for(int i = 0; i < topk; i++){
+                for(int j = i + 1; j < step; j++){
+                    if(src[i] < src[j]){
+                        T tmp = src[i];
+                        src[i] = src[j];
+                        src[j] = tmp;
+
+                        uint64_t indexTmp = srcInd[i];
+                        srcInd[i] = srcInd[j];
+                        srcInd[j] = indexTmp;
+                    }
+                }
+            }
+        }
+        else{
+            for(int i = step; i < topk; i++){
+                src[i] = -INFINITY;
+                srcInd[i] = -1;
+            }
+        }
+        __memcpy(globalTopk + taskId * topk, src, topk * sizeof(T), NRAM2GDRAM);
+        __memcpy(indGdram + taskId * topk, srcInd, topk * sizeof(uint64_t), NRAM2GDRAM);
+        __sync_all();
+    }
+    if(taskId == 0){
+        __memcpy(srcGlobal, globalTopk, taskDim * topk * sizeof(T), GDRAM2NRAM);
+        __memcpy(indGlobal, indGdram, taskDim * topk * sizeof(uint64_t), GDRAM2NRAM);
+        for(int i = 0; i < topk; i++){
+            for(int j = i + 1; j < taskDim * topk; j++){
+                if(srcGlobal[i] < srcGlobal[j]){
+                    T tmpg = srcGlobal[i];
+                    srcGlobal[i] = srcGlobal[j];
+                    srcGlobal[j] = tmpg;
+
+                    uint64_t indexTmpg = indGlobal[i];
+                    indGlobal[i] = indGlobal[j];
+                    indGlobal[j] = indexTmpg;
+                }
+            }
+        }
+        __memcpy(globalTopk, srcGlobal, taskDim * topk * sizeof(T), NRAM2GDRAM);
+        __memcpy(indGdram, indGlobal, taskDim * topk * sizeof(uint64_t), NRAM2GDRAM);
+    }
+    __sync_all();
+    T globalM = globalTopk[0];
+    __bang_write_zero(destSum, maxNum);
+    __bang_write_zero(destSumFinal, wSize);
+    if(step){
+        __bang_write_value(src, maxNum, globalM);
+        __memcpy(src, source + indStart, step * sizeof(T), GDRAM2NRAM);
+        __bang_sub_scalar(src, src, globalM, maxNum);
+        __bang_mul_scalar(src, src, temInv, maxNum);
+        __bang_active_exp_less_0(src, src, maxNum);
+        __bang_add(destSum, destSum, src, maxNum);
+    }
+    if(maxNum >= wSize){
+        for(int strip = segNum/2; strip > 0; strip = strip / 2){//segNum要求是2的幂次即maxNum必须选取2的幂次
+            for(int i = 0; i < strip ; i++){
+                __bang_add(destSum + i * wSize, destSum + i * wSize, destSum + (i + strip) * wSize, wSize);
+            }
+        }
+
+        __bang_reduce_sum(destSumFinal, destSum, wSize);
+    }
+    else{
+        for(int i = 0; i < maxNum; i++){
+            destSumFinal[0] += destSum[i];
+        }
+    }
+    if(step){
+        destSumFinal[0] = destSumFinal[0] - (maxNum - step);//把上面多加的(maxNum - step)减掉
+    }
+    globalSum[0] = 0.0;
+
+    __sync_all();
+    __bang_atomic_add(destSumFinal, globalSum, destSumFinal, 1);//globalSum[0]必须初始化为0
+
+    T globalSumInv = 1.0 / globalSum[0];//计算出全局数值和
+
+    if(taskId == 0){
+        __memcpy(srcGlobal, globalTopk, topk * sizeof(T), GDRAM2NRAM);//前topk个元素就是前k个最大值
+
+
+        __bang_sub_scalar(srcGlobal, srcGlobal, globalM, topk);
+        __bang_mul_scalar(srcGlobal, srcGlobal, temInv, topk);
+        __bang_active_exp_less_0(srcGlobal, srcGlobal, topk);
+        __bang_mul_scalar(srcGlobal, srcGlobal, globalSumInv, topk);
+
+        __bang_write_zero(destSum, 2 * topk);
+        destSum[0] = srcGlobal[0];
+        for(int i = 1; i < topk; i++){
+            destSum[i] = destSum[i - 1] + srcGlobal[i];
+        }
+
+        int end = 0;
+        for(end = 0; end < topk; end++){
+            if(destSum[end] >= static_cast<T>(topp)){
+                break;
+            }
+        }
+        if(end < topk - 1){
+            end += 1;
+        }
+        else{
+            end = topk;
+        }
+
+        random_val *= destSum[end - 1];
+        for(int i = 0; i < end; i++){
+            if(random_val < destSum[i]){
+                indices[0] = indGdram[i];
+                break;
+            }
+        }
+        __memcpy(globalTopk, srcGlobal, topk * sizeof(T), NRAM2GDRAM);
+    }
+}
+
+template <typename T>
+__mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t *indGdram, T *globalTopk, T *globalSum, float random_val, float topp, int topk, float temperature, int voc){
+    const int maxNum = SRC_MAX_SIZE/sizeof(T);
+
+    int wSize = 128 / sizeof(T);
+    int segNum = maxNum / wSize;
+
+    T temInv = 1.0 / static_cast<T>(temperature);
+    int taskSize = taskDim * maxNum;
+    int remain = voc % taskSize;
+    int repeat = (voc - remain) / taskSize;
+
+    int remainT = remain % taskDim;
+    int stepEasy = (remain - remainT) / taskDim;
+    int stepHard = stepEasy + 1;
+    int step = (taskId < remainT ? stepHard : stepEasy);
+    int indStart = (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy);
+
+    char *nram_bufferInd = nram_buffer + (2 * maxNum + wSize + 2 * topk + taskDim * topk) * sizeof(T);
+    uint64_t *srcInd = (uint64_t *)nram_bufferInd;//[maxNum]
+    uint64_t *topkInd = srcInd + maxNum;//[2 * topk]
+    uint64_t *indGlobal = topkInd + 2 * topk;
+    __bang_write_zero(topkInd, 2 * topk);
+
+    T *src = (T *)nram_buffer;//[maxNum]
+    T *srcTopk = src + maxNum;//[2 * topk]
+    T *destSum = srcTopk + 2 * topk;//[maxNum]
+    T *destSumFinal = destSum + maxNum;//[wSize]
+    T *srcGlobal = destSumFinal + wSize;//[taskDim * topk]
+    for(int i = 0; i < 2 * topk; i++){
+        srcTopk[i] = -INFINITY;//不能使用__bang_write_value
+    }
+    for(int j = 0; j < maxNum; j++){
+        srcInd[j] = taskId * maxNum + j;
+    }
+    for(int r = 0; r < repeat; r++){
+        if(r > 0){
+            __bang_add_scalar(srcInd, srcInd, taskSize, maxNum);//每次都在上一次基础上增加taskSize
+        }
+        __memcpy(src, source + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+        for(int i = 0; i < topk; i++){
+            for(int j = i + 1; j < maxNum; j++){
+                if(src[i] < src[j]){
+                    T tmp = src[i];
+                    src[i] = src[j];
+                    src[j] = tmp;
+
+                    uint64_t indexTmp = srcInd[i];
+                    srcInd[i] = srcInd[j];
+                    srcInd[j] = indexTmp;
+                }
+            }
+            
+        }
+        for(int i = 0; i < topk; i++){
+            srcTopk[topk + i] = src[i];
+            topkInd[topk + i] = srcInd[i];
+        }
+        
+        for(int i = 0; i < topk; i++){
+            for(int j = i + 1; j < 2 * topk; j++){
+                if(srcTopk[i] < srcTopk[j]){
+                    T tmpk = srcTopk[i];
+                    srcTopk[i] = srcTopk[j];
+                    srcTopk[j] = tmpk;
+
+                    uint64_t indexTmpk = topkInd[i];
+                    topkInd[i] = topkInd[j];
+                    topkInd[j] = indexTmpk;
+                }
+            }
+        }
+        
+    }
+    if(step){
+        for(int j = 0; j < step; j++){
+            srcInd[j] = repeat * taskSize + indStart + j;
+        }
+        __memcpy(src, source + repeat * taskSize + indStart, step * sizeof(T), GDRAM2NRAM);
+        if(step >= topk){
+            for(int i = 0; i < topk; i++){
+                for(int j = i + 1; j < step; j++){
+                    if(src[i] < src[j]){
+                        T tmp = src[i];
+                        src[i] = src[j];
+                        src[j] = tmp;
+
+                        uint64_t indexTmp = srcInd[i];
+                        srcInd[i] = srcInd[j];
+                        srcInd[j] = indexTmp;
+                    }
+                }
+                
+            }
+            for(int i = 0; i < topk; i++){
+                srcTopk[topk + i] = src[i];
+                topkInd[topk + i] = srcInd[i];
+            }
+        }
+        else{
+            for(int i = 0; i < step; i++){
+                srcTopk[topk + i] = src[i];
+                topkInd[topk + i] = srcInd[i];
+            }
+        }
+        for(int i = 0; i < topk; i++){
+            for(int j = i + 1; j < 2 * topk; j++){
+                if(srcTopk[i] < srcTopk[j]){
+                    T tmpk = srcTopk[i];
+                    srcTopk[i] = srcTopk[j];
+                    srcTopk[j] = tmpk;
+
+                    uint64_t indexTmpk = topkInd[i];
+                    topkInd[i] = topkInd[j];
+                    topkInd[j] = indexTmpk;
+                }
+            }
+        }
+    }
+
+    __memcpy(globalTopk + taskId * topk, srcTopk, topk * sizeof(T), NRAM2GDRAM);
+    __memcpy(indGdram + taskId * topk, topkInd, topk * sizeof(uint64_t), NRAM2GDRAM);
+    __sync_all();
+
+    if(taskId == 0){
+        __memcpy(srcGlobal, globalTopk, taskDim * topk * sizeof(T), GDRAM2NRAM);
+        __memcpy(indGlobal, indGdram, taskDim * topk * sizeof(uint64_t), GDRAM2NRAM);
+        for(int i = 0; i < topk; i++){
+            for(int j = i + 1; j < taskDim * topk; j++){
+                if(srcGlobal[i] < srcGlobal[j]){
+                    T tmpg = srcGlobal[i];
+                    srcGlobal[i] = srcGlobal[j];
+                    srcGlobal[j] = tmpg;
+
+                    uint64_t indexTmpg = indGlobal[i];
+                    indGlobal[i] = indGlobal[j];
+                    indGlobal[j] = indexTmpg;
+                }
+            }
+        }
+        __memcpy(globalTopk, srcGlobal, taskDim * topk * sizeof(T), NRAM2GDRAM);
+        __memcpy(indGdram, indGlobal, taskDim * topk * sizeof(uint64_t), NRAM2GDRAM);
+    }
+    __sync_all();
+    //下面开始做类似于softmax变换
+    T globalM = globalTopk[0];
+    __bang_write_zero(destSum, maxNum);
+    __bang_write_zero(destSumFinal, wSize);
+    for(int r = 0; r < repeat; r++){
+        __memcpy(src, source + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+        __bang_sub_scalar(src, src, globalM, maxNum);
+        __bang_mul_scalar(src, src, temInv, maxNum);
+        __bang_active_exp_less_0(src, src, maxNum);
+        __bang_add(destSum, destSum, src, maxNum);
+    }
+    if(step){
+        __bang_write_zero(src, maxNum);
+        __memcpy(src, source + repeat * taskSize + indStart, step * sizeof(T), GDRAM2NRAM);
+        __bang_sub_scalar(src, src, globalM, step);
+        __bang_mul_scalar(src, src, temInv, step);
+        __bang_active_exp_less_0(src, src, step);
+        __bang_add(destSum, destSum, src, maxNum);
+    }
+    if(maxNum >= wSize){
+        for(int strip = segNum/2; strip > 0; strip = strip / 2){//segNum要求是2的幂次即maxNum必须选取2的幂次
+            for(int i = 0; i < strip ; i++){
+                __bang_add(destSum + i * wSize, destSum + i * wSize, destSum + (i + strip) * wSize, wSize);
+            }
+        }
+        for(int i = 0; i < wSize; i++){
+
+            destSumFinal[0] += destSum[i];//__bang_reduce_sum失效，只能手动reduce
+        }
+    }
+
+    else{
+        for(int i = 0; i < maxNum; i++){
+
+            destSumFinal[0] += destSum[i];
+        }
+
+    }
+    
+    globalSum[0] = 0.0;
+
+    __sync_all();
+    __bang_atomic_add(destSumFinal, globalSum, destSumFinal, 1);//globalSum[0]必须初始化为0
+
+    T globalSumInv = 1.0 / globalSum[0];//计算出全局数值和
+
+    if(taskId == 0){
+        __memcpy(srcGlobal, globalTopk, topk * sizeof(T), GDRAM2NRAM);//前topk个元素就是前k个最大值
+
+
+        __bang_sub_scalar(srcGlobal, srcGlobal, globalM, topk);
+        __bang_mul_scalar(srcGlobal, srcGlobal, temInv, topk);
+        __bang_active_exp_less_0(srcGlobal, srcGlobal, topk);
+        __bang_mul_scalar(srcGlobal, srcGlobal, globalSumInv, topk);
+
+        __bang_write_zero(srcTopk, 2 * topk);
+        srcTopk[0] = srcGlobal[0];
+        for(int i = 1; i < topk; i++){
+            srcTopk[i] = srcTopk[i - 1] + srcGlobal[i];
+        }
+
+        int end = 0;
+        for(end = 0; end < topk; end++){
+            if(srcTopk[end] >= static_cast<T>(topp)){
+                break;
+            }
+        }
+        if(end < topk - 1){
+            end += 1;
+        }
+        else{
+            end = topk;
+        }
+
+        random_val *= srcTopk[end - 1];
+        for(int i = 0; i < end; i++){
+            if(random_val < srcTopk[i]){
+                indices[0] = indGdram[i];
+                break;
+            }
+        }
+        __memcpy(globalTopk, srcGlobal, topk * sizeof(T), NRAM2GDRAM);
+    }
+}
+template<typename T>
+__mlu_global__ void random_sample(T const *source, uint64_t *indices, uint64_t *indGdram, int voc){
+    const uint64_t maxNum = SRC_MAX_SIZE/sizeof(T);
+
+    uint64_t taskSize = taskDim * maxNum;
+    uint64_t remain = voc % taskSize;
+    uint64_t repeat = (voc - remain) / taskSize;
+
+    uint64_t remainT = remain % taskDim;
+    uint64_t stepEasy = (remain - remainT) / taskDim;
+    uint64_t stepHard = stepEasy + 1;
+    uint64_t step = (taskId < remainT ? stepHard : stepEasy);
+    uint64_t indStart = repeat * taskSize + (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy);
+
+    T *src = (T *)nram_buffer;
+    T *srcMax = src + maxNum;
+    uint64_t index = 0;
+
+    T newMax = -INFINITY;
+    for(uint64_t r = 0; r < repeat; r++){
+        __memcpy(src, source + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+        __bang_argmax(srcMax, src, maxNum);
+        if(newMax < srcMax[0]){
+            newMax = srcMax[0];
+            index = r * taskSize + taskId * maxNum + *((int64_t*)&srcMax[1]);
+        }
+
+    }
+    if(step){
+        __bang_write_value(src, maxNum, -INFINITY);
+        __memcpy(src, source + indStart, step * sizeof(T), GDRAM2NRAM);
+        __bang_argmax(srcMax, src, maxNum);
+        if(newMax < srcMax[0]){
+            newMax = srcMax[0];
+            index = indStart + *((int64_t*)&srcMax[1]);
+        }
+
+    }
+
+    indGdram[taskId] = index;
+    __sync_all();
+    if(taskId == 0){
+        uint64_t globalInd = indGdram[0];
+        T globalM = source[globalInd];
+        for(uint64_t id = 0; id < taskDim; id++){
+            if(globalM < source[indGdram[id]]){
+                globalM = source[indGdram[id]];
+                globalInd = indGdram[id];
+            }
+        }
+        indices[0] = globalInd;
+    }
+}
+template<typename T>
+void random_sampleUnion(cnrtQueue_t queue, void *workspace, void const *source, void *indices, float random_val, float topp, int topk, float temperature, int voc) {
+    auto logits_ = reinterpret_cast<const T *>(source);
+    auto index_ = reinterpret_cast<uint64_t *>(indices);
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = CNRT_FUNC_TYPE_UNION1;
+
+    int taskNum = k_dim.x * k_dim.y * k_dim.z;
+    if(topp > 0 && topk > 1){
+        const int maxNum = SRC_MAX_SIZE/sizeof(T);
+        char *origin = reinterpret_cast<char *>(workspace);
+        char *indTmp = origin + taskNum * topk * sizeof(uint64_t);
+        uint64_t *indGdram = (uint64_t *)origin;
+        T *globalTopk = (T *)indTmp;
+        T *globalSum = globalTopk + taskNum * topk;
+
+        if(voc >= taskNum * maxNum){
+            random_sampleD<T><<<k_dim, k_type, queue>>>(logits_, index_, indGdram, globalTopk, globalSum, random_val, topp, topk, temperature, voc);
+        }
+        else{
+            random_sampleX<T><<<k_dim, k_type, queue>>>(logits_, index_, indGdram, globalTopk, globalSum, random_val, topp, topk, temperature, voc);
+        }
+    }
+    else{
+        uint64_t *indGdram = reinterpret_cast<uint64_t *>(workspace);
+        random_sample<T><<<k_dim, k_type, queue>>>(logits_, index_, indGdram, voc);
+    }
+    cnrtQueueSync(queue);
+
+
+}
+
+void random_sample_bang_f16(RandomSampleBangDescriptor_t desc, void *workspace, void *result,
+                                    void const *probs,
+                                    float random_val,
+                                    float topp,
+                                    int topk,
+                                    float temperature,
+                                    void *stream) {
+    auto queue = reinterpret_cast<cnrtQueue_t>(stream);
+    int voc = desc->voc;
+
+    random_sampleUnion<half>(queue, workspace, probs, result, random_val, topp, topk, temperature, voc);
+}
+infiniopStatus_t bangRandomSample(RandomSampleBangDescriptor_t desc,
+                                    void *workspace,
+                                    uint64_t workspace_size,
+                                    void *result,
+                                    void const *probs,
+                                    float random_val,
+                                    float topp,
+                                    int topk,
+                                    float temperature,
+                                    void *stream) {
+    if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    if (dtype_eq(desc->dtype, F16)) {
+        random_sample_bang_f16(desc, workspace, result, probs, random_val, topp, topk, temperature, stream);
+        return STATUS_SUCCESS;
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/random_sample/cpu/random_sample.cc b/src/ops/random_sample/cpu/random_sample.cc
new file mode 100644
index 00000000..28de5b93
--- /dev/null
+++ b/src/ops/random_sample/cpu/random_sample.cc
@@ -0,0 +1,185 @@
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+#include "random_sample_cpu.h"
+#include <cmath>
+
+
+infiniopStatus_t cpuCreateRandomSampleDescriptor(infiniopHandle_t,
+                                                 RandomSampleCpuDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
+                                                 infiniopTensorDescriptor_t probs) {
+    int ndim = probs->ndim;
+    if (ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!dtype_eq(probs->dt, F16)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!dtype_eq(result->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+    int voc = probs->shape[0];
+    int rLength = result->shape[0];
+    if (result->ndim != 1 && rLength != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    *desc_ptr = new RandomSampleCpuDescriptor{
+        DevCpu,
+        probs->dt,
+        voc,
+        result->dt,
+        rLength};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGetRandomSampleWorkspaceSize(RandomSampleCpuDescriptor_t desc, uint64_t *size) {
+    *size = desc->voc * (sizeof(uint64_t) + sizeof(desc->dtype));
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyRandomSampleDescriptor(RandomSampleCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+
+void random_sample_cpu_f16(RandomSampleCpuDescriptor_t desc,
+                           void *workspace,
+                           void *result,
+                           void const *probs,
+                           float random_val,
+                           float topp,
+                           int topk,
+                           float temperature) {
+    int voc = desc->voc;
+    char *origin = reinterpret_cast<char *>(workspace);
+    //排序得到前k个最大值，按照从大到小顺序存储在logits_前k个位置里面
+    char *logitsTmp = origin + voc * sizeof(uint64_t);
+    uint64_t *indexTmp = (uint64_t *) origin;
+    uint16_t *logits_ = (uint16_t *) logitsTmp;
+
+
+    auto source = reinterpret_cast<const uint16_t *>(probs);
+
+    std::copy(source, source + voc, logits_);
+    auto index_ = reinterpret_cast<uint64_t *>(result);
+
+    // 如果k大于voc，调整k为voc
+    if (topk > voc) {
+        topk = voc;
+    }
+
+    for (int i = 0; i < voc; i++) {
+        indexTmp[i] = i;
+    }
+    for (int i = 0; i < topk; i++) {
+        for (int j = i + 1; j < voc; j++) {
+            if (f16_to_f32(logits_[i]) < f16_to_f32(logits_[j])) {
+                float M = f16_to_f32(logits_[i]);
+                logits_[i] = logits_[j];
+                logits_[j] = f32_to_f16(M);
+
+
+                int index = indexTmp[i];
+                indexTmp[i] = indexTmp[j];
+                indexTmp[j] = index;
+            }
+        }
+    }
+
+    //做类似于softmax的temperature变换
+    float reduceM = f16_to_f32(logits_[0]);
+    float reduceS = 0.0f;
+    for (int i = 0; i < voc; i++) {
+        reduceS += std::exp((f16_to_f32(logits_[i]) - reduceM) / temperature);
+    }
+    for (int i = 0; i < voc; i++) {
+        logits_[i] = f32_to_f16(std::exp((f16_to_f32(logits_[i]) - reduceM) / temperature) / reduceS);
+    }
+    //在前k个元素里面利用topp选取不超过topp的元素作为数据集
+    float tmp = 0.0f;
+    int end = 0;
+    for (end = 0; end < topk; end++) {
+        tmp += f16_to_f32(logits_[end]);
+        if (tmp >= topp) {
+            break;
+        }
+    }
+    //printf("%d\n", end);
+    if (end < topk - 1) {
+        end += 1;
+    } else {
+        end = topk;
+    }
+    //利用随机数随机输出满足同时满足topk,topp的某个元素在原始向量的index
+
+    float sum_s = 0.0f;
+    for (int i = 0; i < end; i++) {
+        sum_s += f16_to_f32(logits_[i]);
+    }
+    random_val *= sum_s;
+
+    sum_s = 0.0f;
+    for (int i = 0; i < end; i++) {
+        sum_s += f16_to_f32(logits_[i]);
+        if (random_val < sum_s) {
+            index_[0] = indexTmp[i];
+            break;
+        }
+    }
+}
+void random_sample_cpu_f16(RandomSampleCpuDescriptor_t desc,
+                           void *workspace,
+                           void *result,
+                           void const *probs) {
+    int voc = desc->voc;
+    auto index_ = reinterpret_cast<uint64_t *>(result);
+    auto source = reinterpret_cast<const uint16_t *>(probs);
+
+    char *origin = reinterpret_cast<char *>(workspace);
+    uint16_t *logits_ = (uint16_t *) origin;
+
+    std::copy(source, source + voc, logits_);
+
+    float M = f16_to_f32(logits_[0]);
+    int index = 0;
+    for (int j = 1; j < voc; j++) {
+        if (M < f16_to_f32(logits_[j])) {
+            M = f16_to_f32(logits_[j]);
+            index = j;
+        }
+    }
+
+    index_[0] = index;
+}
+
+infiniopStatus_t cpuRandomSample(RandomSampleCpuDescriptor_t desc,
+                                 void *workspace,
+                                 uint64_t workspace_size,
+                                 void *result,
+                                 void const *probs,
+                                 float random_val,
+                                 float topp,
+                                 int topk,
+                                 float temperature,
+                                 void *stream) {
+    if (dtype_eq(desc->dtype, F16)) {
+        if (topp > 0 && topk > 1) {
+            random_sample_cpu_f16(desc,
+                                  workspace,
+                                  result,
+                                  probs,
+                                  random_val,
+                                  topp,
+                                  topk,
+                                  temperature);
+        } else {
+            random_sample_cpu_f16(desc,
+                                  workspace,
+                                  result,
+                                  probs);
+        }
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/random_sample/cpu/random_sample_cpu.h b/src/ops/random_sample/cpu/random_sample_cpu.h
new file mode 100644
index 00000000..b4b501be
--- /dev/null
+++ b/src/ops/random_sample/cpu/random_sample_cpu.h
@@ -0,0 +1,34 @@
+#ifndef __CPU_RANDOM_SAMPLE_H__
+#define __CPU_RANDOM_SAMPLE_H__
+
+#include "operators.h"
+struct RandomSampleCpuDescriptor {
+    Device device;
+    DT dtype;
+    int voc;
+    DT rDtype;
+    int rLength;
+};
+
+typedef struct RandomSampleCpuDescriptor *RandomSampleCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateRandomSampleDescriptor(infiniopHandle_t,
+                                                 RandomSampleCpuDescriptor_t *, infiniopTensorDescriptor_t result,
+                                                 infiniopTensorDescriptor_t probs);
+
+infiniopStatus_t cpuGetRandomSampleWorkspaceSize(RandomSampleCpuDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cpuRandomSample(RandomSampleCpuDescriptor_t desc,
+                                 void *workspace,
+                                 uint64_t workspace_size,
+                                 void *result,
+                                 void const *probs,
+                                 float random_val,
+                                 float topp,
+                                 int topk,
+                                 float temperature,
+                                 void *stream);
+
+infiniopStatus_t cpuDestroyRandomSampleDescriptor(RandomSampleCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu
new file mode 100644
index 00000000..12bc03b2
--- /dev/null
+++ b/src/ops/random_sample/cuda/random_sample.cu
@@ -0,0 +1,180 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "random_sample.cuh"
+#include <cub/block/block_reduce.cuh>
+#include <cub/cub.cuh>
+
+template<class T, int BLOCK_DIM>
+__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void softmax(
+    T *val_out,
+    int topk,
+    float temperature, int voc) {
+    float sum_s = 0.0f;
+    for (int i = threadIdx.x; i < topk; i += BLOCK_DIM) {
+        sum_s += __expf(static_cast<float>(val_out[i] - val_out[0]) / temperature);
+    }
+    __shared__ float sum_inverse_total;
+
+    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    float block_sum = BlockReduce(temp_storage).Reduce(sum_s, cub::Sum());
+    if (threadIdx.x == 0) {
+        sum_inverse_total = __fdividef(1.0F, block_sum);//高精度除法
+    }
+
+    __syncthreads();
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (tid < topk) {
+        val_out[tid] = static_cast<T>(__expf(static_cast<float>(val_out[tid] - val_out[0]) / temperature) * sum_inverse_total);
+    }
+}
+
+__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void index(uint64_t *key_in, int voc) {
+    int ind = threadIdx.x + blockIdx.x * blockDim.x;
+    if (ind < voc) {
+        key_in[ind] = static_cast<uint64_t>(ind);
+    }
+}
+template<class T>
+__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void random_sample_kernel(uint64_t *result,
+                                     T *val_out,
+                                     float random_val,
+                                     float topp,
+                                     int topk,
+                                     uint64_t *key_out) {
+    int end = 0;
+    for (end = 0; end < topk; end++) {
+        if (val_out[end] >= static_cast<T>(topp)) {
+            break;
+        }
+    }
+    if (end < topk - 1) {
+        end += 1;
+    } else {
+        end = topk;
+    }
+
+    random_val *= static_cast<float>(val_out[end - 1]);
+    for (int i = 0; i < end; i++) {
+        if (random_val < static_cast<float>(val_out[i])) {
+            result[0] = key_out[i];
+            break;
+        }
+    }
+}
+template<class T, class I>
+void sort_pairs_descending(
+    void *workspace, size_t &size_radix_sort,
+    T const *val_in, T *val_out,
+    I *key_in, I *key_out,
+    int voc, cudaStream_t stream) {
+    cub::DeviceRadixSort::SortPairsDescending(
+        workspace, size_radix_sort,
+        val_in, val_out,
+        key_in, key_out,
+        voc, 0, sizeof(T) * 8, stream);
+}
+template<class T>
+void inclusive_sum(
+    void *workspace, size_t &size_scan,
+    T *data, int voc,
+    cudaStream_t stream) {
+    cub::DeviceScan::InclusiveSum(
+        workspace, size_scan,
+        data, data, voc,
+        stream);
+}
+template<class T, class I>
+void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
+                             int voc, cudaStream_t stream) {
+
+
+    sort_pairs_descending<T, I>(nullptr, size_radix_sort,
+                                nullptr, nullptr,
+                                nullptr, nullptr,
+                                voc, stream);
+
+    inclusive_sum<T>(
+        nullptr, size_scan,
+        nullptr, voc,
+        stream);
+}
+__global__ void random_sample_kernel(uint64_t *result,
+                                     uint64_t *key_out) {
+    result[0] = key_out[0];
+}
+void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace, void *result,
+                              void const *probs,
+                              float random_val,
+                              float topp,
+                              int topk,
+                              float temperature,
+                              void *stream) {
+    int voc = desc->voc;
+    //下面这段代码在排序
+    char *origin = reinterpret_cast<char *>(workspace);
+    char *keyTmp = origin + voc * sizeof(half);
+    half *val_out = (half *) origin;
+
+    uint64_t *key_in = (uint64_t *) keyTmp;
+    uint64_t *key_out = key_in + voc;
+
+    int block_dim = MAX_THREADS_PER_BLOCK;
+    int num_blocks = ROUND_UP_DIV(voc, block_dim);
+    index<<<num_blocks, block_dim, 0, (cudaStream_t) stream>>>(key_in, voc);
+    //下面开始计算workspace空间
+    size_t size_radix_sort;
+    size_t size_scan;
+    random_sample_workspace<half, uint64_t>(size_radix_sort, size_scan,
+                                            voc, (cudaStream_t) stream);
+    void *workspace_extra;
+    cudaMalloc(&workspace_extra, size_radix_sort + size_scan);
+    sort_pairs_descending<half, uint64_t>(
+        workspace_extra, size_radix_sort,
+        (half *) probs, val_out,
+        key_in, key_out,
+        voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上
+    //排序结束，然后开始做softmax变换
+    if (topp > 0 && topk > 1) {
+        softmax<half, MAX_THREADS_PER_BLOCK><<<num_blocks, block_dim, 0, (cudaStream_t) stream>>>(val_out, topk,
+                                                                                 temperature, voc);
+
+
+        inclusive_sum<half>(
+            workspace_extra, size_scan,
+            val_out, voc,
+            (cudaStream_t) stream);//该函数会实现scan功能不断累加结果
+        random_sample_kernel<half><<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result,
+                                                                       val_out,
+                                                                       random_val,
+                                                                       topp,
+                                                                       topk,
+                                                                       key_out);
+
+    } else {
+        random_sample_kernel<<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result,
+                                                                 key_out);
+    }
+    cudaFree(workspace_extra);
+}
+
+infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *result,
+                                  void const *probs,
+                                  float random_val,
+                                  float topp,
+                                  int topk,
+                                  float temperature,
+                                  void *stream) {
+    if (cudaSetDevice(desc->device_id) != cudaSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    if (dtype_eq(desc->dtype, F16)) {
+        random_sample_nv_gpu_f16(desc, workspace, result, probs, random_val, topp, topk, temperature, stream);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh
new file mode 100644
index 00000000..d3fff76d
--- /dev/null
+++ b/src/ops/random_sample/cuda/random_sample.cuh
@@ -0,0 +1,38 @@
+#ifndef __CUDA_RANDOM_SAMPLE_H__
+#define __CUDA_RANDOM_SAMPLE_H__
+
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+
+struct RandomSampleCudaDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    int voc;
+    DT rDtype;
+    int rLength;
+};
+
+typedef struct RandomSampleCudaDescriptor *RandomSampleCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
+                                                  RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
+                                                  infiniopTensorDescriptor_t probs);
+
+infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *result,
+                                  void const *probs,
+                                  float random_val,
+                                  float topp,
+                                  int topk,
+                                  float temperature,
+                                  void *stream);
+
+infiniopStatus_t cudaDestroyRandomSampleDescriptor(RandomSampleCudaDescriptor_t desc);
+
+
+#endif
diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc
new file mode 100644
index 00000000..022a113b
--- /dev/null
+++ b/src/ops/random_sample/cuda/random_sample_cuda.cc
@@ -0,0 +1,37 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "random_sample.cuh"
+
+infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
+                                                  RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
+                                                  infiniopTensorDescriptor_t probs) {
+    if (probs->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!dtype_eq(result->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+    int voc = probs->shape[0];
+    int rLength = result->shape[0];
+    if (result->ndim != 1 && rLength != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    *desc_ptr = new RandomSampleCudaDescriptor{
+        handle->device,
+        handle->device_id,
+        probs->dt,
+        voc,
+        result->dt,
+        rLength};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size) {
+    *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype));
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyRandomSampleDescriptor(RandomSampleCudaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/random_sample/maca/random_sample_maca.cc b/src/ops/random_sample/maca/random_sample_maca.cc
new file mode 100644
index 00000000..1cb0fe74
--- /dev/null
+++ b/src/ops/random_sample/maca/random_sample_maca.cc
@@ -0,0 +1,37 @@
+#include "../../../devices/maca/common_maca.h"
+#include "../../utils.h"
+#include "random_sample_maca.h"
+
+infiniopStatus_t macaCreateRandomSampleDescriptor(MacaHandle_t handle,
+                                                  RandomSampleMacaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
+                                                  infiniopTensorDescriptor_t probs) {
+    if (probs->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!dtype_eq(result->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+    int voc = probs->shape[0];
+    int rLength = result->shape[0];
+    if (result->ndim != 1 && rLength != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    *desc_ptr = new RandomSampleMacaDescriptor{
+        handle->device,
+        handle->device_id,
+        probs->dt,
+        voc,
+        result->dt,
+        rLength};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t macaGetRandomSampleWorkspaceSize(RandomSampleMacaDescriptor_t desc, uint64_t *size) {
+    *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype));
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t macaDestroyRandomSampleDescriptor(RandomSampleMacaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/random_sample/maca/random_sample_maca.h b/src/ops/random_sample/maca/random_sample_maca.h
new file mode 100644
index 00000000..3cf1ab59
--- /dev/null
+++ b/src/ops/random_sample/maca/random_sample_maca.h
@@ -0,0 +1,38 @@
+#ifndef __MACA_RANDOM_SAMPLE_H__
+#define __MACA_RANDOM_SAMPLE_H__
+
+#include "../../../devices/maca/maca_handle.h"
+#include "operators.h"
+
+struct RandomSampleMacaDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    int voc;
+    DT rDtype;
+    int rLength;
+};
+
+typedef struct RandomSampleMacaDescriptor *RandomSampleMacaDescriptor_t;
+
+infiniopStatus_t macaCreateRandomSampleDescriptor(MacaHandle_t handle,
+                                                  RandomSampleMacaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
+                                                  infiniopTensorDescriptor_t probs);
+
+infiniopStatus_t macaGetRandomSampleWorkspaceSize(RandomSampleMacaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t macaRandomSample(RandomSampleMacaDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *result,
+                                  void const *probs,
+                                  float random_val,
+                                  float topp,
+                                  int topk,
+                                  float temperature,
+                                  void *stream);
+
+infiniopStatus_t macaDestroyRandomSampleDescriptor(RandomSampleMacaDescriptor_t desc);
+
+
+#endif
diff --git a/src/ops/random_sample/maca/random_sample_maca.maca b/src/ops/random_sample/maca/random_sample_maca.maca
new file mode 100644
index 00000000..310343fb
--- /dev/null
+++ b/src/ops/random_sample/maca/random_sample_maca.maca
@@ -0,0 +1,180 @@
+#include "../../../devices/maca/common_maca.h"
+#include "../../utils.h"
+#include "random_sample_maca.h"
+#include <cub/block/block_reduce.cuh>
+#include <cub/cub.cuh>
+
+template<class T, int BLOCK_DIM>
+__global__ void softmax(
+    T *val_out,
+    int topk,
+    float temperature, int voc) {
+    float sum_s = 0.0f;
+    for (int i = threadIdx.x; i < topk; i += BLOCK_DIM) {
+        sum_s += __expf(static_cast<float>(val_out[i] - val_out[0]) / temperature);
+    }
+    __shared__ float sum_inverse_total;
+
+    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    float block_sum = BlockReduce(temp_storage).Reduce(sum_s, cub::Sum());
+    if (threadIdx.x == 0) {
+        sum_inverse_total = __fdividef(1.0F, block_sum);//高精度除法
+    }
+
+    __syncthreads();
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (tid < topk) {
+        val_out[tid] = static_cast<T>(__expf(static_cast<float>(val_out[tid] - val_out[0]) / temperature) * sum_inverse_total);
+    }
+}
+
+__global__ void index(uint64_t *key_in, int voc) {
+    int ind = threadIdx.x + blockIdx.x * blockDim.x;
+    if (ind < voc) {
+        key_in[ind] = static_cast<uint64_t>(ind);
+    }
+}
+template<class T>
+__global__ void random_sample_kernel(uint64_t *result,
+                                     T *val_out,
+                                     float random_val,
+                                     float topp,
+                                     int topk,
+                                     uint64_t *key_out) {
+    int end = 0;
+    for (end = 0; end < topk; end++) {
+        if (val_out[end] >= static_cast<T>(topp)) {
+            break;
+        }
+    }
+    if (end < topk - 1) {
+        end += 1;
+    } else {
+        end = topk;
+    }
+
+    random_val *= static_cast<float>(val_out[end - 1]);
+    for (int i = 0; i < end; i++) {
+        if (random_val < static_cast<float>(val_out[i])) {
+            result[0] = key_out[i];
+            break;
+        }
+    }
+}
+template<class T, class I>
+void sort_pairs_descending(
+    void *workspace, size_t &size_radix_sort,
+    T const *val_in, T *val_out,
+    I *key_in, I *key_out,
+    int voc, hcStream_t stream) {
+    cub::DeviceRadixSort::SortPairsDescending(
+        workspace, size_radix_sort,
+        val_in, val_out,
+        key_in, key_out,
+        voc, 0, sizeof(T) * 8, stream);
+}
+template<class T>
+void inclusive_sum(
+    void *workspace, size_t &size_scan,
+    T *data, int voc,
+    hcStream_t stream) {
+    cub::DeviceScan::InclusiveSum(
+        workspace, size_scan,
+        data, data, voc,
+        stream);
+}
+template<class T, class I>
+void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
+                             int voc, hcStream_t stream) {
+
+
+    sort_pairs_descending<T, I>(nullptr, size_radix_sort,
+                                nullptr, nullptr,
+                                nullptr, nullptr,
+                                voc, stream);
+
+    inclusive_sum<T>(
+        nullptr, size_scan,
+        nullptr, voc,
+        stream);
+}
+__global__ void random_sample_kernel(uint64_t *result,
+                                     uint64_t *key_out) {
+    result[0] = key_out[0];
+}
+void random_sample_nv_gpu_f16(RandomSampleMacaDescriptor_t desc, void *workspace, void *result,
+                              void const *probs,
+                              float random_val,
+                              float topp,
+                              int topk,
+                              float temperature,
+                              void *stream) {
+    int voc = desc->voc;
+    //下面这段代码在排序
+    char *origin = reinterpret_cast<char *>(workspace);
+    char *keyTmp = origin + voc * sizeof(half);
+    half *val_out = (half *) origin;
+
+    uint64_t *key_in = (uint64_t *) keyTmp;
+    uint64_t *key_out = key_in + voc;
+
+    index<<<(voc + 1023) / 1024, 1024, 0, (hcStream_t) stream>>>(key_in, voc);
+    //下面开始计算workspace空间
+    size_t size_radix_sort;
+    size_t size_scan;
+    random_sample_workspace<half, uint64_t>(size_radix_sort, size_scan,
+                                            voc, (hcStream_t) stream);
+    void *workspace_extra;
+    hcMalloc(&workspace_extra, size_radix_sort + size_scan);
+    sort_pairs_descending<half, uint64_t>(
+        workspace_extra, size_radix_sort,
+        (half *) probs, val_out,
+        key_in, key_out,
+        voc, (hcStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上
+    //排序结束，然后开始做softmax变换
+    if (topp > 0 && topk > 1) {
+        int BLOCK_DIM = 1024;
+        int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM;
+        softmax<half, 1024><<<num_blocks, BLOCK_DIM, 0, (hcStream_t) stream>>>(val_out, topk,
+                                                                                 temperature, voc);
+
+
+        inclusive_sum<half>(
+            workspace_extra, size_scan,
+            val_out, voc,
+            (hcStream_t) stream);//该函数会实现scan功能不断累加结果
+        random_sample_kernel<half><<<1, 1, 0, (hcStream_t) stream>>>((uint64_t *) result,
+                                                                       val_out,
+                                                                       random_val,
+                                                                       topp,
+                                                                       topk,
+                                                                       key_out);
+
+    } else {
+        random_sample_kernel<<<1, 1, 0, (hcStream_t) stream>>>((uint64_t *) result,
+                                                                 key_out);
+    }
+    hcFree(workspace_extra);
+}
+
+infiniopStatus_t macaRandomSample(RandomSampleMacaDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *result,
+                                  void const *probs,
+                                  float random_val,
+                                  float topp,
+                                  int topk,
+                                  float temperature,
+                                  void *stream) {
+    if (hcSetDevice(desc->device_id) != hcSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    if (dtype_eq(desc->dtype, F16)) {
+        random_sample_nv_gpu_f16(desc, workspace, result, probs, random_val, topp, topk, temperature, stream);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/random_sample/musa/random_sample_musa.cc b/src/ops/random_sample/musa/random_sample_musa.cc
new file mode 100644
index 00000000..70ff941c
--- /dev/null
+++ b/src/ops/random_sample/musa/random_sample_musa.cc
@@ -0,0 +1,37 @@
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include "random_sample_musa.h"
+
+infiniopStatus_t musaCreateRandomSampleDescriptor(MusaHandle_t handle,
+                                                  RandomSampleMusaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
+                                                  infiniopTensorDescriptor_t probs) {
+    if (probs->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!dtype_eq(result->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+    int voc = probs->shape[0];
+    int rLength = result->shape[0];
+    if (result->ndim != 1 && rLength != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    *desc_ptr = new RandomSampleMusaDescriptor{
+        handle->device,
+        handle->device_id,
+        probs->dt,
+        voc,
+        result->dt,
+        rLength};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaGetRandomSampleWorkspaceSize(RandomSampleMusaDescriptor_t desc, uint64_t *size) {
+    *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype));
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaDestroyRandomSampleDescriptor(RandomSampleMusaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/random_sample/musa/random_sample_musa.h b/src/ops/random_sample/musa/random_sample_musa.h
new file mode 100644
index 00000000..d8839ff1
--- /dev/null
+++ b/src/ops/random_sample/musa/random_sample_musa.h
@@ -0,0 +1,38 @@
+#ifndef __MUSA_RANDOM_SAMPLE_H__
+#define __MUSA_RANDOM_SAMPLE_H__
+
+#include "../../../devices/musa/musa_handle.h"
+#include "operators.h"
+
+struct RandomSampleMusaDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    int voc;
+    DT rDtype;
+    int rLength;
+};
+
+typedef struct RandomSampleMusaDescriptor *RandomSampleMusaDescriptor_t;
+
+infiniopStatus_t musaCreateRandomSampleDescriptor(MusaHandle_t handle,
+                                                  RandomSampleMusaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
+                                                  infiniopTensorDescriptor_t probs);
+
+infiniopStatus_t musaGetRandomSampleWorkspaceSize(RandomSampleMusaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t musaRandomSample(RandomSampleMusaDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *result,
+                                  void const *probs,
+                                  float random_val,
+                                  float topp,
+                                  int topk,
+                                  float temperature,
+                                  void *stream);
+
+infiniopStatus_t musaDestroyRandomSampleDescriptor(RandomSampleMusaDescriptor_t desc);
+
+
+#endif
diff --git a/src/ops/random_sample/musa/random_sample_musa.mu b/src/ops/random_sample/musa/random_sample_musa.mu
new file mode 100644
index 00000000..55dbdd0a
--- /dev/null
+++ b/src/ops/random_sample/musa/random_sample_musa.mu
@@ -0,0 +1,184 @@
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include "random_sample_musa.h"
+#include <cub/block/block_reduce.cuh>
+#include <cub/cub.cuh>
+
+template<class T, int BLOCK_DIM>
+__global__ void softmax(
+    T *val_out,
+    int topk,
+    float temperature, int voc) {
+    float sum_s = 0.0f;
+    for (int i = threadIdx.x; i < topk; i += BLOCK_DIM) {
+        sum_s += __expf(static_cast<float>(val_out[i] - val_out[0]) / temperature);
+    }
+    __shared__ float sum_inverse_total;
+
+    typedef cub::BlockReduce<float, BLOCK_DIM> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    float block_sum = BlockReduce(temp_storage).Reduce(sum_s, cub::Sum());
+    if (threadIdx.x == 0) {
+        sum_inverse_total = __fdividef(1.0F, block_sum);//高精度除法
+    }
+
+    __syncthreads();
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (tid < topk) {
+        val_out[tid] = static_cast<T>(__expf(static_cast<float>(val_out[tid] - val_out[0]) / temperature) * sum_inverse_total);
+    }
+}
+
+__global__ void index(uint64_t *key_in, int voc) {
+    int ind = threadIdx.x + blockIdx.x * blockDim.x;
+    if (ind < voc) {
+        key_in[ind] = static_cast<uint64_t>(ind);
+    }
+}
+template<class T>
+__global__ void random_sample_kernel(uint64_t *result,
+                                     T *val_out,
+                                     float random_val,
+                                     float topp,
+                                     int topk,
+                                     uint64_t *key_out) {
+    int end = 0;
+    for (end = 0; end < topk; end++) {
+        if (val_out[end] >= static_cast<T>(topp)) {
+            break;
+        }
+    }
+    if (end < topk - 1) {
+        end += 1;
+    } else {
+        end = topk;
+    }
+
+    random_val *= static_cast<float>(val_out[end - 1]);
+    for (int i = 0; i < end; i++) {
+        if (random_val < static_cast<float>(val_out[i])) {
+            result[0] = key_out[i];
+            break;
+        }
+    }
+}
+template<class T, class I>
+void sort_pairs_descending(
+    void *workspace, size_t &size_radix_sort,
+    T const *val_in, T *val_out,
+    I *key_in, I *key_out,
+    int voc, musaStream_t stream) {
+    cub::DeviceRadixSort::SortPairsDescending(
+        workspace, size_radix_sort,
+        val_in, val_out,
+        key_in, key_out,
+        voc, 0, sizeof(T) * 8, stream);
+}
+template<class T>
+void inclusive_sum(
+    void *workspace, size_t &size_scan,
+    T *data, int voc,
+    musaStream_t stream) {
+    cub::DeviceScan::InclusiveSum(
+        workspace, size_scan,
+        data, data, voc,
+        stream);
+}
+template<class T, class I>
+void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
+                             int voc, musaStream_t stream) {
+
+
+    sort_pairs_descending<T, I>(nullptr, size_radix_sort,
+                                nullptr, nullptr,
+                                nullptr, nullptr,
+                                voc, stream);
+
+    inclusive_sum<T>(
+        nullptr, size_scan,
+        nullptr, voc,
+        stream);
+}
+__global__ void random_sample_kernel(uint64_t *result,
+                                     uint64_t *key_out) {
+    result[0] = key_out[0];
+}
+void random_sample_nv_gpu_f16(RandomSampleMusaDescriptor_t desc, void *workspace, void *result,
+                              void const *probs,
+                              float random_val,
+                              float topp,
+                              int topk,
+                              float temperature,
+                              void *stream) {
+    int voc = desc->voc;
+    //下面这段代码在排序
+    char *origin = reinterpret_cast<char *>(workspace);
+    char *keyTmp = origin + voc * sizeof(half);
+    half *val_out = (half *) origin;
+
+    uint64_t *key_in = (uint64_t *) keyTmp;
+    uint64_t *key_out = key_in + voc;
+
+    index<<<(voc + 1023) / 1024, 1024, 0, (musaStream_t) stream>>>(key_in, voc);
+    //下面开始计算workspace空间
+    size_t size_radix_sort;
+    size_t size_scan;
+    random_sample_workspace<half, uint64_t>(size_radix_sort, size_scan,
+                                            voc, (musaStream_t) stream);
+    void *workspace_extra;
+    musaMalloc(&workspace_extra, size_radix_sort + size_scan);
+    sort_pairs_descending<half, uint64_t>(
+        workspace_extra, size_radix_sort,
+        (half *) probs, val_out,
+        key_in, key_out,
+        voc, (musaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上
+    //排序结束，然后开始做softmax变换
+    if (topp > 0 && topk > 1) {
+        int BLOCK_DIM = 1024;
+        int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM;
+        softmax<half, 1024><<<num_blocks, BLOCK_DIM, 0, (musaStream_t) stream>>>(val_out, topk,
+                                                                                 temperature, voc);
+
+
+        inclusive_sum<half>(
+            workspace_extra, size_scan,
+            val_out, voc,
+            (musaStream_t) stream);//该函数会实现scan功能不断累加结果
+        random_sample_kernel<half><<<1, 1, 0, (musaStream_t) stream>>>((uint64_t *) result,
+                                                                       val_out,
+                                                                       random_val,
+                                                                       topp,
+                                                                       topk,
+                                                                       key_out);
+
+    } else {
+        random_sample_kernel<<<1, 1, 0, (musaStream_t) stream>>>((uint64_t *) result,
+                                                                 key_out);
+    }
+    musaFree(workspace_extra);
+}
+
+infiniopStatus_t musaRandomSample(RandomSampleMusaDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *result,
+                                  void const *probs,
+                                  float random_val,
+                                  float topp,
+                                  int topk,
+                                  float temperature,
+                                  void *stream) {
+    int current_device;
+    if (musaGetDevice(&current_device) != musaSuccess) {
+        return STATUS_BAD_DEVICE; 
+    }
+    if (current_device != desc->device_id && musaSetDevice(desc->device_id) != musaSuccess) {
+        return STATUS_BAD_DEVICE;
+    }   
+    if (dtype_eq(desc->dtype, F16)) {
+        random_sample_nv_gpu_f16(desc, workspace, result, probs, random_val, topp, topk, temperature, stream);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/random_sample/operator.cc b/src/ops/random_sample/operator.cc
new file mode 100644
index 00000000..40a8ec03
--- /dev/null
+++ b/src/ops/random_sample/operator.cc
@@ -0,0 +1,172 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/random_sample/random_sample.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/random_sample_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "cuda/random_sample.cuh"
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+#include "bang/random_sample_bang.h"
+#endif
+#ifdef ENABLE_ASCEND_NPU
+#include "ascend/random_sample.h"
+#endif
+#ifdef ENABLE_METAX_GPU
+#include "maca/random_sample_maca.h"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#include "musa/random_sample_musa.h"
+#endif
+
+__C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *) desc_ptr, result, probs);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaCreateRandomSampleDescriptor((CudaHandle_t) handle, (RandomSampleCudaDescriptor_t *) desc_ptr, result, probs);
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangCreateRandomSampleDescriptor((BangHandle_t) handle,
+                                                    (RandomSampleBangDescriptor_t *) desc_ptr, result,
+                                                    probs);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendCreateRandomSampleDescriptor((AscendHandle_t) handle,
+                                                     (RandomSampleAscendDescriptor_t *) desc_ptr, result, probs);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCreateRandomSampleDescriptor((MacaHandle_t) handle,
+                                                    (RandomSampleMacaDescriptor_t *) desc_ptr, result,
+                                                    probs);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu:
+            return musaCreateRandomSampleDescriptor((MusaHandle_t) handle, (RandomSampleMusaDescriptor_t *) desc_ptr, result, probs);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+};
+
+__C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t) desc, size);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t) desc, size);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t) desc, size);
+            // return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t) desc, size);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
+                                          void *workspace,
+                                          uint64_t workspace_size,
+                                          void *result,
+                                          void const *probs,
+                                          float random_val,
+                                          float topp,
+                                          int topk,
+                                          float temperature,
+                                          void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuRandomSample((RandomSampleCpuDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaRandomSample((RandomSampleCudaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangRandomSample((RandomSampleBangDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendRandomSample((RandomSampleAscendDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaRandomSample((RandomSampleMacaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu:
+            return musaRandomSample((RandomSampleMusaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t) desc);
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu:
+            return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t) desc);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/rearrange/ascend/rearrange_aclnn.cc b/src/ops/rearrange/ascend/rearrange_aclnn.cc
new file mode 100644
index 00000000..f1db82cd
--- /dev/null
+++ b/src/ops/rearrange/ascend/rearrange_aclnn.cc
@@ -0,0 +1,113 @@
+#include "rearrange_aclnn.h"
+#include "../../utils.h"
+
+RearrangeAclnnDescriptor::RearrangeAclnnDescriptor(Device _device) {
+    device = _device;
+    device_id = 0;
+    executor = nullptr;
+    dstDesc = new aclnnTensorDescriptor();
+    srcDesc = new aclnnTensorDescriptor();
+    workspaceSize = 0;
+    workspaceAddr = nullptr;
+}
+
+infiniopStatus_t aclnnCreateRearrangeDescriptor(AscendHandle_t handle,
+                                                RearrangeAclnnDescriptor_t *desc_ptr,
+                                                infiniopTensorDescriptor_t dst,
+                                                infiniopTensorDescriptor_t src) {
+    *desc_ptr = new RearrangeAclnnDescriptor(handle->device);
+    (*desc_ptr)->device_id = handle->device_id;
+
+    auto &dstDesc = (*desc_ptr)->dstDesc;
+    auto &srcDesc = (*desc_ptr)->srcDesc;
+
+    CHECK_STATUS(dstDesc->fromInfiniOpTensorDescriptor(dst), STATUS_SUCCESS);
+    CHECK_STATUS(srcDesc->fromInfiniOpTensorDescriptor(src), STATUS_SUCCESS);
+
+    // CHECK_STATUS(dstDesc->createTensor(), STATUS_SUCCESS);
+    // CHECK_STATUS(srcDesc->createTensor(), STATUS_SUCCESS);
+
+    // aclTensor *td = dstDesc->t;
+    // aclTensor *ts = srcDesc->t;
+
+    // auto &workspaceSize = (*desc_ptr)->workspaceSize;
+    // auto &executor = (*desc_ptr)->executor;
+
+    // auto ret = aclnnInplaceCopyGetWorkspaceSize(td,
+    //                                             ts,
+    //                                             &workspaceSize,
+    //                                             &executor);
+    // aclSetAclOpExecutorRepeatable(executor);
+    // CHECK_RET(ret == ACL_SUCCESS,
+    //           LOG_PRINT("aclnnInplaceCopyGetWorkspaceSize failed. ERROR: %d\n", ret);
+    //           return STATUS_EXECUTION_FAILED);
+
+    // (*desc_ptr)->workspaceAddr = mallocWorkspace(workspaceSize);
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnRearrange(RearrangeAclnnDescriptor_t desc,
+                                void *dst,
+                                void const *src,
+                                void *stream) {
+    // Set runing on handle device
+    aclrtSetDevice(desc->device_id);
+
+    /// TODO: something is wrong with aclSetTensorAddr, do all the preparation here for now
+    desc->dstDesc->t = aclCreateTensor(desc->dstDesc->shape.data(),
+                                       desc->dstDesc->ndim,
+                                       desc->dstDesc->dataType,
+                                       desc->dstDesc->strides.data(),
+                                       desc->dstDesc->offset,
+                                       desc->dstDesc->format,
+                                       desc->dstDesc->storageShape.data(),
+                                       desc->dstDesc->storageNdim,
+                                       dst);
+    desc->srcDesc->t = aclCreateTensor(desc->srcDesc->shape.data(),
+                                       desc->srcDesc->ndim,
+                                       desc->srcDesc->dataType,
+                                       desc->srcDesc->strides.data(),
+                                       desc->srcDesc->offset,
+                                       desc->srcDesc->format,
+                                       desc->srcDesc->storageShape.data(),
+                                       desc->srcDesc->storageNdim,
+                                       (void *) src);
+
+    aclTensor *td = desc->dstDesc->t;
+    aclTensor *ts = desc->srcDesc->t;
+    aclOpExecutor *executor;
+    uint64_t workspaceSize;
+    aclnnInplaceCopyGetWorkspaceSize(td,
+                                     ts,
+                                     &workspaceSize,
+                                     &executor);
+    CHECK_STATUS(mallocWorkspace(&(desc->workspaceAddr), workspaceSize), STATUS_SUCCESS);
+
+
+    // AclSetTensorAddr(executor, 0, td, dst);
+    // AclSetTensorAddr(executor, 1, ts, (void *) src);
+    auto ret = aclnnInplaceCopy(desc->workspaceAddr,
+                                desc->workspaceSize,
+                                executor,
+                                stream);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnInplaceCopy failed. ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+
+    desc->dstDesc->destroyTensor();
+    desc->srcDesc->destroyTensor();
+    CHECK_STATUS(freeWorkspace(desc->workspaceAddr), STATUS_SUCCESS);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnDestroyRearrangeDescriptor(RearrangeAclnnDescriptor_t desc) {
+    delete desc->srcDesc;
+    delete desc->dstDesc;
+    /// TODO: this aclDestroyAclOpExecutor seems to trigger a double free error
+    // aclDestroyAclOpExecutor(desc->executor);
+    // freeWorkspace(desc->workspaceAddr);
+    delete desc;
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/ascend/rearrange_aclnn.h b/src/ops/rearrange/ascend/rearrange_aclnn.h
new file mode 100644
index 00000000..4b60e4e7
--- /dev/null
+++ b/src/ops/rearrange/ascend/rearrange_aclnn.h
@@ -0,0 +1,36 @@
+#ifndef __ACLNN_REARRANGE_H__
+#define __ACLNN_REARRANGE_H__
+
+#include "../../../devices/ascend/ascend_handle.h"
+#include "../../../devices/ascend/tensor_aclnn.h"
+#include "operators.h"
+#include <acl/acl_base.h>
+#include <aclnn/acl_meta.h>
+#include <aclnnop/aclnn_copy.h>
+
+struct RearrangeAclnnDescriptor {
+    Device device;
+    int device_id;
+    aclOpExecutor *executor;
+    aclnnTensorDescriptor_t dstDesc, srcDesc;
+    uint64_t workspaceSize;
+    void *workspaceAddr;
+
+    RearrangeAclnnDescriptor(Device device);
+};
+
+typedef struct RearrangeAclnnDescriptor *RearrangeAclnnDescriptor_t;
+
+infiniopStatus_t aclnnCreateRearrangeDescriptor(AscendHandle_t handle,
+                                                RearrangeAclnnDescriptor_t *desc_ptr,
+                                                infiniopTensorDescriptor_t dst,
+                                                infiniopTensorDescriptor_t src);
+
+infiniopStatus_t aclnnRearrange(RearrangeAclnnDescriptor_t desc,
+                                void *dst,
+                                void const *src,
+                                void *stream);
+
+infiniopStatus_t aclnnDestroyRearrangeDescriptor(RearrangeAclnnDescriptor_t desc);
+
+#endif
diff --git a/src/ops/rearrange/bang/rearrange_bang.cc b/src/ops/rearrange/bang/rearrange_bang.cc
new file mode 100644
index 00000000..e846f2d1
--- /dev/null
+++ b/src/ops/rearrange/bang/rearrange_bang.cc
@@ -0,0 +1,89 @@
+#include "rearrange_bang.h"
+#include "../../../devices/bang/common_bang.h"
+#include "../../utils.h"
+#include <numeric>
+
+infiniopStatus_t bangCreateRearrangeDescriptor(BangHandle_t handle,
+                                               RearrangeBangDescriptor_t *desc_ptr,
+                                               infiniopTensorDescriptor_t dst,
+                                               infiniopTensorDescriptor_t src) {
+    auto dt = dst->dt;
+    if (!dtype_eq(src->dt, dt)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto ndim = dst->ndim;
+    if (src->ndim != ndim || ndim == 0) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (decltype(ndim) i = 0; i < ndim; ++i) {
+        if (dst->shape[i] != src->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    unsigned int r = 0;
+    std::vector<uint64_t> shape_;
+    std::vector<int64_t> dst_strides, src_strides;
+    switch (ndim) {
+        case 1:
+            shape_.push_back(dst->shape[0]);
+            dst_strides.push_back(0);
+            src_strides.push_back(0);
+            r = 1;
+            break;
+        case 2:
+            r = dst->shape[0];
+            break;
+        case 3:
+            r = dst->shape[0] * dst->shape[1];
+            break;
+        default: {
+            for (size_t i = ndim - 3; i >= 1; --i) {
+                if (static_cast<uint64_t>(dst->shape[i]) * static_cast<uint64_t>(dst->strides[i]) != static_cast<uint64_t>(dst->strides[i - 1]) ||
+                    static_cast<uint64_t>(src->shape[i]) * static_cast<uint64_t>(src->strides[i]) != static_cast<uint64_t>(src->strides[i - 1])) {
+                    return STATUS_BAD_TENSOR_STRIDES;
+                }
+            }
+            r = std::accumulate(dst->shape, dst->shape + ndim - 1, 1, std::multiplies<unsigned int>());
+            break;
+        }
+    }
+
+    for (decltype(ndim) i = 0; i < ndim; ++i) {
+        shape_.push_back(dst->shape[i]);
+        dst_strides.push_back(dst->strides[i]);
+        src_strides.push_back(src->strides[i]);
+    }
+
+    char *tmpDevice;
+    CNRT_CHECK(cnrtMalloc((void **) &tmpDevice, ndim * sizeof(uint64_t) + 2 * ndim * sizeof(int64_t)));
+    char *mlu_stride = tmpDevice + ndim * sizeof(uint64_t);
+    uint64_t *mlu_shape = (uint64_t *) tmpDevice;
+
+    int64_t *mlu_strides_dst = (int64_t *) mlu_stride;
+    int64_t *mlu_strides_src = mlu_strides_dst + ndim;
+
+    CNRT_CHECK(cnrtMemcpy(mlu_shape, shape_.data(), ndim * sizeof(uint64_t), cnrtMemcpyHostToDev));
+    CNRT_CHECK(cnrtMemcpy(mlu_strides_dst, dst_strides.data(), ndim * sizeof(int64_t), cnrtMemcpyHostToDev));
+    CNRT_CHECK(cnrtMemcpy(mlu_strides_src, src_strides.data(), ndim * sizeof(int64_t), cnrtMemcpyHostToDev));
+    *desc_ptr = new RearrangeBangDescriptor{
+        handle->device,
+        handle->device_id,
+        dst->dt,
+        r,
+        ndim,
+        mlu_shape,
+        mlu_strides_dst,
+        mlu_strides_src};
+    return STATUS_SUCCESS;
+}
+infiniopStatus_t bangDestroyRearrangeDescriptor(RearrangeBangDescriptor_t desc) {
+    cnrtFree(desc->mlu_shape);
+
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/bang/rearrange_bang.h b/src/ops/rearrange/bang/rearrange_bang.h
new file mode 100644
index 00000000..dc64f76a
--- /dev/null
+++ b/src/ops/rearrange/bang/rearrange_bang.h
@@ -0,0 +1,34 @@
+#ifndef __BANG_REARRANGE_H__
+#define __BANG_REARRANGE_H__
+
+#include "../../../devices/bang/bang_handle.h"
+#include "operators.h"
+
+struct RearrangeBangDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t r;
+    uint64_t ndim;
+    uint64_t *mlu_shape;
+    int64_t
+        *mlu_strides_dst,
+        *mlu_strides_src;
+};
+
+typedef struct RearrangeBangDescriptor *RearrangeBangDescriptor_t;
+
+infiniopStatus_t bangCreateRearrangeDescriptor(BangHandle_t handle,
+                                               RearrangeBangDescriptor_t *desc_ptr,
+                                               infiniopTensorDescriptor_t dst,
+                                               infiniopTensorDescriptor_t src);
+
+infiniopStatus_t bangRearrange(RearrangeBangDescriptor_t desc,
+                               void *dst,
+                               void const *src,
+                               void *stream);
+
+infiniopStatus_t bangDestroyRearrangeDescriptor(RearrangeBangDescriptor_t desc);
+
+
+#endif// __BANG_REARRANGE_H__
diff --git a/src/ops/rearrange/bang/rearrange_bang.mlu b/src/ops/rearrange/bang/rearrange_bang.mlu
new file mode 100644
index 00000000..5c14a516
--- /dev/null
+++ b/src/ops/rearrange/bang/rearrange_bang.mlu
@@ -0,0 +1,104 @@
+#include "bang.h"
+#include "bang_device_functions.h"
+#include "cnrt.h"
+#include "rearrange_bang.h"
+#include "../../../devices/bang/common_bang.h"
+#include <stdlib.h>
+
+const int SRC_MAX_SIZE = 1024 * 1024 * 128; 
+
+__mlu_global__ void rearrange(
+    char *dst,
+    char const *src,
+    uint64_t *mlu_shape,
+    int64_t *mlu_strides_dst,
+    int64_t *mlu_strides_src,
+    int r,
+    int ndim, int byteSize){
+    const int maxNum = SRC_MAX_SIZE/byteSize;
+
+    int remainT = r % taskDim;
+    int stepEasy = (r - remainT) / taskDim;
+    int stepHard = stepEasy + 1;
+    int step = (taskId < remainT ? stepHard : stepEasy);
+    int indStart = (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy);
+    
+    int dimsize = mlu_shape[ndim - 1];
+    if(dimsize < maxNum){
+        for(int i = indStart; i < indStart + step; i++){
+            int tidS = 0;
+            int tidD = 0;
+            int indi = i;
+            for(int j = ndim - 2; j >= 0; --j){
+                tidS += (indi % mlu_shape[j]) * mlu_strides_src[j];
+                tidD += (indi % mlu_shape[j]) * mlu_strides_dst[j];
+                indi /= mlu_shape[j];
+            }
+            __memcpy(dst + tidD * byteSize, src + tidS * byteSize, dimsize * byteSize, GDRAM2GDRAM);
+        }
+       
+    }
+    else{
+        int remain = dimsize % maxNum;
+        int repeat = (dimsize - remain) / maxNum;
+        for(int i = indStart; i < indStart + step; i++){
+            int tidS = 0;
+            int tidD = 0;
+            int indi = i;
+            for(int j = ndim - 2; j >= 0; --j){
+                tidS += (indi % mlu_shape[j]) * mlu_strides_src[j];
+                tidD += (indi % mlu_shape[j]) * mlu_strides_dst[j];
+                indi /= mlu_shape[j];
+            }
+            for(int index = 0; index < repeat; index++){
+                __memcpy(dst + (tidD + index * maxNum) * byteSize, src + (tidS + index * maxNum) * byteSize, maxNum * byteSize, GDRAM2GDRAM);
+            }
+            if(remain){
+                __memcpy(dst + (tidD + repeat * maxNum) * byteSize, src + (tidS + repeat * maxNum) * byteSize, remain * byteSize, GDRAM2GDRAM);
+            }
+        }
+        
+    }   
+}
+
+void rearrangeUnion(cnrtQueue_t queue, void *destination, void const *source,
+    uint64_t *mlu_shape,
+    int64_t *mlu_strides_dst,
+    int64_t *mlu_strides_src,
+    int r,
+    int ndim, int byteSize) {
+    auto dst = reinterpret_cast< char *>(destination);
+    auto src = reinterpret_cast<const char *>(source);
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = CNRT_FUNC_TYPE_UNION1;
+
+    rearrange<<<k_dim, k_type, queue>>>(dst, src, mlu_shape, mlu_strides_dst, mlu_strides_src, r, ndim, byteSize);
+    
+    cnrtQueueSync(queue);
+}
+
+void rearrange_bang(RearrangeBangDescriptor_t desc, void *dst,
+                               void const *src,
+                               void *stream) {
+    auto queue = reinterpret_cast<cnrtQueue_t>(stream);
+    int r = desc->r;
+    int ndim = desc->ndim;
+    int byteSize = desc->dtype.size;
+    rearrangeUnion(queue, dst, src, desc->mlu_shape, desc->mlu_strides_dst, desc->mlu_strides_src, r, ndim, byteSize);
+}
+infiniopStatus_t bangRearrange(RearrangeBangDescriptor_t desc,
+                               void *dst,
+                               void const *src,
+                               void *stream) {
+                              
+    if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    rearrange_bang(desc, dst, src, stream);
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/cpu/rearrange_cpu.cc b/src/ops/rearrange/cpu/rearrange_cpu.cc
new file mode 100644
index 00000000..a5540727
--- /dev/null
+++ b/src/ops/rearrange/cpu/rearrange_cpu.cc
@@ -0,0 +1,100 @@
+#include "rearrange_cpu.h"
+#include "../../utils.h"
+#include <cstdint>
+#include <cstring>
+#include <numeric>
+
+infiniopStatus_t cpuCreateRearrangeDescriptor(infiniopHandle_t,
+                                              RearrangeCpuDescriptor_t *desc_ptr,
+                                              infiniopTensorDescriptor_t dst,
+                                              infiniopTensorDescriptor_t src) {
+    if (!dtype_eq(dst->dt, src->dt)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto ndim = dst->ndim;
+    if (src->ndim != ndim || ndim == 0) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (int i = 0; i < ndim; ++i) {
+        if (dst->shape[i] != src->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    std::vector<uint64_t>
+        shape(dst->shape, dst->shape + ndim);
+    std::vector<int64_t>
+        strides_dst(dst->strides, dst->strides + ndim),
+        strides_src(src->strides, src->strides + ndim);
+
+    unsigned int r = 0;
+    switch (ndim) {
+        case 1:
+            ndim = 2;
+            strides_dst.insert(strides_dst.begin(), shape[0]);
+            strides_src.insert(strides_src.begin(), shape[0]);
+            shape.insert(shape.begin(), 1);
+        case 2:
+            r = shape[0];
+            break;
+        case 3:
+            r = shape[0] * shape[1];
+            break;
+        default:
+            for (int i = ndim - 3; i >= 1; --i) {
+                if (shape[i] * strides_dst[i] != strides_dst[i - 1] || shape[i] * strides_src[i] != strides_src[i - 1]) {
+                    return STATUS_BAD_TENSOR_STRIDES;
+                }
+            }
+            r = std::accumulate(shape.begin(), shape.end() - 1, 1, std::multiplies{});
+            break;
+    }
+    *desc_ptr = new RearrangeCpuDescriptor{
+        DevCpu,
+        dst->dt,
+        r,
+        shape,
+        strides_dst,
+        strides_src,
+    };
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyRearrangeDescriptor(RearrangeCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+inline int indices(uint64_t i, uint64_t ndim, std::vector<int64_t> strides, std::vector<uint64_t> shape) {
+    uint64_t ans = 0;
+    for (int j = ndim - 2; j >= 0; --j) {
+        ans += (i % shape[j]) * strides[j];
+        i /= shape[j];
+    }
+    return ans;
+}
+
+void reform_cpu(RearrangeCpuDescriptor_t desc, void *dst, void const *src) {
+    auto dst_ptr = reinterpret_cast<uint8_t *>(dst);
+    auto src_ptr = reinterpret_cast<const uint8_t *>(src);
+    auto ndim = desc->shape.size();
+    int bytes_size = desc->shape[ndim - 1] * desc->dt.size;
+#pragma omp parallel for
+    for (uint64_t i = 0; i < desc->r; ++i) {
+        auto dst_offset = indices(i, ndim, desc->strides_dst, desc->shape);
+        auto src_offset = indices(i, ndim, desc->strides_src, desc->shape);
+        std::memcpy(dst_ptr + dst_offset * desc->dt.size, src_ptr + src_offset * desc->dt.size, bytes_size);
+    }
+}
+
+infiniopStatus_t cpuRearrange(RearrangeCpuDescriptor_t desc,
+                              void *dst,
+                              void const *src,
+                              void *stream) {
+    reform_cpu(desc, dst, src);
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/cpu/rearrange_cpu.h b/src/ops/rearrange/cpu/rearrange_cpu.h
new file mode 100644
index 00000000..99cc62e6
--- /dev/null
+++ b/src/ops/rearrange/cpu/rearrange_cpu.h
@@ -0,0 +1,31 @@
+#ifndef __CPU_REARRANGE_H__
+#define __CPU_REARRANGE_H__
+
+#include "operators.h"
+#include <vector>
+struct RearrangeCpuDescriptor {
+    Device device;
+    DataLayout dt;
+    uint64_t r;
+    std::vector<uint64_t> shape;
+    std::vector<int64_t> strides_dst;
+    std::vector<int64_t> strides_src;
+};
+
+typedef struct RearrangeCpuDescriptor *RearrangeCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateRearrangeDescriptor(infiniopHandle_t handle,
+                                              RearrangeCpuDescriptor_t *desc_ptr,
+                                              infiniopTensorDescriptor_t dst,
+                                              infiniopTensorDescriptor_t src);
+
+infiniopStatus_t cpuRearrange(RearrangeCpuDescriptor_t desc,
+                              void *dst,
+                              void const *src,
+                              void *stream);
+
+infiniopStatus_t cpuDestroyRearrangeDescriptor(RearrangeCpuDescriptor_t desc);
+
+void reform_cpu(RearrangeCpuDescriptor_t desc, void *y, void const *x);
+
+#endif
diff --git a/src/ops/rearrange/cuda/rearrange.cc b/src/ops/rearrange/cuda/rearrange.cc
new file mode 100644
index 00000000..da23489b
--- /dev/null
+++ b/src/ops/rearrange/cuda/rearrange.cc
@@ -0,0 +1,70 @@
+#include "rearrange.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include <numeric>
+
+infiniopStatus_t cudaCreateRearrangeDescriptor(CudaHandle_t handle,
+                                               RearrangeCudaDescriptor_t *desc_ptr,
+                                               infiniopTensorDescriptor_t dst,
+                                               infiniopTensorDescriptor_t src) {
+    auto dt = dst->dt;
+    if (!dtype_eq(src->dt, dt)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto ndim = dst->ndim;
+    if (src->ndim != ndim || ndim == 0) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (int i = 0; i < ndim; ++i) {
+        if (dst->shape[i] != src->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    switch (ndim) {
+        case 1:
+            *desc_ptr = new RearrangeCudaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[0],
+                1, 1,
+                0, 0,
+                0, 0};
+            break;
+        case 2:
+            *desc_ptr = new RearrangeCudaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[1],
+                1, dst->shape[0],
+                0, dst->strides[0],
+                0, src->strides[0]};
+            break;
+        case 3:
+            *desc_ptr = new RearrangeCudaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[2],
+                dst->shape[0], dst->shape[1],
+                dst->strides[0], dst->strides[1],
+                src->strides[0], src->strides[1]};
+            break;
+        default:
+            return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    (*desc_ptr)->dst_rs *= dt.size;
+    (*desc_ptr)->dst_cs *= dt.size;
+    (*desc_ptr)->src_rs *= dt.size;
+    (*desc_ptr)->src_cs *= dt.size;
+
+    return STATUS_SUCCESS;
+}
+infiniopStatus_t cudaDestroyRearrangeDescriptor(RearrangeCudaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/cuda/rearrange.cu b/src/ops/rearrange/cuda/rearrange.cu
new file mode 100644
index 00000000..8f90924c
--- /dev/null
+++ b/src/ops/rearrange/cuda/rearrange.cu
@@ -0,0 +1,77 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "rearrange.cuh"
+#include "../../utils.h"
+
+template<class Tmem>
+static __launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void rearrange(
+    void *__restrict__ dst,
+    int const rsa,
+    int const csa,
+    void const *__restrict__ src,
+    int const rsb,
+    int const csb,
+    unsigned int const ncols) {
+
+    auto row = blockIdx.y,
+         col = blockIdx.x * blockDim.y + threadIdx.y;
+    if (col >= ncols) return;
+
+    auto thread = threadIdx.x,
+         warp_size = blockDim.x;
+    auto i = (row * rsa + col * csa) * warp_size + thread;
+    auto j = (row * rsb + col * csb) * warp_size + thread;
+
+    reinterpret_cast<Tmem *>(dst)[i] = reinterpret_cast<Tmem const *>(src)[j];
+}
+
+void rearrange_nv_gpu(RearrangeCudaDescriptor_t desc, void *y, void const *x, void *stream) {
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    auto unit = desc->unit,
+         r = desc->r, c = desc->c;
+    auto dst_rs = desc->dst_rs, dst_cs = desc->dst_cs,
+         src_rs = desc->src_rs, src_cs = desc->src_cs;
+
+    if (r == 1 && c == 1) {
+        cudaMemcpyAsync(y, x, unit, cudaMemcpyDeviceToDevice, cuda_stream);
+        return;
+    }
+
+    auto warps = MAX_THREADS_PER_BLOCK / WARP_SIZE;
+    auto grid = dim3(ROUND_UP_DIV(c, warps), r);
+    auto block = dim3(WARP_SIZE, ROUND_UP_DIV(c, grid.x));
+    dst_rs /= unit;
+    dst_cs /= unit;
+    src_rs /= unit;
+    src_cs /= unit;
+
+    switch (unit / WARP_SIZE) {
+        case 1:
+            rearrange<uchar1><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 2:
+            rearrange<uchar2><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 4:
+            rearrange<float1><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 8:
+            rearrange<float2><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 16:
+            rearrange<float4><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 32:
+            rearrange<double4><<<grid, block, 0, cuda_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        default:
+            break;
+    }
+}
+infiniopStatus_t cudaRearrange(RearrangeCudaDescriptor_t desc,
+                               void *dst, void const *src, void *stream) {
+    if (cudaSetDevice(desc->device_id) != cudaSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    rearrange_nv_gpu(desc, dst, src, stream);
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/cuda/rearrange.cuh b/src/ops/rearrange/cuda/rearrange.cuh
new file mode 100644
index 00000000..f31f74b3
--- /dev/null
+++ b/src/ops/rearrange/cuda/rearrange.cuh
@@ -0,0 +1,29 @@
+#ifndef __CUDA_REARRANGE_H__
+#define __CUDA_REARRANGE_H__
+
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+
+struct RearrangeCudaDescriptor {
+    Device device;
+    int device_id;
+    uint64_t unit, r, c;
+    int64_t dst_rs, dst_cs, src_rs, src_cs;
+};
+
+typedef struct RearrangeCudaDescriptor *RearrangeCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateRearrangeDescriptor(CudaHandle_t handle,
+                                               RearrangeCudaDescriptor_t *desc_ptr,
+                                               infiniopTensorDescriptor_t dst,
+                                               infiniopTensorDescriptor_t src);
+
+infiniopStatus_t cudaRearrange(RearrangeCudaDescriptor_t desc,
+                               void *dst,
+                               void const *src,
+                               void *stream);
+
+infiniopStatus_t cudaDestroyRearrangeDescriptor(RearrangeCudaDescriptor_t desc);
+
+void rearrange_nv_gpu(RearrangeCudaDescriptor_t, void *y, void const *x, void *stream);
+#endif// __CUDA_REARRANGE_H__
diff --git a/src/ops/rearrange/maca/rearrange_maca.cc b/src/ops/rearrange/maca/rearrange_maca.cc
new file mode 100644
index 00000000..ac33fe06
--- /dev/null
+++ b/src/ops/rearrange/maca/rearrange_maca.cc
@@ -0,0 +1,70 @@
+#include "rearrange_maca.h"
+#include "../../../devices/maca/common_maca.h"
+#include "../../utils.h"
+#include <numeric>
+
+infiniopStatus_t macaCreateRearrangeDescriptor(MacaHandle_t handle,
+                                               RearrangeMacaDescriptor_t *desc_ptr,
+                                               infiniopTensorDescriptor_t dst,
+                                               infiniopTensorDescriptor_t src) {
+    auto dt = dst->dt;
+    if (!dtype_eq(src->dt, dt)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto ndim = dst->ndim;
+    if (src->ndim != ndim || ndim == 0) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (int i = 0; i < ndim; ++i) {
+        if (dst->shape[i] != src->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    switch (ndim) {
+        case 1:
+            *desc_ptr = new RearrangeMacaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[0],
+                1, 1,
+                0, 0,
+                0, 0};
+            break;
+        case 2:
+            *desc_ptr = new RearrangeMacaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[1],
+                1, dst->shape[0],
+                0, dst->strides[0],
+                0, src->strides[0]};
+            break;
+        case 3:
+            *desc_ptr = new RearrangeMacaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[2],
+                dst->shape[0], dst->shape[1],
+                dst->strides[0], dst->strides[1],
+                src->strides[0], src->strides[1]};
+            break;
+        default:
+            return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    (*desc_ptr)->dst_rs *= dt.size;
+    (*desc_ptr)->dst_cs *= dt.size;
+    (*desc_ptr)->src_rs *= dt.size;
+    (*desc_ptr)->src_cs *= dt.size;
+
+    return STATUS_SUCCESS;
+}
+infiniopStatus_t macaDestroyRearrangeDescriptor(RearrangeMacaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/maca/rearrange_maca.h b/src/ops/rearrange/maca/rearrange_maca.h
new file mode 100644
index 00000000..701f55bb
--- /dev/null
+++ b/src/ops/rearrange/maca/rearrange_maca.h
@@ -0,0 +1,29 @@
+#ifndef __MACA_REARRANGE_H__
+#define __MACA_REARRANGE_H__
+
+#include "../../../devices/maca/maca_handle.h"
+#include "operators.h"
+
+struct RearrangeMacaDescriptor {
+    Device device;
+    int device_id;
+    uint64_t unit, r, c;
+    int64_t dst_rs, dst_cs, src_rs, src_cs;
+};
+
+typedef struct RearrangeMacaDescriptor *RearrangeMacaDescriptor_t;
+
+infiniopStatus_t macaCreateRearrangeDescriptor(MacaHandle_t handle,
+                                               RearrangeMacaDescriptor_t *desc_ptr,
+                                               infiniopTensorDescriptor_t dst,
+                                               infiniopTensorDescriptor_t src);
+
+infiniopStatus_t macaRearrange(RearrangeMacaDescriptor_t desc,
+                               void *dst,
+                               void const *src,
+                               void *stream);
+
+infiniopStatus_t macaDestroyRearrangeDescriptor(RearrangeMacaDescriptor_t desc);
+
+void rearrange_mc_gpu(RearrangeMacaDescriptor_t, void *y, void const *x, void *stream);
+#endif// __MACA_REARRANGE_H__
diff --git a/src/ops/rearrange/maca/rearrange_maca.maca b/src/ops/rearrange/maca/rearrange_maca.maca
new file mode 100644
index 00000000..b5152c15
--- /dev/null
+++ b/src/ops/rearrange/maca/rearrange_maca.maca
@@ -0,0 +1,76 @@
+#include "../../../devices/maca/common_maca.h"
+#include "rearrange_maca.h"
+
+template<class Tmem>
+static __global__ void rearrange(
+    void *__restrict__ dst,
+    int const rsa,
+    int const csa,
+    void const *__restrict__ src,
+    int const rsb,
+    int const csb,
+    unsigned int const ncols) {
+
+    auto row = blockIdx.y,
+         col = blockIdx.x * blockDim.y + threadIdx.y;
+    if (col >= ncols) return;
+
+    auto thread = threadIdx.x;
+    auto warp_size = blockDim.x;
+    auto i = (row * rsa + col * csa) * warp_size + thread;
+    auto j = (row * rsb + col * csb) * warp_size + thread;
+
+    reinterpret_cast<Tmem *>(dst)[i] = reinterpret_cast<Tmem const *>(src)[j];
+}
+
+void rearrange_mc_gpu(RearrangeMacaDescriptor_t desc, void *y, void const *x, void *stream) {
+    auto maca_stream = reinterpret_cast<hcStream_t>(stream);
+    auto unit = desc->unit,
+         r = desc->r, c = desc->c;
+    auto dst_rs = desc->dst_rs, dst_cs = desc->dst_cs,
+         src_rs = desc->src_rs, src_cs = desc->src_cs;
+
+    if (r == 1 && c == 1) {
+        hcMemcpyAsync(y, x, unit, hcMemcpyDeviceToDevice, maca_stream);
+        return;
+    }
+
+    auto warps = 1024 / WARP_SIZE;
+    auto grid = dim3((c + warps - 1) / warps, r);
+    auto block = dim3(WARP_SIZE, (c + grid.x - 1) / grid.x);
+    dst_rs /= unit;
+    dst_cs /= unit;
+    src_rs /= unit;
+    src_cs /= unit;
+
+    switch (unit / WARP_SIZE) {
+        case 1:
+            rearrange<uchar1><<<grid, block, 0, maca_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 2:
+            rearrange<uchar2><<<grid, block, 0, maca_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 4:
+            rearrange<float1><<<grid, block, 0, maca_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 8:
+            rearrange<float2><<<grid, block, 0, maca_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 16:
+            rearrange<float4><<<grid, block, 0, maca_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 32:
+            rearrange<double4><<<grid, block, 0, maca_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        default:
+            break;
+    }
+}
+infiniopStatus_t macaRearrange(RearrangeMacaDescriptor_t desc,
+                               void *dst, void const *src, void *stream) {
+    if (hcSetDevice(desc->device_id) != hcSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    rearrange_mc_gpu(desc, dst, src, stream);
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/musa/rearrange_musa.cc b/src/ops/rearrange/musa/rearrange_musa.cc
new file mode 100644
index 00000000..5fa2e768
--- /dev/null
+++ b/src/ops/rearrange/musa/rearrange_musa.cc
@@ -0,0 +1,70 @@
+#include "rearrange_musa.h"
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include <numeric>
+
+infiniopStatus_t musaCreateRearrangeDescriptor(MusaHandle_t handle,
+                                               RearrangeMusaDescriptor_t *desc_ptr,
+                                               infiniopTensorDescriptor_t dst,
+                                               infiniopTensorDescriptor_t src) {
+    auto dt = dst->dt;
+    if (!dtype_eq(src->dt, dt)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto ndim = dst->ndim;
+    if (src->ndim != ndim || ndim == 0) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (int i = 0; i < ndim; ++i) {
+        if (dst->shape[i] != src->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    switch (ndim) {
+        case 1:
+            *desc_ptr = new RearrangeMusaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[0],
+                1, 1,
+                0, 0,
+                0, 0};
+            break;
+        case 2:
+            *desc_ptr = new RearrangeMusaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[1],
+                1, dst->shape[0],
+                0, dst->strides[0],
+                0, src->strides[0]};
+            break;
+        case 3:
+            *desc_ptr = new RearrangeMusaDescriptor{
+                handle->device,
+                handle->device_id,
+                dt.size * dst->shape[2],
+                dst->shape[0], dst->shape[1],
+                dst->strides[0], dst->strides[1],
+                src->strides[0], src->strides[1]};
+            break;
+        default:
+            return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    (*desc_ptr)->dst_rs *= dt.size;
+    (*desc_ptr)->dst_cs *= dt.size;
+    (*desc_ptr)->src_rs *= dt.size;
+    (*desc_ptr)->src_cs *= dt.size;
+
+    return STATUS_SUCCESS;
+}
+infiniopStatus_t musaDestroyRearrangeDescriptor(RearrangeMusaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/musa/rearrange_musa.h b/src/ops/rearrange/musa/rearrange_musa.h
new file mode 100644
index 00000000..df6ade12
--- /dev/null
+++ b/src/ops/rearrange/musa/rearrange_musa.h
@@ -0,0 +1,30 @@
+#ifndef __MUSA_REARRANGE_H__
+#define __MUSA_REARRANGE_H__
+
+#include "operators.h"
+#include "../../../devices/musa/musa_handle.h"
+
+struct RearrangeMusaDescriptor {
+    Device device;
+    int device_id;
+    uint64_t unit, r, c;
+    int64_t dst_rs, dst_cs, src_rs, src_cs;
+};
+
+typedef struct RearrangeMusaDescriptor *RearrangeMusaDescriptor_t;
+
+infiniopStatus_t musaCreateRearrangeDescriptor(MusaHandle_t handle,
+                                               RearrangeMusaDescriptor_t *desc_ptr,
+                                               infiniopTensorDescriptor_t dst,
+                                               infiniopTensorDescriptor_t src);
+
+infiniopStatus_t musaRearrange(RearrangeMusaDescriptor_t desc,
+                               void *dst,
+                               void const *src,
+                               void *stream);
+
+infiniopStatus_t musaDestroyRearrangeDescriptor(RearrangeMusaDescriptor_t desc);
+
+void rearrange_mt_gpu(RearrangeMusaDescriptor *, void *y, void const *x, void *stream);
+#endif // __MUSA_REARRANGE_H__
+
diff --git a/src/ops/rearrange/musa/rearrange_musa.mu b/src/ops/rearrange/musa/rearrange_musa.mu
new file mode 100644
index 00000000..887923b3
--- /dev/null
+++ b/src/ops/rearrange/musa/rearrange_musa.mu
@@ -0,0 +1,81 @@
+#include "../../../devices/musa/common_musa.h"
+#include "rearrange_musa.h"
+
+template<class Tmem>
+static __global__ void rearrange(
+    void *__restrict__ dst,
+    int const rsa,
+    int const csa,
+    void const *__restrict__ src,
+    int const rsb,
+    int const csb,
+    unsigned int const ncols) {
+
+    auto row = blockIdx.y,
+         col = blockIdx.x * blockDim.y + threadIdx.y;
+    if (col >= ncols) return;
+
+    auto thread = threadIdx.x,
+         warp_size = blockDim.x;
+    auto i = (row * rsa + col * csa) * warp_size + thread;
+    auto j = (row * rsb + col * csb) * warp_size + thread;
+
+    reinterpret_cast<Tmem *>(dst)[i] = reinterpret_cast<Tmem const *>(src)[j];
+}
+
+
+void rearrange_mt_gpu(RearrangeMusaDescriptor_t desc, void *y, void const *x, void *stream) {
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    auto unit = desc->unit,
+         r = desc->r, c = desc->c;
+    auto dst_rs = desc->dst_rs, dst_cs = desc->dst_cs,
+         src_rs = desc->src_rs, src_cs = desc->src_cs;
+
+    if (r == 1 && c == 1) {
+        musaMemcpyAsync(y, x, unit, musaMemcpyDeviceToDevice, musa_stream);
+        return;
+    }
+
+    auto warps = 1024 / WARP_SIZE;
+    auto grid = dim3((c + warps - 1) / warps, r);
+    auto block = dim3(WARP_SIZE, (c + grid.x - 1) / grid.x);
+    dst_rs /= unit;
+    dst_cs /= unit;
+    src_rs /= unit;
+    src_cs /= unit;
+
+    switch (unit / WARP_SIZE) {
+        case 1:
+            rearrange<uchar1><<<grid, block, 0, musa_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 2:
+            rearrange<uchar2><<<grid, block, 0, musa_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 4:
+            rearrange<float1><<<grid, block, 0, musa_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 8:
+            rearrange<float2><<<grid, block, 0, musa_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 16:
+            rearrange<float4><<<grid, block, 0, musa_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        case 32:
+            rearrange<double4><<<grid, block, 0, musa_stream>>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c);
+            break;
+        default:
+            break;
+    }
+}
+infiniopStatus_t musaRearrange(RearrangeMusaDescriptor_t desc,
+                               void *dst, void const *src, void *stream) {
+    int current_device;
+    if (musaGetDevice(&current_device) != musaSuccess) {
+        return STATUS_BAD_DEVICE; 
+    }
+    if (current_device != desc->device_id && musaSetDevice(desc->device_id) != musaSuccess) {
+        return STATUS_BAD_DEVICE;
+    }   
+    rearrange_mt_gpu(desc, dst, src, stream);
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rearrange/operator.cc b/src/ops/rearrange/operator.cc
new file mode 100644
index 00000000..4a922dc7
--- /dev/null
+++ b/src/ops/rearrange/operator.cc
@@ -0,0 +1,143 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/rearrange/rearrange.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/rearrange_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/common_cuda.h"
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/rearrange.cuh"
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+#include "bang/rearrange_bang.h"
+//#include "bang/rearrange_cnnl.h"
+#endif
+#ifdef ENABLE_ASCEND_NPU
+#include "ascend/rearrange_aclnn.h"
+#endif
+#ifdef ENABLE_METAX_GPU
+#include "maca/rearrange_maca.h"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#include "musa/rearrange_musa.h"
+#endif
+
+__C infiniopStatus_t infiniopCreateRearrangeDescriptor(
+    infiniopHandle_t handle,
+    infiniopRearrangeDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t src) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateRearrangeDescriptor(handle, (RearrangeCpuDescriptor_t *) desc_ptr, dst, src);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateRearrangeDescriptor((CudaHandle_t) handle, (RearrangeCudaDescriptor_t *) desc_ptr, dst, src);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangCreateRearrangeDescriptor((BangHandle_t) handle, (RearrangeBangDescriptor_t *) desc_ptr, dst, src);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnCreateRearrangeDescriptor((AscendHandle_t) handle,
+                                                  (RearrangeAclnnDescriptor_t *) desc_ptr,
+                                                  dst,
+                                                  src);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCreateRearrangeDescriptor((MacaHandle_t) handle, (RearrangeMacaDescriptor_t *) desc_ptr, dst, src);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCreateRearrangeDescriptor((MusaHandle_t)handle, (RearrangeMusaDescriptor_t *) desc_ptr, dst, src);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void *dst, void const *src, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuRearrange((RearrangeCpuDescriptor_t) desc, dst, src, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaRearrange((RearrangeCudaDescriptor_t) desc, dst, src, stream);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangRearrange((RearrangeBangDescriptor_t) desc, dst, src, stream);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnRearrange((RearrangeAclnnDescriptor_t) desc,
+                                  dst,
+                                  src,
+                                  stream);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaRearrange((RearrangeMacaDescriptor_t) desc, dst, src, stream);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaRearrange((RearrangeMusaDescriptor_t) desc, dst, src, stream);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyRearrangeDescriptor((RearrangeCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyRearrangeDescriptor((RearrangeCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangDestroyRearrangeDescriptor((RearrangeBangDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaDestroyRearrangeDescriptor((RearrangeMacaDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaDestroyRearrangeDescriptor((RearrangeMusaDescriptor_t) desc);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/reform/bang/reform_bang.h b/src/ops/reform/bang/reform_bang.h
deleted file mode 100644
index 2c65d52c..00000000
--- a/src/ops/reform/bang/reform_bang.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __BANG_REFORM_H__
-#define __BANG_REFORM_H__
-
-#include "../../utils.h"
-#include "cnrt.h"
-#include "operators.h"
-
-struct ReformBangDescriptor {
-    Device device;
-};
-
-void reform_bang(Tensor y, Tensor x, void *stream);
-
-#endif// __BANG_REFORM_H__
diff --git a/src/ops/reform/bang/reform_bang.mlu b/src/ops/reform/bang/reform_bang.mlu
deleted file mode 100644
index 130a6847..00000000
--- a/src/ops/reform/bang/reform_bang.mlu
+++ /dev/null
@@ -1,247 +0,0 @@
-#include <bang.h>
-#include <bang_device_functions.h>
-#include "reform_bang.h"
-#include "../../../devices/bang/common_bang.h"
-
-template <typename T>
-__mlu_device__ void reformKernel(T *source, T *destination, int *strideSrc, int *strideDest, int *shape, int n, int dimsize, int nDim){
-    
-    if (dimsize * sizeof(T) > GDRAM_MAX_SIZE){
-        int maxNum = GDRAM_MAX_SIZE / sizeof(T);
-        int remain = dimsize % maxNum;
-        int repeat = (dimsize - remain) / maxNum;
-
-        int remainT = n % taskDim;
-        int stepEasy = (n - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i;
-            for (int j = nDim - 2; j >= 0; --j) {
-                inds += (indi % shape[j]) * strideSrc[j];
-                indd += (indi % shape[j]) * strideDest[j];
-                indi /= shape[j];
-            }
-            for (int s = 0; s < repeat; s++){
-                __memcpy(destination + indd + s * maxNum, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2GDRAM); 
-            }
-            if (remain){
-                __memcpy(destination + indd + repeat * maxNum, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2GDRAM); 
-            }                              
-        }      
-    }
-    else {
-        int remainT = n % taskDim;
-        int stepEasy = (n - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i;
-            for (int j = nDim - 2; j >= 0; --j) {
-                inds += (indi % shape[j]) * strideSrc[j];
-                indd += (indi % shape[j]) * strideDest[j];
-                indi /= shape[j];
-            }
-            __memcpy(destination + indd, source + inds, dimsize * sizeof(T), GDRAM2GDRAM);                                  
-        }      
-    }
-    
-}
-
-template <typename T>
-__mlu_global__ void reformUnion1(T *source, T *destination, int *strideSrc, int *strideDest, int *shape, int n, int dimsize, int ndim){
-    
-    reformKernel<T>(source, destination, strideSrc, strideDest, shape, n, dimsize, ndim);
-    
-}
-
-void reform(cnrtQueue_t queue, void *y, void *x, int *y_stride, int *x_stride, int *shape, int n, int dimsize, int ndim){
-    
-    auto y_ = reinterpret_cast<half *>(y);
-    auto x_ = reinterpret_cast<half *>(x);
-
-    cnrtDim3_t dim = {16, 1, 1};
-    cnrtFunctionType_t ktype = CNRT_FUNC_TYPE_UNION1;
-    
-    reformUnion1<half><<<dim, ktype, queue>>>(x_, y_, x_stride, y_stride, shape, n, dimsize, ndim);
-    // cnrtQueueSync(queue);
-    
-}
-template <typename T>
-__mlu_global__ void reformDim_2(T *source, T *destination, int strideS_f, int strideD_f, int n, int dimsize){
-    if (dimsize * sizeof(T) > GDRAM_MAX_SIZE){
-        int maxNum = GDRAM_MAX_SIZE / sizeof(T);
-        int remain = dimsize % maxNum;
-        int repeat = (dimsize - remain) / maxNum;
-
-        int remainT = n % taskDim;
-        int stepEasy = (n - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i;
-            inds += (indi % n) * strideS_f;
-            indd += (indi % n) * strideD_f;
-            for (int s = 0; s < repeat; s++){
-                __memcpy(destination + indd + s * maxNum, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2GDRAM); 
-            }
-            if (remain){
-                __memcpy(destination + indd + repeat * maxNum, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2GDRAM); 
-            }                              
-        }      
-    }
-    else {
-        int remainT = n % taskDim;
-        int stepEasy = (n - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i;
-            inds += (indi % n) * strideS_f;
-            indd += (indi % n) * strideD_f;
-            __memcpy(destination + indd, source + inds, dimsize * sizeof(T), GDRAM2GDRAM);                                  
-        }      
-    }   
-}
-void reformUnionDim_2(cnrtQueue_t queue, void *y, void *x , int strideS_f, int strideD_f, int n, int dimsize){
-    
-    auto y_ = reinterpret_cast<half *>(y);
-    auto x_ = reinterpret_cast<half *>(x);
-
-    cnrtDim3_t dim = {16, 1, 1};
-    cnrtFunctionType_t ktype = CNRT_FUNC_TYPE_UNION1;
-    
-    reformDim_2<half><<<dim, ktype, queue>>>(x_, y_, strideS_f, strideD_f, n, dimsize);
-    // cnrtQueueSync(queue);
-    
-}
-template <typename T>
-__mlu_global__ void reformDim_3(T *source, T *destination, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int n, int middle, int dimsize){
-    int startDim = n / middle;
-    if (dimsize * sizeof(T) > GDRAM_MAX_SIZE){
-        int maxNum = GDRAM_MAX_SIZE / sizeof(T);
-        int remain = dimsize % maxNum;
-        int repeat = (dimsize - remain) / maxNum;
-
-        int remainT = n % taskDim;
-        int stepEasy = (n - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i;
-            inds += (indi % middle) * strideS_m;
-            indd += (indi % middle) * strideD_m;
-            indi /= middle;
-            inds += (indi % startDim) * strideS_f;
-            indd += (indi % startDim) * strideD_f;
-            for (int s = 0; s < repeat; s++){
-                __memcpy(destination + indd + s * maxNum, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2GDRAM); 
-            }
-            if (remain){
-                __memcpy(destination + indd + repeat * maxNum, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2GDRAM); 
-            }                              
-        }      
-    }
-    else {
-        int remainT = n % taskDim;
-        int stepEasy = (n - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i;
-            inds += (indi % middle) * strideS_m;
-            indd += (indi % middle) * strideD_m;
-            indi /= middle;
-            inds += (indi % startDim) * strideS_f;
-            indd += (indi % startDim) * strideD_f;
-            __memcpy(destination + indd, source + inds, dimsize * sizeof(T), GDRAM2GDRAM);                                  
-        }      
-    }   
-}
-void reformUnionDim_3(cnrtQueue_t queue, void *y, void *x, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int n, int middle, int dimsize){
-    
-    auto y_ = reinterpret_cast<half *>(y);
-    auto x_ = reinterpret_cast<half *>(x);
-
-    cnrtDim3_t dim = {16, 1, 1};
-    cnrtFunctionType_t ktype = CNRT_FUNC_TYPE_UNION1;
-    
-    reformDim_3<half><<<dim, ktype, queue>>>(x_, y_, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, dimsize);
-    // cnrtQueueSync(queue);
-    
-}
-void reform_bang(Tensor y, Tensor x, void *stream) {
-    ASSERT_EQ(y.layout->ndim, x.layout->ndim);
-    int ndim = y.layout->ndim;
-    ASSERT(ndim >= 2);
-    for (int i = 0; i < ndim; ++i) {
-        ASSERT_EQ(y.layout->shape[i], x.layout->shape[i]);
-    }
-    ASSERT_EQ(y.layout->strides[ndim - 1], y.layout->dt.size);
-    ASSERT_EQ(x.layout->strides[ndim - 1], x.layout->dt.size);
-    
-    int x_stride[ndim], y_stride[ndim], shape[ndim];
-    int n = 1;
-    for (int i = 0; i < ndim; i++) {
-        x_stride[i] = static_cast<int>(x.layout->strides[i])/y.layout->dt.size;
-        y_stride[i] = static_cast<int>(y.layout->strides[i])/y.layout->dt.size;
-        shape[i] = static_cast<int>(y.layout->shape[i]);
-        n *= shape[i];
-    }
-    int dimsize = shape[ndim - 1];
-    n /= dimsize;
-    auto queue = reinterpret_cast<cnrtQueue_t>(stream);
-    if(ndim == 2){
-        int strideS_f = x_stride[0];
-        int strideD_f = y_stride[0];
-        reformUnionDim_2(queue, y.data, x.data, strideS_f, strideD_f, n, dimsize);
-    }
-    else if(ndim == 3){
-        int strideS_f = x_stride[0];
-        int strideD_f = y_stride[0];
-        int strideS_m = x_stride[1];
-        int strideD_m = y_stride[1];
-        int middle = shape[1];
-        reformUnionDim_3(queue, y.data, x.data, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, dimsize);
-    }
-    else{
-        int *mlu_strideX, *mlu_strideY, *mlu_shape;
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_strideX, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_strideY, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_shape, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMemcpy(mlu_strideX, x_stride, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpy(mlu_strideY, y_stride, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpy(mlu_shape, shape, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        
-        
-        reform(queue, y.data, x.data, mlu_strideY, mlu_strideX, mlu_shape, n, dimsize, ndim);
-        cnrtFree(mlu_strideX);
-        cnrtFree(mlu_strideY);
-        cnrtFree(mlu_shape);
-    }
-    
-}
diff --git a/src/ops/reform/cpu/reform_cpu.cc b/src/ops/reform/cpu/reform_cpu.cc
deleted file mode 100644
index 7296e414..00000000
--- a/src/ops/reform/cpu/reform_cpu.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "reform_cpu.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include "../../utils.h"
-#include <cstring>
-#include <numeric>
-
-inline int indices(int i, int ndim, int64_t *strides, uint64_t *shape) {
-    int ans = 0;
-    for (int j = ndim - 2; j >= 0; --j) {
-        ans += (i % shape[j]) * strides[j];
-        i /= shape[j];
-    }
-    return ans;
-}
-
-void copy_contiguous(uint8_t *dst_ptr, uint8_t const *src_ptr, int n, Tensor y, Tensor x) {
-#pragma omp parallel for
-    for (int i = 0; i < n; ++i) {
-        auto dst_offset = indices(i, y.layout->ndim, y.layout->strides, y.layout->shape);
-        auto src_offset = indices(i, y.layout->ndim, x.layout->strides, x.layout->shape);
-        std::memcpy(dst_ptr + dst_offset, src_ptr + src_offset, y.layout->shape[y.layout->ndim - 1] * y.layout->dt.size);
-    }
-}
-
-union DataLayout_ {
-    DataLayout i;
-    unsigned short u;
-};
-
-void reform_cpu(Tensor y, Tensor x) {
-    DataLayout_ dl_y, dl_x;
-    dl_y.i = y.layout->dt;
-    dl_x.i = x.layout->dt;
-    ASSERT_EQ(dl_y.u, dl_x.u);
-    ASSERT_EQ(y.layout->ndim, x.layout->ndim);
-    auto ndim = y.layout->ndim;
-    ASSERT(ndim >= 2);
-    for (int i = 0; i < ndim; ++i) {
-        ASSERT_EQ(y.layout->shape[i], x.layout->shape[i]);
-    }
-    ASSERT_EQ(y.layout->strides[ndim - 1], y.layout->dt.size);
-    ASSERT_EQ(x.layout->strides[ndim - 1], x.layout->dt.size);
-    unsigned int r = 0;
-    if (ndim == 2) {
-        r = y.layout->shape[0];
-    } else if (ndim == 3) {
-        r = y.layout->shape[0] * y.layout->shape[1];
-    } else {
-        for (int i = ndim - 3; i >= 1; --i) {
-            ASSERT_EQ(y.layout->shape[i] * y.layout->strides[i], y.layout->strides[i - 1]);
-            ASSERT_EQ(x.layout->shape[i] * x.layout->strides[i], x.layout->strides[i - 1]);
-        }
-        r = std::accumulate(y.layout->shape, y.layout->shape + ndim - 1, 1, std::multiplies<unsigned int>());
-    }
-    auto dst_ptr = reinterpret_cast<uint8_t *>(y.data);
-    auto src_ptr = reinterpret_cast<uint8_t const *>(x.data);
-
-    copy_contiguous(dst_ptr, src_ptr, r, y, x);
-}
diff --git a/src/ops/reform/cpu/reform_cpu.h b/src/ops/reform/cpu/reform_cpu.h
deleted file mode 100644
index e0194cd5..00000000
--- a/src/ops/reform/cpu/reform_cpu.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef __CPU_REFORM_H__
-#define __CPU_REFORM_H__
-
-#include "operators.h"
-
-struct ReformCpuDescriptor {
-    Device device;
-};
-
-void reform_cpu(Tensor y, Tensor x);
-
-#endif// __CPU_REFORM_H__
diff --git a/src/ops/reform/cuda/reform.cu b/src/ops/reform/cuda/reform.cu
deleted file mode 100644
index 1a82c8c0..00000000
--- a/src/ops/reform/cuda/reform.cu
+++ /dev/null
@@ -1,107 +0,0 @@
-#include "../../utils.h"
-#include "reform.cuh"
-#include <numeric>
-
-template<class Tmem>
-static __global__ void reform(
-    void *__restrict__ dst,
-    unsigned int const rsa,
-    unsigned int const csa,
-    void const *__restrict__ src,
-    unsigned int const rsb,
-    unsigned int const csb,
-    unsigned int const ncols) {
-
-    auto row = blockIdx.y,
-         col = blockIdx.x * blockDim.y + threadIdx.y;
-    if (col >= ncols) return;
-
-    auto thread = threadIdx.x,
-         warp_size = blockDim.x;
-    auto i = (row * rsa + col * csa) * warp_size + thread;
-    auto j = (row * rsb + col * csb) * warp_size + thread;
-
-    reinterpret_cast<Tmem *>(dst)[i] = reinterpret_cast<Tmem const *>(src)[j];
-}
-
-union DataLayout_ {
-    DataLayout i;
-    unsigned short u;
-};
-
-void reform_nv_gpu(Tensor y, Tensor x, void *stream) {
-    DataLayout_ dl_y, dl_x;
-    dl_y.i = y.layout->dt;
-    dl_x.i = x.layout->dt;
-    ASSERT_EQ(dl_y.u, dl_x.u);
-    ASSERT_EQ(y.layout->ndim, x.layout->ndim);
-    auto ndim = y.layout->ndim;
-    ASSERT(ndim >= 2);
-    for (int i = 0; i < ndim; ++i) {
-        ASSERT_EQ(y.layout->shape[i], x.layout->shape[i]);
-    }
-    ASSERT_EQ(y.layout->strides[ndim - 1], y.layout->dt.size);
-    ASSERT_EQ(x.layout->strides[ndim - 1], x.layout->dt.size);
-    unsigned int r = 0, c = 0, b = 0;
-    unsigned int rsa = 0, csa = 0, rsb = 0, csb = 0;
-    if (ndim == 2) {
-        c = y.layout->shape[0];
-        b = y.layout->shape[1];
-        csa = y.layout->strides[0] / y.layout->dt.size;
-        csb = x.layout->strides[0] / x.layout->dt.size;
-    } else if (ndim == 3) {
-        r = y.layout->shape[0];
-        c = y.layout->shape[1];
-        b = y.layout->shape[2];
-        csa = y.layout->strides[1] / y.layout->dt.size;
-        csb = x.layout->strides[1] / x.layout->dt.size;
-        rsa = y.layout->strides[0] / y.layout->dt.size;
-        rsb = x.layout->strides[0] / x.layout->dt.size;
-    } else {
-        for (int i = ndim - 3; i >= 1; --i) {
-            ASSERT_EQ(y.layout->shape[i] * y.layout->strides[i], y.layout->strides[i - 1]);
-            ASSERT_EQ(x.layout->shape[i] * x.layout->strides[i], x.layout->strides[i - 1]);
-        }
-        r = std::accumulate(y.layout->shape, y.layout->shape + ndim - 2, 1, std::multiplies<unsigned int>());
-        c = y.layout->shape[ndim - 2];
-        b = y.layout->shape[ndim - 1];
-        csa = y.layout->strides[ndim - 2] / y.layout->dt.size;
-        csb = x.layout->strides[ndim - 2] / x.layout->dt.size;
-        rsa = y.layout->strides[ndim - 3] / y.layout->dt.size;
-        rsb = x.layout->strides[ndim - 3] / x.layout->dt.size;
-    }
-    auto contiguous_bytes = b * y.layout->dt.size;
-    ASSERT_EQ(contiguous_bytes % WARP_SIZE, 0);
-    auto bytes_per_thread = contiguous_bytes / WARP_SIZE;
-    ASSERT(bytes_per_thread > 0 && bytes_per_thread <= 32 && (bytes_per_thread & (bytes_per_thread - 1)) == 0);
-
-    auto dst_ptr = static_cast<void *>(reinterpret_cast<uint8_t *>(y.data));
-    rsa /= b;
-    csa /= b;
-    auto src_ptr = static_cast<void const *>(reinterpret_cast<uint8_t const *>(x.data));
-    rsb /= b;
-    csb /= b;
-    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
-    dim3 grid_dims = dim3((c + MAX_WARP_PER_BLOCK - 1) / MAX_WARP_PER_BLOCK, r);
-    dim3 block_dims = dim3(WARP_SIZE, (c + grid_dims.x - 1) / grid_dims.x);
-    switch (bytes_per_thread) {
-        case 1:
-            reform<uchar1><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
-            break;
-        case 2:
-            reform<uchar2><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
-            break;
-        case 4:
-            reform<float1><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
-            break;
-        case 8:
-            reform<float2><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
-            break;
-        case 16:
-            reform<float4><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
-            break;
-        case 32:
-            reform<double4><<<grid_dims, block_dims, 0, cuda_stream>>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c);
-            break;
-    }
-}
diff --git a/src/ops/reform/cuda/reform.cuh b/src/ops/reform/cuda/reform.cuh
deleted file mode 100644
index c1f6ebf6..00000000
--- a/src/ops/reform/cuda/reform.cuh
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __NV_GPU_REFORM_H__
-#define __NV_GPU_REFORM_H__
-
-#include "../../../devices/cuda/common_cuda.h"
-#include "operators.h"
-
-struct ReformCudaDescriptor {
-    Device device;
-};
-
-void reform_nv_gpu(Tensor y, Tensor x, void *stream);
-
-#endif// __NV_GPU_REFORM_H__
diff --git a/src/ops/reform/operator.cc b/src/ops/reform/operator.cc
deleted file mode 100644
index bce59b04..00000000
--- a/src/ops/reform/operator.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-#include "../utils.h"
-#include "ops/reform/reform.h"
-
-#ifdef ENABLE_CPU
-#include "cpu/reform_cpu.h"
-#endif
-#ifdef ENABLE_NV_GPU
-#include "cuda/reform.cuh"
-#endif
-#ifdef ENABLE_CAMBRICON_MLU
-#include "bang/reform_bang.h"
-#endif
-
-struct ReformDescriptor {
-    Device device;
-};
-
-__C ReformDescriptor *createReformDescriptor(Device device, void *config) {
-    switch (device) {
-#ifdef ENABLE_CPU
-        case DevCpu:
-            return (ReformDescriptor *) (new ReformCpuDescriptor{device});
-#endif
-#ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return (ReformDescriptor *) (new ReformCudaDescriptor{device});
-        }
-#endif
-#ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return (ReformDescriptor *) (new ReformBangDescriptor{device});
-        }
-#endif
-        default:
-            PANIC(UnsupportedDevice);
-    }
-    return nullptr;
-}
-
-__C void destroyReformDescriptor(ReformDescriptor *descriptor) {
-    switch (descriptor->device) {
-#ifdef ENABLE_CPU
-        case DevCpu:
-            delete (ReformCpuDescriptor *) (descriptor);
-            break;
-#endif
-#ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            delete (ReformCudaDescriptor *) (descriptor);
-            break;
-#endif
-#ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            delete (ReformBangDescriptor *) (descriptor);
-            break;
-        }
-#endif
-        default:
-            PANIC(UnsupportedDevice);
-    }
-}
-
-__C void reform(ReformDescriptor *descriptor, Tensor y, Tensor x, void *stream) {
-    switch (descriptor->device) {
-#ifdef ENABLE_CPU
-        case DevCpu:
-            reform_cpu(y, x);
-            break;
-#endif
-#ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            reform_nv_gpu(y, x, stream);
-            break;
-#endif
-#ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu:
-            reform_bang(y, x, stream);
-            break;
-#endif
-        default:
-            PANIC(UnsupportedDevice);
-    }
-};
diff --git a/src/ops/relu/cpu/relu_cpu.cc b/src/ops/relu/cpu/relu_cpu.cc
new file mode 100644
index 00000000..2ac7d324
--- /dev/null
+++ b/src/ops/relu/cpu/relu_cpu.cc
@@ -0,0 +1,72 @@
+#include "relu_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+infiniopStatus_t cpuCreateReluDescriptor(infiniopHandle_t,
+                                         ReluCpuDescriptor_t *desc_ptr,
+                                         infiniopTensorDescriptor_t y,
+                                         infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (ndim != x->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (size_t i = 0; i < ndim; ++i) {
+        if (y->shape[i] != x->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    *desc_ptr = new ReluCpuDescriptor{
+        DevCpu,
+        y->dt,
+        data_size,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyReluDescriptor(ReluCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t relu_cpu(ReluCpuDescriptor_t desc, void *y, void const *x) {
+    auto x_ = reinterpret_cast<Tdata const *>(x);
+    auto y_ = reinterpret_cast<Tdata *>(y);
+
+#pragma omp parallel for
+    for (uint64_t i = 0; i < desc->data_size; ++i) {
+        if constexpr (std::is_same<Tdata, uint16_t>::value) {
+            float x_f32 = f16_to_f32(x_[i]);
+            y_[i] = f32_to_f16(x_f32 < 0 ? 0 : x_f32);
+        } else {
+            Tdata x_val = x_[i];
+            y_[i] = x_val < 0 ? 0 : x_val;
+        }
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuRelu(ReluCpuDescriptor_t desc,
+                         void *y, void const *x,
+                         void *stream) {
+    if (desc->dtype == F16) {
+        return relu_cpu<uint16_t>(desc, y, x);
+    }
+    if (desc->dtype == F32) {
+        return relu_cpu<float>(desc, y, x);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/relu/cpu/relu_cpu.h b/src/ops/relu/cpu/relu_cpu.h
new file mode 100644
index 00000000..e4e51532
--- /dev/null
+++ b/src/ops/relu/cpu/relu_cpu.h
@@ -0,0 +1,26 @@
+#ifndef __CPU_RELU_H__
+#define __CPU_RELU_H__
+
+#include "operators.h"
+#include <numeric>
+
+struct ReluCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t data_size;
+};
+
+typedef struct ReluCpuDescriptor *ReluCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateReluDescriptor(infiniopHandle_t,
+                                         ReluCpuDescriptor_t *,
+                                         infiniopTensorDescriptor_t y,
+                                         infiniopTensorDescriptor_t x);
+
+infiniopStatus_t cpuRelu(ReluCpuDescriptor_t desc,
+                         void *y, void const *x,
+                         void *stream);
+
+infiniopStatus_t cpuDestroyReluDescriptor(ReluCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/relu/cuda/relu.cc b/src/ops/relu/cuda/relu.cc
new file mode 100644
index 00000000..3dfadd8a
--- /dev/null
+++ b/src/ops/relu/cuda/relu.cc
@@ -0,0 +1,45 @@
+#include "relu.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateReluDescriptor(CudaHandle_t handle,
+                                          ReluCudaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t y,
+                                          infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (ndim != x->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (size_t i = 0; i < ndim; ++i) {
+        if (y->shape[i] != x->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    *desc_ptr = new ReluCudaDescriptor{
+        DevNvGpu,
+        y->dt,
+        handle->device_id,
+        ndim,
+        data_size,
+        static_cast<uint64_t>(handle->prop.maxGridSize[0]),
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyReluDescriptor(ReluCudaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/relu/cuda/relu.cu b/src/ops/relu/cuda/relu.cu
new file mode 100644
index 00000000..7c9884e6
--- /dev/null
+++ b/src/ops/relu/cuda/relu.cu
@@ -0,0 +1,111 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "relu.cuh"
+
+/**
+ * @brief A templated vector struct that supports applying relu on arrays.
+ *
+ * @tparam T - The access data type for elements in the vector.
+ * @tparam TComp - The computation data type used for arithmetic operations. sizeof(T) should
+ *         be >= sizeof(TComp) 
+ * @tparam N - The number of elements of type T in the vector for a single access.
+ */
+template<typename T, typename TComp, size_t N>
+struct vecN {
+    T data[N];
+    constexpr static size_t pack_size = sizeof(T) / sizeof(TComp);
+
+    // Constructor that initializes the data array with type TComp
+    __device__ __forceinline__ constexpr vecN(const TComp &val) {
+        const auto data_ = reinterpret_cast<TComp *>(data);
+        const auto size = N * pack_size;
+#pragma unroll
+        for (size_t i = 0; i < size; ++i) {
+            data_[i] = 0;
+        }
+    }
+
+    // Assignment operator with relu assignment logic
+    __device__ __forceinline__ vecN<T, TComp, N> &operator=(const vecN<T, TComp, N> &other) {
+        if constexpr (std::is_same<T, TComp>::value) {
+#pragma unroll
+            for (int i = 0; i < N; ++i) {
+                data[i] = other.data[i] < TComp(0) ? TComp(0) : other.data[i];
+            }
+        } else {
+            auto *data_this = reinterpret_cast<vecN<TComp, TComp, pack_size> *>(data);
+            auto *data_other = reinterpret_cast<const vecN<TComp, TComp, pack_size> *>(other.data);
+#pragma unroll
+            for (int i = 0; i < N; ++i) {
+                data_this[i] = data_other[i];
+            }
+        }
+        return *this;
+    }
+
+    // Always returns false since the actual relu logic is in the assignment process
+    __device__ __forceinline__ bool operator<(const vecN<T, TComp, N> &other) const {
+        return false;
+    }
+
+    __device__ __forceinline__ const T &operator[](size_t i) const {
+        return data[i];
+    }
+};
+
+template<typename Tdata>
+__global__ void relu(
+    Tdata *y,
+    const Tdata *x,
+    uint64_t data_size,
+    uint64_t offset) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < data_size) {
+        y[idx] = x[idx] < Tdata(0) ? Tdata(0) : x[idx];
+    }
+}
+
+template<typename Tdata>
+void relu_nv_gpu(ReluCudaDescriptor_t desc, Tdata *y, Tdata const *x, uint64_t data_size, uint64_t offset, void *stream) {
+    if (data_size == 0) {
+        return;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < data_size; i += step) {
+        relu<Tdata><<<gridDims, blockDims, 0, cuda_stream>>>(y, x, offset + data_size, offset + i);
+    }
+}
+
+template<typename Tdata, typename TIdata>
+infiniopStatus_t relu_nv_gpu(ReluCudaDescriptor_t desc, void *y, void const *x, void *stream, uint64_t pack_size) {
+    const auto data_size = desc->data_size / pack_size;
+    const auto x_vec = reinterpret_cast<const Tdata *>(x);
+    const auto y_vec = reinterpret_cast<Tdata *>(y);
+    relu_nv_gpu(desc, y_vec, x_vec, data_size, 0, stream);
+
+    const auto remainder = desc->data_size % pack_size;
+    const auto x_ = reinterpret_cast<const TIdata *>(x);
+    const auto y_ = reinterpret_cast<TIdata *>(y);
+    relu_nv_gpu(desc, y_, x_, remainder, data_size * pack_size, stream);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaRelu(ReluCudaDescriptor_t desc,
+                          void *y, void const *x,
+                          void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+    if (desc->dtype == F16) {
+        return relu_nv_gpu<vecN<half, half, 4>, half>(desc, y, x, stream, 4);
+    }
+    if (desc->dtype == F32) {
+        return relu_nv_gpu<vecN<float2, float, 2>, float>(desc, y, x, stream, 4);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/relu/cuda/relu.cuh b/src/ops/relu/cuda/relu.cuh
new file mode 100644
index 00000000..82020eb6
--- /dev/null
+++ b/src/ops/relu/cuda/relu.cuh
@@ -0,0 +1,32 @@
+#ifndef __CUDA_RELU_H__
+#define __CUDA_RELU_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <cuda_fp16.h>
+#include <numeric>
+
+struct ReluCudaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t data_size;
+    uint64_t max_grid_size;
+};
+
+typedef struct ReluCudaDescriptor *ReluCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateReluDescriptor(CudaHandle_t,
+                                          ReluCudaDescriptor_t *,
+                                          infiniopTensorDescriptor_t y,
+                                          infiniopTensorDescriptor_t x);
+
+infiniopStatus_t cudaRelu(ReluCudaDescriptor_t desc,
+                          void *y, void const *x,
+                          void *stream);
+
+infiniopStatus_t cudaDestroyReluDescriptor(ReluCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/relu/musa/relu_musa.cc b/src/ops/relu/musa/relu_musa.cc
new file mode 100644
index 00000000..6baaef18
--- /dev/null
+++ b/src/ops/relu/musa/relu_musa.cc
@@ -0,0 +1,45 @@
+#include "relu_musa.h"
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+
+infiniopStatus_t musaCreateReluDescriptor(MusaHandle_t handle,
+                                          ReluMusaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t y,
+                                          infiniopTensorDescriptor_t x) {
+    uint64_t ndim = y->ndim;
+    if (ndim != x->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (size_t i = 0; i < ndim; ++i) {
+        if (y->shape[i] != x->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(y) || !is_contiguous(x)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (y->dt != F16 && y->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (y->dt != x->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    *desc_ptr = new ReluMusaDescriptor{
+        DevMthreadsGpu,
+        y->dt,
+        handle->device_id,
+        ndim,
+        data_size,
+        static_cast<uint64_t>(handle->prop.maxGridSize[0]),
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaDestroyReluDescriptor(ReluMusaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/relu/musa/relu_musa.h b/src/ops/relu/musa/relu_musa.h
new file mode 100644
index 00000000..84276369
--- /dev/null
+++ b/src/ops/relu/musa/relu_musa.h
@@ -0,0 +1,32 @@
+#ifndef __MUSA_RELU_H__
+#define __MUSA_RELU_H__
+
+#include "../../../devices/musa/common_musa.h"
+#include "../../../devices/musa/musa_handle.h"
+#include "operators.h"
+#include <musa_fp16.h>
+#include <numeric>
+
+struct ReluMusaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t data_size;
+    uint64_t max_grid_size;
+};
+
+typedef struct ReluMusaDescriptor *ReluMusaDescriptor_t;
+
+infiniopStatus_t musaCreateReluDescriptor(MusaHandle_t,
+                                          ReluMusaDescriptor_t *,
+                                          infiniopTensorDescriptor_t y,
+                                          infiniopTensorDescriptor_t x);
+
+infiniopStatus_t musaRelu(ReluMusaDescriptor_t desc,
+                          void *y, void const *x,
+                          void *stream);
+
+infiniopStatus_t musaDestroyReluDescriptor(ReluMusaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/relu/musa/relu_musa.mu b/src/ops/relu/musa/relu_musa.mu
new file mode 100644
index 00000000..3d91b4e2
--- /dev/null
+++ b/src/ops/relu/musa/relu_musa.mu
@@ -0,0 +1,111 @@
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include "relu_musa.h"
+
+/**
+ * @brief A templated vector struct that supports applying relu on arrays.
+ *
+ * @tparam T - The access data type for elements in the vector.
+ * @tparam TComp - The computation data type used for arithmetic operations. sizeof(T) should
+ *         be >= sizeof(TComp) 
+ * @tparam N - The number of elements of type T in the vector for a single access.
+ */
+template<typename T, typename TComp, size_t N>
+struct vecN {
+    T data[N];
+    constexpr static size_t pack_size = sizeof(T) / sizeof(TComp);
+
+    // Constructor that initializes the data array with type TComp
+    __device__ __forceinline__ constexpr vecN(const TComp &val) {
+        const auto data_ = reinterpret_cast<TComp *>(data);
+        const auto size = N * pack_size;
+#pragma unroll
+        for (size_t i = 0; i < size; ++i) {
+            data_[i] = 0;
+        }
+    }
+
+    // Assignment operator with relu assignment logic
+    __device__ __forceinline__ vecN<T, TComp, N> &operator=(const vecN<T, TComp, N> &other) {
+        if constexpr (std::is_same<T, TComp>::value) {
+#pragma unroll
+            for (int i = 0; i < N; ++i) {
+                data[i] = other.data[i] < TComp(0) ? TComp(0) : other.data[i];
+            }
+        } else {
+            auto *data_this = reinterpret_cast<vecN<TComp, TComp, pack_size> *>(data);
+            auto *data_other = reinterpret_cast<const vecN<TComp, TComp, pack_size> *>(other.data);
+#pragma unroll
+            for (int i = 0; i < N; ++i) {
+                data_this[i] = data_other[i];
+            }
+        }
+        return *this;
+    }
+
+    // Always returns false since the actual relu logic is in the assignment process
+    __device__ __forceinline__ bool operator<(const vecN<T, TComp, N> &other) const {
+        return false;
+    }
+
+    __device__ __forceinline__ const T &operator[](size_t i) const {
+        return data[i];
+    }
+};
+
+template<typename Tdata>
+__global__ void relu(
+    Tdata *y,
+    const Tdata *x,
+    uint64_t data_size,
+    uint64_t offset) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < data_size) {
+        y[idx] = x[idx] < Tdata(0) ? Tdata(0) : x[idx];
+    }
+}
+
+template<typename Tdata>
+void relu_mt_gpu(ReluMusaDescriptor_t desc, Tdata *y, Tdata const *x, uint64_t data_size, uint64_t offset, void *stream) {
+    if (data_size == 0) {
+        return;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    musaStream_t musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < data_size; i += step) {
+        relu<Tdata><<<gridDims, blockDims, 0, musa_stream>>>(y, x, offset + data_size, offset + i);
+    }
+}
+
+template<typename Tdata, typename TIdata>
+infiniopStatus_t relu_mt_gpu(ReluMusaDescriptor_t desc, void *y, void const *x, void *stream, uint64_t pack_size) {
+    const auto data_size = desc->data_size / pack_size;
+    const auto x_vec = reinterpret_cast<const Tdata *>(x);
+    const auto y_vec = reinterpret_cast<Tdata *>(y);
+    relu_mt_gpu(desc, y_vec, x_vec, data_size, 0, stream);
+
+    const auto remainder = desc->data_size % pack_size;
+    const auto x_ = reinterpret_cast<const TIdata *>(x);
+    const auto y_ = reinterpret_cast<TIdata *>(y);
+    relu_mt_gpu(desc, y_, x_, remainder, data_size * pack_size, stream);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaRelu(ReluMusaDescriptor_t desc,
+                          void *y, void const *x,
+                          void *stream) {
+    checkMusaError(musaSetDevice(desc->device_id));
+    if (desc->dtype == F16) {
+        return relu_mt_gpu<vecN<half, half, 4>, half>(desc, y, x, stream, 4);
+    }
+    if (desc->dtype == F32) {
+        return relu_mt_gpu<vecN<float2, float, 2>, float>(desc, y, x, stream, 4);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/relu/operator.cc b/src/ops/relu/operator.cc
new file mode 100644
index 00000000..7a3a2e2f
--- /dev/null
+++ b/src/ops/relu/operator.cc
@@ -0,0 +1,91 @@
+#include "../utils.h"
+#include "operators.h"
+#include "ops/relu/relu.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/relu_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/relu.cuh"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#include "musa/relu_musa.h"
+#endif
+
+
+__C infiniopStatus_t infiniopCreateReluDescriptor(
+    infiniopHandle_t handle,
+    infiniopReluDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateReluDescriptor(handle, (ReluCpuDescriptor_t *) desc_ptr, y, x);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateReluDescriptor((CudaHandle_t) handle, (ReluCudaDescriptor_t *) desc_ptr, y, x);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCreateReluDescriptor((MusaHandle_t) handle, (ReluMusaDescriptor_t *) desc_ptr, y, x);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopRelu(infiniopReluDescriptor_t desc, void *y, void const *x, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuRelu((ReluCpuDescriptor_t) desc, y, x, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaRelu((ReluCudaDescriptor_t) desc, y, x, stream);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaRelu((ReluMusaDescriptor_t) desc, y, x, stream);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyReluDescriptor((ReluCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyReluDescriptor((ReluCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        // TODO
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaDestroyReluDescriptor((ReluMusaDescriptor_t) desc);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/rms_norm/ascend/rms_norm_aclnn.cc b/src/ops/rms_norm/ascend/rms_norm_aclnn.cc
new file mode 100644
index 00000000..d264be39
--- /dev/null
+++ b/src/ops/rms_norm/ascend/rms_norm_aclnn.cc
@@ -0,0 +1,215 @@
+#include "rms_norm_aclnn.h"
+
+RMSNormAclnnDescriptor::RMSNormAclnnDescriptor(Device _device) {
+    device = _device;
+    device_id = 0;
+    executor = nullptr;
+    castExecutor = nullptr;
+    workspaceSize = 0;
+    castWorkspaceSize = 0;
+    yDesc = new aclnnTensorDescriptor();
+    xDesc = new aclnnTensorDescriptor();
+    wDesc = new aclnnTensorDescriptor();
+    rstdDesc = new aclnnTensorDescriptor();
+    castDesc = nullptr;
+    epsilon = 1e-5;
+}
+
+
+infiniopStatus_t aclnnCreateRMSNormDescriptor(AscendHandle_t handle,
+                                              RMSNormAclnnDescriptor_t *desc_ptr,
+                                              infiniopTensorDescriptor_t y,
+                                              infiniopTensorDescriptor_t x,
+                                              infiniopTensorDescriptor_t w,
+                                              float eps) {
+    *desc_ptr = new RMSNormAclnnDescriptor(handle->device);
+    (*desc_ptr)->device_id = handle->device_id;
+    (*desc_ptr)->epsilon = static_cast<double>(eps);
+
+    auto &yDesc = (*desc_ptr)->yDesc;
+    auto &xDesc = (*desc_ptr)->xDesc;
+    auto &wDesc = (*desc_ptr)->wDesc;
+    auto &castDesc = (*desc_ptr)->castDesc;
+    auto &rstdDesc = (*desc_ptr)->rstdDesc;
+
+    CHECK_STATUS(yDesc->fromInfiniOpTensorDescriptor(y), STATUS_SUCCESS);
+    CHECK_STATUS(xDesc->fromInfiniOpTensorDescriptor(x), STATUS_SUCCESS);
+    CHECK_STATUS(wDesc->fromInfiniOpTensorDescriptor(w), STATUS_SUCCESS);
+
+    // Set rstdDesc
+    // See: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnRmsNorm.md
+    // rstdTensor cannot set nullptr in aclnn
+    int64_t wsize = 1;
+    for (uint64_t i = 0; i < wDesc->ndim; ++i) {
+        wsize *= (wDesc->shape)[i];
+    }
+    int64_t xsize = 1;
+    uint64_t rstd_dim = xDesc->ndim - 1;
+    for (int64_t i = xDesc->ndim - 1; i >= 0; --i) {
+        xsize *= (xDesc->shape)[i];
+        rstd_dim = static_cast<uint64_t>(i);
+        if (xsize == wsize) {
+            break;
+        }
+    }
+
+    auto rstd_shape = std::vector<int64_t>(xDesc->ndim, 1);
+    auto rstd_strides = std::vector<int64_t>(xDesc->ndim, 1);
+
+    for (uint64_t i = 0; i < rstd_dim; ++i) {
+        rstd_shape[i] = (xDesc->shape)[i];
+    }
+    for (int64_t i = xDesc->ndim - 2; i >= 0; --i) {
+        rstd_strides[i] = rstd_strides[i + 1] * rstd_shape[i + 1];
+    }
+    CHECK_STATUS(rstdDesc->setDescriptor(toAclDataType(F32), rstd_shape, rstd_strides), STATUS_SUCCESS);
+
+    if (wDesc->dataType != xDesc->dataType) {
+        castDesc = new aclnnTensorDescriptor();
+        CHECK_STATUS(castDesc->fromInfiniOpTensorDescriptor(w), STATUS_SUCCESS);
+        castDesc->dataType = xDesc->dataType;
+        CHECK_STATUS(castDesc->createTensor(), STATUS_SUCCESS);
+    }
+
+    CHECK_STATUS(yDesc->createTensor(), STATUS_SUCCESS);
+    CHECK_STATUS(xDesc->createTensor(), STATUS_SUCCESS);
+    CHECK_STATUS(wDesc->createTensor(), STATUS_SUCCESS);
+    CHECK_STATUS(rstdDesc->createTensor(), STATUS_SUCCESS);
+
+    // Get Tensor
+    aclTensor *ty = yDesc->t;
+    aclTensor *tx = xDesc->t;
+    aclTensor *tw = wDesc->t;
+    aclTensor *trstd = rstdDesc->t;
+
+    // Get workspaceSize and set executor
+    auto &workspaceSize = (*desc_ptr)->workspaceSize;
+    auto &executor = (*desc_ptr)->executor;
+    auto ret = aclnnRmsNormGetWorkspaceSize(tx,
+                                            castDesc == nullptr ? tw
+                                                                : castDesc->t,
+                                            (*desc_ptr)->epsilon,
+                                            ty,
+                                            trstd,
+                                            &workspaceSize,
+                                            &executor);
+    aclSetAclOpExecutorRepeatable(executor);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnRmsNormGetWorkspaceSize failed. ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+
+    // Get Cast workspaceSize and set castExecutor
+    if (castDesc != nullptr) {
+        auto &castExecutor = (*desc_ptr)->castExecutor;
+        auto &castWorkspaceSize = (*desc_ptr)->castWorkspaceSize;
+        aclTensor *tcast = castDesc->t;
+        ret = aclnnCastGetWorkspaceSize(tw,
+                                        castDesc->dataType,
+                                        tcast,
+                                        &castWorkspaceSize,
+                                        &castExecutor);
+        aclSetAclOpExecutorRepeatable(castExecutor);
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclnnCastGetWorkspaceSize failed. ERROR: %d\n", ret);
+                  return STATUS_EXECUTION_FAILED);
+    }
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnGetRMSNormWorkspaceSize(RMSNormAclnnDescriptor_t desc,
+                                              uint64_t *size) {
+    auto &rstdDesc = desc->rstdDesc;
+    auto &castDesc = desc->castDesc;
+
+    *size = desc->workspaceSize +
+            numElements(rstdDesc->shape.data(), rstdDesc->ndim) * aclDataTypeSize(rstdDesc->dataType);
+
+    if (castDesc != nullptr) {
+        *size += desc->castWorkspaceSize;
+        *size += numElements(castDesc->shape.data(), castDesc->ndim) * aclDataTypeSize(castDesc->dataType);
+    }
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnRMSNorm(RMSNormAclnnDescriptor_t desc,
+                              void *workspace,
+                              uint64_t workspace_size,
+                              void *y,
+                              void const *x,
+                              void const *w,
+                              void *stream) {
+    auto &yDesc = desc->yDesc;
+    auto &xDesc = desc->xDesc;
+    auto &wDesc = desc->wDesc;
+    auto &rstdDesc = desc->rstdDesc;
+    auto &castDesc = desc->castDesc;
+
+    // Get Tensor
+    aclTensor *ty = yDesc->t;
+    aclTensor *tx = xDesc->t;
+    aclTensor *tw = wDesc->t;
+    aclTensor *trstd = rstdDesc->t;
+
+    auto &executor = desc->executor;
+    auto &castExecutor = desc->castExecutor;
+    auto &workspaceSize = desc->workspaceSize;
+    auto &castWorkspaceSize = desc->castWorkspaceSize;
+
+    auto rstd = (void *) ((uint8_t *) workspace + workspaceSize);
+    
+    // Set device
+    aclrtSetDevice(desc->device_id);
+    aclnnStatus ret;
+
+    void *castPtr = nullptr;
+
+    // Cast w 
+    if (castDesc != nullptr) {
+        aclTensor *tcast = castDesc->t;
+        castPtr = (void *) ((float *) rstd + numElements(rstdDesc->shape.data(), rstdDesc->ndim));
+
+        AclSetTensorAddr(castExecutor, 0, tw, (void *) w);
+        AclSetTensorAddr(castExecutor, 1, tcast, castPtr);
+        ret = aclnnCast(nullptr, castWorkspaceSize, castExecutor, stream);
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclnnCast failed. ERROR: %d\n", ret);
+                  return STATUS_EXECUTION_FAILED);
+    }
+
+    // Do RmsNorm calc
+    AclSetTensorAddr(executor, 0, tx, (void *) x);
+    if (castDesc != nullptr) {
+        AclSetTensorAddr(executor, 1, castDesc->t, castPtr);
+    } else {
+        AclSetTensorAddr(executor, 1, tw, (void *) w);
+    }
+    AclSetTensorAddr(executor, 2, ty, y);
+    AclSetTensorAddr(executor, 3, trstd, rstd);
+
+    ret = aclnnRmsNorm(workspace,
+                       workspaceSize,
+                       executor,
+                       stream);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnRmsNorm failed. ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnDestroyRMSNormDescriptor(RMSNormAclnnDescriptor_t desc) {
+    delete desc->yDesc;
+    delete desc->wDesc;
+    delete desc->xDesc;
+    delete desc->rstdDesc;
+    aclDestroyAclOpExecutor(desc->executor);
+    if (desc->castDesc != nullptr) {
+        delete desc->castDesc;
+        aclDestroyAclOpExecutor(desc->castExecutor);
+    }
+    delete desc;
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rms_norm/ascend/rms_norm_aclnn.h b/src/ops/rms_norm/ascend/rms_norm_aclnn.h
new file mode 100644
index 00000000..2999fefd
--- /dev/null
+++ b/src/ops/rms_norm/ascend/rms_norm_aclnn.h
@@ -0,0 +1,49 @@
+#ifndef __ACLNN_RMS_NORM_H__
+#define __ACLNN_RMS_NORM_H__
+
+#include "../../../devices/ascend/ascend_handle.h"
+#include "../../../devices/ascend/tensor_aclnn.h"
+#include "../../utils.h"
+#include "operators.h"
+#include <acl/acl_base.h>
+#include <aclnn/acl_meta.h>
+#include <aclnnop/aclnn_cast.h>
+#include <aclnnop/aclnn_rms_norm.h>
+#include <algorithm>
+
+struct RMSNormAclnnDescriptor {
+    Device device;
+    int device_id;
+    aclOpExecutor *executor;
+    aclOpExecutor *castExecutor;
+    aclnnTensorDescriptor_t yDesc, xDesc, wDesc, rstdDesc, castDesc;
+    uint64_t workspaceSize;
+    uint64_t castWorkspaceSize;
+    double epsilon;
+
+    RMSNormAclnnDescriptor(Device device);
+};
+
+typedef RMSNormAclnnDescriptor *RMSNormAclnnDescriptor_t;
+
+infiniopStatus_t aclnnCreateRMSNormDescriptor(AscendHandle_t handle,
+                                              RMSNormAclnnDescriptor_t *desc,
+                                              infiniopTensorDescriptor_t y,
+                                              infiniopTensorDescriptor_t x,
+                                              infiniopTensorDescriptor_t w,
+                                              float eps);
+
+infiniopStatus_t aclnnGetRMSNormWorkspaceSize(RMSNormAclnnDescriptor_t desc,
+                                              uint64_t *size);
+
+infiniopStatus_t aclnnRMSNorm(RMSNormAclnnDescriptor_t desc,
+                              void *workspace,
+                              uint64_t workspace_size,
+                              void *y,
+                              void const *x,
+                              void const *w,
+                              void *stream);
+
+infiniopStatus_t aclnnDestroyRMSNormDescriptor(RMSNormAclnnDescriptor_t desc);
+
+#endif
diff --git a/src/ops/rms_norm/bang/rms_norm_bang.cc b/src/ops/rms_norm/bang/rms_norm_bang.cc
new file mode 100644
index 00000000..fbf7f689
--- /dev/null
+++ b/src/ops/rms_norm/bang/rms_norm_bang.cc
@@ -0,0 +1,44 @@
+#include "rms_norm_bang.h"
+#include "../../utils.h"
+infiniopStatus_t bangCreateRMSNormDescriptor(BangHandle_t handle, RMSNormBangDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t y_desc,
+                                             infiniopTensorDescriptor_t x_desc,
+                                             infiniopTensorDescriptor_t w_desc,
+                                             float epsilon) {
+    if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    auto n = y_desc->shape[0],
+         d = y_desc->shape[1];
+
+    if (x_desc->shape[0] != n || x_desc->shape[1] != d || w_desc->shape[0] != d) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    uint64_t stride_y = y_desc->strides[0];
+    uint64_t stride_x = x_desc->strides[0];
+    auto w_datatype = w_desc->dt;
+    *desc_ptr = new RMSNormBangDescriptor{
+        handle->device,
+        handle->device_id,
+        y_desc->dt,
+        n,
+        d,
+        stride_y,
+        stride_x,
+        w_datatype,
+        epsilon};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t bangDestroyRMSNormDescriptor(RMSNormBangDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rms_norm/bang/rms_norm_bang.h b/src/ops/rms_norm/bang/rms_norm_bang.h
index 26187c97..bfd94158 100644
--- a/src/ops/rms_norm/bang/rms_norm_bang.h
+++ b/src/ops/rms_norm/bang/rms_norm_bang.h
@@ -1,10 +1,39 @@
 #ifndef __BANG_RMS_NORM_H__
 #define __BANG_RMS_NORM_H__
 
+#include "../../../devices/bang/bang_handle.h"
 #include "../../utils.h"
-#include "cnrt.h"
 #include "operators.h"
 
-void rms_norm_bang_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream);
+struct RMSNormBangDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t n;
+    uint64_t d;
+    uint64_t stride_y;
+    uint64_t stride_x;
+    DT w_datatype;
+    float epsilon;
+};
+
+typedef struct RMSNormBangDescriptor *RMSNormBangDescriptor_t;
+
+infiniopStatus_t bangCreateRMSNormDescriptor(BangHandle_t handle,
+                                             RMSNormBangDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t y_desc,
+                                             infiniopTensorDescriptor_t x_desc,
+                                             infiniopTensorDescriptor_t w_desc,
+                                             float epsilon);
+
+infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t bangRMSNorm(RMSNormBangDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *y, void const *x, void const *w,
+                             void *stream);
+
+infiniopStatus_t bangDestroyRMSNormDescriptor(RMSNormBangDescriptor_t desc);
 
 #endif// __BANG_RMS_NORM_H__
diff --git a/src/ops/rms_norm/bang/rms_norm_bang.mlu b/src/ops/rms_norm/bang/rms_norm_bang.mlu
index 6b4dcfc3..755e1e3c 100644
--- a/src/ops/rms_norm/bang/rms_norm_bang.mlu
+++ b/src/ops/rms_norm/bang/rms_norm_bang.mlu
@@ -1,143 +1,148 @@
 #include "bang.h"
-#include "bang_device_functions.h"
 #include "cnrt.h"
 #include "rms_norm_bang.h"
 #include "../../../devices/bang/common_bang.h"
 
 
-const int SRC_MAX_SIZE = 1024 * 64;//至少大于等于128字节
+const int SRC_MAX_SIZE = 1024 * 64;//尽量取大一些
 __nram__  char nram_buffer[NRAM_MAX_SIZE];
-const int wSize = 64;
-template <typename T>
-__mlu_device__ void rmsNormKernel(T *destination, T const *source, T const *weight, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int dimS, float eps, int ndim) {//axis=-1
-    
-    const int maxNum = SRC_MAX_SIZE/sizeof(T);
+template<typename T>
+__mlu_global__ void rms_norm(T *destination, T const *source, float const *weight, int stride_y, int stride_x, float eps, int othersize, int dimsize, int dimS){
+    const int maxNum = SRC_MAX_SIZE/sizeof(float);
+    int wSize = 128 / sizeof(T);
+
+    int remainT = othersize % taskDim;
+    int stepEasy = (othersize - remainT) / taskDim;
+    int stepHard = stepEasy + 1;
+    int step = (taskId < remainT ? stepHard : stepEasy);
+    int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
+
     if(dimsize >= maxNum){
-        
+
+        char *nram_buffer1 = nram_buffer + (2 * maxNum + 3 * wSize) * sizeof(T);
         T *src = (T *)nram_buffer;//[maxNum]
-        T *destSumFinal = src + maxNum;//[wSize]
+        T *wet = src + maxNum;//[maxNum]
+        T *destSumFinal = wet + maxNum;//[wSize]
         T *destSum = destSumFinal + wSize;//[wSize]
-        T *wet = destSum + wSize;//[maxNum]
-        
+        T *srcTmp = destSum + wSize;//[wSize]
+        __bang_write_zero(srcTmp, wSize);
+        float *wetTmp = (float *)nram_buffer1;
+
         int remain = dimsize % maxNum;
         int repeat = (dimsize - remain) / maxNum;
-        int tidS;
-        int tidD;
+        int segNum = maxNum / wSize;//准备数值求和
 
-        int remainT = othersize % taskDim;
-        int stepEasy = (othersize - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
         for(int i = indStart; i < indStart + step; i++){
             int inds = 0;
             int indd = 0;
             int indi = i;
-            for (int j = ndim - 2; j >= 0; --j) {
-                inds += (indi % shape[j]) * strideSrc[j];
-                indd += (indi % shape[j]) * strideDest[j];
-                indi /= shape[j];
-            }
+            inds += (indi % othersize) * stride_x;
+            indd += (indi % othersize) * stride_y;
             __bang_write_zero(destSumFinal, wSize);
+            __bang_write_zero(destSum, wSize);
             for(int s = 0; s < repeat; s++){
-                __bang_write_zero(destSum, wSize);
-                tidS = inds + s * maxNum;
-                __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM);
+                __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
                 __bang_mul(src, src, src, maxNum);//src = src * src
-                int segNum = maxNum / wSize;//准备数值求和
-                for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                    for(int j = 0; j < strip; j++){
-                        __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize);
+
+                if(maxNum >= wSize){
+                    for(int strip = segNum / 2; strip > 0; strip = strip / 2){
+                        for(int j = 0; j < strip; j++){
+                            __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize);
+                        }
                     }
+                    __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
+                    __bang_add(destSumFinal, destSumFinal, destSum, wSize);
+                }
+                else{
+                    __memcpy(srcTmp, src, maxNum * sizeof(T), NRAM2NRAM);
+                    __bang_reduce_sum(destSum, srcTmp, wSize);
+                    __bang_add(destSumFinal, destSumFinal, destSum, wSize);
                 }
-                __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                __bang_add(destSumFinal, destSumFinal, destSum, wSize);
             }
-            
             if(remain){
-                tidS = inds + repeat * maxNum;
                 __bang_write_zero(src, maxNum);
-                __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM);
+                __bang_write_zero(destSum, wSize);
+                __memcpy(src, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM);
                 __bang_mul(src, src, src, maxNum);//src = src * src
-                int segNum = maxNum / wSize;//准备数值求和
-                for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                    for(int j = 0; j < strip; j++){
-                        __bang_add(src + j * wSize, src + j * wSize, src + (j+ strip) * wSize, wSize);
+                if(maxNum >= wSize){
+                    for(int strip = segNum / 2; strip > 0; strip = strip / 2){
+                        for(int j = 0; j < strip; j++){
+                            __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize);
+                        }
                     }
+                    __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
+                    __bang_add(destSumFinal, destSumFinal, destSum, wSize);
+                }
+                else{
+                    __memcpy(srcTmp, src, remain * sizeof(T), NRAM2NRAM);
+                    __bang_reduce_sum(destSum, srcTmp, wSize);
+                    __bang_add(destSumFinal, destSumFinal, destSum, wSize);
                 }
-                __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                __bang_add(destSumFinal, destSumFinal, destSum, wSize);
             }
-            
-            destSumFinal[0] += eps;
             destSumFinal[0] /= dimsize;
-            destSum[0] = pow(destSum[0], 0.5);
+            destSumFinal[0] += eps;
+            destSumFinal[0] = pow(destSumFinal[0], 0.5);
             T globalSumInv = 1.0 / destSumFinal[0];
-
-            // 写回 global memory
             for(int s = 0; s < repeat; s++){
-                tidS = inds + s * maxNum;
-                tidD = indd + s * maxNum;
-                __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM);
-                
-                __memcpy(wet, weight + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-                
+                __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+                __memcpy(wetTmp, weight + s * maxNum, maxNum * sizeof(float), GDRAM2NRAM);
+                __bang_float2half_dn(wet, wetTmp, maxNum);
                 __bang_mul(src, src, wet, maxNum);//src = src * wet
                 __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                __memcpy(destination + tidD, src, maxNum * sizeof(T), NRAM2GDRAM);
+                __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM);
             }
             if(remain){
-                tidS = inds + repeat * maxNum;
-                tidD = indd + repeat * maxNum;
-                __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM);
-                __memcpy(wet, weight + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM);
+                __memcpy(src, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM);
+                __memcpy(wetTmp, weight + repeat * maxNum, remain * sizeof(float), GDRAM2NRAM);
+                __bang_float2half_dn(wet, wetTmp, maxNum);
                 __bang_mul(src, src, wet, maxNum);//src = src * wet
                 __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                __memcpy(destination + tidD, src, remain * sizeof(T), NRAM2GDRAM); 
+                __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM);
             }
         }
     }
-    else{//dimsize < maxNum
-        
-        T *src = (T *)nram_buffer;
-        T *wet = src + dimsize;   
-        T *destSum = wet + dimsize;  
-        T *destSumFinal = destSum + dimS;
-        
-        __bang_write_zero(destSum, dimS);
-        __bang_write_zero(destSumFinal, dimS);
-        __memcpy(wet, weight, dimsize * sizeof(T), GDRAM2NRAM);
-        
-        int remainT = othersize % taskDim;
-        int stepEasy = (othersize - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
+    else{
+        char *nram_buffer1 = nram_buffer + (2 * dimsize + 2 * wSize + dimS) * sizeof(T);
+        T *src = (T *)nram_buffer;//[dimsize]
+        T *wet = src + dimsize;//[dimsize]
+        T *destSumFinal = wet + dimsize;//[wSize]
+        T *destSum = destSumFinal + wSize;//[dimS]
+        T *srcTmp = destSum + dimS;
+        __bang_write_zero(srcTmp, wSize);
+        float *wetTmp = (float *)nram_buffer1;
+
+
+        int segNum = dimS / wSize;
+
         for(int i = indStart; i < indStart + step; i++){
+            __bang_write_zero(destSum, dimS);
+            __bang_write_zero(destSumFinal, wSize);
             int inds = 0;
             int indd = 0;
-            int indi = i ;
-            for (int j = ndim - 2; j >= 0; --j) {
-                inds += (indi % shape[j]) * strideSrc[j];
-                indd += (indi % shape[j]) * strideDest[j];
-                indi /= shape[j];
-            }
+            int indi = i;
+            inds += (indi % othersize) * stride_x;
+            indd += (indi % othersize) * stride_y;
             __memcpy(src, source + inds, dimsize * sizeof(T), GDRAM2NRAM);
             __bang_mul(destSum, src, src, dimsize);//src = src * src
-            int segNum = dimS / wSize;
-            for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                for(int j = 0; j < strip; j++){
-                    __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
+            if(dimS >= wSize){
+                for(int strip = segNum / 2; strip > 0; strip = strip / 2){
+                    for(int j = 0; j < strip; j++){
+                        __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
+                    }
                 }
+                __bang_reduce_sum(destSumFinal, destSum, wSize);
+            }
+            else{
+                __memcpy(srcTmp, destSum, dimsize * sizeof(T), NRAM2NRAM);
+                __bang_reduce_sum(destSumFinal, srcTmp, wSize);
             }
-            __bang_reduce_sum(destSumFinal, destSum, wSize);
             destSumFinal[0] /= dimsize;
             destSumFinal[0] += eps;
-            T globalSum = pow(destSumFinal[0], 0.5);
-            T globalSumInv = 1.0 / globalSum;
-            __bang_mul(src, src, wet, dimsize);
+            destSumFinal[0] = pow(destSumFinal[0], 0.5);
+            T globalSumInv = 1.0 / destSumFinal[0];
+            __memcpy(wetTmp, weight, dimsize * sizeof(float), GDRAM2NRAM);
+            __bang_float2half_dn(wet, wetTmp, dimsize);
+            __bang_mul(src, src, wet, dimsize);//src = src * wet
             __bang_mul_scalar(src, src, globalSumInv, dimsize);
             __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
         }
@@ -145,336 +150,136 @@ __mlu_device__ void rmsNormKernel(T *destination, T const *source, T const *weig
 }
 
 template<typename T>
-__mlu_global__ void rmsNormUnion1(T *mlu_destination, T const *mlu_src, T const *mlu_weight, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int dimS, float eps, int ndim) {
-
-    rmsNormKernel<T>(mlu_destination, mlu_src, mlu_weight, strideSrc, strideDest, shape, othersize, dimsize, dimS, eps, ndim);
-}
-
-template<typename T>
-void rmsNorm(cnrtQueue_t queue, void *y, void const *x, void const *w, int *strideSrc, int *strideDest, int *shape, int n, int d, float eps, int ndim) {
-    const int wSize = 128 / sizeof(T);
-    auto y_ = reinterpret_cast<T *>(y);
-    auto x_ = reinterpret_cast<T const *>(x);
-    auto w_ = reinterpret_cast<T const *>(w);
-
-    int dimS;
-    float mi = log2(d);
-    if (floor(mi) == mi) {
-        dimS = d;
-    } else {
-        dimS = pow(2, floor(mi) + 1);
-    }
-    if (dimS < wSize) {
-        dimS = wSize;
-    }
-    
-    cnrtDim3_t k_dim;
-    cnrtFunctionType_t k_type;
-
-    k_dim.x = 4;
-    k_dim.y = 1;
-    k_dim.z = 1;
-    k_type = CNRT_FUNC_TYPE_UNION1;
+__mlu_global__ void rms_norm(T *destination, T const *source, T const *weight, int stride_y, int stride_x, float eps, int othersize, int dimsize, int dimS){
+    const int maxNum = SRC_MAX_SIZE/sizeof(T);
+    int wSize = 128 / sizeof(T);
 
-    rmsNormUnion1<T><<<k_dim, k_type, queue>>>(y_, x_, w_, strideSrc, strideDest, shape, n, d, dimS, eps, ndim);
-    // cnrtQueueSync(queue);
-}
+    int remainT = othersize % taskDim;
+    int stepEasy = (othersize - remainT) / taskDim;
+    int stepHard = stepEasy + 1;
+    int step = (taskId < remainT ? stepHard : stepEasy);
+    int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
 
-void rmsNorm_fp16(cnrtQueue_t queue, void *y, void const *x, void const *w, int *strideSrc, int *strideDest, int *shape, int n, int d, float eps, int ndim) {
-   rmsNorm<half>(queue, y, x, w, strideSrc, strideDest, shape, n, d, eps, ndim);
-}
-template <typename T>
-__mlu_global__ void rmsNormDim_2(T *destination, T const *source, T const *weight, int strideS_f, int strideD_f, int othersize, int dimsize, int dimS, float eps) {//axis=-1
-    
-    const int maxNum = SRC_MAX_SIZE/sizeof(T);
     if(dimsize >= maxNum){
-        
+
         T *src = (T *)nram_buffer;//[maxNum]
-        T *destSumFinal = src + maxNum;//[wSize]
+        T *wet = src + maxNum;//[maxNum]
+        T *destSumFinal = wet + maxNum;//[wSize]
         T *destSum = destSumFinal + wSize;//[wSize]
-        T *wet = destSum + wSize;//[maxNum]
-        
+        T *srcTmp = destSum + wSize;//[wSize]
+        __bang_write_zero(srcTmp, wSize);
+
         int remain = dimsize % maxNum;
         int repeat = (dimsize - remain) / maxNum;
-        int tidS;
-        int tidD;
+        int segNum = maxNum / wSize;//准备数值求和
 
-        int remainT = othersize % taskDim;
-        int stepEasy = (othersize - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
         for(int i = indStart; i < indStart + step; i++){
             int inds = 0;
             int indd = 0;
             int indi = i;
-            inds += (indi % othersize) * strideS_f;
-            indd += (indi % othersize) * strideD_f;
+            inds += (indi % othersize) * stride_x;
+            indd += (indi % othersize) * stride_y;
             __bang_write_zero(destSumFinal, wSize);
+            __bang_write_zero(destSum, wSize);
             for(int s = 0; s < repeat; s++){
-                __bang_write_zero(destSum, wSize);
-                tidS = inds + s * maxNum;
-                __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM);
+                __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
                 __bang_mul(src, src, src, maxNum);//src = src * src
-                int segNum = maxNum / wSize;//准备数值求和
-                for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                    for(int j = 0; j < strip; j++){
-                        __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize);
+
+                if(maxNum >= wSize){
+                    for(int strip = segNum / 2; strip > 0; strip = strip / 2){
+                        for(int j = 0; j < strip; j++){
+                            __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize);
+                        }
                     }
+                    __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
+                    __bang_add(destSumFinal, destSumFinal, destSum, wSize);
+                }
+                else{
+                    __memcpy(srcTmp, src, maxNum * sizeof(T), NRAM2NRAM);
+                    __bang_reduce_sum(destSum, srcTmp, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
+                    __bang_add(destSumFinal, destSumFinal, destSum, wSize);
                 }
-                __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                __bang_add(destSumFinal, destSumFinal, destSum, wSize);
             }
-            
             if(remain){
-                tidS = inds + repeat * maxNum;
                 __bang_write_zero(src, maxNum);
-                __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM);
+                __bang_write_zero(destSum, wSize);
+                __memcpy(src, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM);
                 __bang_mul(src, src, src, maxNum);//src = src * src
-                int segNum = maxNum / wSize;//准备数值求和
-                for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                    for(int j = 0; j < strip; j++){
-                        __bang_add(src + j * wSize, src + j * wSize, src + (j+ strip) * wSize, wSize);
+                if(maxNum >= wSize){
+                    for(int strip = segNum / 2; strip > 0; strip = strip / 2){
+                        for(int j = 0; j < strip; j++){
+                            __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize);
+                        }
                     }
+                    __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
+                    __bang_add(destSumFinal, destSumFinal, destSum, wSize);
+                }
+                else{
+                    __memcpy(srcTmp, src, remain * sizeof(T), NRAM2NRAM);
+                    __bang_reduce_sum(destSum, srcTmp, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
+                    __bang_add(destSumFinal, destSumFinal, destSum, wSize);
                 }
-                __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                __bang_add(destSumFinal, destSumFinal, destSum, wSize);
             }
-            
-            destSumFinal[0] += eps;
             destSumFinal[0] /= dimsize;
-            destSum[0] = pow(destSum[0], 0.5);
+            destSumFinal[0] += eps;
+            destSumFinal[0] = pow(destSumFinal[0], 0.5);
             T globalSumInv = 1.0 / destSumFinal[0];
-            
-            // 写回 global memory
             for(int s = 0; s < repeat; s++){
-                tidS = inds + s * maxNum;
-                tidD = indd + s * maxNum;
-                __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM);
-                
+                __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
                 __memcpy(wet, weight + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-                
                 __bang_mul(src, src, wet, maxNum);//src = src * wet
                 __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                __memcpy(destination + tidD, src, maxNum * sizeof(T), NRAM2GDRAM);
+                __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM);
             }
             if(remain){
-                tidS = inds + repeat * maxNum;
-                tidD = indd + repeat * maxNum;
-                __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM);
+                __memcpy(src, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM);
                 __memcpy(wet, weight + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM);
                 __bang_mul(src, src, wet, maxNum);//src = src * wet
                 __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                __memcpy(destination + tidD, src, remain * sizeof(T), NRAM2GDRAM); 
+                __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM);
             }
         }
     }
-    else{//dimsize < maxNum
-        
-        T *src = (T *)nram_buffer;
-        T *wet = src + dimsize;   
-        T *destSum = wet + dimsize;  
-        T *destSumFinal = destSum + dimS;
-        
-        __bang_write_zero(destSum, dimS);
-        __bang_write_zero(destSumFinal, dimS);
-        __memcpy(wet, weight, dimsize * sizeof(T), GDRAM2NRAM);
-        
-        int remainT = othersize % taskDim;
-        int stepEasy = (othersize - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i ;
-            inds += (indi % othersize) * strideS_f;
-            indd += (indi % othersize) * strideD_f;
-            __memcpy(src, source + inds, dimsize * sizeof(T), GDRAM2NRAM);
-            __bang_mul(destSum, src, src, dimsize);//src = src * src
-            int segNum = dimS / wSize;
-            for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                for(int j = 0; j < strip; j++){
-                    __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
-                }
-            }
-            __bang_reduce_sum(destSumFinal, destSum, wSize);
-            destSumFinal[0] /= dimsize;
-            destSumFinal[0] += eps;
-            T globalSum = pow(destSumFinal[0], 0.5);
-            T globalSumInv = 1.0 / globalSum;
-            __bang_mul(src, src, wet, dimsize);
-            __bang_mul_scalar(src, src, globalSumInv, dimsize);
-            __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
-        }
-    }
-}
-
-
-
-template<typename T>
-void rmsNormUnionDim_2(cnrtQueue_t queue, void *y, void const *x, void const *w, int strideS_f, int strideD_f, int n, int d, float eps) {
-    const int wSize = 128 / sizeof(T);
-    auto y_ = reinterpret_cast<T *>(y);
-    auto x_ = reinterpret_cast<T const *>(x);
-    auto w_ = reinterpret_cast<T const *>(w);
+    else{
 
-    int dimS;
-    float mi = log2(d);
-    if (floor(mi) == mi) {
-        dimS = d;
-    } else {
-        dimS = pow(2, floor(mi) + 1);
-    }
-    if (dimS < wSize) {
-        dimS = wSize;
-    }
-    
-    cnrtDim3_t k_dim;
-    cnrtFunctionType_t k_type;
+        T *src = (T *)nram_buffer;//[dimsize]
+        T *wet = src + dimsize;//[dimsize]
+        T *destSumFinal = wet + dimsize;//[wSize]
+        T *destSum = destSumFinal + wSize;//[dimS]
+        T *srcTmp = destSum + dimS;//[wSize]
 
-    k_dim.x = 4;
-    k_dim.y = 1;
-    k_dim.z = 1;
-    k_type = CNRT_FUNC_TYPE_UNION1;
 
-    rmsNormDim_2<T><<<k_dim, k_type, queue>>>(y_, x_, w_, strideS_f, strideD_f, n, d, dimS, eps);
-    // cnrtQueueSync(queue);
-}
-template <typename T>
-__mlu_global__ void rmsNormDim_3(T *destination, T const *source, T const *weight, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int othersize, int middle, int dimsize, int dimS, float eps) {//axis=-1
-    
-    const int maxNum = SRC_MAX_SIZE/sizeof(T);
-    int startDim = othersize / middle;
-    if(dimsize >= maxNum){
-        
-        T *src = (T *)nram_buffer;//[maxNum]
-        T *destSumFinal = src + maxNum;//[wSize]
-        T *destSum = destSumFinal + wSize;//[wSize]
-        T *wet = destSum + wSize;//[maxNum]
-        
-        int remain = dimsize % maxNum;
-        int repeat = (dimsize - remain) / maxNum;
-        int tidS;
-        int tidD;
+        int segNum = dimS / wSize;
 
-        int remainT = othersize % taskDim;
-        int stepEasy = (othersize - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
         for(int i = indStart; i < indStart + step; i++){
+            __bang_write_zero(destSum, dimS);
+            __bang_write_zero(destSumFinal, wSize);
             int inds = 0;
             int indd = 0;
             int indi = i;
-            inds += (indi % middle) * strideS_m;
-            indd += (indi % middle) * strideD_m;
-            indi /= middle;
-            inds += (indi % startDim) * strideS_f;
-            indd += (indi % startDim) * strideD_f;
-            __bang_write_zero(destSumFinal, wSize);
-            for(int s = 0; s < repeat; s++){
-                __bang_write_zero(destSum, wSize);
-                tidS = inds + s * maxNum;
-                __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM);
-                __bang_mul(src, src, src, maxNum);//src = src * src
-                int segNum = maxNum / wSize;//准备数值求和
-                for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                    for(int j = 0; j < strip; j++){
-                        __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize);
-                    }
-                }
-                __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                __bang_add(destSumFinal, destSumFinal, destSum, wSize);
-            }
-            
-            if(remain){
-                tidS = inds + repeat * maxNum;
-                __bang_write_zero(src, maxNum);
-                __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM);
-                __bang_mul(src, src, src, maxNum);//src = src * src
-                int segNum = maxNum / wSize;//准备数值求和
+            inds += (indi % othersize) * stride_x;
+            indd += (indi % othersize) * stride_y;
+            __memcpy(src, source + inds, dimsize * sizeof(T), GDRAM2NRAM);
+            __bang_mul(destSum, src, src, dimsize);//src = src * src
+            if(dimS >= wSize){
                 for(int strip = segNum / 2; strip > 0; strip = strip / 2){
                     for(int j = 0; j < strip; j++){
-                        __bang_add(src + j * wSize, src + j * wSize, src + (j+ strip) * wSize, wSize);
+                        __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
                     }
                 }
-                __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和
-                __bang_add(destSumFinal, destSumFinal, destSum, wSize);
+                __bang_reduce_sum(destSumFinal, destSum, wSize);
             }
-            
-            destSumFinal[0] += eps;
-            destSumFinal[0] /= dimsize;
-            destSum[0] = pow(destSum[0], 0.5);
-            T globalSumInv = 1.0 / destSumFinal[0];
-            
-            // 写回 global memory
-            for(int s = 0; s < repeat; s++){
-                tidS = inds + s * maxNum;
-                tidD = indd + s * maxNum;
-                __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM);
-                
-                __memcpy(wet, weight + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-                
-                __bang_mul(src, src, wet, maxNum);//src = src * wet
-                __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                __memcpy(destination + tidD, src, maxNum * sizeof(T), NRAM2GDRAM);
-            }
-            if(remain){
-                tidS = inds + repeat * maxNum;
-                tidD = indd + repeat * maxNum;
-                __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM);
-                __memcpy(wet, weight + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM);
-                __bang_mul(src, src, wet, maxNum);//src = src * wet
-                __bang_mul_scalar(src, src, globalSumInv, maxNum);
-                __memcpy(destination + tidD, src, remain * sizeof(T), NRAM2GDRAM); 
-            }
-        }
-    }
-    else{//dimsize < maxNum
-        
-        T *src = (T *)nram_buffer;
-        T *wet = src + dimsize;   
-        T *destSum = wet + dimsize;  
-        T *destSumFinal = destSum + dimS;
-        
-        __bang_write_zero(destSum, dimS);
-        __bang_write_zero(destSumFinal, dimS);
-        __memcpy(wet, weight, dimsize * sizeof(T), GDRAM2NRAM);
-        
-        int remainT = othersize % taskDim;
-        int stepEasy = (othersize - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i ;
-            inds += (indi % middle) * strideS_m;
-            indd += (indi % middle) * strideD_m;
-            indi /= middle;
-            inds += (indi % startDim) * strideS_f;
-            indd += (indi % startDim) * strideD_f;
-            __memcpy(src, source + inds, dimsize * sizeof(T), GDRAM2NRAM);
-            __bang_mul(destSum, src, src, dimsize);//src = src * src
-            int segNum = dimS / wSize;
-            for(int strip = segNum / 2; strip > 0; strip = strip / 2){
-                for(int j = 0; j < strip; j++){
-                    __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize);
-                }
+            else{
+                __memcpy(srcTmp, destSum, dimsize * sizeof(T), NRAM2NRAM);
+                __bang_reduce_sum(destSumFinal, srcTmp, wSize);
+
             }
-            __bang_reduce_sum(destSumFinal, destSum, wSize);
             destSumFinal[0] /= dimsize;
             destSumFinal[0] += eps;
-            T globalSum = pow(destSumFinal[0], 0.5);
-            T globalSumInv = 1.0 / globalSum;
-            __bang_mul(src, src, wet, dimsize);
+            destSumFinal[0] = pow(destSumFinal[0], 0.5);
+            T globalSumInv = 1.0 / destSumFinal[0];
+            __memcpy(wet, weight, dimsize * sizeof(T), GDRAM2NRAM);
+            __bang_mul(src, src, wet, dimsize);//src = src * wet
             __bang_mul_scalar(src, src, globalSumInv, dimsize);
             __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
         }
@@ -482,14 +287,15 @@ __mlu_global__ void rmsNormDim_3(T *destination, T const *source, T const *weigh
 }
 
 
+template<typename T, typename Tw>
+void rms_normUnion(cnrtQueue_t queue, T *y, T const *x, Tw const *w, int stride_y, int stride_x, float epsilon, int n, int d){
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
 
-template<typename T>
-void rmsNormUnionDim_3(cnrtQueue_t queue, void *y, void const *x, void const *w, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int n, int middle, int d, float eps) {
-    const int wSize = 128 / sizeof(T);
-    auto y_ = reinterpret_cast<T *>(y);
-    auto x_ = reinterpret_cast<T const *>(x);
-    auto w_ = reinterpret_cast<T const *>(w);
-
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = CNRT_FUNC_TYPE_UNION1;
     int dimS;
     float mi = log2(d);
     if (floor(mi) == mi) {
@@ -497,74 +303,45 @@ void rmsNormUnionDim_3(cnrtQueue_t queue, void *y, void const *x, void const *w,
     } else {
         dimS = pow(2, floor(mi) + 1);
     }
-    if (dimS < wSize) {
-        dimS = wSize;
-    }
-    
-    cnrtDim3_t k_dim;
-    cnrtFunctionType_t k_type;
-
-    k_dim.x = 4;
-    k_dim.y = 1;
-    k_dim.z = 1;
-    k_type = CNRT_FUNC_TYPE_UNION1;
+    rms_norm<T><<<k_dim, k_type, queue>>>(y, x, w, stride_y, stride_x, epsilon, n, d, dimS);
+    cnrtQueueSync(queue);
 
-    rmsNormDim_3<T><<<k_dim, k_type, queue>>>(y_, x_, w_, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, d, dimS, eps);
-    // cnrtQueueSync(queue);
 }
-
-void rms_norm_bang_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream) {
-    int num = 1;
-    int ndim = y.layout->ndim;
-    int x_stride[ndim], y_stride[ndim], shape[ndim];
-    for (int i = 0; i < ndim; i++) {
-        x_stride[i] = static_cast<int>(x.layout->strides[i]) / y.layout->dt.size;
-        y_stride[i] = static_cast<int>(y.layout->strides[i]) / y.layout->dt.size;
-        shape[i] = static_cast<int>(y.layout->shape[i]);
-        num *= shape[i];
-    }    
+void rms_norm_bang_f16(RMSNormBangDescriptor_t desc, void *y, void const *x, void const *w,
+                             void *stream){
     auto queue = reinterpret_cast<cnrtQueue_t>(stream);
-    if(ndim == 2){
-        ASSERT_EQ(y.layout->ndim, 2);
-        ASSERT_EQ(x.layout->ndim, 2);
-        ASSERT_EQ(w.layout->ndim, 1);
-
-        auto n = y.layout->shape[0],
-            d = y.layout->shape[1];
-
-        ASSERT_EQ(x.layout->shape[0], n);
-        ASSERT_EQ(x.layout->shape[1], d);
-        ASSERT_EQ(w.layout->shape[0], d);
+    int n = static_cast<int>(desc->n);
+    int d = static_cast<int>(desc->d);
+    auto y_ = reinterpret_cast<half *>(y);
+    auto x_ = reinterpret_cast<half const *>(x);
+    auto epsilon = desc->epsilon;//float
 
-        int strideS_f = x_stride[0];
-        int strideD_f = y_stride[0];
-        rmsNormUnionDim_2<half>(queue, y.data, x.data, w.data, strideS_f, strideD_f, n, d, epsilon);
-    }
-    else if(ndim == 3){
-        int strideS_f = x_stride[0];
-        int strideD_f = y_stride[0];
-        int strideS_m = x_stride[1];
-        int strideD_m = y_stride[1];
-        int middle = shape[1];
-        int d = shape[ndim - 1];
-        int n = num / d;
-        rmsNormUnionDim_3<half>(queue, y.data, x.data, w.data, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, d, epsilon);
+    // Get strides in terms of elements
+    int stride_y = static_cast<int>(desc->stride_y);
+    int stride_x = static_cast<int>(desc->stride_x);
+    auto w_datatype = desc->w_datatype;
+    if (dtype_eq(w_datatype, F16)) {
+        auto w_ = reinterpret_cast<half const *>(w);
+        rms_normUnion<half, half>(queue, y_, x_, w_, stride_y, stride_x, epsilon, n, d);
     }
     else{
-        int d = shape[ndim - 1];
-        int n = num / d;
-        int *mlu_strideX, *mlu_strideY, *mlu_shape;
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_strideX, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_strideY, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_shape, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMemcpy(mlu_strideX, x_stride, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpy(mlu_strideY, y_stride, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpy(mlu_shape, shape, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        
-        rmsNorm_fp16(queue, y.data, x.data, w.data, mlu_strideX, mlu_strideY, mlu_shape, n, d, epsilon, ndim);
-        cnrtFree(mlu_strideX);
-        cnrtFree(mlu_strideY);
-        cnrtFree(mlu_shape);
+        auto w_ = reinterpret_cast<float const *>(w);
+        rms_normUnion<half, float>(queue, y_, x_, w_, stride_y, stride_x, epsilon, n, d);
+    }
+
+}
+infiniopStatus_t bangRMSNorm(RMSNormBangDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *y, void const *x, void const *w,
+                             void *stream){
+    if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    if (dtype_eq(desc->dtype, F16)){
+        rms_norm_bang_f16(desc, y, x, w, stream);
+        return STATUS_SUCCESS;
     }
-    
-} 
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/rms_norm/bang/rms_norm_cnnl.cc b/src/ops/rms_norm/bang/rms_norm_cnnl.cc
deleted file mode 100644
index 9e80918d..00000000
--- a/src/ops/rms_norm/bang/rms_norm_cnnl.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-﻿#include "rms_norm_cnnl.h"
-#include "../../../devices/bang/common_bang.h"
-#include "../../../devices/bang/handle_pool.h"
-#include "../../utils.h"
-#include "cnrt.h"
-
-RMSNormBangDescriptor::RMSNormBangDescriptor(Device device) {
-    this->device = device;
-    get_cnnl_pool();
-}
-
-void rms_norm_cnnl_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream) {
-    ASSERT_EQ(y.layout->ndim, 2);
-    ASSERT_EQ(x.layout->ndim, 2);
-    ASSERT_EQ(w.layout->ndim, 1);
-
-    auto n = y.layout->shape[0],
-         d = y.layout->shape[1];
-
-    ASSERT_EQ(x.layout->shape[0], n);
-    ASSERT_EQ(x.layout->shape[1], d);
-    ASSERT_EQ(w.layout->shape[0], d);
-
-    cnnlTensorDescriptor_t yDesc, xDesc, wDesc;
-    cnnlCreateTensorDescriptor(&yDesc);
-    cnnlCreateTensorDescriptor(&xDesc);
-    cnnlCreateTensorDescriptor(&wDesc);
-    setCnnlTensor(yDesc, y.layout);
-    setCnnlTensor(xDesc, x.layout);
-    setCnnlTensor(wDesc, w.layout);
-
-    cnnlFuseNormDescriptor_t opDesc;
-    cnnlCreateFuseNormDescriptor(&opDesc);
-    cnnlSetFuseNormDescriptor(opDesc, epsilon, 1.0, true,
-                              false, false, false, false,
-                              CNNL_DTYPE_HALF, CNNL_TRANSFORMER_RMSNORM);
-
-    void *workspace;
-    
-    use_cnnl((cnrtQueue_t) stream,
-             [&](cnnlHandle_t handle) {
-                 size_t wsSize;
-                 cnnlGetFuseNormWorkspaceSize(handle, opDesc, xDesc, &wsSize);
-                 cnrtMalloc(&workspace, wsSize);
-                 cnnlFuseNorm(handle, opDesc, xDesc, x.data,
-                              wDesc, w.data, nullptr, nullptr,
-                              nullptr, nullptr, nullptr, nullptr,
-                              workspace, wsSize, yDesc, y.data, nullptr, nullptr);
-             });
-
-    cnrtFree(workspace);
-    cnnlDestroyFuseNormDescriptor(opDesc);
-    cnnlDestroyTensorDescriptor(xDesc);
-    cnnlDestroyTensorDescriptor(yDesc);
-    cnnlDestroyTensorDescriptor(wDesc);
-}
diff --git a/src/ops/rms_norm/bang/rms_norm_cnnl.h b/src/ops/rms_norm/bang/rms_norm_cnnl.h
deleted file mode 100644
index ab0972ce..00000000
--- a/src/ops/rms_norm/bang/rms_norm_cnnl.h
+++ /dev/null
@@ -1,15 +0,0 @@
-﻿#ifndef __CNNL_RMS_NORM_H__
-#define __CNNL_RMS_NORM_H__
-
-#include "cnnl.h"
-#include "cnnl_extra.h"
-#include "operators.h"
-
-struct RMSNormBangDescriptor {
-    Device device;
-    RMSNormBangDescriptor(Device device);
-};
-
-void rms_norm_cnnl_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream);
-
-#endif// __CNNL_RMS_NORM_H__
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cc b/src/ops/rms_norm/cpu/rms_norm_cpu.cc
index 38e4581f..3152b5b9 100644
--- a/src/ops/rms_norm/cpu/rms_norm_cpu.cc
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cc
@@ -3,25 +3,66 @@
 #include "../../utils.h"
 #include <cmath>
 
-void rms_norm_cpu_f16(Tensor y, Tensor x, Tensor w, float epsilon) {
-    ASSERT_EQ(y.layout->ndim, 2);
-    ASSERT_EQ(x.layout->ndim, 2);
-    ASSERT_EQ(w.layout->ndim, 1);
+infiniopStatus_t cpuCreateRMSNormDescriptor(infiniopHandle_t, RMSNormCpuDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, infiniopTensorDescriptor_t w_desc, float epsilon) {
+    if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    auto n = y_desc->shape[0],
+         d = y_desc->shape[1];
 
-    auto n = y.layout->shape[0],
-         d = y.layout->shape[1];
+    if (x_desc->shape[0] != n || x_desc->shape[1] != d || w_desc->shape[0] != d) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
 
-    ASSERT_EQ(x.layout->shape[0], n);
-    ASSERT_EQ(x.layout->shape[1], d);
-    ASSERT_EQ(w.layout->shape[0], d);
+    uint64_t stride_y = y_desc->strides[0];
+    uint64_t stride_x = y_desc->strides[0];
+    auto w_datatype = w_desc->dt;
 
-    auto stride_y = y.layout->strides[0];
-    auto stride_x = x.layout->strides[0];
+    *desc_ptr = new RMSNormCpuDescriptor{
+        DevCpu,
+        y_desc->dt,
+        n,
+        d,
+        stride_y,
+        stride_x,
+        w_datatype,
+        epsilon};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGetRMSNormWorkspaceSize(RMSNormCpuDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyRMSNormDescriptor(RMSNormCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+void rms_norm_cpu_f16(RMSNormCpuDescriptor_t desc, void *y, void const *x, void const *w) {
+    auto n = desc->n, d = desc->d;
+    auto stride_y = desc->stride_y;
+    auto stride_x = desc->stride_x;
+    auto epsilon = desc->epsilon;
+
+    auto y_ptr = reinterpret_cast<uint16_t *>(y);
+    auto x_ptr = reinterpret_cast<uint16_t const *>(x);
+    void const *w_ptr = w;
+    void const *w_ = nullptr;
+    auto w_datatype = desc->w_datatype;
+    if (dtype_eq(w_datatype, F16)) {
+        w_ = reinterpret_cast<uint16_t const *>(w_ptr);
+    } else {
+        w_ = reinterpret_cast<float const *>(w_ptr);
+    }
 
     for (size_t i = 0; i < n; ++i) {
-        auto y_ = reinterpret_cast<uint16_t *>(reinterpret_cast<char *>(y.data) + i * stride_y);
-        auto x_ = reinterpret_cast<uint16_t const *>(reinterpret_cast<char const *>(x.data) + i * stride_x);
-        auto w_ = reinterpret_cast<uint16_t const *>(w.data);
+        auto y_ = reinterpret_cast<uint16_t *>(y_ptr + i * stride_y);
+        auto x_ = reinterpret_cast<uint16_t const *>(x_ptr + i * stride_x);
 
         auto sum_sq = 0.0f;
         for (size_t j = 0; j < d; ++j) {
@@ -32,8 +73,27 @@ void rms_norm_cpu_f16(Tensor y, Tensor x, Tensor w, float epsilon) {
         auto k = std::pow(sum_sq / d + epsilon, -.5);
         for (size_t j = 0; j < d; ++j) {
             auto x__ = f16_to_f32(x_[j]);
-            auto w__ = f16_to_f32(w_[j]);
+            float w__ = 0.0f;
+            if (dtype_eq(w_datatype, F16)) {
+                w__ = f16_to_f32(static_cast<uint16_t const *>(w_)[j]);
+            } else {
+                w__ = static_cast<float const *>(w_)[j];
+            }
+
             y_[j] = f32_to_f16(k * x__ * w__);
         }
     }
 }
+
+infiniopStatus_t cpuRMSNorm(RMSNormCpuDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *y, void const *x, void const *w,
+                            void *stream) {
+    if (dtype_eq(desc->dtype, F16)) {
+        rms_norm_cpu_f16(desc, y, x, w);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.h b/src/ops/rms_norm/cpu/rms_norm_cpu.h
index 9f598c55..ddf1de66 100644
--- a/src/ops/rms_norm/cpu/rms_norm_cpu.h
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.h
@@ -5,8 +5,30 @@
 
 struct RMSNormCpuDescriptor {
     Device device;
+    DT dtype;
+    uint64_t n;
+    uint64_t d;
+    uint64_t stride_y;
+    uint64_t stride_x;
+    DT w_datatype;
+    float epsilon;
 };
 
-void rms_norm_cpu_f16(Tensor y, Tensor x, Tensor w, float epsilon);
+typedef struct RMSNormCpuDescriptor *RMSNormCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateRMSNormDescriptor(infiniopHandle_t handle, RMSNormCpuDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t y_desc,
+                                            infiniopTensorDescriptor_t x_desc,
+                                            infiniopTensorDescriptor_t w_desc, float epsilon);
+
+infiniopStatus_t cpuGetRMSNormWorkspaceSize(RMSNormCpuDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cpuRMSNorm(RMSNormCpuDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *y, void const *x, void const *w,
+                            void *stream);
+
+infiniopStatus_t cpuDestroyRMSNormDescriptor(RMSNormCpuDescriptor_t desc);
 
 #endif// __CPU_RMS_NORM_H__
diff --git a/src/ops/rms_norm/cuda/rms_norm.cc b/src/ops/rms_norm/cuda/rms_norm.cc
new file mode 100644
index 00000000..92d34a99
--- /dev/null
+++ b/src/ops/rms_norm/cuda/rms_norm.cc
@@ -0,0 +1,46 @@
+#include "rms_norm.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle, RMSNormCudaDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t y_desc,
+                                             infiniopTensorDescriptor_t x_desc,
+                                             infiniopTensorDescriptor_t w_desc,
+                                             float epsilon) {
+    if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    auto n = y_desc->shape[0],
+         d = y_desc->shape[1];
+
+    if (x_desc->shape[0] != n || x_desc->shape[1] != d || w_desc->shape[0] != d) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    int64_t stride_y = y_desc->strides[0];
+    int64_t stride_x = x_desc->strides[0];
+    auto w_datatype = w_desc->dt;
+    *desc_ptr = new RMSNormCudaDescriptor{
+        handle->device,
+        handle->device_id,
+        y_desc->dt,
+        n,
+        d,
+        stride_y,
+        stride_x,
+        w_datatype,
+        epsilon};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyRMSNormDescriptor(RMSNormCudaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rms_norm/cuda/rms_norm.cu b/src/ops/rms_norm/cuda/rms_norm.cu
index 88608baf..aa36f2f0 100644
--- a/src/ops/rms_norm/cuda/rms_norm.cu
+++ b/src/ops/rms_norm/cuda/rms_norm.cu
@@ -5,13 +5,13 @@
 #include <cub/block/block_reduce.cuh>
 
 // assert BLOCK_SIZE >= blockDim.x
-template<unsigned int BLOCK_SIZE, class Tdata>
-static __global__ void rms_norm_padding(
+template<unsigned int BLOCK_SIZE, class Tdata, class Wdata>
+__launch_bounds__(MAX_THREADS_PER_BLOCK) static __global__ void rms_norm_padding(
     Tdata *__restrict__ o_,
     unsigned int const stride_y,
     Tdata const *__restrict__ x_,
     unsigned int const stride_x,
-    Tdata const *__restrict__ w_,
+    Wdata const *__restrict__ w_,
     float const epsilon) {
     auto y = o_ + blockIdx.x * stride_y + threadIdx.x;
     auto x = x_[blockIdx.x * stride_x + threadIdx.x];
@@ -19,24 +19,27 @@ static __global__ void rms_norm_padding(
 
     using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
     __shared__ typename BlockOp::TempStorage temp_storage;
+#ifdef ENABLE_SUGON_DCU
+    auto acc = BlockOp(temp_storage).Sum(x * x);
+#else
     auto acc = BlockOp(temp_storage).Reduce(x * x, cub::Sum());
-
+#endif
     __shared__ Tdata rms;
     if (threadIdx.x == 0) {
         rms = Tdata(rsqrtf(acc / float(blockDim.x) + epsilon));
     }
     __syncthreads();
 
-    *y = rms * x * w;
+    *y = rms * x * (Tdata) w;
 }
 
-template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata>
-static __global__ void rms_norm_folding(
+template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata, class Wdata>
+__launch_bounds__(MAX_THREADS_PER_BLOCK) static __global__ void rms_norm_folding(
     Tdata *__restrict__ y,
     unsigned int const stride_y,
     Tdata const *__restrict__ x,
     unsigned int const stride_x,
-    Tdata const *__restrict__ w,
+    Wdata const *__restrict__ w,
     float const epsilon,
     unsigned int const items_size) {
     y += blockIdx.x * stride_y;
@@ -59,7 +62,11 @@ static __global__ void rms_norm_folding(
     {
         using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
         __shared__ typename BlockOp::TempStorage temp_storage;
+#ifdef ENABLE_SUGON_DCU
+        acc = BlockOp(temp_storage).Sum(squared);
+#else
         acc = BlockOp(temp_storage).Reduce(squared, cub::Sum());
+#endif
     }
 
     __shared__ Tdata rms;
@@ -76,13 +83,13 @@ static __global__ void rms_norm_folding(
     }
 }
 
-template<unsigned int BLOCK_SIZE, class Tdata>
+template<unsigned int BLOCK_SIZE, class Tdata, class Wdata>
 static __global__ void rms_norm_standard(
     Tdata *__restrict__ y_,
     unsigned int const stride_y,
     Tdata const *__restrict__ x_,
     unsigned int const stride_x,
-    Tdata const *__restrict__ w,
+    Wdata const *__restrict__ w,
     float const epsilon,
     unsigned int const d) {
     auto y = y_ + blockIdx.x * stride_y;
@@ -112,41 +119,62 @@ static __global__ void rms_norm_standard(
     __syncthreads();
 
     for (int i = threadIdx.x; i < d; i += BLOCK_SIZE) {
-        y[i] = rms * x[i] * w[i];
+        y[i] = rms * x[i] * (Tdata) w[i];
     }
 }
 
-
-void rms_norm_nv_gpu_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream) {
-    ASSERT_EQ(y.layout->ndim, 2);
-    ASSERT_EQ(x.layout->ndim, 2);
-    ASSERT_EQ(w.layout->ndim, 1);
-
-    auto n = y.layout->shape[0],
-         d = y.layout->shape[1];
-
-    ASSERT_EQ(x.layout->shape[0], n);
-    ASSERT_EQ(x.layout->shape[1], d);
-    ASSERT_EQ(w.layout->shape[0], d);
-
-    auto y_ = reinterpret_cast<half *>(y.data);
-    auto x_ = reinterpret_cast<half const *>(x.data);
-    auto w_ = reinterpret_cast<half const *>(w.data);
+void rms_norm_nv_gpu_f16(RMSNormCudaDescriptor_t desc, void *y, void const *x, void const *w, void *stream) {
+    auto n = desc->n, d = desc->d;
+    auto y_ = reinterpret_cast<half *>(y);
+    auto x_ = reinterpret_cast<half const *>(x);
+    auto epsilon = desc->epsilon;
 
     // Get strides in terms of elements
-    auto stride_y = y.layout->strides[0] / sizeof(half);
-    auto stride_x = x.layout->strides[0] / sizeof(half);
+    auto stride_y = desc->stride_y;
+    auto stride_x = desc->stride_x;
 
     auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
     unsigned int items_per_thread = ROUND_UP_DIV(d, MAX_THREADS_PER_BLOCK);
-    if (items_per_thread == 1) {
-        rms_norm_padding<MAX_THREADS_PER_BLOCK>
-            <<<n, d, 0, cuda_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon);
-    } else if (items_per_thread <= 16) {
-        rms_norm_folding<MAX_THREADS_PER_BLOCK, 16>
-            <<<n, MAX_THREADS_PER_BLOCK, 0, cuda_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+    auto w_datatype = desc->w_datatype;
+    if (dtype_eq(w_datatype, F16)) {
+        auto w_ = reinterpret_cast<half const *>(w);
+        if (items_per_thread == 1) {
+            rms_norm_padding<MAX_THREADS_PER_BLOCK, half, half>
+                <<<n, d, 0, cuda_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon);
+        } else if (items_per_thread <= 16) {
+            rms_norm_folding<MAX_THREADS_PER_BLOCK, 16, half, half>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, cuda_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        } else {
+            rms_norm_standard<MAX_THREADS_PER_BLOCK, half, half>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, cuda_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        }
     } else {
-        rms_norm_standard<MAX_THREADS_PER_BLOCK>
-            <<<n, MAX_THREADS_PER_BLOCK, 0, cuda_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        auto w_ = reinterpret_cast<float const *>(w);
+        if (items_per_thread == 1) {
+            rms_norm_padding<MAX_THREADS_PER_BLOCK, half, float>
+                <<<n, d, 0, cuda_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon);
+        } else if (items_per_thread <= 16) {
+            rms_norm_folding<MAX_THREADS_PER_BLOCK, 16, half, float>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, cuda_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        } else {
+            rms_norm_standard<MAX_THREADS_PER_BLOCK, half, float>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, cuda_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        }
+    }
+}
+
+infiniopStatus_t cudaRMSNorm(RMSNormCudaDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *y, void const *x, void const *w,
+                             void *stream) {
+    if (cudaSetDevice(desc->device_id) != cudaSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    if (dtype_eq(desc->dtype, F16)) {
+        rms_norm_nv_gpu_f16(desc, y, x, w, stream);
+        return STATUS_SUCCESS;
     }
+
+    return STATUS_BAD_TENSOR_DTYPE;
 }
diff --git a/src/ops/rms_norm/cuda/rms_norm.cuh b/src/ops/rms_norm/cuda/rms_norm.cuh
index 0d187c7c..683011f2 100644
--- a/src/ops/rms_norm/cuda/rms_norm.cuh
+++ b/src/ops/rms_norm/cuda/rms_norm.cuh
@@ -1,12 +1,40 @@
 ﻿#ifndef __NV_GPU_RMS_NORM_H__
 #define __NV_GPU_RMS_NORM_H__
 
+#include "../../../devices/cuda/cuda_handle.h"
 #include "operators.h"
 
 struct RMSNormCudaDescriptor {
     Device device;
+    int device_id;
+    DT dtype;
+    uint64_t n;
+    uint64_t d;
+    int64_t stride_y;
+    int64_t stride_x;
+    DT w_datatype;
+    float epsilon;
 };
 
-void rms_norm_nv_gpu_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream);
+typedef struct RMSNormCudaDescriptor *RMSNormCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle,
+                                             RMSNormCudaDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t y_desc,
+                                             infiniopTensorDescriptor_t x_desc,
+                                             infiniopTensorDescriptor_t w_desc,
+                                             float epsilon);
+
+infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cudaRMSNorm(RMSNormCudaDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *y, void const *x, void const *w,
+                             void *stream);
+
+infiniopStatus_t cudaDestroyRMSNormDescriptor(RMSNormCudaDescriptor_t desc);
+
+void rms_norm_nv_gpu_f16(RMSNormCudaDescriptor_t desc, void *y, void const *x, void const *w, float epsilon, void *stream);
 
 #endif// __NV_GPU_RMS_NORM_H__
diff --git a/src/ops/rms_norm/maca/rms_norm_maca.cc b/src/ops/rms_norm/maca/rms_norm_maca.cc
new file mode 100644
index 00000000..054be969
--- /dev/null
+++ b/src/ops/rms_norm/maca/rms_norm_maca.cc
@@ -0,0 +1,46 @@
+#include "rms_norm_maca.h"
+#include "../../../devices/maca/common_maca.h"
+#include "../../utils.h"
+
+infiniopStatus_t macaCreateRMSNormDescriptor(MacaHandle_t handle, RMSNormMacaDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t y_desc,
+                                             infiniopTensorDescriptor_t x_desc,
+                                             infiniopTensorDescriptor_t w_desc,
+                                             float epsilon) {
+    if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    auto n = y_desc->shape[0],
+         d = y_desc->shape[1];
+
+    if (x_desc->shape[0] != n || x_desc->shape[1] != d || w_desc->shape[0] != d) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    int64_t stride_y = y_desc->strides[0];
+    int64_t stride_x = x_desc->strides[0];
+    auto w_datatype = w_desc->dt;
+    *desc_ptr = new RMSNormMacaDescriptor{
+        handle->device,
+        handle->device_id,
+        y_desc->dt,
+        n,
+        d,
+        stride_y,
+        stride_x,
+        w_datatype,
+        epsilon};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t macaGetRMSNormWorkspaceSize(RMSNormMacaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t macaDestroyRMSNormDescriptor(RMSNormMacaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rms_norm/maca/rms_norm_maca.h b/src/ops/rms_norm/maca/rms_norm_maca.h
new file mode 100644
index 00000000..f244ce97
--- /dev/null
+++ b/src/ops/rms_norm/maca/rms_norm_maca.h
@@ -0,0 +1,40 @@
+#ifndef __MACA_RMS_NORM_H__
+#define __MACA_RMS_NORM_H__
+
+#include "../../../devices/maca/maca_handle.h"
+#include "operators.h"
+
+struct RMSNormMacaDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t n;
+    uint64_t d;
+    int64_t stride_y;
+    int64_t stride_x;
+    DT w_datatype;
+    float epsilon;
+};
+
+typedef struct RMSNormMacaDescriptor *RMSNormMacaDescriptor_t;
+
+infiniopStatus_t macaCreateRMSNormDescriptor(MacaHandle_t handle,
+                                             RMSNormMacaDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t y_desc,
+                                             infiniopTensorDescriptor_t x_desc,
+                                             infiniopTensorDescriptor_t w_desc,
+                                             float epsilon);
+
+infiniopStatus_t macaGetRMSNormWorkspaceSize(RMSNormMacaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t macaRMSNorm(RMSNormMacaDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *y, void const *x, void const *w,
+                             void *stream);
+
+infiniopStatus_t macaDestroyRMSNormDescriptor(RMSNormMacaDescriptor_t desc);
+
+void rms_norm_mc_gpu_f16(RMSNormMacaDescriptor_t desc, void *y, void const *x, void const *w, float epsilon, void *stream);
+
+#endif// __MACA_RMS_NORM_H__
diff --git a/src/ops/rms_norm/maca/rms_norm_maca.maca b/src/ops/rms_norm/maca/rms_norm_maca.maca
new file mode 100644
index 00000000..3becfab6
--- /dev/null
+++ b/src/ops/rms_norm/maca/rms_norm_maca.maca
@@ -0,0 +1,173 @@
+#include "../../../devices/maca/common_maca.h"
+#include "../../utils.h"
+#include "rms_norm_maca.h"
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+
+// assert BLOCK_SIZE >= blockDim.x
+template<unsigned int BLOCK_SIZE, class Tdata, class Wdata>
+static __global__ void rms_norm_padding(
+    Tdata *__restrict__ o_,
+    unsigned int const stride_y,
+    Tdata const *__restrict__ x_,
+    unsigned int const stride_x,
+    Wdata const *__restrict__ w_,
+    float const epsilon) {
+    auto y = o_ + blockIdx.x * stride_y + threadIdx.x;
+    auto x = x_[blockIdx.x * stride_x + threadIdx.x];
+    auto w = w_[threadIdx.x];
+
+    using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+    __shared__ typename BlockOp::TempStorage temp_storage;
+    auto acc = BlockOp(temp_storage).Reduce(x * x, cub::Sum());
+
+    __shared__ Tdata rms;
+    if (threadIdx.x == 0) {
+        rms = Tdata(rsqrtf(acc / float(blockDim.x) + epsilon));
+    }
+    __syncthreads();
+
+    *y = rms * x * (Tdata) w;
+}
+
+template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata, class Wdata>
+static __global__ void rms_norm_folding(
+    Tdata *__restrict__ y,
+    unsigned int const stride_y,
+    Tdata const *__restrict__ x,
+    unsigned int const stride_x,
+    Wdata const *__restrict__ w,
+    float const epsilon,
+    unsigned int const items_size) {
+    y += blockIdx.x * stride_y;
+    x += blockIdx.x * stride_x;
+
+    float thread_data[ITEMS_PER_THREAD];
+    {
+        using BlockOp = cub::BlockLoad<float, BLOCK_SIZE, ITEMS_PER_THREAD>;
+        __shared__ typename BlockOp::TempStorage temp_storage;
+        BlockOp(temp_storage).Load(x, thread_data, items_size, 0.f);
+    }
+
+    float squared[ITEMS_PER_THREAD];
+#pragma unroll
+    for (unsigned int i = 0; i < ITEMS_PER_THREAD; ++i) {
+        squared[i] = thread_data[i] * thread_data[i];
+    }
+
+    float acc;
+    {
+        using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+        __shared__ typename BlockOp::TempStorage temp_storage;
+        acc = BlockOp(temp_storage).Reduce(squared, cub::Sum());
+    }
+
+    __shared__ Tdata rms;
+    if (threadIdx.x == 0) {
+        rms = Tdata(rsqrtf(acc / float(items_size) + epsilon));
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (unsigned int i = 0; i < ITEMS_PER_THREAD; ++i) {
+        if (auto j = i + threadIdx.x * ITEMS_PER_THREAD; j < items_size) {
+            y[j] = Tdata(float(rms) * float(thread_data[i]) * float(w[j]));
+        }
+    }
+}
+
+template<unsigned int BLOCK_SIZE, class Tdata, class Wdata>
+static __global__ void rms_norm_standard(
+    Tdata *__restrict__ y_,
+    unsigned int const stride_y,
+    Tdata const *__restrict__ x_,
+    unsigned int const stride_x,
+    Wdata const *__restrict__ w,
+    float const epsilon,
+    unsigned int const d) {
+    auto y = y_ + blockIdx.x * stride_y;
+    auto x = x_ + blockIdx.x * stride_x;
+
+    __shared__ float partial_sum[BLOCK_SIZE];
+
+    float sum = 0.0f;
+    for (int i = threadIdx.x; i < d; i += BLOCK_SIZE) {
+        sum += float(x[i]) * float(x[i]);
+    }
+
+    partial_sum[threadIdx.x] = sum;
+    __syncthreads();
+    for (int stride = BLOCK_SIZE / 2; stride > 0; stride >>= 1) {
+        if (threadIdx.x < stride) {
+            partial_sum[threadIdx.x] += partial_sum[threadIdx.x + stride];
+        }
+        __syncthreads();
+    }
+
+    __shared__ Tdata rms;
+    if (threadIdx.x == 0) {
+        float row_sum = partial_sum[0];
+        rms = Tdata(rsqrtf(row_sum / float(d) + epsilon));
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < d; i += BLOCK_SIZE) {
+        y[i] = rms * x[i] * (Tdata) w[i];
+    }
+}
+
+void rms_norm_mc_gpu_f16(RMSNormMacaDescriptor_t desc, void *y, void const *x, void const *w, void *stream) {
+    auto n = desc->n, d = desc->d;
+    auto y_ = reinterpret_cast<half *>(y);
+    auto x_ = reinterpret_cast<half const *>(x);
+    auto epsilon = desc->epsilon;
+
+    // Get strides in terms of elements
+    auto stride_y = desc->stride_y;
+    auto stride_x = desc->stride_x;
+
+    auto maca_stream = reinterpret_cast<hcStream_t>(stream);
+    unsigned int items_per_thread = ROUND_UP_DIV(d, MAX_THREADS_PER_BLOCK);
+    auto w_datatype = desc->w_datatype;
+    if (dtype_eq(w_datatype, F16)) {
+        auto w_ = reinterpret_cast<half const *>(w);
+        if (items_per_thread == 1) {
+            rms_norm_padding<MAX_THREADS_PER_BLOCK, half, half>
+                <<<n, d, 0, maca_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon);
+        } else if (items_per_thread <= 16) {
+            rms_norm_folding<MAX_THREADS_PER_BLOCK, 16, half, half>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, maca_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        } else {
+            rms_norm_standard<MAX_THREADS_PER_BLOCK, half, half>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, maca_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        }
+    } else {
+        auto w_ = reinterpret_cast<float const *>(w);
+        if (items_per_thread == 1) {
+            rms_norm_padding<MAX_THREADS_PER_BLOCK, half, float>
+                <<<n, d, 0, maca_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon);
+        } else if (items_per_thread <= 16) {
+            rms_norm_folding<MAX_THREADS_PER_BLOCK, 16, half, float>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, maca_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        } else {
+            rms_norm_standard<MAX_THREADS_PER_BLOCK, half, float>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, maca_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        }
+    }
+}
+
+infiniopStatus_t macaRMSNorm(RMSNormMacaDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *y, void const *x, void const *w,
+                             void *stream) {
+    if (hcSetDevice(desc->device_id) != hcSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    if (dtype_eq(desc->dtype, F16)) {
+        rms_norm_mc_gpu_f16(desc, y, x, w, stream);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/rms_norm/musa/rms_norm_musa.cc b/src/ops/rms_norm/musa/rms_norm_musa.cc
new file mode 100644
index 00000000..99c22c6e
--- /dev/null
+++ b/src/ops/rms_norm/musa/rms_norm_musa.cc
@@ -0,0 +1,46 @@
+#include "rms_norm_musa.h"
+#include "../../utils.h"
+#include "../../../devices/musa/common_musa.h"
+
+infiniopStatus_t musaCreateRMSNormDescriptor(MusaHandle_t handle, RMSNormMusaDescriptor_t *desc_ptr,
+                                    infiniopTensorDescriptor_t y_desc,
+                                    infiniopTensorDescriptor_t x_desc,
+                                    infiniopTensorDescriptor_t w_desc,
+                                    float epsilon) {
+    if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    auto n = y_desc->shape[0],
+         d = y_desc->shape[1];
+
+    if (x_desc->shape[0] != n || x_desc->shape[1] != d || w_desc->shape[0] != d) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    uint64_t stride_y = y_desc->strides[0];
+    uint64_t stride_x = x_desc->strides[0];
+    auto w_datatype = w_desc->dt;
+    *desc_ptr = new RMSNormMusaDescriptor{
+        handle->device,
+        handle->device_id,
+        y_desc->dt,
+        n,
+        d,
+        stride_y,
+        stride_x,
+        w_datatype,
+        epsilon};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaGetRMSNormWorkspaceSize(RMSNormMusaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaDestroyRMSNormDescriptor(RMSNormMusaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rms_norm/musa/rms_norm_musa.h b/src/ops/rms_norm/musa/rms_norm_musa.h
new file mode 100644
index 00000000..ee8dfb72
--- /dev/null
+++ b/src/ops/rms_norm/musa/rms_norm_musa.h
@@ -0,0 +1,40 @@
+#ifndef __MUSA_RMS_NORM_H__
+#define __MUSA_RMS_NORM_H__
+
+#include "operators.h"
+#include "../../../devices/musa/musa_handle.h"
+
+struct RMSNormMusaDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t n;
+    uint64_t d;
+    uint64_t stride_y;
+    uint64_t stride_x;
+    DT w_datatype;
+    float epsilon;
+};
+
+typedef struct RMSNormMusaDescriptor *RMSNormMusaDescriptor_t;
+
+infiniopStatus_t musaCreateRMSNormDescriptor(MusaHandle_t handle,
+                                             RMSNormMusaDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t y_desc,
+                                             infiniopTensorDescriptor_t x_desc,
+                                             infiniopTensorDescriptor_t w_desc,
+                                             float epsilon);
+
+infiniopStatus_t musaGetRMSNormWorkspaceSize(RMSNormMusaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t musaRMSNorm(RMSNormMusaDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *y, void const *x, void const *w,
+                                   void *stream);
+
+infiniopStatus_t musaDestroyRMSNormDescriptor(RMSNormMusaDescriptor_t desc);
+
+void rms_norm_mt_gpu_f16(RMSNormMusaDescriptor_t desc, void *y, void const *x, void const *w, float epsilon, void *stream);
+
+#endif// __MT_GPU_RMS_NORM_H__
diff --git a/src/ops/rms_norm/musa/rms_norm_musa.mu b/src/ops/rms_norm/musa/rms_norm_musa.mu
new file mode 100644
index 00000000..d80bdac9
--- /dev/null
+++ b/src/ops/rms_norm/musa/rms_norm_musa.mu
@@ -0,0 +1,177 @@
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include "rms_norm_musa.h"
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+
+// assert BLOCK_SIZE >= blockDim.x
+template<unsigned int BLOCK_SIZE, class Tdata, class Wdata>
+static __global__ void rms_norm_padding(
+    Tdata *__restrict__ o_,
+    unsigned int const stride_y,
+    Tdata const *__restrict__ x_,
+    unsigned int const stride_x,
+    Wdata const *__restrict__ w_,
+    float const epsilon) {
+    auto y = o_ + blockIdx.x * stride_y + threadIdx.x;
+    auto x = x_[blockIdx.x * stride_x + threadIdx.x];
+    auto w = w_[threadIdx.x];
+
+    using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+    __shared__ typename BlockOp::TempStorage temp_storage;
+    auto acc = BlockOp(temp_storage).Reduce(x * x, cub::Sum());
+
+    __shared__ Tdata rms;
+    if (threadIdx.x == 0) {
+        rms = Tdata(rsqrtf(acc / float(blockDim.x) + epsilon));
+    }
+    __syncthreads();
+
+    *y = rms * x * (Tdata)w;
+}
+
+template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata, class Wdata>
+static __global__ void rms_norm_folding(
+    Tdata *__restrict__ y,
+    unsigned int const stride_y,
+    Tdata const *__restrict__ x,
+    unsigned int const stride_x,
+    Wdata const *__restrict__ w,
+    float const epsilon,
+    unsigned int const items_size) {
+    y += blockIdx.x * stride_y;
+    x += blockIdx.x * stride_x;
+
+    float thread_data[ITEMS_PER_THREAD];
+    {
+        using BlockOp = cub::BlockLoad<float, BLOCK_SIZE, ITEMS_PER_THREAD>;
+        __shared__ typename BlockOp::TempStorage temp_storage;
+        BlockOp(temp_storage).Load(x, thread_data, items_size, 0.f);
+    }
+
+    float squared[ITEMS_PER_THREAD];
+#pragma unroll
+    for (unsigned int i = 0; i < ITEMS_PER_THREAD; ++i) {
+        squared[i] = thread_data[i] * thread_data[i];
+    }
+
+    float acc;
+    {
+        using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+        __shared__ typename BlockOp::TempStorage temp_storage;
+        acc = BlockOp(temp_storage).Reduce(squared, cub::Sum());
+    }
+
+    __shared__ Tdata rms;
+    if (threadIdx.x == 0) {
+        rms = Tdata(rsqrtf(acc / float(items_size) + epsilon));
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (unsigned int i = 0; i < ITEMS_PER_THREAD; ++i) {
+        if (auto j = i + threadIdx.x * ITEMS_PER_THREAD; j < items_size) {
+            y[j] = Tdata(float(rms) * float(thread_data[i]) * float(w[j]));
+        }
+    }
+}
+
+template<unsigned int BLOCK_SIZE, class Tdata, class Wdata>
+static __global__ void rms_norm_standard(
+    Tdata *__restrict__ y_,
+    unsigned int const stride_y,
+    Tdata const *__restrict__ x_,
+    unsigned int const stride_x,
+    Wdata const *__restrict__ w,
+    float const epsilon,
+    unsigned int const d) {
+    auto y = y_ + blockIdx.x * stride_y;
+    auto x = x_ + blockIdx.x * stride_x;
+
+    __shared__ float partial_sum[BLOCK_SIZE];
+
+    float sum = 0.0f;
+    for (int i = threadIdx.x; i < d; i += BLOCK_SIZE) {
+        sum += float(x[i]) * float(x[i]);
+    }
+
+    partial_sum[threadIdx.x] = sum;
+    __syncthreads();
+    for (int stride = BLOCK_SIZE / 2; stride > 0; stride >>= 1) {
+        if (threadIdx.x < stride) {
+            partial_sum[threadIdx.x] += partial_sum[threadIdx.x + stride];
+        }
+        __syncthreads();
+    }
+
+    __shared__ Tdata rms;
+    if (threadIdx.x == 0) {
+        float row_sum = partial_sum[0];
+        rms = Tdata(rsqrtf(row_sum / float(d) + epsilon));
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < d; i += BLOCK_SIZE) {
+        y[i] = rms * x[i] * (Tdata)w[i];
+    }
+}
+
+void rms_norm_mt_gpu_f16(RMSNormMusaDescriptor_t desc, void *y, void const *x, void const *w, void *stream) {
+    auto n = desc->n, d = desc->d;
+    auto y_ = reinterpret_cast<half *>(y);
+    auto x_ = reinterpret_cast<half const *>(x);
+    auto epsilon = desc->epsilon;
+
+    // Get strides in terms of elements
+    auto stride_y = desc->stride_y;
+    auto stride_x = desc->stride_x;
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    unsigned int items_per_thread = ROUND_UP_DIV(d, MAX_THREADS_PER_BLOCK);
+    auto w_datatype = desc->w_datatype;
+    if (dtype_eq(w_datatype, F16)) {
+        auto w_ = reinterpret_cast<half const *>(w);
+        if (items_per_thread == 1) {
+            rms_norm_padding<MAX_THREADS_PER_BLOCK, half, half>
+                <<<n, d, 0, musa_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon);
+        } else if (items_per_thread <= 16) {
+            rms_norm_folding<MAX_THREADS_PER_BLOCK, 16, half, half>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, musa_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        } else {
+            rms_norm_standard<MAX_THREADS_PER_BLOCK, half, half>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, musa_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        }
+    } else {
+        auto w_ = reinterpret_cast<float const *>(w);
+        if (items_per_thread == 1) {
+            rms_norm_padding<MAX_THREADS_PER_BLOCK, half, float>
+                <<<n, d, 0, musa_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon);
+        } else if (items_per_thread <= 16) {
+            rms_norm_folding<MAX_THREADS_PER_BLOCK, 16, half, float>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, musa_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        } else {
+            rms_norm_standard<MAX_THREADS_PER_BLOCK, half, float>
+                <<<n, MAX_THREADS_PER_BLOCK, 0, musa_stream>>>(y_, stride_y, x_, stride_x, w_, epsilon, d);
+        }
+    }
+}
+
+infiniopStatus_t musaRMSNorm(RMSNormMusaDescriptor_t desc,
+                                   void *workspace,
+                                   uint64_t workspace_size,
+                                   void *y, void const *x, void const *w,
+                                   void *stream){
+    int current_device;
+    if (musaGetDevice(&current_device) != musaSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    if (current_device != desc->device_id && musaSetDevice(desc->device_id) != musaSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    if (dtype_eq(desc->dtype, F16)){
+        rms_norm_mt_gpu_f16(desc, y, x, w, stream);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/rms_norm/operator.cc b/src/ops/rms_norm/operator.cc
index fae458d9..317e7ef2 100644
--- a/src/ops/rms_norm/operator.cc
+++ b/src/ops/rms_norm/operator.cc
@@ -1,85 +1,187 @@
 #include "../utils.h"
+#include "operators.h"
 #include "ops/rms_norm/rms_norm.h"
 
 #ifdef ENABLE_CPU
 #include "cpu/rms_norm_cpu.h"
 #endif
 #ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/common_cuda.h"
+#include "../../devices/cuda/cuda_handle.h"
 #include "cuda/rms_norm.cuh"
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-#include "bang/rms_norm_cnnl.h"
+#include "../../devices/bang/bang_handle.h"
 #include "bang/rms_norm_bang.h"
 #endif
+#ifdef ENABLE_ASCEND_NPU
+#include "ascend/rms_norm_aclnn.h"
+#endif
+#ifdef ENABLE_METAX_GPU
+#include "maca/rms_norm_maca.h"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#include "musa/rms_norm_musa.h"
+#endif
 
-struct RMSNormDescriptor {
-    Device device;
-};
+__C infiniopStatus_t infiniopCreateRMSNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopRMSNormDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    float epsilon) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateRMSNormDescriptor(handle, (RMSNormCpuDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateRMSNormDescriptor((CudaHandle_t) handle, (RMSNormCudaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangCreateRMSNormDescriptor((BangHandle_t) handle, (RMSNormBangDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnCreateRMSNormDescriptor((AscendHandle_t) handle,
+                                                (RMSNormAclnnDescriptor_t *) desc_ptr,
+                                                y_desc,
+                                                x_desc,
+                                                w_desc,
+                                                epsilon);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCreateRMSNormDescriptor((MacaHandle_t) handle, (RMSNormMacaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCreateRMSNormDescriptor((MusaHandle_t) handle, (RMSNormMusaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
 
-__C void *createRMSNormDescriptor(Device device, void *config) {
-    switch (device) {
+__C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            return (RMSNormDescriptor *) (new RMSNormCpuDescriptor{device});
+            return cpuGetRMSNormWorkspaceSize((RMSNormCpuDescriptor_t) desc, size);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            return (RMSNormDescriptor *) (new RMSNormCudaDescriptor{device});
+        case DevNvGpu: {
+            return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t) desc, size);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         case DevCambriconMlu: {
-            return (RMSNormDescriptor *) (new RMSNormBangDescriptor(device));
+            return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t) desc,
+                                                size);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaGetRMSNormWorkspaceSize((RMSNormMusaDescriptor_t) desc, size);
         }
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
-    return nullptr;
+    return STATUS_BAD_DEVICE;
 }
 
-__C void destroyRMSNormDescriptor(RMSNormDescriptor *descriptor) {
-    switch (descriptor->device) {
+__C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, uint64_t workspace_size,
+                                     void *y, void const *x, void const *w, void *stream) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            delete (RMSNormCpuDescriptor *) (descriptor);
-            break;
+            return cpuRMSNorm((RMSNormCpuDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            delete (RMSNormCudaDescriptor *) (descriptor);
-            break;
+        case DevNvGpu: {
+            return cudaRMSNorm((RMSNormCudaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         case DevCambriconMlu: {
-            delete (RMSNormBangDescriptor *) (descriptor);
-            break;
+            return bangRMSNorm((RMSNormBangDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnRMSNorm((RMSNormAclnnDescriptor_t) desc,
+                                workspace,
+                                workspace_size,
+                                y,
+                                x,
+                                w,
+                                stream);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaRMSNorm((RMSNormMacaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaRMSNorm((RMSNormMusaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
         }
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
+    return STATUS_BAD_DEVICE;
 }
 
-__C void rmsNorm(RMSNormDescriptor *descriptor, Tensor y, Tensor x, Tensor w, float epsilon, void *stream) {
-    switch (descriptor->device) {
+__C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            rms_norm_cpu_f16(y, x, w, epsilon);
-            break;
+            return cpuDestroyRMSNormDescriptor((RMSNormCpuDescriptor_t) desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            rms_norm_nv_gpu_f16(y, x, w, epsilon, stream);
-            break;
+        case DevNvGpu: {
+            return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t) desc);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu:
-            // Using BANGC Kernel
-            rms_norm_bang_f16(y, x, w, epsilon, stream);
-            // rms_norm_cnnl_f16(y, x, w, epsilon, stream);
-            break;
-#endif
-        default:
-            PANIC(UnsupportedDevice);
+        case DevCambriconMlu: {
+            return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaDestroyRMSNormDescriptor((RMSNormMusaDescriptor_t) desc);
+        }
+#endif
     }
+    return STATUS_BAD_DEVICE;
 }
diff --git a/src/ops/rotary_embedding/ascend/rotary_embedding.cc b/src/ops/rotary_embedding/ascend/rotary_embedding.cc
new file mode 100644
index 00000000..5908af2a
--- /dev/null
+++ b/src/ops/rotary_embedding/ascend/rotary_embedding.cc
@@ -0,0 +1,99 @@
+#include "rotary_embedding.h"
+#include "../../utils.h"
+
+infiniopStatus_t ascendCreateRoPEDescriptor(AscendHandle_t handle,
+                                            RoPEAscendDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t t,
+                                            infiniopTensorDescriptor_t pos_ids,
+                                            infiniopTensorDescriptor_t sin_table,
+                                            infiniopTensorDescriptor_t cos_table) {
+    if (t->ndim != 3 ||
+        pos_ids->ndim != 1 ||
+        sin_table->ndim != 2 ||
+        cos_table->ndim != 2) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    auto seq_len = t->shape[0];
+    auto nh = t->shape[1];
+    auto dim = t->shape[2];
+    auto total_seq_len = sin_table->shape[0];
+    auto stride_seq = t->strides[0];
+    auto stride_head = t->strides[1];
+
+
+    if (dim % 2 != 0 || dim <= 32) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (pos_ids->shape[0] != seq_len ||
+        sin_table->shape[1] != dim ||
+        cos_table->shape[1] != dim ||
+        sin_table->shape[0] != cos_table->shape[0]) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (t->strides[2] != 1 ||
+        pos_ids->strides[0] != 1 ||
+        sin_table->strides[1] != 1 ||
+        cos_table->strides[1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    aclDataType dt;
+    if (dtype_eq(t->dt, F16)) {
+        dt = aclDataType::ACL_FLOAT16;
+    } else if (dtype_eq(t->dt, F32)) {
+        dt = aclDataType::ACL_FLOAT;
+    } else {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    *desc_ptr = new RoPEAscendDescriptor{
+        handle->device,
+        handle->device_id,
+        dt,
+        seq_len,
+        nh,
+        dim,
+        total_seq_len,
+        stride_seq,
+        stride_head};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t ascendGetRoPEWorkspaceSize(RoPEAscendDescriptor_t desc,
+                                            uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t ascendRoPE(RoPEAscendDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *t,
+                            void const *pos_ids,
+                            void const *sin_table,
+                            void const *cos_table,
+                            void *stream) {
+    auto nt = static_cast<int>(desc->seq_len);
+    auto nh = static_cast<int>(desc->nhead);
+    auto dh = static_cast<int>(desc->dim);
+    auto stt = static_cast<int>(desc->stride_seq);
+    auto sth = static_cast<int>(desc->stride_head);
+
+    // Set device
+    aclrtSetDevice(desc->device_id);
+
+    return rope_kernel_do(t, (void *) pos_ids, (void *) sin_table, (void *) cos_table,
+                   nt, nh, dh, stt, sth, desc->dt, stream);
+}
+
+infiniopStatus_t ascendDestroyRoPEDescriptor(RoPEAscendDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rotary_embedding/ascend/rotary_embedding.h b/src/ops/rotary_embedding/ascend/rotary_embedding.h
new file mode 100644
index 00000000..679b238a
--- /dev/null
+++ b/src/ops/rotary_embedding/ascend/rotary_embedding.h
@@ -0,0 +1,46 @@
+#ifndef __ASCEND_ROTARY_EMBEDDING_H__
+#define __ASCEND_ROTARY_EMBEDDING_H__
+
+#include "../../../devices/ascend/ascend_handle.h"
+#include "operators.h"
+
+struct RoPEAscendDescriptor {
+    Device device;
+    int device_id;
+    aclDataType dt;
+    uint64_t seq_len;
+    uint64_t nhead;
+    uint64_t dim;
+    uint64_t total_seq_len;
+    int64_t stride_seq;
+    int64_t stride_head;
+};
+
+typedef struct RoPEAscendDescriptor *RoPEAscendDescriptor_t;
+
+infiniopStatus_t ascendCreateRoPEDescriptor(AscendHandle_t handle,
+                                            RoPEAscendDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t t,
+                                            infiniopTensorDescriptor_t pos_ids,
+                                            infiniopTensorDescriptor_t sin_table,
+                                            infiniopTensorDescriptor_t cos_table);
+
+infiniopStatus_t ascendGetRoPEWorkspaceSize(RoPEAscendDescriptor_t desc,
+                                            uint64_t *size);
+
+infiniopStatus_t ascendRoPE(RoPEAscendDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *t,
+                            void const *pos_ids,
+                            void const *sin_table,
+                            void const *cos_table,
+                            void *stream);
+
+infiniopStatus_t ascendDestroyRoPEDescriptor(RoPEAscendDescriptor_t desc);
+
+extern "C" infiniopStatus_t rope_kernel_do(void *t, void *pos, void *sin, void *cos,
+                               int32_t nt, int32_t nh, int32_t dh, int32_t stt,
+                               int32_t sth, int dtype, void *stream);
+
+#endif
diff --git a/src/ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp b/src/ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
new file mode 100644
index 00000000..989b1422
--- /dev/null
+++ b/src/ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
@@ -0,0 +1,230 @@
+#include "kernel_operator.h"
+#include "../../../../include/status.h"
+
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 1;
+
+template<typename T> class RoPE {
+public:
+    __aicore__ inline RoPE() {}
+    // Init op
+    // pos position vector
+    // t input tensor
+    // input tensor shape [nt, nh, dh]
+    // make block_num = nh, tile_len = dh
+    __aicore__ inline void Init(GM_ADDR t, GM_ADDR pos, GM_ADDR sin,
+                                GM_ADDR cos, int32_t nt, int32_t nh,
+                                int32_t dh, int32_t stt, int32_t sth);
+    __aicore__ inline void Process();
+
+private:
+    // Copy a tile into UB
+    __aicore__ inline void CopyIn(int32_t i);
+    __aicore__ inline void Compute(int32_t i);
+    __aicore__ inline void CopyOut(int32_t i);
+
+private:
+    TPipe pipe;
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> sinQue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> cosQue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outQue;
+    TBuf<TPosition::VECCALC> tmpOddBuf;
+    TBuf<TPosition::VECCALC> tmpEvenBuf;
+    TBuf<TPosition::VECCALC> tmpBuf;
+    TBuf<TPosition::VECCALC> tmp2Buf;
+    TBuf<TPosition::VECCALC> tmp3Buf;
+    TBuf<TPosition::VECCALC> tmp4Buf;
+    TBuf<TPosition::VECCALC> tmpSinBuf;
+    TBuf<TPosition::VECCALC> tmpCosBuf;
+
+    GlobalTensor<T> xGm;
+    GlobalTensor<uint64_t> pGm;
+    GlobalTensor<float> sinGm;
+    GlobalTensor<float> cosGm;
+    GlobalTensor<T> oGm;
+
+    // TODO: Change to uint64_t
+    uint32_t _block_idx;
+    uint32_t _tile_len;
+
+    // t[nt, nh, dh]
+    // nt num of tokens
+    // nh num of heads
+    // dh dimension of each head
+    int32_t nt;
+    int32_t nh;
+    int32_t dh;
+    int32_t sth;
+    int32_t stt;
+};
+
+template<typename T>
+__aicore__ inline void RoPE<T>::Init(GM_ADDR t, GM_ADDR pos, GM_ADDR sin,
+                                     GM_ADDR cos, int32_t nt, int32_t nh,
+                                     int32_t dh, int32_t stt, int32_t sth) {
+    this->nt = nt;
+    this->nh = nh;
+    this->dh = dh;
+    this->stt = stt;
+    this->sth = sth;
+
+    _block_idx = GetBlockIdx();
+    _tile_len = dh;
+
+    // Init global buffer
+    xGm.SetGlobalBuffer((__gm__ T *) t);
+    pGm.SetGlobalBuffer((__gm__ uint64_t *) pos);
+    sinGm.SetGlobalBuffer((__gm__ float *) sin);
+    cosGm.SetGlobalBuffer((__gm__ float *) cos);
+    oGm.SetGlobalBuffer((__gm__ T *) t);
+
+    // Init Queue buffer
+    pipe.InitBuffer(inQue, BUFFER_NUM, _tile_len * sizeof(T));
+    pipe.InitBuffer(outQue, BUFFER_NUM, _tile_len * sizeof(T));
+    pipe.InitBuffer(sinQue, BUFFER_NUM, _tile_len * sizeof(float));
+    pipe.InitBuffer(cosQue, BUFFER_NUM, _tile_len * sizeof(float));
+    pipe.InitBuffer(tmpOddBuf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(tmpEvenBuf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(tmpBuf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(tmp2Buf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(tmp3Buf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(tmp4Buf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(tmpSinBuf, _tile_len * sizeof(T));
+    pipe.InitBuffer(tmpCosBuf, _tile_len * sizeof(T));
+}
+
+template<typename T>
+__aicore__ inline void RoPE<T>::CopyIn(int32_t i) {
+    LocalTensor<T> inputUb = inQue.AllocTensor<T>();
+    LocalTensor<float> sinUb = sinQue.AllocTensor<float>();
+    LocalTensor<float> cosUb = cosQue.AllocTensor<float>();
+    // Get idx of current tile in total input
+    auto idx = i * stt + _block_idx * sth;
+    // Copy tile current tile into UB
+    DataCopy(inputUb, xGm[idx], _tile_len);
+    // Copy sin cos tile
+    auto pos_idx = pGm(i);
+    // Cast sin cos to T type
+    DataCopy(sinUb, sinGm[pos_idx * dh], _tile_len);
+    DataCopy(cosUb, cosGm[pos_idx * dh], _tile_len);
+    // Push in operands
+    inQue.EnQue(inputUb);
+    sinQue.EnQue(sinUb);
+    cosQue.EnQue(cosUb);
+}
+
+template<typename T>
+__aicore__ inline void RoPE<T>::Compute(int32_t i) {
+    LocalTensor<T> inputUb = inQue.DeQue<T>();
+    LocalTensor<float> sinUb = sinQue.DeQue<float>();
+    LocalTensor<float> cosUb = cosQue.DeQue<float>();
+    LocalTensor<T> outUb = outQue.AllocTensor<T>();
+
+    // Choose odd and even position
+    LocalTensor<T> tmpOdd = tmpOddBuf.Get<T>();
+    LocalTensor<T> tmpEven = tmpEvenBuf.Get<T>();
+    LocalTensor<T> tmpUb = tmpBuf.Get<T>();
+    LocalTensor<T> tmp2Ub = tmp2Buf.Get<T>();
+    LocalTensor<T> tmp3Ub = tmp3Buf.Get<T>();
+    LocalTensor<T> tmp4Ub = tmp4Buf.Get<T>();
+    LocalTensor<T> tmpSinUb = tmpSinBuf.Get<T>();
+    LocalTensor<T> tmpCosUb = tmpCosBuf.Get<T>();
+
+    // Cast from float to T
+    Cast<T, float>(tmpSinUb, sinUb, RoundMode::CAST_FLOOR, _tile_len);
+    Cast<T, float>(tmpCosUb, cosUb, RoundMode::CAST_FLOOR, _tile_len);
+    PipeBarrier<PIPE_V>();
+    
+    // Select odd & even numbers
+    uint64_t rsvdCnt = 0;
+    GatherMaskParams gMaskParams = {
+        1,
+        static_cast<uint16_t>((_tile_len * sizeof(T) + 255) / 256),
+        8,
+        8,
+    };
+    GatherMask<T>(tmpOdd, inputUb, 1, false, 0, gMaskParams, rsvdCnt);
+    GatherMask<T>(tmpEven, inputUb, 2, false, 0, gMaskParams, rsvdCnt);
+    
+    // Calc odd position
+    GatherMask<T>(tmpUb, tmpCosUb, 1, false, 0, gMaskParams, rsvdCnt);
+    GatherMask<T>(tmp2Ub, tmpSinUb, 1, false, 0, gMaskParams, rsvdCnt);
+    PipeBarrier<PIPE_V>();
+    tmpUb = tmpOdd * tmpUb;
+    tmp2Ub = tmpEven * tmp2Ub;
+    PipeBarrier<PIPE_V>();
+    tmpUb = tmpUb - tmp2Ub;
+
+    // Calc even position
+    GatherMask<T>(tmp3Ub, tmpSinUb, 2, false, 0, gMaskParams, rsvdCnt);
+    GatherMask<T>(tmp4Ub, tmpCosUb, 2, false, 0, gMaskParams, rsvdCnt);
+    PipeBarrier<PIPE_V>();
+    tmp3Ub = tmpOdd * tmp3Ub;
+    tmp4Ub = tmpEven * tmp4Ub;
+    PipeBarrier<PIPE_V>();
+    tmp3Ub = tmp3Ub + tmp4Ub;
+
+    // Scatter
+    // Scatter<T>(outUb, tmpUb, tmpOffsetUb, (uint32_t)sizeof(T), tile_len / 2);
+    for (uint32_t i = 0; i < _tile_len / 2; i += 1) {
+        outUb(i * 2 + 1) = tmp3Ub(i);
+        outUb(i * 2) = tmpUb(i);
+    }
+
+    outQue.EnQue<T>(outUb);
+    inQue.FreeTensor(inputUb);
+    sinQue.FreeTensor(sinUb);
+    cosQue.FreeTensor(cosUb);
+}
+
+template<typename T>
+__aicore__ inline void RoPE<T>::CopyOut(int32_t i) {
+    LocalTensor<T> outUb = outQue.DeQue<T>();
+    auto idx = i * stt + _block_idx * sth;
+    // DataCopy(oGm[idx], outUb, _tile_len);
+    DataCopyExtParams dcep = {
+        1,
+        static_cast<uint32_t>(_tile_len * sizeof(T)),
+        0, 0, 0};
+    DataCopyPad(oGm[idx], outUb, dcep);
+    outQue.FreeTensor(outUb);
+}
+
+template<typename T> __aicore__ inline void RoPE<T>::Process() {
+
+    for (int32_t i = 0; i < nt; ++i) {
+        CopyIn(i);
+        Compute(i);
+        CopyOut(i);
+    }
+}
+
+// Kernel func
+__global__ __aicore__ void rope_kernel_fp16(GM_ADDR t, GM_ADDR pos,
+                                                       GM_ADDR sin, GM_ADDR cos,
+                                                       int32_t nt, int32_t nh,
+                                                       int32_t dh, int32_t stt,
+                                                       int32_t sth) {
+    RoPE<half> op;
+    op.Init(t, pos, sin, cos, nt, nh, dh, stt, sth);
+    op.Process();
+}
+
+extern "C"  infiniopStatus_t rope_kernel_do(void *t, void *pos, void *sin, void *cos,
+                               int32_t nt, int32_t nh, int32_t dh,
+                               int32_t stt, int32_t sth,
+                               int dtype, void *stream) {
+    switch (dtype) {
+        case 0:// ACL_FLOAT32
+            // TODO:
+            break;
+        case 1:// ACL_FLOAT16
+            rope_kernel_fp16<<<nh, nullptr, stream>>>(t, pos, sin, cos, nt, nh, dh, stt, sth);
+            return STATUS_SUCCESS;
+        default:
+            break;
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/rotary_embedding/bang/rotary_embedding_bang.cc b/src/ops/rotary_embedding/bang/rotary_embedding_bang.cc
new file mode 100644
index 00000000..c5c51449
--- /dev/null
+++ b/src/ops/rotary_embedding/bang/rotary_embedding_bang.cc
@@ -0,0 +1,74 @@
+#include "rotary_embedding_bang.h"
+#include "../../utils.h"
+
+
+infiniopStatus_t bangCreateRoPEDescriptor(BangHandle_t handle,
+                                          RoPEBangDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t t,
+                                          infiniopTensorDescriptor_t pos_ids,
+                                          infiniopTensorDescriptor_t sin_table,
+                                          infiniopTensorDescriptor_t cos_table) {
+
+    if (desc_ptr == nullptr)
+        return STATUS_MEMORY_NOT_ALLOCATED;
+
+    if (t->ndim != 3 ||
+        pos_ids->ndim != 1 ||
+        sin_table->ndim != 2 ||
+        cos_table->ndim != 2)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    auto seq_len = t->shape[0];
+    auto nhead = t->shape[1];
+    auto dim = t->shape[2];
+    auto total_seq_len = sin_table->shape[0];
+
+    if (dim % 2 != 0)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    if (pos_ids->shape[0] != seq_len ||
+        sin_table->shape[1] != dim ||
+        cos_table->shape[1] != dim ||
+        sin_table->shape[0] != cos_table->shape[0])
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    if (t->strides[2] != 1 ||
+        pos_ids->strides[0] != 1 ||
+        sin_table->strides[1] != 1 ||
+        cos_table->strides[1] != 1)
+        return STATUS_BAD_TENSOR_STRIDES;
+
+    if (!dtype_eq(t->dt, F16))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(pos_ids->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+    int stride_0 = static_cast<int>(t->strides[0]);
+    int stride_1 = static_cast<int>(t->strides[1]);
+    *desc_ptr = new RoPEBangDescriptor{
+        handle->device,
+        handle->device_id,
+        t->dt,
+        seq_len,
+        nhead,
+        dim,
+        total_seq_len,
+        stride_0, stride_1};
+
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t bangGetRoPEWorkspaceSize(RoPEBangDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t bangDestroyRoPEDescriptor(RoPEBangDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rotary_embedding/bang/rotary_embedding_bang.h b/src/ops/rotary_embedding/bang/rotary_embedding_bang.h
new file mode 100644
index 00000000..4ede6d33
--- /dev/null
+++ b/src/ops/rotary_embedding/bang/rotary_embedding_bang.h
@@ -0,0 +1,44 @@
+#ifndef __BANG_ROTARY_EMBEDDING_H__
+#define __BANG_ROTARY_EMBEDDING_H__
+
+#include "../../../devices/bang/bang_handle.h"
+#include "../../utils.h"
+#include "operators.h"
+
+struct RoPEBangDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t seq_len;
+    uint64_t nhead;
+    uint64_t dim;
+    uint64_t total_seq_len;
+    int stride_0;
+    int stride_1;
+};
+
+
+typedef struct RoPEBangDescriptor *RoPEBangDescriptor_t;
+
+infiniopStatus_t bangCreateRoPEDescriptor(BangHandle_t handle,
+                                          RoPEBangDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t t,
+                                          infiniopTensorDescriptor_t pos_ids,
+                                          infiniopTensorDescriptor_t sin_table,
+                                          infiniopTensorDescriptor_t cos_table);
+
+infiniopStatus_t bangGetRoPEWorkspaceSize(RoPEBangDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t bangRoPE(RoPEBangDescriptor_t desc,
+                          void *workspace,
+                          uint64_t workspace_size,
+                          void *t,
+                          void const *pos_ids,
+                          void const *sin_table,
+                          void const *cos_table,
+                          void *stream);
+
+infiniopStatus_t bangDestroyRoPEDescriptor(RoPEBangDescriptor_t desc);
+
+
+#endif// __BANG_RMS_NORM_H__
diff --git a/src/ops/rotary_embedding/bang/rotary_embedding_bang.mlu b/src/ops/rotary_embedding/bang/rotary_embedding_bang.mlu
new file mode 100644
index 00000000..b7d3658e
--- /dev/null
+++ b/src/ops/rotary_embedding/bang/rotary_embedding_bang.mlu
@@ -0,0 +1,451 @@
+#include "bang.h"
+#include "bang_device_functions.h"
+#include "cnrt.h"
+#include "rotary_embedding_bang.h"
+#include "../../../devices/bang/common_bang.h"
+#include "../../utils.h"
+
+const int SRC_MAX_SIZE = 1024 * 8;//8 = 256/32
+__nram__  char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename T>
+__mlu_global__ void RoPE(T *destination, uint64_t const *pos_ids, float const *sin_table, float const *cos_table, int stride_0, int stride_1, int nt, int nh, int dimsize) {//axis=-1
+
+    const int maxNum = SRC_MAX_SIZE/sizeof(float);
+    
+    int othersize = nt * nh;
+
+    int segsize = sizeof(T);
+    int srcStrideL = 2 * sizeof(T);
+    int destStrideL = 1 * sizeof(T);
+    
+    int srcStrideW = 1 * sizeof(T);
+    int destStrideW = 2 * sizeof(T);
+
+    int segsize_table = sizeof(float);
+    int srcStrideL_table = 2 * sizeof(float);
+    int destStrideL_table = 1 * sizeof(float);
+    
+
+    int remainT = othersize % taskDim;
+    int stepEasy = (othersize - remainT) / taskDim;
+    int stepHard = stepEasy + 1;
+    int step = (taskId < remainT ? stepHard : stepEasy);
+    int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
+    
+    if(nt < maxNum){
+        char *nram_buffer1 = nram_buffer + nt * sizeof(uint64_t);
+        uint64_t *srcP = (uint64_t *)nram_buffer;//[nt]
+
+        __memcpy(srcP, pos_ids, nt * sizeof(uint64_t), GDRAM2NRAM);
+        
+        if(dimsize >= maxNum){
+            int dSize = 2 * maxNum;
+            char *nram_buffer2 = nram_buffer1 + (2 * dSize + 14 * maxNum) * sizeof(float);
+            float *srcSin = (float *)nram_buffer1;//[dSize]
+            float *srcCos = srcSin + dSize;//[dSize]
+            float *sin0 = srcCos + dSize;//[3 * maxNum]
+            float *cos0 = sin0 + 3 * maxNum;//[3 * maxNum]
+            float *sin1 = cos0 + 3 * maxNum;//[3 * maxNum],需要多申请内存，方便后面数据移动
+            float *cos1 = sin1 + 3 * maxNum;//[3 * maxNum],需要多申请内存，方便后面数据移动
+            float *tmpa = cos1 + 3 * maxNum;//[maxNum]
+            float *tmpb = tmpa + maxNum;//[maxNum]
+            
+
+            T *srca = (T *)nram_buffer2;//[maxNum]
+            T *srcb = srca + maxNum;//[3 * maxNum]
+            T *src = srcb + 3 * maxNum;//[dSize]
+
+
+            int segnum = 2 * maxNum;
+            
+            int remain = dimsize % dSize;
+            int repeat = (dimsize - remain) / dSize;
+            
+            for(int i = indStart; i < indStart + step; i++){
+                int indd = 0;
+                int indi = i;
+                indd += (indi % nh) * stride_1;
+                indi /= nh;
+                indd += (indi % nt) * stride_0;
+                int index = srcP[(indi % nt)] * dimsize;
+                for(int s = 0; s < repeat; s++){
+                    __memcpy(srcSin, sin_table + index + s * dSize, dSize * sizeof(float), GDRAM2NRAM);
+                    __memcpy(sin0, srcSin, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+                    __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum);   
+
+                    __memcpy(srcCos, cos_table + index + s * dSize, dSize * sizeof(float), GDRAM2NRAM);              
+                    __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+                    __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum);
+
+                    __memcpy(src, destination + indd + s * dSize, dSize * sizeof(T), GDRAM2NRAM);
+                    __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); 
+                    __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum);
+                    
+                    __bang_half2float(tmpa, srca, maxNum);
+                    __bang_half2float(tmpb, srcb, maxNum);
+
+                    __bang_mul(cos0, tmpa, cos0, maxNum);
+                    __bang_mul(sin0, tmpb, sin0, maxNum);
+                    __bang_sub(cos0, cos0, sin0, maxNum);//结果临时存储在cos0上
+
+                    __bang_mul(sin1, tmpa, sin1, maxNum);
+                    __bang_mul(cos1, tmpb, cos1, maxNum);
+                    __bang_add(cos1, sin1, cos1, maxNum);
+
+                    __bang_float2half_dn(srca, cos0, maxNum);
+                    __bang_float2half_dn(srcb, cos1, maxNum);
+
+                    __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum);
+                    __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum);
+                    __memcpy(destination + indd + s * dSize, src, dSize * sizeof(T), NRAM2GDRAM);
+                    
+                    
+                }
+                if(remain){
+                    __memcpy(srcSin, sin_table + index + repeat * dSize, remain * sizeof(float), GDRAM2NRAM);    
+                    __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+
+                    __memcpy(srcCos, cos_table + index + repeat * dSize, remain * sizeof(float), GDRAM2NRAM);     
+                    __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+                    __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum);
+
+                    __memcpy(src, destination + indd + repeat * dSize, remain * sizeof(T), GDRAM2NRAM); 
+                    __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, remain); 
+                    __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, remain);    
+                    
+                    __bang_half2float(tmpa, srca, maxNum);
+                    __bang_half2float(tmpb, srcb, maxNum);
+
+                    __bang_mul(cos0, tmpa, cos0, maxNum);
+                    __bang_mul(sin0, tmpb, sin0, maxNum);
+                    __bang_sub(cos0, cos0, sin0, maxNum);//结果临时存储在cos0上
+
+                    __bang_mul(sin1, tmpa, sin1, maxNum);
+                    __bang_mul(cos1, tmpb, cos1, maxNum);
+                    __bang_add(cos1, sin1, cos1, maxNum);
+
+                    __bang_float2half_dn(srca, cos0, maxNum);
+                    __bang_float2half_dn(srcb, cos1, maxNum);
+
+                    __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, remain);
+                    __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, remain);
+                    __memcpy(destination + indd + repeat * dSize, src, remain * sizeof(T), NRAM2GDRAM);
+                    
+                    
+                }
+            }
+            
+        }
+        else{
+            
+            int segnum = dimsize;
+            int dh = dimsize / 2;
+
+            char *nram_buffer2 = nram_buffer1 + (2 * dimsize + 14 * dh) * sizeof(float);
+            float *srcSin = (float *)nram_buffer1;//[dimsize]
+            float *srcCos = srcSin + dimsize;//[dimsize]
+            float *sin0 = srcCos + dimsize;//[dh]
+            float *cos0 = sin0 + 3 * dh;//[dh]
+            float *sin1 = cos0 + 3 * dh;//[dh]
+            float *cos1 = sin1 + 3 * dh;//[dh]
+            float *tmpa = cos1 + 3 * dh;//[dh]
+            float *tmpb = tmpa + dh;//[dh]
+            
+            T *srca = (T *)nram_buffer2;//[dh]
+            T *srcb = srca + dh;//[dh]
+            T *src = srcb + 3 * dh;//[dimsize]
+            
+            for(int i = indStart; i < indStart + step; i++){
+                int indd = 0;
+                int indi = i;
+                indd += (indi % nh) * stride_1;
+                indi /= nh;
+                indd += (indi % nt) * stride_0;
+
+                int index = srcP[(indi % nt)] * dimsize;
+                
+                __memcpy(srcSin, sin_table + index, dimsize * sizeof(float), GDRAM2NRAM);     
+                __memcpy(sin0, srcSin, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+                __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum);
+                
+                
+                
+                __memcpy(srcCos, cos_table + index, dimsize * sizeof(float), GDRAM2NRAM);     
+                __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+                __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum);
+                
+                
+                
+                __memcpy(src, destination + indd, dimsize * sizeof(T), GDRAM2NRAM);
+                __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); 
+                __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); 
+                
+                
+                __bang_half2float(tmpa, srca, dh);
+                __bang_half2float(tmpb, srcb, dh);
+                
+                
+                
+                __bang_mul(cos0, tmpa, cos0, dh);
+                __bang_mul(sin0, tmpb, sin0, dh);
+                __bang_sub(cos0, cos0, sin0, dh);//结果临时存储在cos0上
+
+                __bang_mul(sin1, tmpa, sin1, dh);
+                __bang_mul(cos1, tmpb, cos1, dh);
+                __bang_add(cos1, sin1, cos1, dh);
+                
+                __bang_float2half_dn(srca, cos0, dh);
+                __bang_float2half_dn(srcb, cos1, dh);
+                
+                
+                __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum);
+                __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum);
+                __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
+                
+                
+                
+            }  
+            
+        }
+    }
+    else{
+        
+        if(dimsize >= maxNum){
+            int dSize = 2 * maxNum;
+            char *nram_buffer1 = nram_buffer + (2 * dSize + 14 * maxNum) * sizeof(float);
+            float *srcSin = (float *)nram_buffer;//[dSize]
+            float *srcCos = srcSin + dSize;//[dSize]
+            float *sin0 = srcCos + dSize;//[3 *maxNum]
+            float *cos0 = sin0 + 3 * maxNum;//[3 * maxNum]
+            float *sin1 = cos0 + 3 * maxNum;//[3 * maxNum],需要多申请内存，方便后面数据移动
+            float *cos1 = sin1 + 3 * maxNum;//[3 * maxNum],需要多申请内存，方便后面数据移动
+            float *tmpa = cos1 + 3 * maxNum;//[maxNum]
+            float *tmpb = tmpa + maxNum;//[maxNum]
+            
+
+            T *srca = (T *)nram_buffer1;//[maxNum]
+            T *srcb = srca + maxNum;//[3 * maxNum]
+            T *src = srcb + 3 * maxNum;//[dSize]
+
+
+            int segnum = 2 * maxNum;
+            
+            int remain = dimsize % dSize;
+            int repeat = (dimsize - remain) / dSize;
+            
+            for(int i = indStart; i < indStart + step; i++){
+                int indd = 0;
+                int indi = i;
+                indd += (indi % nh) * stride_1;
+                indi /= nh;
+                indd += (indi % nt) * stride_0;
+                int index = pos_ids[(indi % nt)] * dimsize;
+                for(int s = 0; s < repeat; s++){
+                    __memcpy(srcSin, sin_table + index + s * dSize, dSize * sizeof(float), GDRAM2NRAM);
+                    __memcpy(sin0, srcSin, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+                    __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum);   
+
+                    __memcpy(srcCos, cos_table + index + s * dSize, dSize * sizeof(float), GDRAM2NRAM);              
+                    __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+                    __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum);
+
+                    __memcpy(src, destination + indd + s * dSize, dSize * sizeof(T), GDRAM2NRAM);
+                    __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); 
+                    __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum);
+                    
+                    __bang_half2float(tmpa, srca, maxNum);
+                    __bang_half2float(tmpb, srcb, maxNum);
+
+                    __bang_mul(cos0, tmpa, cos0, maxNum);
+                    __bang_mul(sin0, tmpb, sin0, maxNum);
+                    __bang_sub(cos0, cos0, sin0, maxNum);//结果临时存储在cos0上
+
+                    __bang_mul(sin1, tmpa, sin1, maxNum);
+                    __bang_mul(cos1, tmpb, cos1, maxNum);
+                    __bang_add(cos1, sin1, cos1, maxNum);
+
+                    __bang_float2half_dn(srca, cos0, maxNum);
+                    __bang_float2half_dn(srcb, cos1, maxNum);
+
+                    __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum);
+                    __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum);
+                    __memcpy(destination + indd + s * dSize, src, dSize * sizeof(T), NRAM2GDRAM);
+                    
+                    
+                }
+                if(remain){
+                    __memcpy(srcSin, sin_table + index + repeat * dSize, remain * sizeof(float), GDRAM2NRAM);    
+                    __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+
+                    __memcpy(srcCos, cos_table + index + repeat * dSize, remain * sizeof(float), GDRAM2NRAM);     
+                    __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+                    __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum);
+
+                    __memcpy(src, destination + indd + repeat * dSize, remain * sizeof(T), GDRAM2NRAM); 
+                    __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, remain); 
+                    __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, remain);    
+                    
+                    __bang_half2float(tmpa, srca, maxNum);
+                    __bang_half2float(tmpb, srcb, maxNum);
+
+                    __bang_mul(cos0, tmpa, cos0, maxNum);
+                    __bang_mul(sin0, tmpb, sin0, maxNum);
+                    __bang_sub(cos0, cos0, sin0, maxNum);//结果临时存储在cos0上
+
+                    __bang_mul(sin1, tmpa, sin1, maxNum);
+                    __bang_mul(cos1, tmpb, cos1, maxNum);
+                    __bang_add(cos1, sin1, cos1, maxNum);
+
+                    __bang_float2half_dn(srca, cos0, maxNum);
+                    __bang_float2half_dn(srcb, cos1, maxNum);
+
+                    __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, remain);
+                    __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, remain);
+                    __memcpy(destination + indd + repeat * dSize, src, remain * sizeof(T), NRAM2GDRAM);
+                    
+                    
+                }
+            }
+            
+        }
+        else{
+            
+            int segnum = dimsize;
+            int dh = dimsize / 2;
+            
+            char *nram_buffer1 = nram_buffer + (2 * dimsize + 14 * dh) * sizeof(float);
+            float *srcSin = (float *)nram_buffer;//[dimsize]
+            float *srcCos = srcSin + dimsize;//[dimsize]
+            float *sin0 = srcCos + dimsize;//[dh]
+            float *cos0 = sin0 + 3 * dh;//[dh]
+            float *sin1 = cos0 + 3 * dh;//[dh]
+            float *cos1 = sin1 + 3 * dh;//[dh]
+            float *tmpa = cos1 + 3 * dh;//[dh]
+            float *tmpb = tmpa + dh;//[dh]
+            
+            T *srca = (T *)nram_buffer1;//[dh]
+            T *srcb = srca + dh;//[dh]
+            T *src = srcb + 3 * dh;//[dimsize]
+            
+            for(int i = indStart; i < indStart + step; i++){
+                int indd = 0;
+                int indi = i;
+                indd += (indi % nh) * stride_1;
+                indi /= nh;
+                indd += (indi % nt) * stride_0;
+
+                int index = pos_ids[(indi % nt)] * dimsize;
+                
+                __memcpy(srcSin, sin_table + index, dimsize * sizeof(float), GDRAM2NRAM);     
+                __memcpy(sin0, srcSin, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+                __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum);
+                
+                
+                
+                __memcpy(srcCos, cos_table + index, dimsize * sizeof(float), GDRAM2NRAM);     
+                __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); 
+                __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum);
+                
+                
+                
+                __memcpy(src, destination + indd, dimsize * sizeof(T), GDRAM2NRAM);
+                __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); 
+                __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); 
+                
+                
+                __bang_half2float(tmpa, srca, dh);
+                __bang_half2float(tmpb, srcb, dh);
+                
+                
+                
+                __bang_mul(cos0, tmpa, cos0, dh);
+                __bang_mul(sin0, tmpb, sin0, dh);
+                __bang_sub(cos0, cos0, sin0, dh);//结果临时存储在cos0上
+
+                __bang_mul(sin1, tmpa, sin1, dh);
+                __bang_mul(cos1, tmpb, cos1, dh);
+                __bang_add(cos1, sin1, cos1, dh);
+                
+                __bang_float2half_dn(srca, cos0, dh);
+                __bang_float2half_dn(srcb, cos1, dh);
+                
+                
+                __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum);
+                __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum);
+                __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
+                
+                
+                
+            }  
+            
+        }
+    }
+    
+}
+
+template<typename T>
+void RoPEUnion(cnrtQueue_t queue, void *destination, void const *pos_ids, void const *sin_table, void const *cos_table, int stride_0, int stride_1, int nt, int nh, int dimsize) {
+    
+    auto pos_ = reinterpret_cast<uint64_t const *>(pos_ids);
+    auto sin_ = reinterpret_cast<float const *>(sin_table);
+    auto cos_ = reinterpret_cast<float const *>(cos_table);
+    auto t_ = reinterpret_cast<T *>(destination);
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = CNRT_FUNC_TYPE_UNION1;
+    
+    
+    RoPE<T><<<k_dim, k_type, queue>>>(t_, pos_, sin_, cos_, stride_0, stride_1, nt, nh, dimsize);
+    cnrtQueueSync(queue);
+    
+    
+    
+}
+
+
+void RoPE_bang_f16(RoPEBangDescriptor_t desc, void *t,
+                         void const *pos_ids,
+                         void const *sin_table,
+                         void const *cos_table, void *stream) {
+    auto queue = reinterpret_cast<cnrtQueue_t>(stream);
+    int nt = static_cast<int>(desc->seq_len);
+    int nh = static_cast<int>(desc->nhead);
+    int dimsize = static_cast<int>(desc->dim);
+    auto stride_0 = desc->stride_0;
+    auto stride_1 = desc->stride_1;
+    
+    RoPEUnion<half>(queue, t, pos_ids, sin_table, cos_table, stride_0, stride_1, nt, nh, dimsize);
+    
+}
+
+infiniopStatus_t bangRoPE(RoPEBangDescriptor_t desc,
+                          void *workspace,
+                          uint64_t workspace_size,
+                          void *t,
+                          void const *pos_ids,
+                          void const *sin_table,
+                          void const *cos_table,
+                          void *stream) {
+    if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
+        return STATUS_BAD_DEVICE;
+    }   
+    if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr)
+        return STATUS_BAD_PARAM;
+
+    if (dtype_eq(desc->dtype, F16)) {
+        RoPE_bang_f16(desc, t,
+                                  pos_ids,
+                                  sin_table,
+                                  cos_table, stream);
+    } else {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.cc b/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.cc
deleted file mode 100644
index c6d66faa..00000000
--- a/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-﻿#include "rotary_embedding_cnnl.h"
-#include "../../../devices/bang/common_bang.h"
-#include "../../../devices/bang/handle_pool.h"
-#include "../../utils.h"
-#include "cnrt.h"
-
-RotaryEmbeddingBangDescriptor::RotaryEmbeddingBangDescriptor(Device device) {
-    this->device = device;
-    get_cnnl_pool();
-}
-
-void rotary_embedding_cnnl_f16(RotaryEmbeddingBangDescriptor *descriptor, Tensor t, Tensor pos, float theta, void *stream) {
-    ASSERT_EQ(t.layout->ndim, 3);
-    ASSERT_EQ(pos.layout->ndim, 1);
-    ASSERT_EQ(pos.layout->shape[0], t.layout->shape[0]);
-
-    auto nt = static_cast<int>(t.layout->shape[0]),
-         nh = static_cast<int>(t.layout->shape[1]),
-         dh = static_cast<int>(t.layout->shape[2]);
-
-    int inDim[4] = {nt, 1, nh, dh};
-    int inDimStride[4] = {static_cast<int>(t.layout->strides[0] / t.layout->dt.size),
-                          0,
-                          static_cast<int>(t.layout->strides[1] / t.layout->dt.size),
-                          static_cast<int>(t.layout->strides[2] / t.layout->dt.size)};
-    int posDim[2] = {nt, 1};
-    int thetaDim[2] = {1, dh / 2};
-    int freqDim[2] = {nt, dh / 2};
-    int freqConcatDim[2] = {nt, dh};
-    int scalerDim[1] = {1};
-
-    cnnlTensorDescriptor_t inDesc, posDesc, thetaDesc, freqDesc, freqConcatDesc, scalerDesc;
-    cnnlCreateTensorDescriptor(&inDesc);
-    cnnlCreateTensorDescriptor(&posDesc);
-    cnnlCreateTensorDescriptor(&thetaDesc);
-    cnnlCreateTensorDescriptor(&freqDesc);
-    cnnlCreateTensorDescriptor(&freqConcatDesc);
-    cnnlCreateTensorDescriptor(&scalerDesc);
-    
-    cnnlSetTensorDescriptor(posDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT32, 2, posDim);
-    cnnlSetTensorDescriptorEx(inDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF, 4, inDim, inDimStride);
-    cnnlSetTensorDescriptor(thetaDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, thetaDim);
-    cnnlSetTensorDescriptor(freqDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, freqDim);
-    cnnlSetTensorDescriptor(freqConcatDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, freqConcatDim);
-    cnnlSetTensorDescriptor(scalerDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 1, scalerDim);
-
-    void *thetaData, *freqData, *freqConcatData, *scalerData;
-    cnrtMalloc(&thetaData, dh / 2 * sizeof(float) + nt * dh / 2 * sizeof(float) + nt * dh * sizeof(float) + sizeof(float));
-    freqData = static_cast<char *>(thetaData) + dh / 2 * sizeof(float);
-    freqConcatData = static_cast<char *>(freqData) + nt * dh / 2 * sizeof(float);
-    scalerData = static_cast<char *>(freqConcatData) + nt * dh * sizeof(float);
-
-    void *powWorkspace, *outerWorkspace, *concatWorkspace;
-    float zero = 0.0f, one = 1.0f;
-    float scaler = -2.0f / dh;
-
-    use_cnnl((cnrtQueue_t) stream,
-             [&](cnnlHandle_t handle) {
-                 cnrtMemcpy(scalerData, &scaler, sizeof(float), cnrtMemcpyHostToDev);
-
-                 void *workspace;
-                 size_t workspaceSize = 0;
-                 size_t powWorkspaceSize;
-                 cnnlGetPowWorkspaceSize(handle, scalerDesc, thetaDesc,
-                                         thetaDesc, &powWorkspaceSize);
-                 workspaceSize += powWorkspaceSize;
-
-                 // Use Broadcast Mul to calc t * theta_n
-                 size_t outerWorkspaceSize;
-                 cnnlGetOpTensorWorkspaceSize_v2(handle, descriptor->outerDesc, &one,
-                                                 posDesc, pos.data,
-                                                 &one, thetaDesc, thetaData,
-                                                 &zero, freqDesc, freqData,
-                                                 &outerWorkspaceSize);
-                 workspaceSize += outerWorkspaceSize;
-
-                 // Concat two freqs to get [freq, freq]
-                 size_t concatWorkspaceSize;
-                 cnnlGetConcatWorkspaceSize(handle, 2, &concatWorkspaceSize);
-                 workspaceSize += concatWorkspaceSize;
-
-                 cnrtMalloc(&workspace, workspaceSize);
-                 powWorkspace = workspace;
-                 outerWorkspace = static_cast<char *>(powWorkspace) + powWorkspaceSize;
-                 concatWorkspace = static_cast<char *>(outerWorkspace) + outerWorkspaceSize;
-
-                 // Use Arange to get [0, 1, 2, ..., dh / 2]
-                 cnnlArange_v2(handle, CNNL_COMPUTATION_ULTRAHIGH_PRECISION, &zero,
-                               &scaler, thetaDesc, thetaData);
-
-                 // Use PowR to calc ((theta)^(-2/d))^n
-                 cnrtMemcpy(scalerData, &theta, sizeof(float), cnrtMemcpyHostToDev);
-
-
-                 cnnlPow(handle, CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
-                         scalerDesc, scalerData, thetaDesc, thetaData,
-                         powWorkspace, powWorkspaceSize, thetaDesc, thetaData);
-
-
-                 cnnlOpTensor(handle, descriptor->outerDesc, &one,
-                              posDesc, pos.data,
-                              &one, thetaDesc, thetaData,
-                              outerWorkspace, outerWorkspaceSize,
-                              &zero, freqDesc, freqData);
-
-
-                 cnnlTensorDescriptor_t concatDescs[2] = {freqDesc, freqDesc};
-                 void *const concatData[2] = {freqData, freqData};
-
-                 cnnlConcat(handle, 2, -1, concatDescs, concatData,
-                            concatWorkspace, concatWorkspaceSize,
-                            freqConcatDesc, freqConcatData);
-
-                 // Do RotaryEmbedding with t(fp16) and [freq, freq](fp32)
-                 cnnlRotaryEmbedding_v2(handle, descriptor->ropeDesc, inDesc, t.data,
-                                        nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-                                        freqConcatDesc, freqConcatData,
-                                        nullptr, nullptr, nullptr, 0,
-                                        inDesc, t.data, nullptr, nullptr);
-             });
-
-    cnrtFree(thetaData);
-    cnrtFree(powWorkspace);
-
-    cnnlDestroyTensorDescriptor(inDesc);
-    cnnlDestroyTensorDescriptor(posDesc);
-    cnnlDestroyTensorDescriptor(thetaDesc);
-    cnnlDestroyTensorDescriptor(freqDesc);
-    cnnlDestroyTensorDescriptor(freqConcatDesc);
-    cnnlDestroyTensorDescriptor(scalerDesc);
-}
diff --git a/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.h b/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.h
deleted file mode 100644
index a83a525d..00000000
--- a/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef __CNNL_ROTARY_EMBEDDING_H__
-#define __CNNL_ROTARY_EMBEDDING_H__
-
-#include "cnnl.h"
-#include "cnnl_extra.h"
-#include "operators.h"
-
-struct RotaryEmbeddingBangDescriptor {
-    Device device;
-    cnnlOpTensorDescriptor_t outerDesc;
-    cnnlRotaryEmbeddingDescriptor_t ropeDesc;
-
-    RotaryEmbeddingBangDescriptor(Device device);
-    void createCnnlDescriptors() {
-        cnnlCreateOpTensorDescriptor(&outerDesc);
-        cnnlCreateRotaryEmbeddingDescriptor(&ropeDesc);
-        cnnlSetOpTensorDescriptor(outerDesc, CNNL_OP_TENSOR_MUL,
-                                  CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN);
-        cnnlSetRotaryEmbeddingDescriptor_v2(ropeDesc, false, true,
-                                            false, false, CNNL_SEQDATA_TNBC);
-    }
-    void destroyCnnlDescriptors() {
-        cnnlDestroyOpTensorDescriptor(outerDesc);
-        cnnlDestroyRotaryEmbeddingDescriptor(ropeDesc);
-    }
-};
-
-void rotary_embedding_cnnl_f16(RotaryEmbeddingBangDescriptor *descriptor, Tensor t, Tensor pos, float theta, void *stream);
-
-#endif// __CNNL_ROTARY_EMBEDDING_H__
diff --git a/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.cc b/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.cc
index 31c26de0..f433ed20 100644
--- a/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.cc
+++ b/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.cc
@@ -3,33 +3,136 @@
 #include "../../utils.h"
 #include <cmath>
 
-void rotary_embedding_cpu_f16(Tensor t, Tensor pos, float theta) {
-    ASSERT_EQ(t.layout->ndim, 3);
-    ASSERT_EQ(pos.layout->ndim, 1);
+struct RoPECpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t seq_len;
+    uint64_t nhead;
+    uint64_t dim;
+    uint64_t total_seq_len;
+    int64_t strides[2];
+};
 
-    auto nt = t.layout->shape[0],
-         nh = t.layout->shape[1],
-         dh = t.layout->shape[2] / 2;
+void rotary_embedding_cpu_f16(RoPECpuDescriptor_t desc,
+                              void *t,
+                              uint64_t const *pos_ids,
+                              float const *sin_table,
+                              float const *cos_table) {
+    auto nt = desc->seq_len,
+         nh = desc->nhead,
+         dim = desc->dim,
+         dk = dim / 2;
 
-    ASSERT_EQ(pos.layout->shape[0], nt);
-
-    auto stride_0 = t.layout->strides[0];
-    auto stride_1 = t.layout->strides[1];
+    auto stride_0 = desc->strides[0];
+    auto stride_1 = desc->strides[1];
 
     for (int i = 0; i < nt; ++i) {
-        auto pos_ = reinterpret_cast<unsigned int const *>(pos.data) + i;
+        auto sin_ = sin_table + pos_ids[i] * dim;
+        auto cos_ = cos_table + pos_ids[i] * dim;
         for (int j = 0; j < nh; ++j) {
-            auto t_ = reinterpret_cast<uint16_t *>(reinterpret_cast<char *>(t.data) + i * stride_0 + j * stride_1);
-            for (int k = 0; k < dh; ++k) {
+            auto t_ = reinterpret_cast<uint16_t *>(t) + i * stride_0 + j * stride_1;
+            for (int k = 0; k < dk; ++k) {
                 auto a = f16_to_f32(t_[2 * k]);
                 auto b = f16_to_f32(t_[2 * k + 1]);
-                auto pos__ = *pos_;
-                float freq = float(pos__) / powf(theta, float(k) / float(dh));
-                float sin = sinf(freq);
-                float cos = cosf(freq);
-                t_[2 * k] = f32_to_f16(a * cos - b * sin);
-                t_[2 * k + 1] = f32_to_f16(a * sin + b * cos);
+                float sin0 = sin_[k * 2], cos0 = cos_[k * 2];
+                float sin1 = sin_[k * 2 + 1], cos1 = cos_[k * 2 + 1];
+                t_[2 * k] = f32_to_f16(a * cos0 - b * sin0);
+                t_[2 * k + 1] = f32_to_f16(a * sin1 + b * cos1);
             }
         }
     }
 }
+
+
+infiniopStatus_t cpuCreateRoPEDescriptor(CpuHandle_t handle,
+                                         RoPECpuDescriptor_t *desc_ptr,
+                                         infiniopTensorDescriptor_t t,
+                                         infiniopTensorDescriptor_t pos_ids,
+                                         infiniopTensorDescriptor_t sin_table,
+                                         infiniopTensorDescriptor_t cos_table) {
+
+    if (desc_ptr == nullptr)
+        return STATUS_MEMORY_NOT_ALLOCATED;
+
+    if (t->ndim != 3 ||
+        pos_ids->ndim != 1 ||
+        sin_table->ndim != 2 ||
+        cos_table->ndim != 2)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    auto seq_len = t->shape[0];
+    auto nhead = t->shape[1];
+    auto dim = t->shape[2];
+    auto total_seq_len = sin_table->shape[0];
+
+    if (dim % 2 != 0)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    if (pos_ids->shape[0] != seq_len ||
+        sin_table->shape[1] != dim ||
+        cos_table->shape[1] != dim ||
+        sin_table->shape[0] != cos_table->shape[0])
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    if (t->strides[2] != 1 ||
+        pos_ids->strides[0] != 1 ||
+        sin_table->strides[1] != 1 ||
+        cos_table->strides[1] != 1)
+        return STATUS_BAD_TENSOR_STRIDES;
+
+    if (!dtype_eq(t->dt, F16))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(pos_ids->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    *desc_ptr = new RoPECpuDescriptor{
+        handle->device,
+        t->dt,
+        seq_len,
+        nhead,
+        dim,
+        total_seq_len,
+        {t->strides[0], t->strides[1]}};
+
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t cpuGetRoPEWorkspaceSize(RoPECpuDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t cpuRoPE(RoPECpuDescriptor_t desc,
+                         void *workspace,
+                         uint64_t workspace_size,
+                         void *t,
+                         void const *pos_ids,
+                         void const *sin_table,
+                         void const *cos_table,
+                         void *stream) {
+    if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr)
+        return STATUS_BAD_PARAM;
+
+    if (dtype_eq(desc->dtype, F16)) {
+        rotary_embedding_cpu_f16(desc, t,
+                                 reinterpret_cast<uint64_t const *>(pos_ids),
+                                 reinterpret_cast<float const *>(sin_table),
+                                 reinterpret_cast<float const *>(cos_table));
+    } else {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t cpuDestroyRoPEDescriptor(RoPECpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.h b/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.h
index 15a1831a..8957b8c5 100644
--- a/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.h
+++ b/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.h
@@ -2,11 +2,31 @@
 #define __CPU_ROTARY_EMBEDDING_H__
 
 #include "operators.h"
+#include "../../../devices/cpu/cpu_handle.h"
 
-struct RotaryEmbeddingCpuDescriptor {
-    Device device;
-};
+struct RoPECpuDescriptor;
+
+typedef struct RoPECpuDescriptor *RoPECpuDescriptor_t;
+
+infiniopStatus_t cpuCreateRoPEDescriptor(CpuHandle_t handle,
+                                         RoPECpuDescriptor_t *desc_ptr,
+                                         infiniopTensorDescriptor_t t,
+                                         infiniopTensorDescriptor_t pos_ids,
+                                         infiniopTensorDescriptor_t sin_table,
+                                         infiniopTensorDescriptor_t cos_table);
+
+infiniopStatus_t cpuGetRoPEWorkspaceSize(RoPECpuDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cpuRoPE(RoPECpuDescriptor_t desc,
+                         void *workspace,
+                         uint64_t workspace_size,
+                         void *t,
+                         void const *pos_ids,
+                         void const *sin_table,
+                         void const *cos_table,
+                         void *stream);
+
+infiniopStatus_t cpuDestroyRoPEDescriptor(RoPECpuDescriptor_t desc);
 
-void rotary_embedding_cpu_f16(Tensor t, Tensor pos, float theta);
 
 #endif// __CPU_RMS_NORM_H__
diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cc b/src/ops/rotary_embedding/cuda/rotary_embedding.cc
new file mode 100644
index 00000000..102eb474
--- /dev/null
+++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cc
@@ -0,0 +1,76 @@
+#include "rotary_embedding.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateRoPEDescriptor(CudaHandle_t handle,
+                                          RoPECudaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t t,
+                                          infiniopTensorDescriptor_t pos_ids,
+                                          infiniopTensorDescriptor_t sin_table,
+                                          infiniopTensorDescriptor_t cos_table) {
+    if (desc_ptr == nullptr)
+        return STATUS_MEMORY_NOT_ALLOCATED;
+
+    if (t->ndim != 3 ||
+        pos_ids->ndim != 1 ||
+        sin_table->ndim != 2 ||
+        cos_table->ndim != 2)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    auto seq_len = t->shape[0];
+    auto nhead = t->shape[1];
+    auto dim = t->shape[2];
+    auto total_seq_len = sin_table->shape[0];
+
+    if (dim % 2 != 0)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    if (pos_ids->shape[0] != seq_len ||
+        sin_table->shape[1] != dim ||
+        cos_table->shape[1] != dim ||
+        sin_table->shape[0] != cos_table->shape[0])
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    // TODO: support larger dim in the future
+    if (dim / 2 > MAX_THREADS_PER_BLOCK) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (t->strides[2] != 1 ||
+        pos_ids->strides[0] != 1 ||
+        sin_table->strides[1] != 1 ||
+        cos_table->strides[1] != 1)
+        return STATUS_BAD_TENSOR_STRIDES;
+
+    if (!dtype_eq(t->dt, F16))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(pos_ids->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    *desc_ptr = new RoPECudaDescriptor{
+        handle->device,
+        handle->device_id,
+        t->dt,
+        seq_len,
+        nhead,
+        dim,
+        total_seq_len,
+        {t->strides[0], t->strides[1]}};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t cudaDestroyRoPEDescriptor(RoPECudaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cu b/src/ops/rotary_embedding/cuda/rotary_embedding.cu
index 373abcb1..62579c3d 100644
--- a/src/ops/rotary_embedding/cuda/rotary_embedding.cu
+++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cu
@@ -2,41 +2,69 @@
 #include "rotary_embedding.cuh"
 #include <cuda_fp16.h>
 
-static __global__ void padding(
-    half2 *__restrict__ x_,
-    unsigned int const *__restrict__ pos_,
-    float const theta,
-    unsigned int const leading_dim) {
-    auto dh = blockDim.x;
+static __global__ void padding_f16(
+    half *__restrict__ x_,
+    uint64_t const *__restrict__ pos_,
+    float const *__restrict__ sin_,
+    float const *__restrict__ cos_,
+    long const stride0,
+    long const stride1) {
+    auto dk = blockDim.x;
     auto k = threadIdx.x;
+    auto offset = blockIdx.x * stride0 + blockIdx.y * stride1 + k * 2;
+    auto &x = reinterpret_cast<half2 &>(x_[offset]);
+    auto pos = pos_[blockIdx.x];
+    auto sincos_offset = pos * dk * 2 + k * 2;
 
-    auto &x = x_[blockIdx.x * leading_dim + blockIdx.y * dh + k];
-    auto pos = float(pos_[blockIdx.x]);
+    float sin0 = sin_[sincos_offset], cos0 = cos_[sincos_offset],
+          sin1 = sin_[sincos_offset + 1], cos1 = cos_[sincos_offset + 1];
+    float x0 = __half2float(x.x) * cos0 - __half2float(x.y) * sin0;
+    float x1 = __half2float(x.y) * cos1 + __half2float(x.x) * sin1;
+    x = half2(x0, x1);
+}
 
-    float sin, cos;
-    sincosf(pos / powf(theta, float(k) / float(dh)), &sin, &cos);
 
-    x = x * half2(cos, cos) + half2(-x.y, x.x) * half2(sin, sin);
-}
+void rotary_embedding_nv_gpu_f16(
+    RoPECudaDescriptor_t desc,
+    half *t,
+    uint64_t const *pos,
+    float const *sin_, float const *cos_,
+    void *stream) {
+    auto nt = desc->seq_len,
+         nh = desc->nhead,
+         dh = desc->dim;
 
-constexpr static int
-    BLOCK_SIZE = 1024;
+    // batching 2 half together
+    auto stride0 = desc->strides[0],
+         stride1 = desc->strides[1];
 
-void rotary_embedding_nv_gpu_f16(Tensor t, Tensor pos, float theta, void *stream) {
-    ASSERT_EQ(t.layout->ndim, 3);
-    ASSERT_EQ(pos.layout->ndim, 1);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    padding_f16<<<dim3(nt, nh), dh / 2, 0, cuda_stream>>>(t, pos, sin_, cos_, stride0, stride1);
+}
 
-    auto nt = t.layout->shape[0],
-         nh = t.layout->shape[1],
-         dh = t.layout->shape[2];
+infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc,
+                          void *workspace,
+                          uint64_t workspace_size,
+                          void *t,
+                          void const *pos_ids,
+                          void const *sin_table,
+                          void const *cos_table,
+                          void *stream) {
+    if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr)
+        return STATUS_BAD_PARAM;
 
-    ASSERT_EQ(pos.layout->shape[0], nt);
-    ASSERT(dh < BLOCK_SIZE);
+    checkCudaError(cudaSetDevice(desc->device_id));
 
-    auto t_ptr = reinterpret_cast<half2 *>(t.data);
-    auto pos_ptr = reinterpret_cast<unsigned int const *>(pos.data);
-    auto leading_dim = t.layout->strides[0] / 4;
+    if (dtype_eq(desc->dtype, F16)) {
+        rotary_embedding_nv_gpu_f16(desc,
+                                    reinterpret_cast<half *>(t),
+                                    reinterpret_cast<uint64_t const *>(pos_ids),
+                                    reinterpret_cast<float const *>(sin_table),
+                                    reinterpret_cast<float const *>(cos_table),
+                                    stream);
+    } else {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
 
-    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
-    padding<<<dim3(nt, nh), dh / 2, 0, cuda_stream>>>(t_ptr, pos_ptr, theta, leading_dim);
+    return STATUS_SUCCESS;
 }
diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cuh b/src/ops/rotary_embedding/cuda/rotary_embedding.cuh
index 83ee010e..36b14194 100644
--- a/src/ops/rotary_embedding/cuda/rotary_embedding.cuh
+++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cuh
@@ -1,12 +1,40 @@
 #ifndef __NV_GPU_ROTARY_EMBEDDING_H__
 #define __NV_GPU_ROTARY_EMBEDDING_H__
 
+#include "../../../devices/cuda/cuda_handle.h"
 #include "operators.h"
 
-struct RotaryEmbeddingCudaDescriptor {
+struct RoPECudaDescriptor {
     Device device;
+    int device_id;
+    DT dtype;
+    uint64_t seq_len;
+    uint64_t nhead;
+    uint64_t dim;
+    uint64_t total_seq_len;
+    int64_t strides[2];
 };
 
-void rotary_embedding_nv_gpu_f16(Tensor t, Tensor pos, float theta, void *stream);
+typedef struct RoPECudaDescriptor *RoPECudaDescriptor_t;
+
+infiniopStatus_t cudaCreateRoPEDescriptor(CudaHandle_t handle,
+                                          RoPECudaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t t,
+                                          infiniopTensorDescriptor_t pos_ids,
+                                          infiniopTensorDescriptor_t sin_table,
+                                          infiniopTensorDescriptor_t cos_table);
+
+infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc,
+                          void *workspace,
+                          uint64_t workspace_size,
+                          void *t,
+                          void const *pos_ids,
+                          void const *sin_table,
+                          void const *cos_table,
+                          void *stream);
+
+infiniopStatus_t cudaDestroyRoPEDescriptor(RoPECudaDescriptor_t desc);
 
 #endif// __NV_GPU_ROTARY_EMBEDDING_H__
diff --git a/src/ops/rotary_embedding/maca/rotary_embedding_maca.cc b/src/ops/rotary_embedding/maca/rotary_embedding_maca.cc
new file mode 100644
index 00000000..171f1c57
--- /dev/null
+++ b/src/ops/rotary_embedding/maca/rotary_embedding_maca.cc
@@ -0,0 +1,76 @@
+#include "rotary_embedding_maca.h"
+#include "../../../devices/maca/common_maca.h"
+#include "../../utils.h"
+
+infiniopStatus_t macaCreateRoPEDescriptor(MacaHandle_t handle,
+                                          RoPEMacaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t t,
+                                          infiniopTensorDescriptor_t pos_ids,
+                                          infiniopTensorDescriptor_t sin_table,
+                                          infiniopTensorDescriptor_t cos_table) {
+    if (desc_ptr == nullptr)
+        return STATUS_MEMORY_NOT_ALLOCATED;
+
+    if (t->ndim != 3 ||
+        pos_ids->ndim != 1 ||
+        sin_table->ndim != 2 ||
+        cos_table->ndim != 2)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    auto seq_len = t->shape[0];
+    auto nhead = t->shape[1];
+    auto dim = t->shape[2];
+    auto total_seq_len = sin_table->shape[0];
+
+    if (dim % 2 != 0)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    if (pos_ids->shape[0] != seq_len ||
+        sin_table->shape[1] != dim ||
+        cos_table->shape[1] != dim ||
+        sin_table->shape[0] != cos_table->shape[0])
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    // TODO: support larger dim in the future
+    if (dim / 2 > MAX_THREADS_PER_BLOCK) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (t->strides[2] != 1 ||
+        pos_ids->strides[0] != 1 ||
+        sin_table->strides[1] != 1 ||
+        cos_table->strides[1] != 1)
+        return STATUS_BAD_TENSOR_STRIDES;
+
+    if (!dtype_eq(t->dt, F16))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(pos_ids->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    *desc_ptr = new RoPEMacaDescriptor{
+        handle->device,
+        handle->device_id,
+        t->dt,
+        seq_len,
+        nhead,
+        dim,
+        total_seq_len,
+        {t->strides[0], t->strides[1]}};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t macaGetRoPEWorkspaceSize(RoPEMacaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t macaDestroyRoPEDescriptor(RoPEMacaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rotary_embedding/maca/rotary_embedding_maca.h b/src/ops/rotary_embedding/maca/rotary_embedding_maca.h
new file mode 100644
index 00000000..f5de3b14
--- /dev/null
+++ b/src/ops/rotary_embedding/maca/rotary_embedding_maca.h
@@ -0,0 +1,40 @@
+#ifndef __METAX_GPU_ROTARY_EMBEDDING_H__
+#define __METAX_GPU_ROTARY_EMBEDDING_H__
+
+#include "../../../devices/maca/maca_handle.h"
+#include "operators.h"
+
+struct RoPEMacaDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t seq_len;
+    uint64_t nhead;
+    uint64_t dim;
+    uint64_t total_seq_len;
+    int64_t strides[2];
+};
+
+typedef struct RoPEMacaDescriptor *RoPEMacaDescriptor_t;
+
+infiniopStatus_t macaCreateRoPEDescriptor(MacaHandle_t handle,
+                                          RoPEMacaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t t,
+                                          infiniopTensorDescriptor_t pos_ids,
+                                          infiniopTensorDescriptor_t sin_table,
+                                          infiniopTensorDescriptor_t cos_table);
+
+infiniopStatus_t macaGetRoPEWorkspaceSize(RoPEMacaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t macaRoPE(RoPEMacaDescriptor_t desc,
+                          void *workspace,
+                          uint64_t workspace_size,
+                          void *t,
+                          void const *pos_ids,
+                          void const *sin_table,
+                          void const *cos_table,
+                          void *stream);
+
+infiniopStatus_t macaDestroyRoPEDescriptor(RoPEMacaDescriptor_t desc);
+
+#endif// __METAX_GPU_ROTARY_EMBEDDING_H__
diff --git a/src/ops/rotary_embedding/maca/rotary_embedding_maca.maca b/src/ops/rotary_embedding/maca/rotary_embedding_maca.maca
new file mode 100644
index 00000000..aaa52250
--- /dev/null
+++ b/src/ops/rotary_embedding/maca/rotary_embedding_maca.maca
@@ -0,0 +1,70 @@
+#include "../../utils.h"
+#include "rotary_embedding_maca.h"
+#include <common/hpcc_fp16.h>
+
+static __global__ void padding_f16(
+    half *__restrict__ x_,
+    uint64_t const *__restrict__ pos_,
+    float const *__restrict__ sin_,
+    float const *__restrict__ cos_,
+    long const stride0,
+    long const stride1) {
+    auto dk = blockDim.x;
+    auto k = threadIdx.x;
+    auto offset = blockIdx.x * stride0 + blockIdx.y * stride1 + k * 2;
+    auto &x = reinterpret_cast<half2 &>(x_[offset]);
+    auto pos = pos_[blockIdx.x];
+    auto sincos_offset = pos * dk * 2 + k * 2;
+
+    float sin0 = sin_[sincos_offset], cos0 = cos_[sincos_offset],
+          sin1 = sin_[sincos_offset + 1], cos1 = cos_[sincos_offset + 1];
+    float x0 = __half2float(x.x) * cos0 - __half2float(x.y) * sin0;
+    float x1 = __half2float(x.y) * cos1 + __half2float(x.x) * sin1;
+    x = half2(x0, x1);
+}
+
+
+void rotary_embedding_mc_gpu_f16(
+    RoPEMacaDescriptor_t desc,
+    half *t,
+    uint64_t const *pos,
+    float const *sin_, float const *cos_,
+    void *stream) {
+    auto nt = desc->seq_len,
+         nh = desc->nhead,
+         dh = desc->dim;
+
+    // batching 2 half together
+    auto stride0 = desc->strides[0],
+         stride1 = desc->strides[1];
+
+    auto maca_stream = reinterpret_cast<hcStream_t>(stream);
+    padding_f16<<<dim3(nt, nh), dh / 2, 0, maca_stream>>>(t, pos, sin_, cos_, stride0, stride1);
+}
+
+infiniopStatus_t macaRoPE(RoPEMacaDescriptor_t desc,
+                          void *workspace,
+                          uint64_t workspace_size,
+                          void *t,
+                          void const *pos_ids,
+                          void const *sin_table,
+                          void const *cos_table,
+                          void *stream) {
+    if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr)
+        return STATUS_BAD_PARAM;
+
+    checkMacaError(hcSetDevice(desc->device_id));
+
+    if (dtype_eq(desc->dtype, F16)) {
+        rotary_embedding_mc_gpu_f16(desc,
+                                    reinterpret_cast<half *>(t),
+                                    reinterpret_cast<uint64_t const *>(pos_ids),
+                                    reinterpret_cast<float const *>(sin_table),
+                                    reinterpret_cast<float const *>(cos_table),
+                                    stream);
+    } else {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rotary_embedding/musa/rotary_embedding_musa.cc b/src/ops/rotary_embedding/musa/rotary_embedding_musa.cc
new file mode 100644
index 00000000..9ba0547d
--- /dev/null
+++ b/src/ops/rotary_embedding/musa/rotary_embedding_musa.cc
@@ -0,0 +1,76 @@
+#include "rotary_embedding_musa.h"
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+
+infiniopStatus_t musaCreateRoPEDescriptor(MusaHandle_t handle,
+                                          RoPEMusaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t t,
+                                          infiniopTensorDescriptor_t pos_ids,
+                                          infiniopTensorDescriptor_t sin_table,
+                                          infiniopTensorDescriptor_t cos_table) {
+    if (desc_ptr == nullptr)
+        return STATUS_MEMORY_NOT_ALLOCATED;
+
+    if (t->ndim != 3 ||
+        pos_ids->ndim != 1 ||
+        sin_table->ndim != 2 ||
+        cos_table->ndim != 2)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    auto seq_len = t->shape[0];
+    auto nhead = t->shape[1];
+    auto dim = t->shape[2];
+    auto total_seq_len = sin_table->shape[0];
+
+    if (dim % 2 != 0)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    if (pos_ids->shape[0] != seq_len ||
+        sin_table->shape[1] != dim ||
+        cos_table->shape[1] != dim ||
+        sin_table->shape[0] != cos_table->shape[0])
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    // TODO: support larger dim in the future
+    if (dim / 2 > MAX_THREADS_PER_BLOCK) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (t->strides[2] != 1 ||
+        pos_ids->strides[0] != 1 ||
+        sin_table->strides[1] != 1 ||
+        cos_table->strides[1] != 1)
+        return STATUS_BAD_TENSOR_STRIDES;
+
+    if (!dtype_eq(t->dt, F16))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(pos_ids->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    *desc_ptr = new RoPEMusaDescriptor{
+        handle->device,
+        handle->device_id,
+        t->dt,
+        seq_len,
+        nhead,
+        dim,
+        total_seq_len,
+        {t->strides[0], t->strides[1]}};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaGetRoPEWorkspaceSize(RoPEMusaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t musaDestroyRoPEDescriptor(RoPEMusaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rotary_embedding/musa/rotary_embedding_musa.h b/src/ops/rotary_embedding/musa/rotary_embedding_musa.h
new file mode 100644
index 00000000..7a14daea
--- /dev/null
+++ b/src/ops/rotary_embedding/musa/rotary_embedding_musa.h
@@ -0,0 +1,40 @@
+#ifndef __MUSA_ROTARY_EMBEDDING_H__
+#define __MUSA_ROTARY_EMBEDDING_H__
+
+#include "../../../devices/musa/musa_handle.h"
+#include "operators.h"
+
+struct RoPEMusaDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t seq_len;
+    uint64_t nhead;
+    uint64_t dim;
+    uint64_t total_seq_len;
+    int64_t strides[2];
+};
+
+typedef struct RoPEMusaDescriptor *RoPEMusaDescriptor_t;
+
+infiniopStatus_t musaCreateRoPEDescriptor(MusaHandle_t handle,
+                                          RoPEMusaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t t,
+                                          infiniopTensorDescriptor_t pos_ids,
+                                          infiniopTensorDescriptor_t sin_table,
+                                          infiniopTensorDescriptor_t cos_table);
+
+infiniopStatus_t musaGetRoPEWorkspaceSize(RoPEMusaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t musaRoPE(RoPEMusaDescriptor_t desc,
+                          void *workspace,
+                          uint64_t workspace_size,
+                          void *t,
+                          void const *pos_ids,
+                          void const *sin_table,
+                          void const *cos_table,
+                          void *stream);
+
+infiniopStatus_t musaDestroyRoPEDescriptor(RoPEMusaDescriptor_t desc);
+
+#endif// __MT_GPU_ROTARY_EMBEDDING_H__
diff --git a/src/ops/rotary_embedding/musa/rotary_embedding_musa.mu b/src/ops/rotary_embedding/musa/rotary_embedding_musa.mu
new file mode 100644
index 00000000..bac7ad47
--- /dev/null
+++ b/src/ops/rotary_embedding/musa/rotary_embedding_musa.mu
@@ -0,0 +1,68 @@
+#include "../../utils.h"
+#include "rotary_embedding_musa.h"
+#include <musa_fp16.h>
+
+static __global__ void padding_f16(
+    half *__restrict__ x_,
+    uint64_t const *__restrict__ pos_,
+    float const *__restrict__ sin_,
+    float const *__restrict__ cos_,
+    long const stride0,
+    long const stride1) {
+    auto dk = blockDim.x;
+    auto k = threadIdx.x;
+    auto offset = blockIdx.x * stride0 + blockIdx.y * stride1 + k * 2;
+    auto &x = reinterpret_cast<half2 &>(x_[offset]);
+    auto pos = pos_[blockIdx.x];
+    auto sincos_offset = pos * dk * 2 + k * 2;
+
+    float sin0 = sin_[sincos_offset], cos0 = cos_[sincos_offset],
+          sin1 = sin_[sincos_offset + 1], cos1 = cos_[sincos_offset + 1];
+    float x0 = __half2float(x.x) * cos0 - __half2float(x.y) * sin0;
+    float x1 = __half2float(x.y) * cos1 + __half2float(x.x) * sin1;
+    x = half2(x0, x1);
+}
+
+
+void rotary_embedding_mt_gpu_f16(
+    RoPEMusaDescriptor_t desc,
+    half *t,
+    uint64_t const *pos,
+    float const *sin_, float const *cos_,
+    void *stream) {
+    auto nt = desc->seq_len,
+         nh = desc->nhead,
+         dh = desc->dim;
+
+    // batching 2 half together
+    auto stride0 = desc->strides[0],
+         stride1 = desc->strides[1];
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    padding_f16<<<dim3(nt, nh), dh / 2, 0, musa_stream>>>(t, pos, sin_, cos_, stride0, stride1);
+}
+
+infiniopStatus_t musaRoPE(RoPEMusaDescriptor_t desc,
+                          void *workspace,
+                          uint64_t workspace_size,
+                          void *t,
+                          void const *pos_ids,
+                          void const *sin_table,
+                          void const *cos_table,
+                          void *stream) {
+    if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr)
+        return STATUS_BAD_PARAM;
+
+    if (dtype_eq(desc->dtype, F16)) {
+        rotary_embedding_mt_gpu_f16(desc,
+                                    reinterpret_cast<half *>(t),
+                                    reinterpret_cast<uint64_t const *>(pos_ids),
+                                    reinterpret_cast<float const *>(sin_table),
+                                    reinterpret_cast<float const *>(cos_table),
+                                    stream);
+    } else {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/rotary_embedding/operator.cc b/src/ops/rotary_embedding/operator.cc
index dcfd1282..bc2dbc09 100644
--- a/src/ops/rotary_embedding/operator.cc
+++ b/src/ops/rotary_embedding/operator.cc
@@ -2,85 +2,209 @@
 #include "ops/rotary_embedding/rotary_embedding.h"
 
 #ifdef ENABLE_CPU
+#include "../../devices/cpu/cpu_handle.h"
 #include "cpu/rotary_embedding_cpu.h"
 #endif
 #ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
 #include "cuda/rotary_embedding.cuh"
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-#include "bang/rotary_embedding_cnnl.h"
+#include "bang/rotary_embedding_bang.h"
+#endif
+#ifdef ENABLE_ASCEND_NPU
+#include "ascend/rotary_embedding.h"
+#endif
+#ifdef ENABLE_METAX_GPU
+#include "maca/rotary_embedding_maca.h"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#include "musa/rotary_embedding_musa.h"
 #endif
 
-struct RotaryEmbeddingDescriptor {
+struct RoPEDescriptor {
     Device device;
 };
 
-__C void *createRotaryEmbeddingDescriptor(Device device, void *config) {
-    switch (device) {
+
+__C infiniopStatus_t infiniopCreateRoPEDescriptor(infiniopHandle_t handle,
+                                                  infiniopRoPEDescriptor_t *desc_ptr,
+                                                  infiniopTensorDescriptor_t t,
+                                                  infiniopTensorDescriptor_t pos_ids,
+                                                  infiniopTensorDescriptor_t sin_table,
+                                                  infiniopTensorDescriptor_t cos_table) {
+    switch (handle->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            return (RotaryEmbeddingDescriptor *) (new RotaryEmbeddingCpuDescriptor{device});
+            return cpuCreateRoPEDescriptor((CpuHandle_t) handle, (RoPECpuDescriptor_t *) desc_ptr, t, pos_ids, sin_table, cos_table);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            return (RotaryEmbeddingDescriptor *) (new RotaryEmbeddingCudaDescriptor{device});
+        case DevNvGpu: {
+            return cudaCreateRoPEDescriptor((CudaHandle_t) handle, (RoPECudaDescriptor_t *) desc_ptr, t, pos_ids, sin_table, cos_table);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         case DevCambriconMlu: {
-            auto bangDescriptor = new RotaryEmbeddingBangDescriptor(device);
-            bangDescriptor->createCnnlDescriptors();
-            return (RotaryEmbeddingDescriptor *) (bangDescriptor);
+            return bangCreateRoPEDescriptor((BangHandle_t) handle, (RoPEBangDescriptor_t *) desc_ptr, t, pos_ids, sin_table, cos_table);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendCreateRoPEDescriptor((AscendHandle_t) handle,
+                                              (RoPEAscendDescriptor_t *) desc_ptr,
+                                              t,
+                                              pos_ids,
+                                              sin_table,
+                                              cos_table);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCreateRoPEDescriptor((MacaHandle_t) handle,
+                                            (RoPEMacaDescriptor_t *) desc_ptr,
+                                            t,
+                                            pos_ids,
+                                            sin_table,
+                                            cos_table);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCreateRoPEDescriptor((MusaHandle_t) handle, (RoPEMusaDescriptor_t *) desc_ptr, t, pos_ids, sin_table, cos_table);
         }
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
-    return nullptr;
-};
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGetRoPEWorkspaceSize((RoPECpuDescriptor_t) desc, size);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaGetRoPEWorkspaceSize((RoPECudaDescriptor_t) desc, size);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangGetRoPEWorkspaceSize((RoPEBangDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendGetRoPEWorkspaceSize((RoPEAscendDescriptor_t) desc,
+                                              size);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaGetRoPEWorkspaceSize((RoPEMacaDescriptor_t) desc,
+                                              size);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaGetRoPEWorkspaceSize((RoPEMusaDescriptor_t) desc, size);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
 
-__C void destroyRotaryEmbeddingDescriptor(RotaryEmbeddingDescriptor *descriptor) {
-    switch (descriptor->device) {
+__C infiniopStatus_t infiniopRoPE(infiniopRoPEDescriptor_t desc,
+                                  void *workspace,
+                                  uint64_t workspace_size,
+                                  void *t,
+                                  void const *pos_ids,
+                                  void const *sin_table,
+                                  void const *cos_table,
+                                  void *stream) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            delete (RotaryEmbeddingCpuDescriptor *) (descriptor);
-            break;
+            return cpuRoPE((RoPECpuDescriptor_t) desc, workspace, workspace_size, t, pos_ids, sin_table, cos_table, stream);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            delete (RotaryEmbeddingCudaDescriptor *) (descriptor);
-            break;
+        case DevNvGpu: {
+            return cudaRoPE((RoPECudaDescriptor_t) desc, workspace, workspace_size, t, pos_ids, sin_table, cos_table, stream);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         case DevCambriconMlu: {
-            auto bangDescriptor = (RotaryEmbeddingBangDescriptor *) (descriptor);
-            bangDescriptor->destroyCnnlDescriptors();
-            delete bangDescriptor;
-            break;
+            return bangRoPE((RoPEBangDescriptor_t) desc, workspace, workspace_size, t, pos_ids, sin_table, cos_table, stream);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendRoPE((RoPEAscendDescriptor_t) desc,
+                              workspace,
+                              workspace_size,
+                              t,
+                              pos_ids,
+                              sin_table,
+                              cos_table,
+                              stream);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaRoPE((RoPEMacaDescriptor_t) desc,
+                              workspace,
+                              workspace_size,
+                              t,
+                              pos_ids,
+                              sin_table,
+                              cos_table,
+                              stream);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaRoPE((RoPEMusaDescriptor_t) desc, workspace, workspace_size, t, pos_ids, sin_table, cos_table, stream);
         }
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
+    return STATUS_BAD_DEVICE;
 }
 
-__C void rotaryEmbedding(RotaryEmbeddingDescriptor *descriptor, Tensor t, Tensor pos, float theta, void *stream) {
-    switch (descriptor->device) {
+__C infiniopStatus_t infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            rotary_embedding_cpu_f16(t, pos, theta);
-            break;
+            return cpuDestroyRoPEDescriptor((RoPECpuDescriptor_t) desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            rotary_embedding_nv_gpu_f16(t, pos, theta, stream);
-            break;
+        case DevNvGpu: {
+            return cudaDestroyRoPEDescriptor((RoPECudaDescriptor_t) desc);
+        }
+
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu:
-            rotary_embedding_cnnl_f16((RotaryEmbeddingBangDescriptor *) (descriptor), t, pos, theta, stream);
-            break;
+        case DevCambriconMlu: {
+            return bangDestroyRoPEDescriptor((RoPEBangDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendDestroyRoPEDescriptor((RoPEAscendDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaDestroyRoPEDescriptor((RoPEMacaDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaDestroyRoPEDescriptor((RoPEMusaDescriptor_t) desc);
+        }
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
-};
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/swiglu/ascend/swiglu.cc b/src/ops/swiglu/ascend/swiglu.cc
new file mode 100644
index 00000000..ff2ee514
--- /dev/null
+++ b/src/ops/swiglu/ascend/swiglu.cc
@@ -0,0 +1,71 @@
+#include "swiglu.h"
+
+
+infiniopStatus_t ascendCreateSwiGLUDescriptor(AscendHandle_t handle,
+                                              SwiGLUAscendDescriptor_t *desc_ptr,
+                                              infiniopTensorDescriptor_t c_desc,
+                                              infiniopTensorDescriptor_t a_desc,
+                                              infiniopTensorDescriptor_t b_desc) {
+    uint64_t ndim = c_desc->ndim;
+    DT dtype = c_desc->dt;
+
+    aclDataType dt;
+    if (dtype_eq(dtype, F16)) {
+        dt = aclDataType::ACL_FLOAT16;
+    } else if (dtype_eq(dtype, F32)) {
+        dt = aclDataType::ACL_FLOAT;
+    } else {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    if (ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (c_desc->strides[1] != 1 || a_desc->strides[1] != 1 || b_desc->strides[1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    int32_t seq_len = static_cast<int32_t>(c_desc->shape[0]),
+            di = static_cast<int32_t>(c_desc->shape[1]);
+
+    int32_t sta = static_cast<int32_t>(a_desc->strides[0]);
+    int32_t stb = static_cast<int32_t>(b_desc->strides[0]);
+    int32_t stc = static_cast<int32_t>(c_desc->strides[0]);
+
+    *desc_ptr = new SwiGLUAscendDescriptor{
+        handle->device,
+        handle->device_id,
+        dt,
+        seq_len,
+        di,
+        sta,
+        stb,
+        stc};
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t ascendSwiGLU(SwiGLUAscendDescriptor_t desc,
+                              void *c,
+                              void const *a,
+                              void const *b,
+                              void *stream) {
+    auto seq_len = desc->seq_len,
+         di = desc->di;
+
+    auto sta = desc->sta,
+         stb = desc->stb,
+         stc = desc->stc;
+
+    auto dt = desc->dtype;
+    
+    // Set device
+    aclrtSetDevice(desc->device_id);
+
+    return swiglu_kernel_do(c, (void *) a, (void *) b, 1.0, seq_len, di, sta, stb, stc, dt, stream);
+}
+
+infiniopStatus_t ascendDestroySwiGLUDescriptor(SwiGLUAscendDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/swiglu/ascend/swiglu.h b/src/ops/swiglu/ascend/swiglu.h
new file mode 100644
index 00000000..be02a318
--- /dev/null
+++ b/src/ops/swiglu/ascend/swiglu.h
@@ -0,0 +1,45 @@
+#ifndef __ACLNN_SWIGLU_H__
+#define __ACLNN_SWIGLU_H__
+
+#include "../../../devices/ascend/ascend_handle.h"
+#include "../../../devices/ascend/tensor_aclnn.h"
+#include "../../utils.h"
+#include "operators.h"
+#include "../../utils.h"
+#include <acl/acl_base.h>
+#include <acl/acl.h>
+
+
+struct SwiGLUAscendDescriptor {
+    Device device;
+    int device_id;
+    aclDataType dtype;
+    int32_t seq_len;
+    int32_t di;
+    int32_t sta;
+    int32_t stb;
+    int32_t stc;
+};
+
+typedef struct SwiGLUAscendDescriptor *SwiGLUAscendDescriptor_t;
+
+infiniopStatus_t ascendCreateSwiGLUDescriptor(AscendHandle_t handle,
+                                              SwiGLUAscendDescriptor_t *desc_ptr,
+                                              infiniopTensorDescriptor_t c_desc,
+                                              infiniopTensorDescriptor_t a_desc,
+                                              infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t ascendSwiGLU(SwiGLUAscendDescriptor_t desc,
+                              void *c,
+                              void const *a,
+                              void const *b,
+                              void *stream);
+
+infiniopStatus_t ascendDestroySwiGLUDescriptor(SwiGLUAscendDescriptor_t desc);
+
+extern "C" infiniopStatus_t swiglu_kernel_do(void *c, void *a, void *b,
+                                 float beta, int32_t nt, int32_t dh,
+                                 int32_t sta, int32_t stb, int32_t stc,
+                                 int dtype, void *stream);
+
+#endif
diff --git a/src/ops/swiglu/ascend/swiglu_kernel.cpp b/src/ops/swiglu/ascend/swiglu_kernel.cpp
new file mode 100644
index 00000000..3dab674f
--- /dev/null
+++ b/src/ops/swiglu/ascend/swiglu_kernel.cpp
@@ -0,0 +1,181 @@
+#include "../../../../include/status.h"
+#include "kernel_operator.h"
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 1;
+constexpr int32_t BLOCK_NUM = 8;
+
+template<typename T> class KernelSwiGLU {
+public:
+    __aicore__ inline KernelSwiGLU() {}
+    // Init SwiGLU
+    // c output tensor, support only 2 dim
+    // a up tensor
+    // b gate tensor
+    // formular: b = a x silu(b)
+    // a, b, c has same tensor shape
+    __aicore__ inline void Init(GM_ADDR c, GM_ADDR a, GM_ADDR b,
+                                float beta, int32_t nt, int32_t dh,
+                                int32_t sta, int32_t stb, int32_t stc,
+                                uint32_t remainder, uint32_t base);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyIn(int32_t i);
+    __aicore__ inline void Compute(int32_t i);
+    __aicore__ inline void CopyOut(int32_t i);
+
+private:
+    TPipe pipe;
+    TQue<QuePosition::VECIN, BUFFER_NUM> aQue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> bQue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> cQue;
+    // Used in GatherMask
+    // TBuf<TPosition::VECCALC> outBuf;
+
+    GlobalTensor<T> aGm;
+    GlobalTensor<T> bGm;
+    GlobalTensor<T> cGm;
+
+    uint32_t _block_idx;
+    uint32_t _tile_len;
+    uint32_t _copy_len;
+
+    // c[nt, dh]
+    // strides = [stx, 1]
+    int32_t nt;
+    int32_t dh;
+    int32_t sta;
+    int32_t stb;
+    int32_t stc;
+    float beta;
+};
+
+
+template<typename T>
+__aicore__ inline void KernelSwiGLU<T>::Init(GM_ADDR c, GM_ADDR a, GM_ADDR b,
+                                             float beta, int32_t nt, int32_t dh,
+                                             int32_t sta, int32_t stb, int32_t stc,
+                                             uint32_t remainder, uint32_t base) {
+
+    this->nt = nt;
+    this->dh = dh;
+    this->beta = beta;
+    this->sta = sta;
+    this->stb = stb;
+    this->stc = stc;
+
+    _block_idx = GetBlockIdx();
+    _tile_len = _block_idx < remainder ? base + 1 : base;
+    _copy_len = _tile_len * sizeof(T) % 32 == 0
+                    ? _tile_len
+                    : (_tile_len * sizeof(T) + 31) / 32 * 32 / sizeof(T);
+
+    // Set global tensor
+    aGm.SetGlobalBuffer((__gm__ T *) a);
+    bGm.SetGlobalBuffer((__gm__ T *) b);
+    cGm.SetGlobalBuffer((__gm__ T *) c);
+
+    // Pipe alloc memory to queue, the unit is bytes
+    pipe.InitBuffer(aQue, BUFFER_NUM, _copy_len * sizeof(T));
+    pipe.InitBuffer(bQue, BUFFER_NUM, _copy_len * sizeof(T));
+    pipe.InitBuffer(cQue, BUFFER_NUM, _copy_len * sizeof(T));
+}
+
+template<typename T>
+__aicore__ inline void KernelSwiGLU<T>::CopyIn(int32_t i) {
+    // Alloc tensor from queue memory
+    LocalTensor<T> aUb = aQue.AllocTensor<T>();
+    LocalTensor<T> bUb = bQue.AllocTensor<T>();
+    // Get idx of current tile
+    auto idxa = i * sta + _block_idx * _tile_len;
+    auto idxb = i * stb + _block_idx * _tile_len;
+    // Copy process_th tile from global tensor to local tensor
+    // See https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/opdevgapi/atlasascendc_api_07_0105.html
+    // DataCopy cut down if _tile_len * sizeof(T) / 32 != 0
+    DataCopy(aUb, aGm[idxa], _copy_len);
+    DataCopy(bUb, bGm[idxb], _copy_len);
+
+    // Enque input tensor to VECIN queue
+    aQue.EnQue(aUb);
+    bQue.EnQue(bUb);
+}
+
+template<typename T>
+__aicore__ inline void KernelSwiGLU<T>::Compute(int32_t i) {
+    // Deque input tensors from VECIN queue
+    LocalTensor<T> aUb = aQue.DeQue<T>();
+    LocalTensor<T> bUb = bQue.DeQue<T>();
+    LocalTensor<T> cUb = cQue.AllocTensor<T>();
+    // Call SwiGLU ascend api
+    SwiGLU<T, false>(cUb, aUb, bUb, beta);
+    // Enque result and free input
+    cQue.EnQue<T>(cUb);
+    aQue.FreeTensor(aUb);
+    bQue.FreeTensor(bUb);
+}
+
+template<typename T>
+__aicore__ inline void KernelSwiGLU<T>::CopyOut(int32_t i) {
+    // Deque output tensor from VECOUT queue
+    LocalTensor<T> cUb = cQue.DeQue<T>();
+    auto idxc = i * stc + _block_idx * _tile_len;
+    // Copy progress_th tile from local tensor to global tensor
+    // Use Gather mask if _tile_len * sizeof(T) % 32 != 0
+    if (_tile_len * sizeof(T) % 32 != 0) {
+        DataCopyExtParams dcep = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0};
+        DataCopyPad(cGm[idxc], cUb, dcep);
+    }
+    DataCopy(cGm[idxc], cUb, _tile_len);
+    // Free output Local tensor
+    cQue.FreeTensor(cUb);
+}
+
+template<typename T>
+__aicore__ inline void KernelSwiGLU<T>::Process() {
+    for (int32_t i = 0; i < nt; ++i) {
+        CopyIn(i);
+        Compute(i);
+        CopyOut(i);
+    }
+}
+
+__global__ __aicore__ void swiglu_kernel_f16(GM_ADDR c, GM_ADDR a, GM_ADDR b,
+                                             float beta, int32_t nt, int32_t dh,
+                                             int32_t sta, int32_t stb, int32_t stc,
+                                             uint32_t remainder, uint32_t base) {
+    KernelSwiGLU<half> op;
+    op.Init(c, a, b, beta, nt, dh, sta, stb, stc, remainder, base);
+    op.Process();
+}
+
+__global__ __aicore__ void swiglu_kernel_f32(GM_ADDR c, GM_ADDR a, GM_ADDR b,
+                                             float beta, int32_t nt, int32_t dh,
+                                             int32_t sta, int32_t stb, int32_t stc,
+                                             uint32_t remainder, uint32_t base) {
+    KernelSwiGLU<float> op;
+    op.Init(c, a, b, beta, nt, dh, sta, stb, stc, remainder, base);
+    op.Process();
+}
+
+extern "C" infiniopStatus_t swiglu_kernel_do(void *c, void *a, void *b,
+                                             float beta, int32_t nt, int32_t dh,
+                                             int32_t sta, int32_t stb, int32_t stc,
+                                             int dtype, void *stream) {
+
+    // Tiling params
+    auto base = static_cast<uint32_t>(dh / BLOCK_NUM);
+    auto remainder = static_cast<uint32_t>(dh % BLOCK_NUM);
+
+    switch (dtype) {
+        case 0:
+            swiglu_kernel_f32<<<BLOCK_NUM, nullptr, stream>>>(
+                c, a, b, beta, nt, dh, sta, stb, stc, remainder, base);
+            return STATUS_SUCCESS;
+        case 1:
+            swiglu_kernel_f16<<<BLOCK_NUM, nullptr, stream>>>(
+                c, a, b, beta, nt, dh, sta, stb, stc, remainder, base);
+            return STATUS_SUCCESS;
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/swiglu/bang/swiglu_bang.cc b/src/ops/swiglu/bang/swiglu_bang.cc
new file mode 100644
index 00000000..7654bf4f
--- /dev/null
+++ b/src/ops/swiglu/bang/swiglu_bang.cc
@@ -0,0 +1,50 @@
+#include "swiglu_bang.h"
+#include "../../utils.h"
+
+infiniopStatus_t bangCreateSwiGLUDescriptor(BangHandle_t handle,
+                                            SwiGLUBangDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc) {
+    if (c_desc->ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    DT dtype = c_desc->dt;
+
+    if (!dtype_eq(dtype, F16)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    if (a_desc->strides[1] != 1 || b_desc->strides[1] != 1 || c_desc->strides[1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    uint64_t seq_len = c_desc->shape[0],
+             di = c_desc->shape[1];
+
+    uint64_t stride_a = a_desc->strides[0],
+             stride_b = b_desc->strides[0],
+             stride_c = c_desc->strides[0];
+
+
+    if (a_desc->shape[0] != seq_len || a_desc->shape[1] != di || !dtype_eq(a_desc->dt, dtype) ||
+        b_desc->shape[0] != seq_len || b_desc->shape[1] != di || !dtype_eq(b_desc->dt, dtype)) {
+        return STATUS_BAD_PARAM;
+    }
+
+    *desc_ptr = new SwiGLUBangDescriptor{handle->device,
+                                         handle->device_id,
+                                         dtype,
+                                         seq_len,
+                                         di,
+                                         stride_a,
+                                         stride_b,
+                                         stride_c};
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t bangDestroySwiGLUDescriptor(SwiGLUBangDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/swiglu/bang/swiglu_bang.h b/src/ops/swiglu/bang/swiglu_bang.h
index 7e81ebee..5eabc103 100644
--- a/src/ops/swiglu/bang/swiglu_bang.h
+++ b/src/ops/swiglu/bang/swiglu_bang.h
@@ -1,10 +1,35 @@
 #ifndef __BANG_SWIGLU_H__
 #define __BANG_SWIGLU_H__
 
+#include "../../../devices/bang/bang_handle.h"
 #include "../../utils.h"
-#include "cnrt.h"
 #include "operators.h"
 
-void swiglu_bang_f16(Tensor gate, Tensor up, void *stream);
+struct SwiGLUBangDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t seq_len;
+    uint64_t di;
+    uint64_t stride_a;
+    uint64_t stride_b;
+    uint64_t stride_c;
+};
+
+typedef struct SwiGLUBangDescriptor *SwiGLUBangDescriptor_t;
+
+infiniopStatus_t bangCreateSwiGLUDescriptor(BangHandle_t handle,
+                                            SwiGLUBangDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_dec,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t bangSwiGLU(SwiGLUBangDescriptor_t desc,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream);
+
+infiniopStatus_t bangDestroySwiGLUDescriptor(SwiGLUBangDescriptor_t desc);
 
 #endif// __BANG_SWIGLU_H__
diff --git a/src/ops/swiglu/bang/swiglu_bang.mlu b/src/ops/swiglu/bang/swiglu_bang.mlu
index e1323236..b43c5e10 100644
--- a/src/ops/swiglu/bang/swiglu_bang.mlu
+++ b/src/ops/swiglu/bang/swiglu_bang.mlu
@@ -3,125 +3,20 @@
 #include "cnrt.h"
 #include "swiglu_bang.h"
 #include "../../../devices/bang/common_bang.h"
-const int SRC_MAX_SIZE = 1024 * 64;//至少大于等于128字节
-__nram__  char nram_buffer[NRAM_MAX_SIZE];
+#include "../../utils.h"
 
-template <typename T>
-__mlu_device__ void swigluKernel(T *gate, int *gate_stride, T const *up, int *up_stride, int *shape, int othersize, int dimsize, int ndim){
-    
-    const int maxNum = SRC_MAX_SIZE/sizeof(T);
-    
-    if(dimsize >= maxNum){
-        T *src = (T *)nram_buffer;//[maxNum]
-        T *dest = src + maxNum; //[maxNum]
-        int remainT = othersize % taskDim;
-        int stepEasy = (othersize - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-
-        int remain = dimsize % maxNum;
-        int repeat = (dimsize - remain) / maxNum;
-        int tidS;
-        int tidD;
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i;
-            for (int j = ndim - 2; j >= 0; --j) {
-                inds += (indi % shape[j]) * up_stride[j];
-                indd += (indi % shape[j]) * gate_stride[j];
-                indi /= shape[j];
-            }
-            for(int s = 0; s < repeat; s++){
-                tidS = inds + s * maxNum;
-                tidD = indd + s * maxNum;
-                __memcpy(src, up + tidS, maxNum * sizeof(T), GDRAM2NRAM);
-                __memcpy(dest, gate + tidD, maxNum * sizeof(T), GDRAM2NRAM);
-                __bang_mul(src, src, dest, maxNum);//up = up * gate
-                __bang_active_sigmoid(dest, dest, maxNum);//gate = sigmoid(gate)
-                __bang_mul(src, src, dest, maxNum);//up = up * gate
-                __memcpy(gate + tidD, src, maxNum * sizeof(T), NRAM2GDRAM);
-            }
-            if(remain){
-                tidS = inds + repeat * maxNum;
-                tidD = indd + repeat * maxNum;
-                __memcpy(src, up + tidS, remain * sizeof(T), GDRAM2NRAM);
-                __memcpy(dest, gate + tidD, remain * sizeof(T), GDRAM2NRAM);
-                __bang_mul(src, src, dest, remain);//up = up * gate
-                __bang_active_sigmoid(dest, dest, remain);//gate = sigmoid(gate)
-                __bang_mul(src, src, dest, remain);//up = up * gate
-                __memcpy(gate + tidD, src, remain * sizeof(T), NRAM2GDRAM);
-            }
-        }
-    }
-    else{
-        T *src = (T *)nram_buffer;//[dimsize]
-        T *dest = src + dimsize; //[dimsize]
-        int remainT = othersize % taskDim;
-        int stepEasy = (othersize - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i;
-            for (int j = ndim - 2; j >= 0; --j) {
-                inds += (indi % shape[j]) * up_stride[j];
-                indd += (indi % shape[j]) * gate_stride[j];
-                indi /= shape[j];
-            }
-            __memcpy(src, up + inds, dimsize * sizeof(T), GDRAM2NRAM);
-            __memcpy(dest, gate + indd, dimsize * sizeof(T), GDRAM2NRAM);
-            
-            __bang_mul(src, src, dest, dimsize);//up = up * gate
-            __bang_active_sigmoid(dest, dest, dimsize);//gate = sigmoid(gate)
-            __bang_mul(src, src, dest, dimsize);//up = up * gate
-
-            __memcpy(gate + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
-        }
-        
-    }
-}
-template<typename T>
-__mlu_global__ void swigluUnion1(T *gate, int *gate_stride, T const *up, int *up_stride, int *shape, int othersize, int dimsize, int ndim) {
+const int SRC_MAX_SIZE = 1024 * 32;//至少大于等于128字节
+__nram__  char nram_buffer[NRAM_MAX_SIZE];
 
-    swigluKernel<T>(gate, gate_stride, up, up_stride, shape, othersize, dimsize, ndim);
-}
 
-template<typename T>
-void swiglu(cnrtQueue_t queue, void *gate, int *gate_stride, void const *up, int *up_stride, int *shape, int othersize, int dimsize, int ndim) {
-    
-    auto y_ = reinterpret_cast<T *>(gate);
-    auto x_ = reinterpret_cast<T const *>(up);
-    
-    cnrtDim3_t k_dim;
-    cnrtFunctionType_t k_type;
-
-    k_dim.x = 4;
-    k_dim.y = 1;
-    k_dim.z = 1;
-    k_type = CNRT_FUNC_TYPE_UNION1;
-    
-    swigluUnion1<T><<<k_dim, k_type, queue>>>(y_, gate_stride, x_, up_stride, shape, othersize, dimsize, ndim);
-    // cnrtQueueSync(queue);
-    
-}
-void swiglu_fp16(cnrtQueue_t queue, void *gate, void *up, int *gate_stride, int *up_stride, int *shape, int othersize, int dimsize, int ndim) {
-    
-    swiglu<half>(queue, gate, gate_stride, up, up_stride, shape, othersize, dimsize, ndim);
-    
-}
 template <typename T>
-__mlu_global__ void swigluDim_2(T *gate, T const *up, int strideS_f, int strideD_f, int othersize, int dimsize){
+__mlu_global__ void swigluDim_2(T const *a_, T const *b_, T *c_, int stride_a, int stride_b, int stride_c, int othersize, int dimsize){
     
     const int maxNum = SRC_MAX_SIZE/sizeof(T);
     
     if(dimsize >= maxNum){
         T *src = (T *)nram_buffer;//[maxNum]
-        T *dest = src + maxNum; //[maxNum]
+        T *dest = src + 3 * maxNum; //[maxNum]
         int remainT = othersize % taskDim;
         int stepEasy = (othersize - remainT) / taskDim;
         int stepHard = stepEasy + 1;
@@ -130,33 +25,46 @@ __mlu_global__ void swigluDim_2(T *gate, T const *up, int strideS_f, int strideD
 
         int remain = dimsize % maxNum;
         int repeat = (dimsize - remain) / maxNum;
-        int tidS;
-        int tidD;
+        int tid_a;
+        int tid_b;
+        int tid_c;
         for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
+            int ind_a = 0;
+            int ind_b = 0;
+            int ind_c = 0;
             int indi = i;
-            inds += (indi % othersize) * strideS_f;
-            indd += (indi % othersize) * strideD_f;
-            for(int s = 0; s < repeat; s++){
-                tidS = inds + s * maxNum;
-                tidD = indd + s * maxNum;
-                __memcpy(src, up + tidS, maxNum * sizeof(T), GDRAM2NRAM);
-                __memcpy(dest, gate + tidD, maxNum * sizeof(T), GDRAM2NRAM);
-                __bang_mul(src, src, dest, maxNum);//up = up * gate
-                __bang_active_sigmoid(dest, dest, maxNum);//gate = sigmoid(gate)
-                __bang_mul(src, src, dest, maxNum);//up = up * gate
-                __memcpy(gate + tidD, src, maxNum * sizeof(T), NRAM2GDRAM);
+            ind_a += (indi % othersize) * stride_a;
+            ind_b += (indi % othersize) * stride_b;
+            ind_c += (indi % othersize) * stride_c;
+            for(int s = 0; s < repeat + 2; s++){
+                
+                if(s < repeat){
+                    tid_a = ind_a + s * maxNum;
+                    tid_b = ind_b + s * maxNum;
+                    __memcpy_async(src + s % 3 * maxNum, a_ + tid_a, maxNum * sizeof(T), GDRAM2NRAM);
+                    __memcpy_async(dest + s % 3 * maxNum, b_ + tid_b, maxNum * sizeof(T), GDRAM2NRAM);
+                }
+                if(s > 0 && s < repeat + 1){
+                    __bang_mul(src + (s - 1) % 3 * maxNum, src + (s - 1) % 3 * maxNum, dest + (s - 1) % 3 * maxNum, maxNum);//a_ = a_ * b_
+                    __bang_active_sigmoid(dest + (s - 1) % 3 * maxNum, dest + (s - 1) % 3 * maxNum, maxNum);//b_ = sigmoid(b_)
+                    __bang_mul(src + (s - 1) % 3 * maxNum, src + (s - 1) % 3 * maxNum, dest + (s - 1) % 3 * maxNum, maxNum);//a_ = a_ * b_
+                }
+                if(s > 1){
+                    tid_c = ind_c + (s - 2) * maxNum;
+                    __memcpy_async(c_ + tid_c, src + (s - 2) % 3 * maxNum, maxNum * sizeof(T), NRAM2GDRAM);
+                }
+                __sync_all_ipu();
             }
             if(remain){
-                tidS = inds + repeat * maxNum;
-                tidD = indd + repeat * maxNum;
-                __memcpy(src, up + tidS, remain * sizeof(T), GDRAM2NRAM);
-                __memcpy(dest, gate + tidD, remain * sizeof(T), GDRAM2NRAM);
-                __bang_mul(src, src, dest, remain);//up = up * gate
-                __bang_active_sigmoid(dest, dest, remain);//gate = sigmoid(gate)
-                __bang_mul(src, src, dest, remain);//up = up * gate
-                __memcpy(gate + tidD, src, remain * sizeof(T), NRAM2GDRAM);
+                tid_a = ind_a + repeat * maxNum;
+                tid_b = ind_b + repeat * maxNum;
+                tid_c = ind_c + repeat * maxNum;
+                __memcpy(src, a_ + tid_a, remain * sizeof(T), GDRAM2NRAM);
+                __memcpy(dest, b_ + tid_b, remain * sizeof(T), GDRAM2NRAM);
+                __bang_mul(src, src, dest, remain);//a_ = a_ * b_
+                __bang_active_sigmoid(dest, dest, remain);//b_ = sigmoid(b_)
+                __bang_mul(src, src, dest, remain);//a_ = a_ * b_
+                __memcpy(c_ + tid_c, src, remain * sizeof(T), NRAM2GDRAM);
             }
         }
     }
@@ -170,29 +78,32 @@ __mlu_global__ void swigluDim_2(T *gate, T const *up, int strideS_f, int strideD
         int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
         
         for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
+            int ind_a = 0;
+            int ind_b = 0;
+            int ind_c = 0;
             int indi = i;
-            inds += (indi % othersize) * strideS_f;
-            indd += (indi % othersize) * strideD_f;
-            __memcpy(src, up + inds, dimsize * sizeof(T), GDRAM2NRAM);
-            __memcpy(dest, gate + indd, dimsize * sizeof(T), GDRAM2NRAM);
+            ind_a += (indi % othersize) * stride_a;
+            ind_b += (indi % othersize) * stride_b;
+            ind_c += (indi % othersize) * stride_c;
+            __memcpy(src, a_ + ind_a, dimsize * sizeof(T), GDRAM2NRAM);
+            __memcpy(dest, b_ + ind_b, dimsize * sizeof(T), GDRAM2NRAM);
             
-            __bang_mul(src, src, dest, dimsize);//up = up * gate
-            __bang_active_sigmoid(dest, dest, dimsize);//gate = sigmoid(gate)
-            __bang_mul(src, src, dest, dimsize);//up = up * gate
+            __bang_mul(src, src, dest, dimsize);//a_ = a_ * b_
+            __bang_active_sigmoid(dest, dest, dimsize);//b_ = sigmoid(b_)
+            __bang_mul(src, src, dest, dimsize);//a_ = a_ * b_
 
-            __memcpy(gate + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
+            __memcpy(c_ + ind_c, src, dimsize * sizeof(T), NRAM2GDRAM);
         }
         
     }
 }
 template<typename T>
-void swigluUnionDim_2(cnrtQueue_t queue, void *gate, void const *up, int strideS_f, int strideD_f, int othersize, int dimsize) {
-    
-    auto y_ = reinterpret_cast<T *>(gate);
-    auto x_ = reinterpret_cast<T const *>(up);
+void swigluUnionDim_2(cnrtQueue_t queue, void const *a, void const *b, void *c, int stride_a, int stride_b, int stride_c, int othersize, int dimsize) {
     
+    auto c_ = reinterpret_cast<T *>(c);
+    auto a_ = reinterpret_cast<T const *>(a);
+    auto b_ = reinterpret_cast<T const *>(b);
+
     cnrtDim3_t k_dim;
     cnrtFunctionType_t k_type;
 
@@ -201,156 +112,35 @@ void swigluUnionDim_2(cnrtQueue_t queue, void *gate, void const *up, int strideS
     k_dim.z = 1;
     k_type = CNRT_FUNC_TYPE_UNION1;
     
-    swigluDim_2<T><<<k_dim, k_type, queue>>>(y_, x_, strideS_f, strideD_f, othersize, dimsize);
-    // cnrtQueueSync(queue);
+    swigluDim_2<T><<<k_dim, k_type, queue>>>(a_, b_, c_, stride_a, stride_b, stride_c, othersize, dimsize);
     
 }
-template <typename T>
-__mlu_global__ void swigluDim_3(T *gate, T const *up, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int othersize, int middle, int dimsize){
-    
-    const int maxNum = SRC_MAX_SIZE/sizeof(T);
-    int startDim = othersize / middle;
-    if(dimsize >= maxNum){
-        T *src = (T *)nram_buffer;//[maxNum]
-        T *dest = src + maxNum; //[maxNum]
-        int remainT = othersize % taskDim;
-        int stepEasy = (othersize - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
 
-        int remain = dimsize % maxNum;
-        int repeat = (dimsize - remain) / maxNum;
-        int tidS;
-        int tidD;
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i;
-            inds += (indi % middle) * strideS_m;
-            indd += (indi % middle) * strideD_m;
-            indi /= middle;
-            inds += (indi % startDim) * strideS_f;
-            indd += (indi % startDim) * strideD_f;
-            for(int s = 0; s < repeat; s++){
-                tidS = inds + s * maxNum;
-                tidD = indd + s * maxNum;
-                __memcpy(src, up + tidS, maxNum * sizeof(T), GDRAM2NRAM);
-                __memcpy(dest, gate + tidD, maxNum * sizeof(T), GDRAM2NRAM);
-                __bang_mul(src, src, dest, maxNum);//up = up * gate
-                __bang_active_sigmoid(dest, dest, maxNum);//gate = sigmoid(gate)
-                __bang_mul(src, src, dest, maxNum);//up = up * gate
-                __memcpy(gate + tidD, src, maxNum * sizeof(T), NRAM2GDRAM);
-            }
-            if(remain){
-                tidS = inds + repeat * maxNum;
-                tidD = indd + repeat * maxNum;
-                __memcpy(src, up + tidS, remain * sizeof(T), GDRAM2NRAM);
-                __memcpy(dest, gate + tidD, remain * sizeof(T), GDRAM2NRAM);
-                __bang_mul(src, src, dest, remain);//up = up * gate
-                __bang_active_sigmoid(dest, dest, remain);//gate = sigmoid(gate)
-                __bang_mul(src, src, dest, remain);//up = up * gate
-                __memcpy(gate + tidD, src, remain * sizeof(T), NRAM2GDRAM);
-            }
-        }
-    }
-    else{
-        T *src = (T *)nram_buffer;//[dimsize]
-        T *dest = src + dimsize; //[dimsize]
-        int remainT = othersize % taskDim;
-        int stepEasy = (othersize - remainT) / taskDim;
-        int stepHard = stepEasy + 1;
-        int step = (taskId < remainT ? stepHard : stepEasy);
-        int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard);
-        
-        for(int i = indStart; i < indStart + step; i++){
-            int inds = 0;
-            int indd = 0;
-            int indi = i;
-            inds += (indi % middle) * strideS_m;
-            indd += (indi % middle) * strideD_m;
-            indi /= middle;
-            inds += (indi % startDim) * strideS_f;
-            indd += (indi % startDim) * strideD_f;
-            __memcpy(src, up + inds, dimsize * sizeof(T), GDRAM2NRAM);
-            __memcpy(dest, gate + indd, dimsize * sizeof(T), GDRAM2NRAM);
-            
-            __bang_mul(src, src, dest, dimsize);//up = up * gate
-            __bang_active_sigmoid(dest, dest, dimsize);//gate = sigmoid(gate)
-            __bang_mul(src, src, dest, dimsize);//up = up * gate
+void swiglu_bang_f16(SwiGLUBangDescriptor_t desc, void const *a, void const *b, void *c, void *stream) {
+    auto queue = reinterpret_cast<cnrtQueue_t>(stream);
+    auto seq_len = desc->seq_len,
+         di = desc->di;
 
-            __memcpy(gate + indd, src, dimsize * sizeof(T), NRAM2GDRAM);
-        }
-        
-    }
-}
-template<typename T>
-void swigluUnionDim_3(cnrtQueue_t queue, void *gate, void const *up, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int othersize, int middle, int dimsize) {
-    
-    auto y_ = reinterpret_cast<T *>(gate);
-    auto x_ = reinterpret_cast<T const *>(up);
-    
-    cnrtDim3_t k_dim;
-    cnrtFunctionType_t k_type;
+    auto stride_a = desc->stride_a,
+         stride_b = desc->stride_b,
+         stride_c = desc->stride_c;
 
-    k_dim.x = 4;
-    k_dim.y = 1;
-    k_dim.z = 1;
-    k_type = CNRT_FUNC_TYPE_UNION1;
     
-    swigluDim_3<T><<<k_dim, k_type, queue>>>(y_, x_, strideS_f, strideS_m, strideD_f, strideD_m, othersize, middle, dimsize);
-    // cnrtQueueSync(queue);
+    swigluUnionDim_2<half>(queue, a, b, c, stride_a, stride_b, stride_c, seq_len, di);
+    
     
 }
-void swiglu_bang_f16(Tensor gate, Tensor up, void *stream) {
-    auto queue = reinterpret_cast<cnrtQueue_t>(stream);
-    int num = 1;
-    int ndim = gate.layout->ndim;
-    int gate_stride[ndim], up_stride[ndim], shape[ndim];
-    for (int i = 0; i < ndim; i++) {
-        gate_stride[i] = gate.layout->strides[i] / gate.layout->dt.size;
-        up_stride[i] = up.layout->strides[i] / up.layout->dt.size;
-        shape[i] = gate.layout->shape[i];
-        num *= shape[i];
-    }
-    if(ndim == 2){
-        ASSERT_EQ(gate.layout->ndim, 2);
-        ASSERT_EQ(up.layout->ndim, 2);
-        ASSERT_EQ(gate.layout->shape[0], up.layout->shape[0]);
-        ASSERT_EQ(gate.layout->shape[1], up.layout->shape[1]);
-        auto n = gate.layout->shape[0],
-            d = gate.layout->shape[1];
-        int strideS_f = up_stride[0];
-        int strideD_f = gate_stride[0];
-        swigluUnionDim_2<half>(queue, gate.data, up.data, strideS_f, strideD_f, n, d);
+infiniopStatus_t bangSwiGLU(SwiGLUBangDescriptor_t desc,
+                           void *c,
+                           void const *a,
+                           void const *b,
+                           void *stream){
+    if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
+        return STATUS_BAD_DEVICE;
+    }                        
+    if (dtype_eq(desc->dtype, F16)) {
+        swiglu_bang_f16(desc, a, b, c, stream);
+        return STATUS_SUCCESS;
     }
-    else if(ndim == 3){
-        int strideS_f = up_stride[0];
-        int strideD_f = gate_stride[0];
-        int strideS_m = up_stride[1];
-        int strideD_m = gate_stride[1];
-        int middle = shape[1];
-        int d = shape[ndim - 1];
-        int n = num / d;
-        swigluUnionDim_3<half>(queue, gate.data, up.data, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, d);
-    }
-    else{
-        int d = shape[ndim - 1];
-        int n = num / d;
-        int *mlu_stride_gate, *mlu_stride_up, *mlu_shape;
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_stride_gate, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_stride_up, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMalloc((void **)&mlu_shape, ndim * sizeof(int)));
-        CNRT_CHECK(cnrtMemcpy(mlu_stride_gate, gate_stride, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpy(mlu_stride_up, up_stride, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpy(mlu_shape, shape, ndim * sizeof(int), cnrtMemcpyHostToDev));
-        
-        
-        swiglu_fp16(queue, gate.data, up.data, mlu_stride_gate, mlu_stride_up, mlu_shape, n, d, ndim);
-        
-        CNRT_CHECK(cnrtFree(mlu_stride_gate));
-        CNRT_CHECK(cnrtFree(mlu_stride_up));
-        CNRT_CHECK(cnrtFree(mlu_shape));
-    }
-    
+    return STATUS_BAD_TENSOR_DTYPE;
 }
diff --git a/src/ops/swiglu/bang/swiglu_cnnl.cc b/src/ops/swiglu/bang/swiglu_cnnl.cc
deleted file mode 100644
index 64f062b6..00000000
--- a/src/ops/swiglu/bang/swiglu_cnnl.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-﻿#include "swiglu_cnnl.h"
-#include "../../../devices/bang/common_bang.h"
-#include "../../../devices/bang/handle_pool.h" 
-#include "../../utils.h"
-#include "cnrt.h"
-
-SwigluBangDescriptor::SwigluBangDescriptor(Device device) {
-    this->device = device;
-    get_cnnl_pool();
-}
-
-void swiglu_cnnl_f16(SwigluBangDescriptor *descriptor, Tensor gate, Tensor up, void *stream) {
-    ASSERT_EQ(gate.layout->ndim, 2);
-    ASSERT_EQ(up.layout->ndim, 2);
-    ASSERT_EQ(gate.layout->shape[0], up.layout->shape[0]);
-    ASSERT_EQ(gate.layout->shape[1], up.layout->shape[1]);
-
-    cnnlTensorDescriptor_t gateDesc, inDesc;
-    cnnlCreateTensorDescriptor(&gateDesc);
-    cnnlCreateTensorDescriptor(&inDesc);
-
-    setCnnlTensor(gateDesc, gate.layout);
-
-    std::vector<int> dims(gate.layout->ndim);
-    size_t inputSizeInBytes = 1;
-    for (uint64_t i = 0; i < gate.layout->ndim; i++) {
-        dims[i] = static_cast<int>(gate.layout->shape[i]);
-        inputSizeInBytes *= dims[i];
-    }
-    dims[gate.layout->ndim - 1] *= 2;
-    inputSizeInBytes *= (2 * sizeof(uint16_t));
-    cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
-                            dims.size(), dims.data());
-
-    void *input;
-    cnrtMalloc(&input, inputSizeInBytes);
-
-    void *concatWorkspace;
-    
-    use_cnnl((cnrtQueue_t) stream,
-             [&](cnnlHandle_t handle) {
-                 size_t concatWorkspaceSize;
-                 cnnlGetConcatWorkspaceSize(handle, 2, &concatWorkspaceSize);
-                 cnrtMalloc(&concatWorkspace, concatWorkspaceSize);
-
-                 cnnlTensorDescriptor_t inputsDesc[2] = {gateDesc, gateDesc};
-                 const void *const inputsData[2] = {gate.data, up.data};
-                 cnnlConcat(handle, 2, -1, inputsDesc, inputsData,
-                            concatWorkspace, concatWorkspaceSize, inDesc, input);
-
-                 cnnlBiasActivationGluForward_v2(handle, descriptor->opDesc, inDesc, input,
-                                                 nullptr, nullptr, gateDesc, gate.data);
-             });
-
-    cnrtFree(concatWorkspace);
-    cnrtFree(input);
-
-    cnnlDestroyTensorDescriptor(gateDesc);
-    cnnlDestroyTensorDescriptor(inDesc);
-}
diff --git a/src/ops/swiglu/bang/swiglu_cnnl.h b/src/ops/swiglu/bang/swiglu_cnnl.h
deleted file mode 100644
index f729c425..00000000
--- a/src/ops/swiglu/bang/swiglu_cnnl.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef __CNNL_SWIGLU_H__
-#define __CNNL_SWIGLU_H__
-
-#include "cnnl.h"
-#include "cnnl_extra.h"
-#include "operators.h"
-
-struct SwigluBangDescriptor {
-    Device device;
-    cnnlActivationDescriptor_t actDesc;
-    cnnlBiasActivationGluDescriptor_t opDesc;
-
-    SwigluBangDescriptor(Device device);
-    void createCnnlDescriptors() {
-        cnnlCreateActivationDescriptor(&actDesc);
-        cnnlCreateBiasActivationGluDescriptor(&opDesc);
-        cnnlSetActivationDescriptor_v6(actDesc, CNNL_ACTIVATION_SILU,
-                                       CNNL_ACTIVATION_HIGH_PRECISION,
-                                       CNNL_NOT_PROPAGATE_NAN,
-                                       0.0, 0, 0.0, 0.0, true, true);
-        cnnlSetBiasActivationGluDescriptor(opDesc, actDesc,
-                                           CNNL_BIAS_ACTIVATION_GLU_ALGO_V2);
-    }
-    void destroyCnnlDescriptors() {
-        cnnlDestroyActivationDescriptor(actDesc);
-        cnnlDestroyBiasActivationGluDescriptor(opDesc);
-    }
-};
-
-void swiglu_cnnl_f16(SwigluBangDescriptor *descriptor, Tensor gate, Tensor up, void *stream);
-
-#endif// __CNNL_SWIGLU_H__
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cc b/src/ops/swiglu/cpu/swiglu_cpu.cc
index 899f0793..4e0fd574 100644
--- a/src/ops/swiglu/cpu/swiglu_cpu.cc
+++ b/src/ops/swiglu/cpu/swiglu_cpu.cc
@@ -3,30 +3,89 @@
 #include "../../utils.h"
 #include <cmath>
 
-inline float sigmoid(float x) {
-    return 1.0f / (1.0f + expf(-x));
+
+infiniopStatus_t cpuCreateSwiGLUDescriptor(infiniopHandle_t handle,
+                                           SwiGLUCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t c_desc,
+                                           infiniopTensorDescriptor_t a_desc,
+                                           infiniopTensorDescriptor_t b_desc) {
+    if (c_desc->ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    DT dtype = c_desc->dt;
+
+    if (!dtype_eq(dtype, F16)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    if (a_desc->strides[1] != 1 || b_desc->strides[1] != 1 || c_desc->strides[1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    uint64_t seq_len = c_desc->shape[0],
+             di = c_desc->shape[1];
+
+    uint64_t stride_a = a_desc->strides[0],
+             stride_b = b_desc->strides[0],
+             stride_c = c_desc->strides[0];
+
+
+    if (a_desc->shape[0] != seq_len || a_desc->shape[1] != di || !dtype_eq(a_desc->dt, dtype) ||
+        b_desc->shape[0] != seq_len || b_desc->shape[1] != di || !dtype_eq(b_desc->dt, dtype)) {
+        return STATUS_BAD_PARAM;
+    }
+
+    *desc_ptr = new SwiGLUCpuDescriptor{DevCpu,
+                                        dtype,
+                                        seq_len,
+                                        di,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c};
+    return STATUS_SUCCESS;
 }
 
-void swiglu_cpu_f16(Tensor gate, Tensor up) {
-    ASSERT_EQ(gate.layout->ndim, 2);
-    ASSERT_EQ(up.layout->ndim, 2);
-    ASSERT_EQ(gate.layout->shape[0], up.layout->shape[0]);
-    ASSERT_EQ(gate.layout->shape[1], up.layout->shape[1]);
+inline float silu(float x) {
+    return x / (1.0f + expf(-x));
+}
 
-    auto seq_len = gate.layout->shape[0],
-         di = gate.layout->shape[1];
+void swiglu_cpu_f16(SwiGLUCpuDescriptor_t desc, void *c, void const *a, void const *b) {
 
-    auto stride_gate = gate.layout->strides[0],
-         stride_up = up.layout->strides[0];
+    auto seq_len = desc->seq_len,
+         di = desc->di;
+
+    auto stride_a = desc->stride_a,
+         stride_b = desc->stride_b,
+         stride_c = desc->stride_c;
 
     for (int i = 0; i < seq_len; ++i) {
-        auto gate_ = reinterpret_cast<uint16_t *>(gate.data) + i * stride_gate;
-        auto up_ = reinterpret_cast<uint16_t const *>(up.data) + i * stride_up;
+        auto a_ = reinterpret_cast<const uint16_t *>(a) + i * stride_a;
+        auto b_ = reinterpret_cast<const uint16_t *>(b) + i * stride_b;
+        auto c_ = reinterpret_cast<uint16_t *>(c) + i * stride_c;
         for (int j = 0; j < di; ++j) {
-            auto x = f16_to_f32(gate_[j]);
-            auto y = f16_to_f32(up_[j]);
+            auto a__ = f16_to_f32(a_[j]);
+            auto b__ = f16_to_f32(b_[j]);
 
-            gate_[j] = f32_to_f16(x * sigmoid(x) * y);
+            c_[j] = f32_to_f16(a__ * silu(b__));
         }
     }
 }
+
+infiniopStatus_t cpuSwiGLU(SwiGLUCpuDescriptor_t desc,
+                           void *c,
+                           void const *a,
+                           void const *b,
+                           void *stream) {
+    if (dtype_eq(desc->dtype, F16)) {
+        swiglu_cpu_f16(desc, c, a, b);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
+
+infiniopStatus_t cpuDestroySwiGLUDescriptor(SwiGLUCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.h b/src/ops/swiglu/cpu/swiglu_cpu.h
index 7fd768e5..a853ccf8 100644
--- a/src/ops/swiglu/cpu/swiglu_cpu.h
+++ b/src/ops/swiglu/cpu/swiglu_cpu.h
@@ -3,10 +3,30 @@
 
 #include "operators.h"
 
-struct SwigluCpuDescriptor {
+struct SwiGLUCpuDescriptor {
     Device device;
+    DT dtype;
+    uint64_t seq_len;
+    uint64_t di;
+    uint64_t stride_a;
+    uint64_t stride_b;
+    uint64_t stride_c;
 };
 
-void swiglu_cpu_f16(Tensor gate, Tensor up);
+typedef struct SwiGLUCpuDescriptor *SwiGLUCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateSwiGLUDescriptor(infiniopHandle_t handle,
+                                           SwiGLUCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t c_dec,
+                                           infiniopTensorDescriptor_t a_desc,
+                                           infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t cpuSwiGLU(SwiGLUCpuDescriptor_t desc,
+                           void *c,
+                           void const *a,
+                           void const *b,
+                           void *stream);
+
+infiniopStatus_t cpuDestroySwiGLUDescriptor(SwiGLUCpuDescriptor_t desc);
 
 #endif// __CPU_SWIGLU_H__
diff --git a/src/ops/swiglu/cuda/swiglu.cu b/src/ops/swiglu/cuda/swiglu.cu
index aa55e63d..fdd3f16b 100644
--- a/src/ops/swiglu/cuda/swiglu.cu
+++ b/src/ops/swiglu/cuda/swiglu.cu
@@ -1,9 +1,10 @@
+#include "../../../devices/cuda/common_cuda.h"
 #include "../../utils.h"
 #include "swiglu.cuh"
 #include <cuda_fp16.h>
 
-static __forceinline__ __device__ float sigmoid(float x) {
-    return fdividef(1, 1 + expf(-x));
+static __forceinline__ __device__ float silu(float x) {
+    return x * fdividef(1, 1 + expf(-x));
 }
 
 inline int gcd(int a, int b) {
@@ -16,37 +17,54 @@ inline int gcd(int a, int b) {
 }
 
 template<class Tdata>
-static __global__ void swiglu(
-    Tdata *__restrict__ gate_,
-    int const stride_gate,
-    Tdata const *__restrict__ up_,
-    int const stride_up) {
-    auto i = blockIdx.y * stride_gate + blockIdx.x * blockDim.x + threadIdx.x,
-         j = blockIdx.y * stride_up + blockIdx.x * blockDim.x + threadIdx.x;
-    auto x = float(gate_[i]),
-         y = float(up_[j]);
-    gate_[i] = Tdata(x * sigmoid(x) * y);
+static __launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void swiglu(
+    Tdata *__restrict__ c,
+    int const stride_c,
+    Tdata const *__restrict__ a,
+    int const stride_a,
+    Tdata const *__restrict__ b,
+    int const stride_b) {
+    auto i = blockIdx.y * stride_b + blockIdx.x * blockDim.x + threadIdx.x,
+         j = blockIdx.y * stride_a + blockIdx.x * blockDim.x + threadIdx.x,
+         k = blockIdx.y * stride_c + blockIdx.x * blockDim.x + threadIdx.x;
+    auto x = float(b[i]),
+         y = float(a[j]);
+    c[k] = Tdata(silu(x) * y);
 }
 
-constexpr static int BLOCK_SIZE = 1024;
+void swiglu_nv_gpu_f16(SwiGLUCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream) {
 
-void swiglu_nv_gpu_f16(Tensor gate, Tensor up, void *stream) {
-    ASSERT_EQ(gate.layout->ndim, 2);
-    ASSERT_EQ(up.layout->ndim, 2);
-    ASSERT_EQ(gate.layout->shape[0], up.layout->shape[0]);
-    ASSERT_EQ(gate.layout->shape[1], up.layout->shape[1]);
+    auto seq_len = desc->seq_len,
+         di = desc->di;
 
-    auto seq_len = gate.layout->shape[0],
-         di = gate.layout->shape[1];
+    auto stride_a = desc->stride_a,
+         stride_b = desc->stride_b,
+         stride_c = desc->stride_c;
 
-    dim3 block_dims = gcd(BLOCK_SIZE, di);
+    dim3 block_dims = gcd(MAX_THREADS_PER_BLOCK, di);
     dim3 grid_dims = dim3(di / block_dims.x, seq_len);
 
-    auto gate_ptr = reinterpret_cast<half *>(gate.data);
-    auto up_ptr = reinterpret_cast<half const *>(up.data);
+    auto a_ptr = reinterpret_cast<const half *>(a);
+    auto b_ptr = reinterpret_cast<const half *>(b);
+    auto c_ptr = reinterpret_cast<half *>(c);
 
     auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
 
     swiglu<<<grid_dims, block_dims, 0, cuda_stream>>>(
-        gate_ptr, gate.layout->strides[0] / 2, up_ptr, up.layout->strides[0] / 2);
+        c_ptr, stride_c, a_ptr, stride_a, b_ptr, stride_b);
+}
+
+infiniopStatus_t cudaSwiGLU(SwiGLUCudaDescriptor_t desc,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+
+    if (dtype_eq(desc->dtype, F16)) {
+        swiglu_nv_gpu_f16(desc, c, a, b, stream);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
 }
diff --git a/src/ops/swiglu/cuda/swiglu.cuh b/src/ops/swiglu/cuda/swiglu.cuh
index 617ecff9..9b3bdcb5 100644
--- a/src/ops/swiglu/cuda/swiglu.cuh
+++ b/src/ops/swiglu/cuda/swiglu.cuh
@@ -1,12 +1,36 @@
-#ifndef __NV_GPU_SWIGLU_H__
-#define __NV_GPU_SWIGLU_H__
-
+#ifndef __CUDA_SWIGLU_H__
+#define __CUDA_SWIGLU_H__
+#include "../../../devices/cuda/cuda_handle.h"
+#include "../../utils.h"
 #include "operators.h"
 
-struct SwigluCudaDescriptor {
+struct SwiGLUCudaDescriptor {
     Device device;
+    int device_id;
+    DT dtype;
+    uint64_t seq_len;
+    uint64_t di;
+    uint64_t stride_a;
+    uint64_t stride_b;
+    uint64_t stride_c;
 };
 
-void swiglu_nv_gpu_f16(Tensor gate, Tensor up, void *stream);
+typedef struct SwiGLUCudaDescriptor *SwiGLUCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateSwiGLUDescriptor(CudaHandle_t handle,
+                                            SwiGLUCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_dec,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t cudaSwiGLU(SwiGLUCudaDescriptor_t desc,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream);
+
+infiniopStatus_t cudaDestroySwiGLUDescriptor(SwiGLUCudaDescriptor_t desc);
+
+void swiglu_nv_gpu_f16(SwiGLUCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream);
 
 #endif// __NV_GPU_SWIGLU_H__
diff --git a/src/ops/swiglu/cuda/swiglu_cuda.cc b/src/ops/swiglu/cuda/swiglu_cuda.cc
new file mode 100644
index 00000000..16d70503
--- /dev/null
+++ b/src/ops/swiglu/cuda/swiglu_cuda.cc
@@ -0,0 +1,51 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "swiglu.cuh"
+
+infiniopStatus_t cudaCreateSwiGLUDescriptor(CudaHandle_t handle,
+                                            SwiGLUCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc) {
+    if (c_desc->ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    DT dtype = c_desc->dt;
+
+    if (!dtype_eq(dtype, F16)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    if (a_desc->strides[1] != 1 || b_desc->strides[1] != 1 || c_desc->strides[1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    uint64_t seq_len = c_desc->shape[0],
+             di = c_desc->shape[1];
+
+    uint64_t stride_a = a_desc->strides[0],
+             stride_b = b_desc->strides[0],
+             stride_c = c_desc->strides[0];
+
+
+    if (a_desc->shape[0] != seq_len || a_desc->shape[1] != di || !dtype_eq(a_desc->dt, dtype) ||
+        b_desc->shape[0] != seq_len || b_desc->shape[1] != di || !dtype_eq(b_desc->dt, dtype)) {
+        return STATUS_BAD_PARAM;
+    }
+
+    *desc_ptr = new SwiGLUCudaDescriptor{DevNvGpu,
+                                         handle->device_id,
+                                         dtype,
+                                         seq_len,
+                                         di,
+                                         stride_a,
+                                         stride_b,
+                                         stride_c};
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroySwiGLUDescriptor(SwiGLUCudaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/swiglu/maca/swiglu_maca.cc b/src/ops/swiglu/maca/swiglu_maca.cc
new file mode 100644
index 00000000..71c2af70
--- /dev/null
+++ b/src/ops/swiglu/maca/swiglu_maca.cc
@@ -0,0 +1,51 @@
+#include "../../../devices/maca/common_maca.h"
+#include "../../utils.h"
+#include "swiglu_maca.h"
+
+infiniopStatus_t macaCreateSwiGLUDescriptor(MacaHandle_t handle,
+                                            SwiGLUMacaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc) {
+    if (c_desc->ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    DT dtype = c_desc->dt;
+
+    if (!dtype_eq(dtype, F16)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    if (a_desc->strides[1] != 1 || b_desc->strides[1] != 1 || c_desc->strides[1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    uint64_t seq_len = c_desc->shape[0],
+             di = c_desc->shape[1];
+
+    uint64_t stride_a = a_desc->strides[0],
+             stride_b = b_desc->strides[0],
+             stride_c = c_desc->strides[0];
+
+
+    if (a_desc->shape[0] != seq_len || a_desc->shape[1] != di || !dtype_eq(a_desc->dt, dtype) ||
+        b_desc->shape[0] != seq_len || b_desc->shape[1] != di || !dtype_eq(b_desc->dt, dtype)) {
+        return STATUS_BAD_PARAM;
+    }
+
+    *desc_ptr = new SwiGLUMacaDescriptor{DevMetaxGpu,
+                                         handle->device_id,
+                                         dtype,
+                                         seq_len,
+                                         di,
+                                         stride_a,
+                                         stride_b,
+                                         stride_c};
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t macaDestroySwiGLUDescriptor(SwiGLUMacaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/swiglu/maca/swiglu_maca.h b/src/ops/swiglu/maca/swiglu_maca.h
new file mode 100644
index 00000000..3ea7c661
--- /dev/null
+++ b/src/ops/swiglu/maca/swiglu_maca.h
@@ -0,0 +1,36 @@
+#ifndef __MACA_SWIGLU_H__
+#define __MACA_SWIGLU_H__
+#include "../../../devices/maca/maca_handle.h"
+#include "../../utils.h"
+#include "operators.h"
+
+struct SwiGLUMacaDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t seq_len;
+    uint64_t di;
+    uint64_t stride_a;
+    uint64_t stride_b;
+    uint64_t stride_c;
+};
+
+typedef struct SwiGLUMacaDescriptor *SwiGLUMacaDescriptor_t;
+
+infiniopStatus_t macaCreateSwiGLUDescriptor(MacaHandle_t handle,
+                                            SwiGLUMacaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_dec,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t macaSwiGLU(SwiGLUMacaDescriptor_t desc,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream);
+
+infiniopStatus_t macaDestroySwiGLUDescriptor(SwiGLUMacaDescriptor_t desc);
+
+void swiglu_mc_gpu_f16(SwiGLUMacaDescriptor_t desc, void *c, void const *a, void const *b, void *stream);
+
+#endif// __MC_GPU_SWIGLU_H__
diff --git a/src/ops/swiglu/maca/swiglu_maca.maca b/src/ops/swiglu/maca/swiglu_maca.maca
new file mode 100644
index 00000000..68692c04
--- /dev/null
+++ b/src/ops/swiglu/maca/swiglu_maca.maca
@@ -0,0 +1,70 @@
+#include "../../../devices/maca/common_maca.h"
+#include "../../utils.h"
+#include "swiglu_maca.h"
+#include <common/hpcc_fp16.h>
+
+static __forceinline__ __device__ float silu(float x) {
+    return x * fdividef(1, 1 + expf(-x));
+}
+
+inline int gcd(int a, int b) {
+    while (b != 0) {
+        int rem = a % b;
+        a = b;
+        b = rem;
+    }
+    return a;
+}
+
+template<class Tdata>
+static __global__ void swiglu(
+    Tdata *__restrict__ c,
+    int const stride_c,
+    Tdata const *__restrict__ a,
+    int const stride_a,
+    Tdata const *__restrict__ b,
+    int const stride_b) {
+    auto i = blockIdx.y * stride_b + blockIdx.x * blockDim.x + threadIdx.x,
+         j = blockIdx.y * stride_a + blockIdx.x * blockDim.x + threadIdx.x,
+         k = blockIdx.y * stride_c + blockIdx.x * blockDim.x + threadIdx.x;
+    auto x = float(b[i]),
+         y = float(a[j]);
+    c[k] = Tdata(silu(x) * y);
+}
+
+void swiglu_mc_gpu_f16(SwiGLUMacaDescriptor_t desc, void *c, void const *a, void const *b, void *stream) {
+
+    auto seq_len = desc->seq_len,
+         di = desc->di;
+
+    auto stride_a = desc->stride_a,
+         stride_b = desc->stride_b,
+         stride_c = desc->stride_c;
+
+    dim3 block_dims = gcd(MAX_THREADS_PER_BLOCK, di);
+    dim3 grid_dims = dim3(di / block_dims.x, seq_len);
+
+    auto a_ptr = reinterpret_cast<const half *>(a);
+    auto b_ptr = reinterpret_cast<const half *>(b);
+    auto c_ptr = reinterpret_cast<half *>(c);
+
+    auto maca_stream = reinterpret_cast<hcStream_t>(stream);
+
+    swiglu<<<grid_dims, block_dims, 0, maca_stream>>>(
+        c_ptr, stride_c, a_ptr, stride_a, b_ptr, stride_b);
+}
+
+infiniopStatus_t macaSwiGLU(SwiGLUMacaDescriptor_t desc,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream) {
+    checkMacaError(hcSetDevice(desc->device_id));
+
+    if (dtype_eq(desc->dtype, F16)) {
+        swiglu_mc_gpu_f16(desc, c, a, b, stream);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/swiglu/musa/swiglu.mu b/src/ops/swiglu/musa/swiglu.mu
new file mode 100644
index 00000000..259e5c6f
--- /dev/null
+++ b/src/ops/swiglu/musa/swiglu.mu
@@ -0,0 +1,68 @@
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include "swiglu_musa.h"
+#include <musa_fp16.h>
+
+static __forceinline__ __device__ float silu(float x) {
+    return x * fdividef(1, 1 + expf(-x));
+}
+
+inline int gcd(int a, int b) {
+    while (b != 0) {
+        int rem = a % b;
+        a = b;
+        b = rem;
+    }
+    return a;
+}
+
+template<class Tdata>
+static __global__ void swiglu(
+    Tdata *__restrict__ c,
+    int const stride_c,
+    Tdata const *__restrict__ a,
+    int const stride_a,
+    Tdata const *__restrict__ b,
+    int const stride_b) {
+    auto i = blockIdx.y * stride_b + blockIdx.x * blockDim.x + threadIdx.x,
+         j = blockIdx.y * stride_a + blockIdx.x * blockDim.x + threadIdx.x,
+         k = blockIdx.y * stride_c + blockIdx.x * blockDim.x + threadIdx.x;
+    auto x = float(b[i]),
+         y = float(a[j]);
+    c[k] = Tdata(silu(x) * y);
+}
+
+void swiglu_mt_gpu_f16(SwiGLUMusaDescriptor_t desc, void *c, void const *a, void const *b, void *stream) {
+
+    auto seq_len = desc->seq_len,
+         di = desc->di;
+
+    auto stride_a = desc->stride_a,
+         stride_b = desc->stride_b,
+         stride_c = desc->stride_c;
+
+    dim3 block_dims = gcd(MAX_THREADS_PER_BLOCK, di);
+    dim3 grid_dims = dim3(di / block_dims.x, seq_len);
+
+    auto a_ptr = reinterpret_cast<const half *>(a);
+    auto b_ptr = reinterpret_cast<const half *>(b);
+    auto c_ptr = reinterpret_cast<half *>(c);
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+    swiglu<<<grid_dims, block_dims, 0, musa_stream>>>(
+        c_ptr, stride_c, a_ptr, stride_a, b_ptr, stride_b);
+}
+
+infiniopStatus_t musaSwiGLU(SwiGLUMusaDescriptor_t desc,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream) {
+    if (dtype_eq(desc->dtype, F16)) {
+        swiglu_mt_gpu_f16(desc, c, a, b, stream);
+        return STATUS_SUCCESS;
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/swiglu/musa/swiglu_musa.cc b/src/ops/swiglu/musa/swiglu_musa.cc
new file mode 100644
index 00000000..a1d5719b
--- /dev/null
+++ b/src/ops/swiglu/musa/swiglu_musa.cc
@@ -0,0 +1,50 @@
+#include "../../../devices/musa/common_musa.h"
+#include "../../utils.h"
+#include "swiglu_musa.h"
+
+infiniopStatus_t musaCreateSwiGLUDescriptor(infiniopHandle_t handle,
+                                            SwiGLUMusaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc) {
+    if (c_desc->ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    DT dtype = c_desc->dt;
+
+    if (!dtype_eq(dtype, F16)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    if (a_desc->strides[1] != 1 || b_desc->strides[1] != 1 || c_desc->strides[1] != 1) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    uint64_t seq_len = c_desc->shape[0],
+             di = c_desc->shape[1];
+
+    uint64_t stride_a = a_desc->strides[0],
+             stride_b = b_desc->strides[0],
+             stride_c = c_desc->strides[0];
+
+
+    if (a_desc->shape[0] != seq_len || a_desc->shape[1] != di || !dtype_eq(a_desc->dt, dtype) ||
+        b_desc->shape[0] != seq_len || b_desc->shape[1] != di || !dtype_eq(b_desc->dt, dtype)) {
+        return STATUS_BAD_PARAM;
+    }
+
+    *desc_ptr = new SwiGLUMusaDescriptor{DevMthreadsGpu,
+                                         dtype,
+                                         seq_len,
+                                         di,
+                                         stride_a,
+                                         stride_b,
+                                         stride_c};
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t musaDestroySwiGLUDescriptor(SwiGLUMusaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/swiglu/musa/swiglu_musa.h b/src/ops/swiglu/musa/swiglu_musa.h
new file mode 100644
index 00000000..00ae1155
--- /dev/null
+++ b/src/ops/swiglu/musa/swiglu_musa.h
@@ -0,0 +1,34 @@
+#ifndef __MUSA_SWIGLU_H__
+#define __MUSA_SWIGLU_H__
+
+#include "operators.h"
+
+struct SwiGLUMusaDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t seq_len;
+    uint64_t di;
+    uint64_t stride_a;
+    uint64_t stride_b;
+    uint64_t stride_c;
+};
+
+typedef struct SwiGLUMusaDescriptor *SwiGLUMusaDescriptor_t;
+
+infiniopStatus_t musaCreateSwiGLUDescriptor(infiniopHandle_t handle,
+                                            SwiGLUMusaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_dec,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t musaSwiGLU(SwiGLUMusaDescriptor_t desc,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            void *stream);
+
+infiniopStatus_t musaDestroySwiGLUDescriptor(SwiGLUMusaDescriptor_t desc);
+
+void swiglu_mt_gpu_f16(SwiGLUMusaDescriptor_t desc, void *c, void const *a, void const *b, void *stream);
+
+#endif// __MT_GPU_SWIGLU_H__
diff --git a/src/ops/swiglu/operator.cc b/src/ops/swiglu/operator.cc
index 8f351242..3ea0bedc 100644
--- a/src/ops/swiglu/operator.cc
+++ b/src/ops/swiglu/operator.cc
@@ -1,4 +1,5 @@
 #include "../utils.h"
+#include "operators.h"
 #include "ops/swiglu/swiglu.h"
 
 #ifdef ENABLE_CPU
@@ -9,80 +10,127 @@
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
 #include "bang/swiglu_bang.h"
-#include "bang/swiglu_cnnl.h"
+#endif
+#ifdef ENABLE_ASCEND_NPU
+#include "ascend/swiglu.h"
+#endif
+#ifdef ENABLE_METAX_GPU
+#include "maca/swiglu_maca.h"
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+#include "musa/swiglu_musa.h"
 #endif
 
-struct SwigluDescriptor {
-    Device device;
-};
-
-__C void *createSwigluDescriptor(Device device, void *config) {
-    switch (device) {
+__C infiniopStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle,
+                                                    infiniopSwiGLUDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t c_desc,
+                                                    infiniopTensorDescriptor_t a_desc,
+                                                    infiniopTensorDescriptor_t b_desc) {
+    switch (handle->device) {
 #ifdef ENABLE_CPU
-    case DevCpu:
-        return (SwigluDescriptor *) (new SwigluCpuDescriptor{device});
+        case DevCpu:
+            return cpuCreateSwiGLUDescriptor(handle, (SwiGLUCpuDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
 #endif
 #ifdef ENABLE_NV_GPU
-    case DevNvGpu:
-        return (SwigluDescriptor *) (new SwigluCudaDescriptor{device});
+        case DevNvGpu:
+            return cudaCreateSwiGLUDescriptor((CudaHandle_t) handle, (SwiGLUCudaDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-    case DevCambriconMlu: {
-        auto bangDescriptor = new SwigluBangDescriptor(device);
-        bangDescriptor->createCnnlDescriptors();
-        return (SwigluDescriptor *) (bangDescriptor);
-    }
+        case DevCambriconMlu: {
+            return bangCreateSwiGLUDescriptor((BangHandle_t) handle,
+                                              (SwiGLUBangDescriptor_t *) desc_ptr,
+                                              c_desc,
+                                              a_desc,
+                                              b_desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu:
+            return ascendCreateSwiGLUDescriptor((AscendHandle_t) handle,
+                                                (SwiGLUAscendDescriptor_t *) desc_ptr,
+                                                c_desc,
+                                                a_desc,
+                                                b_desc);
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCreateSwiGLUDescriptor((MacaHandle_t) handle,
+                                                (SwiGLUMacaDescriptor_t *) desc_ptr,
+                                                c_desc,
+                                                a_desc,
+                                                b_desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu:
+            return musaCreateSwiGLUDescriptor(handle, (SwiGLUMusaDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
 #endif
-    default:
-        PANIC(UnsupportedDevice);
     }
-    return nullptr;
+    return STATUS_BAD_DEVICE;
 };
 
-__C void destroySwigluDescriptor(SwigluDescriptor *descriptor) {
-    switch (descriptor->device) {
+__C infiniopStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc,
+                                    void *c,
+                                    void const *a,
+                                    void const *b,
+                                    void *stream) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            delete (SwigluCpuDescriptor *) (descriptor);
-            break;
+            return cpuSwiGLU((SwiGLUCpuDescriptor_t) desc, c, a, b, stream);
 #endif
 #ifdef ENABLE_NV_GPU
         case DevNvGpu:
-            delete (SwigluCudaDescriptor *) (descriptor);
-            break;
+            return cudaSwiGLU((SwiGLUCudaDescriptor_t) desc, c, a, b, stream);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
         case DevCambriconMlu: {
-            auto bangDescriptor = (SwigluBangDescriptor *) (descriptor);
-            bangDescriptor->destroyCnnlDescriptors();
-            delete bangDescriptor;
-            break;
+            return bangSwiGLU((SwiGLUBangDescriptor_t) desc, c, a, b, stream);
         }
 #endif
-        default:
-            PANIC(UnsupportedDevice);
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu:
+            return ascendSwiGLU((SwiGLUAscendDescriptor_t) desc, c, a, b, stream);
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu:
+            return macaSwiGLU((SwiGLUMacaDescriptor_t) desc, c, a, b, stream);
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu:
+            return musaSwiGLU((SwiGLUMusaDescriptor_t) desc, c, a, b, stream);
+#endif
     }
+    return STATUS_BAD_DEVICE;
 }
 
-__C void swiglu(SwigluDescriptor *descriptor, Tensor gate, Tensor up, void *stream) {
-    switch (descriptor->device) {
+__C infiniopStatus_t infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
+    switch (desc->device) {
 #ifdef ENABLE_CPU
         case DevCpu:
-            swiglu_cpu_f16(gate, up);
-            break;
+            return cpuDestroySwiGLUDescriptor((SwiGLUCpuDescriptor_t) desc);
 #endif
 #ifdef ENABLE_NV_GPU
         case DevNvGpu:
-            swiglu_nv_gpu_f16(gate, up, stream);
-            break;
+            return cudaDestroySwiGLUDescriptor((SwiGLUCudaDescriptor_t) desc);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu:
-            // swiglu_cnnl_f16((SwigluBangDescriptor *) (descriptor), gate, up, stream);
-            swiglu_bang_f16(gate, up, stream);
-            break;
+        case DevCambriconMlu: {
+            return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu:
+            return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t) desc);
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu:
+            return macaDestroySwiGLUDescriptor((SwiGLUMacaDescriptor_t) desc);
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu:
+            return musaDestroySwiGLUDescriptor((SwiGLUMusaDescriptor_t) desc);
 #endif
-        default:
-            PANIC(UnsupportedDevice);
     }
-};
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/utils.h b/src/ops/utils.h
index 01b5e81f..b48cf419 100644
--- a/src/ops/utils.h
+++ b/src/ops/utils.h
@@ -1,8 +1,14 @@
 #ifndef __UTILS_H__
 #define __UTILS_H__
 
+#include "data_type.h"
+#include "tensor.h"
+#include <algorithm>
+#include <iostream>
+#include <numeric>
 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>
 
 /* This file contains some useful macros and helper functions */
 
@@ -23,4 +29,225 @@ inline void assert_true(int expr, const char *msg, const char *file, int line) {
     exit(EXIT_FAILURE)
 
 #define ROUND_UP_DIV(x, y) ((x + y - 1) / y)
+
+#define CHECK_ERROR(call, target, errCode)                   \
+    do {                                                     \
+        if (auto value = (call); value == (target)) {        \
+            std::cerr << "Error: expected " << (target)      \
+                      << " but got " << value                \
+                      << " in file " << __FILE__             \
+                      << ", function " << __func__           \
+                      << ", line " << __LINE__ << std::endl; \
+            return (errCode);                                \
+        }                                                    \
+    } while (0)
+
+#define CREATE_CHECK_ERROR(expr, value, target, errCode) \
+    expr;                                                \
+    CHECK_ERROR(value, target, errCode)
+
+#define CHECK_STATUS(call, target)                           \
+    do {                                                     \
+        if (auto value = (call); value != (target)) {        \
+            std::cerr << "Error: expected " << (target)      \
+                      << " but got " << value                \
+                      << " in file " << __FILE__             \
+                      << ", function " << __func__           \
+                      << ", line " << __LINE__ << std::endl; \
+            return value;                                    \
+        }                                                    \
+    } while (0)
+
+// check if two data layouts (types) are equal
+inline bool dtype_eq(DataLayout a, DataLayout b) {
+    union TypePun {
+        DataLayout layout;
+        int i;
+    } pun;
+    pun.layout = a;
+    auto a_ = pun.i;
+    pun.layout = b;
+    auto b_ = pun.i;
+    return a_ == b_;
+}
+
+inline std::vector<int64_t> get_byte_strides(infiniopTensorDescriptor_t desc) {
+    int64_t dsize = desc->dt.size;
+    std::vector<int64_t> strides(desc->ndim);
+    for (uint64_t i = 0; i < desc->ndim; i++) {
+        strides[i] = dsize * desc->strides[i];
+    }
+
+    return strides;
+}
+
+// calculate the broadcasted shape for two tensors
+inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
+                              const uint64_t *shape2, uint64_t ndim2,
+                              uint64_t *broadcast_shape, uint64_t *padded_shape1,
+                              uint64_t *padded_shape2, uint64_t max_rank) {
+    // prepending and initializing
+    std::fill(padded_shape1, padded_shape1 + max_rank, 1);
+    std::fill(padded_shape2, padded_shape2 + max_rank, 1);
+    std::copy(shape1, shape1 + ndim1, padded_shape1 + max_rank - ndim1);
+    std::copy(shape2, shape2 + ndim2, padded_shape2 + max_rank - ndim2);
+
+    // compute broadcasted shape
+    for (size_t i = 0; i < max_rank; ++i) {
+        if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) {
+            broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]);
+        } else {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+// check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes
+inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c,
+                                  uint64_t broadcast_ndim) {
+    std::vector<uint64_t>
+        broadcast_shape_(broadcast_ndim),
+        padded_shape1_(broadcast_ndim),
+        padded_shape2_(broadcast_ndim);
+    auto broadcast_shape = broadcast_shape_.data(),
+         padded_shape1 = padded_shape1_.data(),
+         padded_shape2 = padded_shape2_.data();
+    if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) {
+        return false;
+    }
+    return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim, c->shape);
+}
+
+// check if the shape of tensor src can be validly broadcasted to that of the tensor dst
+inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensorDescriptor_t src) {
+    if (dst->ndim < src->ndim) {
+        return false;
+    }
+    std::vector<uint64_t> padded_shape_(dst->ndim);
+    auto padded_shape = padded_shape_.data();
+    std::fill(padded_shape, padded_shape + dst->ndim, 1);
+    std::copy(src->shape, src->shape + src->ndim, padded_shape + dst->ndim - src->ndim);
+    for (size_t i = 0; i < dst->ndim; ++i) {
+        if (padded_shape[i] != dst->shape[i] && padded_shape[i] != 1) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// check if the shape of tensor c is valid after broadcasting tensors a and b
+inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c) {
+    return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim));
+}
+
+inline uint64_t get_byte_size(infiniopTensorDescriptor_t desc) {
+    uint64_t dsize = desc->dt.size;
+    uint64_t size = 1;
+    for (uint64_t i = 0; i < desc->ndim; i++) {
+        size *= desc->shape[i];
+    }
+    return size * dsize;
+}
+
+// permute the dimensions of a tensor descriptor
+inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, const std::vector<uint64_t> &order) {
+    uint64_t ndim = desc->ndim;
+    if (order.size() != ndim) {
+        return nullptr;
+    }
+    uint64_t *shape = new uint64_t[ndim];
+    int64_t *strides = new int64_t[ndim];
+    for (uint64_t i = 0; i < ndim; i++) {
+        if (std::find(order.begin(), order.end(), i) == order.end()) {
+            return nullptr;
+        }
+        shape[i] = desc->shape[order[i]];
+        strides[i] = desc->strides[order[i]];
+    }
+    return new TensorDescriptor{
+        desc->dt, ndim, shape, strides};
+}
+
+// check if the dimensions [dim_start, dim_end] of a tensor descriptor are contiguous
+inline bool is_contiguous(const infiniopTensorDescriptor_t &desc, uint64_t dim_start, uint64_t dim_end) {
+    for (size_t i = dim_start + 1; i <= dim_end; i++) {
+        if (desc->strides[i - 1] != static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+inline bool is_contiguous(const infiniopTensorDescriptor_t &desc) {
+    if (desc->ndim == 0) {
+        return true;
+    }
+    return is_contiguous(desc, 0, desc->ndim - 1);
+}
+
+// merge the dimensions [dim_start, dim_end] of a tensor descriptor
+inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, uint64_t dim_start, uint64_t dim_end) {
+    uint64_t ndim = desc->ndim;
+    if (dim_start > dim_end || dim_end >= ndim) {
+        return nullptr;
+    }
+
+    uint64_t new_ndim = ndim - (dim_end - dim_start);
+    uint64_t *new_shape = new uint64_t[new_ndim];
+    int64_t *new_strides = new int64_t[new_ndim];
+    uint64_t index = 0;
+    for (size_t i = 0; i < dim_start; i++) {
+        new_shape[index] = desc->shape[i];
+        new_strides[index] = desc->strides[i];
+        index++;
+    }
+    if (!is_contiguous(desc, dim_start, dim_end)) {
+        return nullptr;
+    }
+    new_shape[index] = 1;
+    for (size_t i = dim_start; i <= dim_end; i++) {
+        new_shape[index] *= desc->shape[i];
+    }
+    new_strides[index] = desc->strides[dim_end];
+    index++;
+    for (size_t i = dim_end + 1; i < ndim; i++) {
+        new_shape[index] = desc->shape[i];
+        new_strides[index] = desc->strides[i];
+        index++;
+    }
+    return new TensorDescriptor{
+        desc->dt, new_ndim, new_shape, new_strides};
+}
+
+// split the dimension dim of a tensor descriptor into multiple dimensions
+inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, uint64_t dim, const std::vector<uint64_t> &dims) {
+    uint64_t ndim = desc->ndim;
+    if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t)1, std::multiplies{})) {
+        return nullptr;
+    }
+    uint64_t new_ndim = ndim + dims.size() - 1;
+    uint64_t *new_shape = new uint64_t[new_ndim];
+    int64_t *new_strides = new int64_t[new_ndim];
+    uint64_t index = 0;
+    for (size_t i = 0; i < dim; i++) {
+        new_shape[index] = desc->shape[i];
+        new_strides[index] = desc->strides[i];
+        index++;
+    }
+    for (size_t i = 0; i < dims.size(); i++) {
+        new_shape[index] = dims[i];
+        new_strides[index] = desc->strides[dim] * desc->shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, 1, std::multiplies<uint64_t>());
+        index++;
+    }
+    for (size_t i = dim + 1; i < ndim; i++) {
+        new_shape[index] = desc->shape[i];
+        new_strides[index] = desc->strides[i];
+        index++;
+    }
+    return new TensorDescriptor{
+        desc->dt, new_ndim, new_shape, new_strides};
+}
+
 #endif// __UTILS_H__
diff --git a/src/tensor/tensor_descriptor.cc b/src/tensor/tensor_descriptor.cc
index a6397206..57afe92d 100644
--- a/src/tensor/tensor_descriptor.cc
+++ b/src/tensor/tensor_descriptor.cc
@@ -1,16 +1,26 @@
 #include "tensor/tensor_descriptor.h"
 #include <cstring>
 
-__C __export void createTensorDescriptor(TensorDescriptor* desc_ptr, uint64_t ndim, uint64_t *shape_, int64_t *strides_, DataLayout datatype) {
+__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, uint64_t ndim, uint64_t const *shape_, int64_t const *strides_, DataLayout datatype) {
     uint64_t *shape = new uint64_t[ndim];
     int64_t *strides = new int64_t[ndim];
     std::memcpy(shape, shape_, ndim * sizeof(uint64_t));
-    std::memcpy(strides, strides_, ndim * sizeof(int64_t));
-    *desc_ptr = new TensorLayout{datatype, ndim, shape, strides};
+    if (strides_) {
+        std::memcpy(strides, strides_, ndim * sizeof(int64_t));
+    } else {
+        int64_t dsize = 1;
+        for (int i = ndim - 1; i >= 0; i--) {
+            strides[i] = dsize;
+            dsize *= shape[i];
+        }
+    }
+    *desc_ptr = new TensorDescriptor{datatype, ndim, shape, strides};
+    return STATUS_SUCCESS;
 }
 
-__C __export void destroyTensorDescriptor(TensorDescriptor desc){
+__C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc) {
     delete[] desc->shape;
     delete[] desc->strides;
     delete desc;
+    return STATUS_SUCCESS;
 }
diff --git a/xmake.lua b/xmake.lua
index e508eae4..f9e6f3dc 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -1,4 +1,8 @@
 add_rules("mode.debug", "mode.release")
+-- Define color codes
+local GREEN = '\27[0;32m'
+local YELLOW = '\27[1;33m'
+local NC = '\27[0m'  -- No Color
 
 add_includedirs("include")
 
@@ -9,6 +13,12 @@ option("cpu")
     add_defines("ENABLE_CPU")
 option_end()
 
+option("omp")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Enable or disable OpenMP support for cpu kernel")
+option_end()
+
 option("nv-gpu")
     set_default(false)
     set_showmenu(true)
@@ -23,6 +33,36 @@ option("cambricon-mlu")
     add_defines("ENABLE_CAMBRICON_MLU")
 option_end()
 
+option("ascend-npu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Enable or disable Ascend NPU kernel")
+    add_defines("ENABLE_ASCEND_NPU")
+option_end()
+
+option("metax-gpu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Enable or disable Metax GPU kernel")
+    add_defines("ENABLE_METAX_GPU")
+option_end()
+
+
+option("mthreads-gpu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Enable or disable MThreads GPU kernel")
+    add_defines("ENABLE_MTHREADS_GPU")
+option_end()
+
+option("sugon-dcu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Enable or disable Sugon DCU kernel")
+    add_defines("ENABLE_SUGON_DCU")
+    add_defines("ENABLE_NV_GPU")
+option_end()
+
 if is_mode("debug") then
     add_cxflags("-g -O0")
     add_defines("DEBUG_MODE")
@@ -32,6 +72,7 @@ if has_config("cpu") then
 
     add_defines("ENABLE_CPU")
     target("cpu")
+        on_install(function (target) end)
         set_kind("static")
 
         if not is_plat("windows") then
@@ -40,32 +81,52 @@ if has_config("cpu") then
 
         set_languages("cxx17")
         add_files("src/devices/cpu/*.cc", "src/ops/*/cpu/*.cc")
-        add_cxflags("-fopenmp")
-        add_ldflags("-fopenmp")
+        if has_config("omp") then
+            add_cxflags("-fopenmp")
+            add_ldflags("-fopenmp")
+        end
     target_end()
 
 end
 
-if has_config("nv-gpu") then
-
+if has_config("nv-gpu", "sugon-dcu") then
     add_defines("ENABLE_NV_GPU")
+    if has_config("sugon-dcu") then
+        add_defines("ENABLE_SUGON_DCU")
+    end
+    local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH")
+    local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH")
+    if CUDA_ROOT ~= nil then
+        add_includedirs(CUDA_ROOT .. "/include")
+    end
+    if CUDNN_ROOT ~= nil then
+        add_includedirs(CUDNN_ROOT .. "/include")
+    end
+
     target("nv-gpu")
         set_kind("static")
+        on_install(function (target) end)
         set_policy("build.cuda.devlink", true)
 
         set_toolchains("cuda")
         add_links("cublas")
+        add_links("cudnn")
         add_cugencodes("native")
 
         if is_plat("windows") then
             add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
+            if CUDNN_ROOT ~= nil then
+                add_linkdirs(CUDNN_ROOT .. "\\lib\\x64")
+            end
         else
             add_cuflags("-Xcompiler=-fPIC")
             add_culdflags("-Xcompiler=-fPIC")
+            add_cxxflags("-fPIC")
         end
 
         set_languages("cxx17")
         add_files("src/devices/cuda/*.cc", "src/ops/*/cuda/*.cu")
+        add_files("src/ops/*/cuda/*.cc")
     target_end()
 
 end
@@ -96,7 +157,7 @@ if has_config("cambricon-mlu") then
 
             local includedirs = table.concat(target:get("includedirs"), " ")
             local args = {"-c", sourcefile, "-o", objectfile, "-I/usr/local/neuware/include", "--bang-mlu-arch=mtp_592", "-O3", "-fPIC", "-Wall", "-Werror", "-std=c++17", "-pthread"}
-            
+
             for _, includedir in ipairs(target:get("includedirs")) do
                 table.insert(args, "-I" .. includedir)
             end
@@ -105,11 +166,11 @@ if has_config("cambricon-mlu") then
             table.insert(target:objectfiles(), objectfile)
         end)
 
-rule_end()
-
+    rule_end()
 
     target("cambricon-mlu")
         set_kind("static")
+        on_install(function (target) end)
         set_languages("cxx17")
         add_files("src/devices/bang/*.cc", "src/ops/*/bang/*.cc")
         add_files("src/ops/*/bang/*.mlu", {rule = "mlu"})
@@ -118,7 +179,162 @@ rule_end()
 
 end
 
-target("operators")
+if has_config("mthreads-gpu") then
+
+    add_defines("ENABLE_MTHREADS_GPU")
+    local musa_home = os.getenv("MUSA_INSTALL_PATH")
+    -- Add include dirs
+    add_includedirs(musa_home .. "/include")
+    -- Add shared lib
+    add_linkdirs(musa_home .. "/lib")
+    add_links("libmusa.so")
+    add_links("libmusart.so")
+    add_links("libmudnn.so")
+    add_links("libmublas.so")
+
+    rule("mu")
+        set_extensions(".mu")
+        on_load(function (target)
+            target:add("includedirs", "include")
+        end)
+
+        on_build_file(function (target, sourcefile)
+            local objectfile = target:objectfile(sourcefile)
+            os.mkdir(path.directory(objectfile))
+
+            local mcc = "/usr/local/musa/bin/mcc"
+            local includedirs = table.concat(target:get("includedirs"), " ")
+            local args = {"-c", sourcefile, "-o", objectfile, "-I/usr/local/musa/include", "-O3", "-fPIC", "-Wall", "-std=c++17", "-pthread"}
+            for _, includedir in ipairs(target:get("includedirs")) do
+                table.insert(args, "-I" .. includedir)
+            end
+
+            os.execv(mcc, args)
+            table.insert(target:objectfiles(), objectfile)
+        end)
+    rule_end()
+
+    target("mthreads-gpu")
+        set_kind("static")
+        set_languages("cxx17")
+        add_files("src/devices/musa/*.cc", "src/ops/*/musa/*.cc")
+        add_files("src/ops/*/musa/*.mu", {rule = "mu"})
+        add_cxflags("-lstdc++ -Wall -fPIC")
+    target_end()
+
+end
+
+if has_config("ascend-npu") then
+
+    add_defines("ENABLE_ASCEND_NPU")
+    local ASCEND_HOME = os.getenv("ASCEND_HOME")
+    local SOC_VERSION = os.getenv("SOC_VERSION")
+
+    -- Add include dirs
+    add_includedirs(ASCEND_HOME .. "/include")
+    add_includedirs(ASCEND_HOME .. "/include/aclnn")
+    add_linkdirs(ASCEND_HOME .. "/lib64")
+    add_links("libascendcl.so")
+    add_links("libnnopbase.so")
+    add_links("libopapi.so")
+    add_links("libruntime.so")
+    add_linkdirs(ASCEND_HOME .. "/../../driver/lib64/driver")
+    add_links("libascend_hal.so")
+    local builddir = string.format(
+            "%s/build/%s/%s/%s",
+            os.projectdir(),
+            get_config("plat"),
+            get_config("arch"),
+            get_config("mode")
+        )
+    rule("ascend-kernels")
+        before_link(function ()
+            local ascend_build_dir = path.join(os.projectdir(), "src/devices/ascend")
+            os.cd(ascend_build_dir)
+            os.exec("make")
+            os.exec("cp $(projectdir)/src/devices/ascend/build/lib/libascend_kernels.a "..builddir.."/")
+            os.cd(os.projectdir())
+
+        end)
+        after_clean(function ()
+            local ascend_build_dir = path.join(os.projectdir(), "src/devices/ascend")
+            os.cd(ascend_build_dir)
+            os.exec("make clean")
+            os.cd(os.projectdir())
+            os.rm(builddir.. "/libascend_kernels.a")
+
+        end)
+    rule_end()
+
+    target("ascend-npu")
+        -- Other configs
+        set_kind("static")
+        set_languages("cxx17")
+        on_install(function (target) end)
+        -- Add files
+        add_files("src/devices/ascend/*.cc", "src/ops/*/ascend/*.cc")
+        add_cxflags("-lstdc++ -Wall -Werror -fPIC")
+
+        -- Add operator
+        add_rules("ascend-kernels")
+        add_links(builddir.."/libascend_kernels.a")
+
+    target_end()
+end
+
+if has_config("metax-gpu") then
+
+    add_defines("ENABLE_METAX_GPU")
+    local MACA_ROOT = os.getenv("MACA_PATH") or os.getenv("MACA_HOME") or os.getenv("MACA_ROOT")
+
+    add_includedirs(MACA_ROOT .. "/include")
+    add_linkdirs(MACA_ROOT .. "/lib")
+    -- add_linkdirs(MACA_ROOT .. "htgpu_llvm/lib")
+    add_links("libhcdnn.so")
+    add_links("libhcblas.so")
+    add_links("libhcruntime.so")
+
+    rule("maca")
+        set_extensions(".maca")
+
+        on_load(function (target)
+            target:add("includedirs", "include")
+        end)
+
+        on_build_file(function (target, sourcefile)
+            local objectfile = target:objectfile(sourcefile)
+            os.mkdir(path.directory(objectfile))
+            local htcc = "/opt/hpcc/htgpu_llvm/bin/htcc"
+
+            local includedirs = table.concat(target:get("includedirs"), " ")
+            local args = { "-x", "hpcc", "-c", sourcefile, "-o", objectfile, "-I/opt/hpcc/include", "-O3", "-fPIC", "-Werror", "-std=c++17"}
+
+            for _, includedir in ipairs(target:get("includedirs")) do
+                table.insert(args, "-I" .. includedir)
+            end
+
+            os.execv(htcc, args)
+            table.insert(target:objectfiles(), objectfile)
+        end)
+    rule_end()
+
+    target("metax-gpu")
+        set_kind("static")
+        on_install(function (target) end)
+        set_languages("cxx17")
+        add_files("src/devices/maca/*.cc", "src/ops/*/maca/*.cc")
+        add_files("src/ops/*/maca/*.maca", {rule = "maca"})
+        add_cxflags("-lstdc++ -Werror -fPIC")
+    target_end()
+
+end
+
+
+toolchain("sugon-dcu-linker")
+    set_toolset("sh", "nvcc")
+toolchain_end()
+
+target("infiniop")
     set_kind("shared")
 
     if has_config("cpu") then
@@ -127,44 +343,41 @@ target("operators")
     if has_config("nv-gpu") then
         add_deps("nv-gpu")
     end
+    if has_config("sugon-dcu") then
+        local builddir = string.format(
+            "build/%s/%s/%s",
+            get_config("plat"),
+            get_config("arch"),
+            get_config("mode")
+        )
+        add_shflags("-s", "-shared", "-fPIC")
+        add_links("cublas", "cudnn", "cudadevrt", "cudart_static", "rt", "pthread", "dl")
+        -- Using -lnv-gpu will fail, manually link the target using full path
+        add_deps("nv-gpu", {inherit = false})
+        add_links(builddir.."/libnv-gpu.a")
+        set_toolchains("sugon-dcu-linker")
+    end
+
     if has_config("cambricon-mlu") then
         add_deps("cambricon-mlu")
     end
+    if has_config("ascend-npu") then
+        add_deps("ascend-npu")
+    end
+    if has_config("metax-gpu") then
+        add_deps("metax-gpu")
+    end
+    if has_config("mthreads-gpu") then
+        add_deps("mthreads-gpu")
+    end
     set_languages("cxx17")
+    add_files("src/devices/handle.cc")
     add_files("src/ops/*/operator.cc")
     add_files("src/tensor/*.cc")
-target_end()
+    after_build(function (target) print(YELLOW .. "You can install the libraries with \"xmake install\"" .. NC) end)
 
-target("main")
-    set_kind("binary")
-    add_deps("operators")
+    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
+    add_installfiles("include/(**/*.h)", {prefixdir = "include"})
+    add_installfiles("include/*.h", {prefixdir = "include"})
 
-    set_languages("c11")
-    add_files("src/main.c")
 target_end()
-
-task("install-operators")
-    set_menu {
-        usage = "xmake install-operators",
-        description = "Build and install the operators",
-        options = {}
-    }
-    on_run(function ()
-        os.exec("xmake --root")
-        os.exec("mkdir -p $(projectdir)/lib/")
-        os.exec("cp $(projectdir)/build/linux/x86_64/release/liboperators.so $(projectdir)/lib/")
-        os.exec("cp -r $(projectdir)/include $(projectdir)/lib/")
-        -- Define color codes
-        local GREEN = '\27[0;32m'
-        local YELLOW = '\27[1;33m'
-        local NC = '\27[0m'  -- No Color
-
-        -- Get the current directory
-        local current_dir = os.curdir()
-
-        -- Output messages with colors
-        os.exec("echo -e '" .. GREEN .. "Compilation completed successfully." .. NC .. "'")
-        os.exec("echo -e '" .. YELLOW .. "To set the environment variable, please run the following command:" .. NC .. "'")
-        os.exec("echo -e '" .. YELLOW .. "echo \"export INFINI_ROOT=" .. current_dir .. "/lib\" >> ~/.bashrc" .. NC .. "'")
-
-    end)