diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml new file mode 100644 index 00000000..84108c51 --- /dev/null +++ b/.github/workflows/main.yaml @@ -0,0 +1,91 @@ +name: CI + +on: + push: + branches: + - main + - dev + pull_request: + + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install Python dependencies + run: | + pip install numpy + pip install torch + + - name: Install xmake + uses: xmake-io/github-action-setup-xmake@v1 + with: + xmake-version: latest + + - name: configure xmake + run: xmake f --cpu=true -cv + + - name: Set INFINI_ROOT + run: | + export INFINI_ROOT=$GITHUB_WORKSPACE/.infini + mkdir -p $INFINI_ROOT + echo "INFINI_ROOT=$INFINI_ROOT" >> $GITHUB_ENV + + - name: Build with XMake + run: xmake build && xmake install + + - name: Run Python Tests + run: | + GREEN='\033[0;32m' + RED='\033[0;31m' + NC='\033[0m' # No Color + + PASSED_TESTS=() + FAILED_TESTS=() + for script in operatorspy/tests/*.py; do + if [ "$(basename $script)" != "__init__.py" ] && [ "$(basename $script)" != "test_utils.py" ]; then + echo "Running $script" + START_TIME=$(date +%s) + if ! python3 $script --cpu; then + echo "$script failed" + FAILED_TESTS+=($script) + else + echo "$script passed" + PASSED_TESTS+=($script) + fi + END_TIME=$(date +%s) + DURATION=$(( END_TIME - START_TIME )) + MINUTES=$(( DURATION / 60 )) + SECONDS=$(( DURATION % 60 )) + echo "Execution time for $script: ${MINUTES}m ${SECONDS}s" + fi + done + + if [ ${#FAILED_TESTS[@]} -ne 0 ]; then + echo "The following tests passed:" + for test in "${PASSED_TESTS[@]}"; do + echo -e "${GREEN}$test${NC}" + done + echo "The following tests failed:" + for test in "${FAILED_TESTS[@]}"; do + echo -e "${RED}$test${NC}" + done + exit 1 + else + echo "The following tests passed:" + for test in "${PASSED_TESTS[@]}"; do + echo -e "${GREEN}$test${NC}" + done + echo "${GREEN}All tests passed${NC}" + fi + env: + INFINI_ROOT: ${{ env.INFINI_ROOT }} diff --git a/.gitignore b/.gitignore index 45efbbb4..024cd682 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,13 @@ __pycache__/ # Lib lib/ +out/ + +# Log +*.log + +# Cache +cache/ + +# Json +*.json diff --git a/README.md b/README.md index c2778312..674a874f 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,77 @@ -# 算子库 +# InfiniOperators 算子库 -跨平台高性能通用算子库。形式为 C 接口动态库。 +跨平台高性能统一算子库。形式为 C 接口动态库。 -采用二段式算子设计,每个算子都实现并对外暴露以下的 C 接口: +## 简介 -- 第一阶段:构造算子 Descriptor。用户提供的算子名称、硬件、以及算子配置(如计算的数据类型、计算排布等),相应模组会被 load 到硬件上。 +### 算子接口设计 + +采用3+1段式算子设计,每个算子都实现并对外暴露以下的 C 接口: + +- 第一阶段:构造硬件控柄(Handle)。用户提供控柄地址、硬件类型以及硬件序号。控柄所在的内存空间由用户管理。 ```C - void* createOpDescriptor(Device, void *config); + infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, int device, int device_id); ``` -- 第二阶段:计算。根据一阶段的 Descriptor,执行相应计算,用户需要提供输入输出张量,以及硬件计算流(CPU 为 NULL)。 +- 第二阶段:构造算子描述(Descriptor)。用户提供描述符地址、硬件控柄、以及算子涉及的张量描述(含张量数据类型、形状和步长)。这一步会完成算子所需的与张量数据无关的预计算。 ```C - void op(void *descriptor, Tensor output, Tensor input, void *stream); + infiniopStatus_t infiniopCreateOpDescriptor(infiniopHandle_t handle, infiniopOpDescriptor_t *desc_ptr, infiniopTensorDescriptor_t t, ...); ``` -- 销毁 Descriptor。 +- 第三阶段(可选):计算额外工作空间。根据算子描述,计算算子所需的额外工作空间大小,并存储于用户提供的位置。具体空间分配由用户负责。 ```C - void destroyOpDescriptor(void *descriptor); + infiniopStatus_t infiniopGetOpWorkspaceSize(infiniopOpDescriptor_t desc, uint64_t *size); ``` +- 第四阶段:计算。根据算子描述符,在指定的硬件上执行相应计算,用户需要提供输入输出的数据,以及硬件计算流(CPU 为 NULL)。 + + ```C + infiniopStatus_t infiniopGetOp(infiniopOpDescriptor_t desc, [void *workspace, uint64_t workspace_size,] void *output_data, void *input_data, ..., void *stream); + ``` + +- 销毁描述和硬件控柄。 + + ```C + infiniopStatus_t infiniopDestroyOpDescriptor(infiniopOpDescriptor_t desc); + infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle); + ``` + +### 张量(Tensor)描述设计 + +张量描述由以下几个部分组成: + +1.数据类型,由打包大小(即一个元素代表几个数据)、符号位、元素大小、尾数位数、指数位数共4字节表示。定义如下: + +```C +typedef struct DataLayout { + unsigned short + packed : 8, + sign : 1, + size : 7, + mantissa : 8, + exponent : 8; +} DataLayout; +``` + +2.维度信息。张量有多少个维度。类型为uint64_t。 + +3.张量形状。张量每个维度的大小。类型为uint64_t*。 + +4.张量步长。张量每个维度的步长。类型为uint64_t*。 + +创建和销毁张量描述符的接口: + +```C +infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, DataLayout layout, uint64_t ndim, uint64_t *shape, uint64_t *strides); +infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc); +``` + ## 一、使用说明 -### 配置 +### 1. 配置 #### 查看当前配置 @@ -52,23 +99,27 @@ xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv xmake f --cambricon-mlu=true -cv ``` -### 编译 +#### 配置 NPU + +````xmake +xmake f --ascend-npu=true -cv +```` + +### 2. 编译安装 ```xmake -xmake +xmake build && xmake install ``` -### 将编译好的算子库添加至环境变量 `INFINI_ROOT` +### 3. 设置环境变量 -```bash -export INFINI_ROOT=[PATH_TO_LIBRARY] -``` +按输出提示设置 `INFINI_ROOT` 和 `LD_LIBRARY_PATH` 环境变量。 -### 运行算子测试 +### 4. 运行算子测试 ```bash cd operatorspy/tests -python operator_name.py +python operator_name.py [--cpu | --cuda | --cambricon | --ascend] ``` ## 二、开发说明 @@ -82,6 +133,8 @@ python operator_name.py │   │   ├── [operator_name].h # 对外暴露的算子 C 接口定义,descriptor 定义 │   ├── tensor │   │   ├── tensor_descriptor.h # 对外暴露的张量 descriptor 定义 +│   ├── handle +│   │   ├── handle_export.h # 对外暴露的硬件 handle 定义 │   ├── *.h # 对外暴露的核心结构体定义 ├── src │   ├── devices @@ -105,7 +158,7 @@ python operator_name.py - 在 `src/device.h` 和 `operatorspy/devices.py` 中增加新的硬件类型,注意两者需要一一对应; - 在 `xmake.lua` 中增加新硬件的编译选项以及编译方式; -- 在 `src/ops/devices/[device_name]` 下编写特定硬件的通用代码; +- 在 `src/ops/devices/[device_name]` 下编写特定硬件的handle实现和通用代码; - 实现该硬件的算子; ### 增加新的算子 diff --git a/include/data_type.h b/include/data_type.h index 7767693f..e2f24c4f 100644 --- a/include/data_type.h +++ b/include/data_type.h @@ -8,8 +8,28 @@ typedef struct DataLayout { size : 7, mantissa : 8, exponent : 8; + +#ifdef __cplusplus + bool operator==(const DataLayout &other) const { + union TypePun { + DataLayout layout; + unsigned int i; + } pun; + pun.layout = *this; + auto a_ = pun.i; + pun.layout = other; + auto b_ = pun.i; + return a_ == b_; + } + + bool operator!=(const DataLayout &other) const { + return !(*this == other); + } +#endif } DataLayout; +typedef struct DataLayout DT; + // clang-format off const static struct DataLayout I8 = {1, 1, 1, 7, 0}, diff --git a/include/device.h b/include/device.h index d7f714e0..bdeb1dc9 100644 --- a/include/device.h +++ b/include/device.h @@ -2,9 +2,14 @@ #define __DEVICE_H__ enum DeviceEnum { - DevCpu, - DevNvGpu, - DevCambriconMlu, + DevCpu = 0, + DevNvGpu = 1, + DevCambriconMlu = 2, + DevAscendNpu = 3, + DevMetaxGpu = 4, + DevMthreadsGpu = 5, }; +typedef enum DeviceEnum Device; + #endif// __DEVICE_H__ diff --git a/include/handle.h b/include/handle.h new file mode 100644 index 00000000..d4eeee28 --- /dev/null +++ b/include/handle.h @@ -0,0 +1,12 @@ +#ifndef INFINIOP_HANDLE_H +#define INFINIOP_HANDLE_H + +#include "device.h" + +typedef struct HandleStruct { + Device device; +} HandleStruct; + +typedef HandleStruct *infiniopHandle_t; + +#endif diff --git a/include/handle/handle_export.h b/include/handle/handle_export.h new file mode 100644 index 00000000..e6f38cf9 --- /dev/null +++ b/include/handle/handle_export.h @@ -0,0 +1,12 @@ +#ifndef INFINIOP_HANDLE_EXPORT_H +#define INFINIOP_HANDLE_EXPORT_H +#include "../status.h" +#include "../handle.h" +#include "../export.h" +#include "../device.h" + +__C __export infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id); + +__C __export infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle); + +#endif // INFINIOP_HANDLE_EXPORT_H diff --git a/include/infini_operators.h b/include/infini_operators.h index 1167c037..9a5a2555 100644 --- a/include/infini_operators.h +++ b/include/infini_operators.h @@ -1,6 +1,18 @@ +#include "handle/handle_export.h" +#include "ops/add/add.h" +#include "ops/attention/attention.h" +#include "ops/avg_pool/avg_pool.h" #include "ops/causal_softmax/causal_softmax.h" +#include "ops/global_avg_pool/global_avg_pool.h" +#include "ops/expand/expand.h" +#include "ops/gemm/gemm.h" +#include "ops/conv/conv.h" #include "ops/matmul/matmul.h" -#include "ops/reform/reform.h" +#include "ops/max_pool/max_pool.h" +#include "ops/mlp/mlp.h" +#include "ops/random_sample/random_sample.h" +#include "ops/rearrange/rearrange.h" +#include "ops/relu/relu.h" #include "ops/rms_norm/rms_norm.h" #include "ops/rotary_embedding/rotary_embedding.h" #include "ops/swiglu/swiglu.h" diff --git a/include/operators.h b/include/operators.h index 1a57a88c..989a1602 100644 --- a/include/operators.h +++ b/include/operators.h @@ -1,11 +1,9 @@ #ifndef __OPERATORS_H__ #define __OPERATORS_H__ -#include "data_type.h" #include "device.h" #include "tensor.h" - -typedef enum DeviceEnum Device; -typedef struct DataLayout DT; +#include "handle.h" +#include "status.h" #endif// __OPERATORS_H__ diff --git a/include/ops/add/add.h b/include/ops/add/add.h new file mode 100644 index 00000000..70da8cd2 --- /dev/null +++ b/include/ops/add/add.h @@ -0,0 +1,27 @@ +#ifndef ADD_H +#define ADD_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct AddDescriptor { + Device device; +} AddDescriptor; + +typedef AddDescriptor *infiniopAddDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle, + infiniopAddDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc); + +#endif diff --git a/include/ops/attention/attention.h b/include/ops/attention/attention.h new file mode 100644 index 00000000..913ca792 --- /dev/null +++ b/include/ops/attention/attention.h @@ -0,0 +1,39 @@ +#ifndef ATTENTION_H +#define ATTENTION_H + +#include "../../export.h" +#include "../../operators.h" +#include "../matmul/matmul.h" +#include "../swiglu/swiglu.h" + +typedef struct AttentionDescriptor { + Device device; +} AttentionDescriptor; + +typedef AttentionDescriptor *infiniopAttentionDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t handle, + infiniopAttentionDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t q_desc, + infiniopTensorDescriptor_t k_desc, + infiniopTensorDescriptor_t v_desc, + infiniopTensorDescriptor_t k_cache_desc, + infiniopTensorDescriptor_t v_cache_desc, + uint64_t pos); + +__C __export infiniopStatus_t infiniopGetAttentionWorkspaceSize(infiniopAttentionDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *out, + void const *q, + void const *k, + void const *v, + void *k_cache, + void *v_cache, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyAttentionDescriptor(infiniopAttentionDescriptor_t desc); +#endif diff --git a/include/ops/avg_pool/avg_pool.h b/include/ops/avg_pool/avg_pool.h new file mode 100644 index 00000000..39a4ce3c --- /dev/null +++ b/include/ops/avg_pool/avg_pool.h @@ -0,0 +1,28 @@ +#ifndef AVG_POOL_H +#define AVG_POOL_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct AvgPoolDescriptor { + Device device; +} AvgPoolDescriptor; +typedef AvgPoolDescriptor *infiniopAvgPoolDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle, + infiniopAvgPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + uint64_t const *kernel_shape, + uint64_t const *pads, + int64_t const *strides, + uint64_t n); + +__C __export infiniopStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, void *stream); + +__C __export infiniopStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc); +#endif diff --git a/include/ops/causal_softmax/causal_softmax.h b/include/ops/causal_softmax/causal_softmax.h index 9607374b..86c700f0 100644 --- a/include/ops/causal_softmax/causal_softmax.h +++ b/include/ops/causal_softmax/causal_softmax.h @@ -4,11 +4,25 @@ #include "../../export.h" #include "../../operators.h" -typedef struct CausalSoftmaxDescriptor CausalSoftmaxDescriptor; +typedef struct CausalSoftmaxDescriptor { + Device device; +} CausalSoftmaxDescriptor; -__C __export CausalSoftmaxDescriptor *createCausalSoftmaxDescriptor(Device, void *config); -__C __export void destroyCausalSoftmaxDescriptor(CausalSoftmaxDescriptor *descriptor); -__C __export void causalSoftmax(CausalSoftmaxDescriptor *descriptor, Tensor y, void *stream); +typedef CausalSoftmaxDescriptor *infiniopCausalSoftmaxDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(infiniopHandle_t handle, + infiniopCausalSoftmaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc); + +__C __export infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc); #endif diff --git a/include/ops/conv/conv.h b/include/ops/conv/conv.h new file mode 100644 index 00000000..12e1b289 --- /dev/null +++ b/include/ops/conv/conv.h @@ -0,0 +1,30 @@ +#ifndef CONV_H +#define CONV_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ConvDescriptor { + Device device; +} ConvDescriptor; + +typedef ConvDescriptor *infiniopConvDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle, + infiniopConvDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t w, + void *pads, + void *strides, + void *dilations, + uint64_t n); + +__C __export infiniopStatus_t infiniopGetConvWorkspaceSize(infiniopConvDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void const *w, void *stream); + +__C __export infiniopStatus_t infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc); + + +#endif diff --git a/include/ops/expand/expand.h b/include/ops/expand/expand.h new file mode 100644 index 00000000..ee28b70c --- /dev/null +++ b/include/ops/expand/expand.h @@ -0,0 +1,25 @@ +#ifndef EXPAND_H +#define EXPAND_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ExpandDescriptor { + Device device; +} ExpandDescriptor; + +typedef ExpandDescriptor *infiniopExpandDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateExpandDescriptor(infiniopHandle_t handle, + infiniopExpandDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniopStatus_t infiniopExpand(infiniopExpandDescriptor_t desc, + void *y, + void const *x, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyExpandDescriptor(infiniopExpandDescriptor_t desc); + +#endif diff --git a/include/ops/gemm/gemm.h b/include/ops/gemm/gemm.h new file mode 100644 index 00000000..a6eac566 --- /dev/null +++ b/include/ops/gemm/gemm.h @@ -0,0 +1,36 @@ +#ifndef GEMM_H +#define GEMM_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct GEMMDescriptor { + Device device; +} GEMMDescriptor; + +typedef GEMMDescriptor *infiniopGEMMDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateGEMMDescriptor(infiniopHandle_t handle, + infiniopGEMMDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infiniopTensorDescriptor_t c_desc, + float alpha, + float beta, + char transA, + char transB); + +__C __export infiniopStatus_t infiniopGetGEMMWorkspaceSize(infiniopGEMMDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopGEMM(infiniopGEMMDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, + void const *a, + void const *b, + void const *c, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyGEMMDescriptor(infiniopGEMMDescriptor_t desc); +#endif diff --git a/include/ops/global_avg_pool/global_avg_pool.h b/include/ops/global_avg_pool/global_avg_pool.h new file mode 100644 index 00000000..ba839ecc --- /dev/null +++ b/include/ops/global_avg_pool/global_avg_pool.h @@ -0,0 +1,26 @@ +#ifndef GLOBAL_AVG_POOL_H +#define GLOBAL_AVG_POOL_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct GlobalAvgPoolDescriptor { + Device device; +} GlobalAvgPoolDescriptor; + +typedef GlobalAvgPoolDescriptor *infiniopGlobalAvgPoolDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateGlobalAvgPoolDescriptor(infiniopHandle_t handle, + infiniopGlobalAvgPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniopStatus_t infiniopGetGlobalAvgPoolWorkspaceSize(infiniopGlobalAvgPoolDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopGlobalAvgPool(infiniopGlobalAvgPoolDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, void *stream); + +__C __export infiniopStatus_t infiniopDestroyGlobalAvgPoolDescriptor(infiniopGlobalAvgPoolDescriptor_t desc); + +#endif diff --git a/include/ops/matmul/matmul.h b/include/ops/matmul/matmul.h index 6c80d761..67285683 100644 --- a/include/ops/matmul/matmul.h +++ b/include/ops/matmul/matmul.h @@ -4,12 +4,30 @@ #include "../../export.h" #include "../../operators.h" -typedef struct MatmulDescriptor MatmulDescriptor; +typedef struct MatmulDescriptor { + Device device; +} MatmulDescriptor; -__C __export MatmulDescriptor *createMatmulDescriptor(Device, void *config); +typedef MatmulDescriptor *infiniopMatmulDescriptor_t; -__C __export void destroyMatmulDescriptor(MatmulDescriptor *descriptor); +__C __export infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle, + infiniopMatmulDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta); -__C __export void matmul(MatmulDescriptor *descriptor, Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream); +__C __export infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc); #endif diff --git a/include/ops/max_pool/max_pool.h b/include/ops/max_pool/max_pool.h new file mode 100644 index 00000000..8828c2c5 --- /dev/null +++ b/include/ops/max_pool/max_pool.h @@ -0,0 +1,28 @@ +#ifndef MAX_POOL_H +#define MAX_POOL_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct MaxPoolDescriptor { + Device device; +} MaxPoolDescriptor; +typedef MaxPoolDescriptor *infiniopMaxPoolDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle, + infiniopMaxPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + uint64_t const *kernel_shape, + uint64_t const *pads, + int64_t const *strides, + uint64_t n); + +__C __export infiniopStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, void *stream); + +__C __export infiniopStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc); +#endif diff --git a/include/ops/mlp/mlp.h b/include/ops/mlp/mlp.h new file mode 100644 index 00000000..9c4c7dd2 --- /dev/null +++ b/include/ops/mlp/mlp.h @@ -0,0 +1,36 @@ +#ifndef MLP_H +#define MLP_H + +#include "../../export.h" +#include "../../operators.h" +#include "../matmul/matmul.h" +#include "../swiglu/swiglu.h" + +typedef struct MLPDescriptor { + Device device; +} MLPDescriptor; + +typedef MLPDescriptor *infiniopMLPDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMLPDescriptor(infiniopHandle_t handle, + infiniopMLPDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w12_desc, + infiniopTensorDescriptor_t w3_desc, + float alpha, + char residual); + +__C __export infiniopStatus_t infiniopGetMLPWorkspaceSize(infiniopMLPDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopMLP(infiniopMLPDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, + void const *x, + void const *w12, + void const *w3, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyMLPDescriptor(infiniopMLPDescriptor_t desc); +#endif diff --git a/include/ops/random_sample/random_sample.h b/include/ops/random_sample/random_sample.h new file mode 100644 index 00000000..e48cb7cc --- /dev/null +++ b/include/ops/random_sample/random_sample.h @@ -0,0 +1,31 @@ +#ifndef RANDOM_SAMPLE_H +#define RANDOM_SAMPLE_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct RandomSampleDescriptor { + Device device; +} RandomSampleDescriptor; + +typedef RandomSampleDescriptor *infiniopRandomSampleDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs); + +__C __export infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc); + + +#endif diff --git a/include/ops/rearrange/rearrange.h b/include/ops/rearrange/rearrange.h new file mode 100644 index 00000000..742c4696 --- /dev/null +++ b/include/ops/rearrange/rearrange.h @@ -0,0 +1,20 @@ +#ifndef REARRANGE_H +#define REARRANGE_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct RearrangeDescriptor { + Device device; +} RearrangeDescriptor; +typedef RearrangeDescriptor *infiniopRearrangeDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateRearrangeDescriptor(infiniopHandle_t handle, + infiniopRearrangeDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src); + +__C __export infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void *dst, void const *src, void *stream); + +__C __export infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescriptor_t desc); +#endif diff --git a/include/ops/reform/reform.h b/include/ops/reform/reform.h deleted file mode 100644 index 1a2af372..00000000 --- a/include/ops/reform/reform.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef REFORM_H -#define REFORM_H - -#include "../../export.h" -#include "../../operators.h" -typedef struct ReformDescriptor ReformDescriptor; - -__C __export ReformDescriptor *createReformDescriptor(Device, void *config); -__C __export void destroyReformDescriptor(ReformDescriptor *descriptor); -__C __export void reform(ReformDescriptor *descriptor, Tensor y, Tensor x, void *stream); - -#endif diff --git a/include/ops/relu/relu.h b/include/ops/relu/relu.h new file mode 100644 index 00000000..9f639b9b --- /dev/null +++ b/include/ops/relu/relu.h @@ -0,0 +1,25 @@ +#ifndef RELU_H +#define RELU_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ReluDescriptor { + Device device; +} ReluDescriptor; + +typedef ReluDescriptor *infiniopReluDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReluDescriptor(infiniopHandle_t handle, + infiniopReluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniopStatus_t infiniopRelu(infiniopReluDescriptor_t desc, + void *y, + void const *x, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc); + +#endif diff --git a/include/ops/rms_norm/rms_norm.h b/include/ops/rms_norm/rms_norm.h index 71aeffbc..19dc8ad5 100644 --- a/include/ops/rms_norm/rms_norm.h +++ b/include/ops/rms_norm/rms_norm.h @@ -4,10 +4,25 @@ #include "../../export.h" #include "../../operators.h" -typedef struct RMSNormDescriptor RMSNormDescriptor; +typedef struct RMSNormDescriptor { + Device device; +} RMSNormDescriptor; -__C __export void *createRMSNormDescriptor(Device, void *config); -__C __export void destroyRMSNormDescriptor(RMSNormDescriptor *descriptor); -__C __export void rmsNorm(RMSNormDescriptor *descriptor, Tensor y, Tensor x, Tensor w, float epsilon, void *stream); +typedef RMSNormDescriptor *infiniopRMSNormDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateRMSNormDescriptor( + infiniopHandle_t handle, + infiniopRMSNormDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + float epsilon); + +__C __export infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, uint64_t workspace_size, + void *y, void const *x, void const *w, void *stream); + +__C __export infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc); #endif diff --git a/include/ops/rotary_embedding/rotary_embedding.h b/include/ops/rotary_embedding/rotary_embedding.h index 103b3101..48b85bdd 100644 --- a/include/ops/rotary_embedding/rotary_embedding.h +++ b/include/ops/rotary_embedding/rotary_embedding.h @@ -4,10 +4,29 @@ #include "../../export.h" #include "../../operators.h" -typedef struct RotaryEmbeddingDescriptor RotaryEmbeddingDescriptor; +typedef struct RoPEDescriptor RoPEDescriptor; +typedef RoPEDescriptor *infiniopRoPEDescriptor_t; -__C __export void *createRotaryEmbeddingDescriptor(Device, void *config); -__C __export void destroyRotaryEmbeddingDescriptor(RotaryEmbeddingDescriptor *descriptor); -__C __export void rotaryEmbedding(RotaryEmbeddingDescriptor *descriptor, Tensor t, Tensor pos, float theta, void *stream); +__C __export infiniopStatus_t infiniopCreateRoPEDescriptor( + infiniopHandle_t handle, + infiniopRoPEDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table); + +__C __export infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopRoPE( + infiniopRoPEDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc); #endif diff --git a/include/ops/swiglu/swiglu.h b/include/ops/swiglu/swiglu.h index b181ef87..58ae73b6 100644 --- a/include/ops/swiglu/swiglu.h +++ b/include/ops/swiglu/swiglu.h @@ -4,10 +4,24 @@ #include "../../export.h" #include "../../operators.h" -typedef struct SwigluDescriptor SwigluDescriptor; +typedef struct SwiGLUDescriptor { + Device device; +} SwiGLUDescriptor; -__C __export void *createSwigluDescriptor(Device, void *config); -__C __export void destroySwigluDescriptor(SwigluDescriptor *descriptor); -__C __export void swiglu(SwigluDescriptor *descriptor, Tensor gate, Tensor up, void *stream); +typedef SwiGLUDescriptor *infiniopSwiGLUDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle, + infiniopSwiGLUDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + +__C __export infiniopStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc); #endif diff --git a/include/status.h b/include/status.h new file mode 100644 index 00000000..54acb02a --- /dev/null +++ b/include/status.h @@ -0,0 +1,16 @@ +#ifndef INFINIOP_STATUS_H +#define INFINIOP_STATUS_H + +typedef enum { + STATUS_SUCCESS = 0, + STATUS_EXECUTION_FAILED = 1, + STATUS_BAD_PARAM = 2, + STATUS_BAD_TENSOR_DTYPE = 3, + STATUS_BAD_TENSOR_SHAPE = 4, + STATUS_BAD_TENSOR_STRIDES = 5, + STATUS_MEMORY_NOT_ALLOCATED = 6, + STATUS_INSUFFICIENT_WORKSPACE = 7, + STATUS_BAD_DEVICE = 8, +} infiniopStatus_t; + +#endif diff --git a/include/tensor.h b/include/tensor.h index abe51434..3cc28922 100644 --- a/include/tensor.h +++ b/include/tensor.h @@ -4,20 +4,17 @@ #include "data_type.h" #include -struct TensorLayout { - struct DataLayout dt; +struct TensorDescriptor { + // Datatype + DT dt; + // Number of dimensions uint64_t ndim; + // Shape of the tensor, ndim elements uint64_t *shape; + // Stride of each dimension in elements, ndim elements int64_t *strides; }; -typedef struct TensorLayout *TensorDescriptor; - -struct TensorTuple { - TensorDescriptor const layout; - void *data; -}; - -typedef struct TensorTuple Tensor; +typedef struct TensorDescriptor *infiniopTensorDescriptor_t; #endif// __TENSOR_H__ diff --git a/include/tensor/tensor_descriptor.h b/include/tensor/tensor_descriptor.h index 87b4dd94..2fb9fc1d 100644 --- a/include/tensor/tensor_descriptor.h +++ b/include/tensor/tensor_descriptor.h @@ -3,9 +3,10 @@ #include "../export.h" #include "../tensor.h" +#include "../status.h" -__C __export void createTensorDescriptor(TensorDescriptor* desc_ptr, uint64_t ndim, uint64_t *shape_, int64_t *strides_, DataLayout datatype); +__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, uint64_t ndim, uint64_t const *shape_, int64_t const *strides_, DataLayout datatype); -__C __export void destroyTensorDescriptor(TensorDescriptor desc); +__C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc); #endif// TENSOR_DESCRIPTOR_H diff --git a/operatorspy/__init__.py b/operatorspy/__init__.py index f4935b7f..abb67be9 100644 --- a/operatorspy/__init__.py +++ b/operatorspy/__init__.py @@ -1,5 +1,7 @@ import os import sys sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '.'))) -from .liboperators import open_lib, to_tensor, CTensor +from .liboperators import open_lib, CTensor, infiniopHandle_t, infiniopTensorDescriptor_t from .devices import DeviceEnum +from .utils import * +from .data_layout import * diff --git a/operatorspy/devices.py b/operatorspy/devices.py index 446bc37f..23bd2a5c 100644 --- a/operatorspy/devices.py +++ b/operatorspy/devices.py @@ -2,3 +2,6 @@ class DeviceEnum: DEVICE_CPU = 0 DEVICE_CUDA = 1 DEVICE_BANG = 2 + DEVICE_ASCEND = 3 + DEVICE_MACA = 4 + DEVICE_MUSA = 5 diff --git a/operatorspy/liboperators.py b/operatorspy/liboperators.py index 80bb640f..0909c0cf 100644 --- a/operatorspy/liboperators.py +++ b/operatorspy/liboperators.py @@ -1,35 +1,50 @@ import os import platform import ctypes -from ctypes import c_void_p, c_int, c_int64, c_uint64, Structure, POINTER +from ctypes import c_int, c_int64, c_uint64, Structure, POINTER from .data_layout import * +from .devices import * Device = c_int Optype = c_int -LIB_OPERATORS_DIR = "INFINI_ROOT" +LIB_OPERATORS_DIR = os.path.join(os.environ.get("INFINI_ROOT"), "lib") -class TensorLayout(Structure): +class TensorDescriptor(Structure): _fields_ = [ ("dt", DataLayout), ("ndim", c_uint64), ("shape", POINTER(c_uint64)), - ("pattern", POINTER(c_int64)), + ("strides", POINTER(c_int64)), ] + def invalidate(self): + for i in range(self.ndim): + self.shape[i] = 0 + self.strides[i] = 0 -TensorDescriptor = ctypes.POINTER(TensorLayout) +infiniopTensorDescriptor_t = ctypes.POINTER(TensorDescriptor) -class CTensor(Structure): - _fields_ = [("layout", TensorDescriptor), ("data", c_void_p)] + +class CTensor: + def __init__(self, desc, data): + self.descriptor = desc + self.data = data + + +class Handle(Structure): + _fields_ = [("device", c_int)] + + +infiniopHandle_t = POINTER(Handle) # Open operators library def open_lib(): def find_library_in_ld_path(library_name): - ld_library_path = os.environ.get(LIB_OPERATORS_DIR, "") + ld_library_path = LIB_OPERATORS_DIR paths = ld_library_path.split(os.pathsep) for path in paths: full_path = os.path.join(path, library_name) @@ -39,64 +54,25 @@ def find_library_in_ld_path(library_name): system_name = platform.system() # Load the library - if system_name == 'Windows': - library_path = find_library_in_ld_path("operators.dll") - elif system_name == 'Linux': - library_path = find_library_in_ld_path("liboperators.so") + if system_name == "Windows": + library_path = find_library_in_ld_path("infiniop.dll") + elif system_name == "Linux": + library_path = find_library_in_ld_path("libinfiniop.so") assert ( library_path is not None - ), f"Cannot find operators.dll or liboperators.so. Check if {LIB_OPERATORS_DIR} is set correctly." + ), f"Cannot find infiniop.dll or libinfiniop.so. Check if INFINI_ROOT is set correctly." lib = ctypes.CDLL(library_path) - lib.createTensorDescriptor.argtypes = [ - POINTER(POINTER(TensorLayout)), + lib.infiniopCreateTensorDescriptor.argtypes = [ + POINTER(infiniopTensorDescriptor_t), c_uint64, POINTER(c_uint64), POINTER(c_int64), DataLayout, ] - return lib + lib.infiniopCreateHandle.argtypes = [POINTER(infiniopHandle_t), c_int, c_int] + lib.infiniopCreateHandle.restype = c_int + lib.infiniopDestroyHandle.argtypes = [infiniopHandle_t] + lib.infiniopDestroyHandle.restype = c_int - -# Convert PyTorch tensor to library Tensor -def to_tensor(tensor, lib, shape = None, strides = None): - import torch - - ndim = tensor.ndimension() - if shape is None: - shape = (ctypes.c_uint64 * ndim)(*tensor.shape) - else: - shape = (ctypes.c_uint64 * ndim)(*shape) - # Get strides in bytes - if strides is None: - strides = (ctypes.c_int64 * ndim)( - *(s * tensor.element_size() for s in tensor.stride()) - ) - else: - strides = (ctypes.c_int64 * ndim)(*strides) - data_ptr = tensor.data_ptr() - # fmt: off - dt = ( - I8 if tensor.dtype == torch.int8 else - I16 if tensor.dtype == torch.int16 else - I32 if tensor.dtype == torch.int32 else - I64 if tensor.dtype == torch.int64 else - U8 if tensor.dtype == torch.uint8 else - F16 if tensor.dtype == torch.float16 else - BF16 if tensor.dtype == torch.bfloat16 else - F32 if tensor.dtype == torch.float32 else - F64 if tensor.dtype == torch.float64 else - # TODO: These following types may not be supported by older - # versions of PyTorch. - U16 if tensor.dtype == torch.uint16 else - U32 if tensor.dtype == torch.uint32 else - U64 if tensor.dtype == torch.uint64 else - None - ) - # fmt: on - assert dt is not None - # Create TensorDecriptor - tensor_desc = TensorDescriptor() - lib.createTensorDescriptor(ctypes.byref(tensor_desc), ndim, shape, strides, dt) - # Create Tensor - return CTensor(tensor_desc, ctypes.c_void_p(data_ptr)) + return lib diff --git a/operatorspy/tests/add.py b/operatorspy/tests/add.py new file mode 100644 index 00000000..da9c58c9 --- /dev/null +++ b/operatorspy/tests/add.py @@ -0,0 +1,180 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +class AddDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopAddDescriptor_t = POINTER(AddDescriptor) + + +def add(x, y): + return torch.add(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Add on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + ans = add(a, b) + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopAddDescriptor_t() + + check_error( + lib.infiniopCreateAddDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + c_tensor.descriptor.contents.invalidate() + a_tensor.descriptor.contents.invalidate() + b_tensor.descriptor.contents.invalidate() + + check_error( + lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None) + ) + assert torch.allclose(c, ans, atol=0, rtol=1e-3) + check_error(lib.infiniopDestroyAddDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + +def test_musa(lib, test_cases): + import torch_musa + + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "musa", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "musa", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE), + ((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE), + ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A), + ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B), + ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateAddDescriptor.restype = c_int32 + lib.infiniopCreateAddDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopAddDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopAdd.restype = c_int32 + lib.infiniopAdd.argtypes = [ + infiniopAddDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAddDescriptor.restype = c_int32 + lib.infiniopDestroyAddDescriptor.argtypes = [ + infiniopAddDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if args.musa: + test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/attention.py b/operatorspy/tests/attention.py new file mode 100644 index 00000000..f5449aaa --- /dev/null +++ b/operatorspy/tests/attention.py @@ -0,0 +1,417 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool +import ctypes +import sys +import os + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + CTensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, + create_workspace, +) + +from operatorspy.tests.test_utils import get_args +import torch +import torch.nn.functional as F + + +class AttentionDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopAttentionDescriptor_t = POINTER(AttentionDescriptor) + + +def causal_softmax(x): + type = x.dtype + mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1]) + y = x.clone() + masked = torch.where(mask == 1, -torch.inf, y.to(torch.float32)) + return torch.nn.functional.softmax(masked, dim=-1).to(type) + + +def attention(q, k, v, k_cache, v_cache, pos): + type = q.dtype + + n_q_head = q.shape[0] + n_kv_head = k.shape[0] + + # Concatenate key and value caches + k_cache = k_cache[:, :pos, :] # (n_kv_head, pos, head_dim) + v_cache = v_cache[:, :pos, :] # (n_kv_head, pos, head_dim) + k = torch.cat([k_cache, k], dim=1) # (n_kv_head, total_seq_len, head_dim) + v = torch.cat([v_cache, v], dim=1) # (n_kv_head, total_seq_len, head_dim) + + total_seq_len = k.shape[1] + + head_dim = v.shape[-1] + + if n_q_head != n_kv_head: + q = q.reshape( + n_kv_head, -1, head_dim + ) # (n_kv_head, n_group * seq_len, head_dim) + + # Scaled dot-product attention + attn_scores = ( + torch.einsum("hqd,hkd->hqk", q.to(torch.float32), k.to(torch.float32)) + .to(type) + .reshape(n_q_head, -1, total_seq_len) + ) # (n_q_head, seq_len, total_seq_len) + attn_scores = attn_scores / (head_dim**0.5) + + attn_weights = causal_softmax(attn_scores).reshape( + n_kv_head, -1, total_seq_len + ) # (n_kv_head, seq_len, total_seq_len) + + # Weighted sum of values + attn_output = ( + torch.einsum( + "hqk,hkd->hqd", attn_weights.to(torch.float32), v.to(torch.float32) + ) + .to(type) + .reshape(n_q_head, -1, head_dim) + .permute(1, 0, 2) + ) # ([seq_len, n_q_head, head_dim]) + + return attn_output + + +def test( + lib, + handle, + torch_device, + n_q_head, + n_kv_head, + seq_len, + head_dim, + pos, + k_cache_buf_len, + v_cache_buf_len, + dtype=torch.float16, + q_stride=None, + k_stride=None, + v_stride=None, + k_cache_stride=None, + v_cache_stride=None, +): + print( + f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} " + f"dtype:{dtype} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}" + ) + + out = torch.zeros([seq_len, n_q_head, head_dim], dtype=dtype, device=torch_device) + q = torch.rand([n_q_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1 + k = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1 + v = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1 + k_cache = ( + torch.rand([n_kv_head, k_cache_buf_len, head_dim], dtype=dtype).to(torch_device) + * 0.1 + ) + v_cache = ( + torch.rand([n_kv_head, v_cache_buf_len, head_dim], dtype=dtype).to(torch_device) + * 0.1 + ) + + ans = attention(q, k, v, k_cache, v_cache, pos) + + if q_stride is not None: + q = rearrange_tensor(q, q_stride) + if k_stride is not None: + k = rearrange_tensor(k, k_stride) + if v_stride is not None: + v = rearrange_tensor(v, v_stride) + if k_cache_stride is not None: + k_cache = rearrange_tensor(k_cache, k_cache_stride) + if v_cache_stride is not None: + v_cache = rearrange_tensor(v_cache, v_cache_stride) + + out_tensor = to_tensor(out, lib) + q_tensor = to_tensor(q, lib) + k_tensor = to_tensor(k, lib) + v_tensor = to_tensor(v, lib) + k_cache_tensor = to_tensor(k_cache, lib) + v_cache_tensor = to_tensor(v_cache, lib) + + descriptor = infiniopAttentionDescriptor_t() + check_error( + lib.infiniopCreateAttentionDescriptor( + handle, + ctypes.byref(descriptor), + out_tensor.descriptor, + q_tensor.descriptor, + k_tensor.descriptor, + v_tensor.descriptor, + k_cache_tensor.descriptor, + v_cache_tensor.descriptor, + pos, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + out_tensor.descriptor.contents.invalidate() + q_tensor.descriptor.contents.invalidate() + k_tensor.descriptor.contents.invalidate() + v_tensor.descriptor.contents.invalidate() + k_cache_tensor.descriptor.contents.invalidate() + v_cache_tensor.descriptor.contents.invalidate() + + workspace_size = c_uint64(0) + check_error( + lib.infiniopGetAttentionWorkspaceSize(descriptor, ctypes.byref(workspace_size)) + ) + workspace = create_workspace(workspace_size.value, out.device) + + check_error( + lib.infiniopAttention( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + out_tensor.data, + q_tensor.data, + k_tensor.data, + v_tensor.data, + k_cache_tensor.data, + v_cache_tensor.data, + None, + ) + ) + + assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2) + + check_error(lib.infiniopDestroyAttentionDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + + for ( + n_q_head, + n_kv_head, + seq_len, + head_dim, + pos, + k_cache_buf_len, + v_cache_buf_len, + dtype, + q_stride, + k_stride, + v_stride, + k_cache_stride, + v_cache_stride, + ) in test_cases: + test( + lib, + handle, + "cpu", + n_q_head, + n_kv_head, + seq_len, + head_dim, + pos, + k_cache_buf_len, + v_cache_buf_len, + dtype, + q_stride, + k_stride, + v_stride, + k_cache_stride, + v_cache_stride, + ) + + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + + for ( + n_q_head, + n_kv_head, + seq_len, + head_dim, + pos, + k_cache_buf_len, + v_cache_buf_len, + dtype, + q_stride, + k_stride, + v_stride, + k_cache_stride, + v_cache_stride, + ) in test_cases: + test( + lib, + handle, + "cuda", + n_q_head, + n_kv_head, + seq_len, + head_dim, + pos, + k_cache_buf_len, + v_cache_buf_len, + dtype, + q_stride, + k_stride, + v_stride, + k_cache_stride, + v_cache_stride, + ) + + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + + for ( + n_q_head, + n_kv_head, + seq_len, + head_dim, + pos, + k_cache_buf_len, + v_cache_buf_len, + dtype, + q_stride, + k_stride, + v_stride, + k_cache_stride, + v_cache_stride, + ) in test_cases: + test( + lib, + handle, + "mlu", + n_q_head, + n_kv_head, + seq_len, + head_dim, + pos, + k_cache_buf_len, + v_cache_buf_len, + dtype, + q_stride, + k_stride, + v_stride, + k_cache_stride, + v_cache_stride, + ) + + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # prefill + ( + 32, # n_q_head + 4, # n_kv_head + 5, # seq_len + 64, # head_dim + 0, # pos + 2048, # k_cache_buf_len + 2048, # v_cache_buf_len + torch.float16, # dtype + [64, 2560, 1], # q_stride + [64, 2560, 1], # k_stride + [64, 2560, 1], # v_stride + [64, 11264, 1], # k_cache_stride + [64, 11264, 1], # v_cache_stride + ), + # decode + ( + 32, # n_q_head + 4, # n_kv_head + 1, # seq_len + 64, # head_dim + 3, # pos + 2048, # k_cache_buf_len + 2048, # v_cache_buf_len + torch.float16, # dtype + [64, 2560, 1], # q_stride + [64, 2560, 1], # k_stride + [64, 2560, 1], # v_stride + [64, 11264, 1], # k_cache_stride + [64, 11264, 1], # v_cache_stride + ), + # for test + ( + 8, # n_q_head + 4, # n_kv_head + 2, # seq_len + 16, # head_dim + 1, # pos + 8, # k_cache_buf_len + 8, # v_cache_buf_len + torch.float16, # dtype + None, # q_stride + None, # k_stride + None, # v_stride + None, # k_cache_stride + None, # v_cache_stride + ), + ] + args = get_args() + lib = open_lib() + + lib.infiniopCreateAttentionDescriptor.restype = c_int32 + lib.infiniopCreateAttentionDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopAttentionDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_uint64, + ] + + lib.infiniopGetAttentionWorkspaceSize.restype = c_int32 + lib.infiniopGetAttentionWorkspaceSize.argtypes = [ + infiniopAttentionDescriptor_t, + POINTER(c_uint64), + ] + + lib.infiniopAttention.restype = c_int32 + lib.infiniopAttention.argtypes = [ + infiniopAttentionDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyAttentionDescriptor.restype = c_int32 + lib.infiniopDestroyAttentionDescriptor.argtypes = [ + infiniopAttentionDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/avg_pool.py b/operatorspy/tests/avg_pool.py new file mode 100644 index 00000000..9c240789 --- /dev/null +++ b/operatorspy/tests/avg_pool.py @@ -0,0 +1,239 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64 +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +import torch +from typing import Tuple + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class AvgPoolDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor) + + +def pool(x, k, padding, stride, dilation = 1): + pooling_layers = { + 1: torch.nn.AvgPool1d, + 2: torch.nn.AvgPool2d, + 3: torch.nn.AvgPool3d, + } + + ndim = len(x.shape) - 2 + if ndim not in pooling_layers: + print("Error: Pytorch -> Unsupported tensor dimension") + return None + + if ndim == 3 and x.dtype == torch.float16: + ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x.to(torch.float32)).to(torch.float16) + else: + ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x) + if PROFILE: + torch.cuda.synchronize() + return ans + + +def inferShape(x_shape, kernel_shape, padding, strides): + assert ( + len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides) + ), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel" + input_shape = x_shape[2:] + output_shape = [] + + for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides): + output_dim = (dim + 2 * p - k) // s + 1 + output_shape.append(output_dim) + + return x_shape[:2] + tuple(output_shape) + +# convert a python tuple to a ctype void pointer +def tuple_to_void_p(py_tuple: Tuple): + array = ctypes.c_int64 * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.c_void_p) + +def test( + lib, + handle, + torch_device, + x_shape, + k_shape, + padding, + strides, + tensor_dtype=torch.float16, +): + print( + f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}" + ) + + x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) + y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = pool(x, k_shape, padding, strides) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = pool(x, k_shape, padding, strides) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopAvgPoolDescriptor_t() + + check_error( + lib.infiniopCreateAvgPoolDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + tuple_to_void_p(k_shape), + tuple_to_void_p(padding), + tuple_to_void_p(strides), + len(k_shape), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopAvgPool( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopAvgPool( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(y, ans, atol=0, rtol=1e-3) + check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for x_shape, kernel_shape, padding, strides in test_cases: + test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) + test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for x_shape, kernel_shape, padding, strides in test_cases: + test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) + test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for x_shape, kernel_shape, padding, strides in test_cases: + test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) + test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # x_shape, kernel_shape, padding, strides + ((1, 1, 10), (3,), (1,), (1,)), + ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)), + ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateAvgPoolDescriptor.restype = c_int32 + lib.infiniopCreateAvgPoolDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopAvgPoolDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_uint64, + ] + lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32 + lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [ + infiniopAvgPoolDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopAvgPool.restype = c_int32 + lib.infiniopAvgPool.argtypes = [ + infiniopAvgPoolDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32 + lib.infiniopDestroyAvgPoolDescriptor.argtypes = [ + infiniopAvgPoolDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/causal_softmax.py b/operatorspy/tests/causal_softmax.py index 09c15fec..b7cabc4a 100644 --- a/operatorspy/tests/causal_softmax.py +++ b/operatorspy/tests/causal_softmax.py @@ -1,20 +1,34 @@ -from ctypes import c_void_p +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p import ctypes import sys import os + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( open_lib, to_tensor, - CTensor, DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, + create_workspace, ) from operatorspy.tests.test_utils import get_args import torch +class CausalSoftmaxDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopCausalSoftmaxDescriptor_t = POINTER(CausalSoftmaxDescriptor) + + def causal_softmax(x): type = x.dtype mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1]) @@ -23,49 +37,142 @@ def causal_softmax(x): return torch.nn.functional.softmax(masked, dim=-1).to(type) -def test(lib, descriptor, torch_device): - x = torch.rand((32, 20, 512), dtype=torch.float16).to(torch_device) +def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float16): + print( + f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{x_dtype}" + ) + x = torch.rand(x_shape, dtype=x_dtype).to(torch_device) + if x_stride is not None: + x = rearrange_tensor(x, x_stride) ans = causal_softmax(x) - lib.causalSoftmax(descriptor, to_tensor(x, lib), None) - assert torch.allclose(x, ans, atol=0, rtol=1e-3) - print("Test passed!") + x_tensor = to_tensor(x, lib) + descriptor = infiniopCausalSoftmaxDescriptor_t() + check_error( + lib.infiniopCreateCausalSoftmaxDescriptor( + handle, ctypes.byref(descriptor), x_tensor.descriptor + ) + ) + workspace_size = c_uint64(0) + check_error( + lib.infiniopGetCausalSoftmaxWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + workspace = create_workspace(workspace_size.value, x.device) + check_error( + lib.infiniopCausalSoftmax( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + x_tensor.data, + None, + ) + ) + assert torch.allclose(x, ans, atol=0, rtol=1e-2) + check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor)) -def test_cpu(lib): + +def test_cpu(lib, test_cases): device = DeviceEnum.DEVICE_CPU - config = None - descriptor = lib.createCausalSoftmaxDescriptor(device, config) - test(lib, descriptor, "cpu") - lib.destroyCausalSoftmaxDescriptor(descriptor) + handle = create_handle(lib, device) + for x_shape, x_stride in test_cases: + test(lib, handle, "cpu", x_shape, x_stride) + destroy_handle(lib, handle) -def test_cuda(lib): +def test_cuda(lib, test_cases): device = DeviceEnum.DEVICE_CUDA - config = None - descriptor = lib.createCausalSoftmaxDescriptor(device, config) - test(lib, descriptor, "cuda") - lib.destroyCausalSoftmaxDescriptor(descriptor) + handle = create_handle(lib, device) + for x_shape, x_stride in test_cases: + test(lib, handle, "cuda", x_shape, x_stride) + destroy_handle(lib, handle) + -def test_bang(lib): +def test_bang(lib, test_cases): import torch_mlu + device = DeviceEnum.DEVICE_BANG - descriptor = lib.createCausalSoftmaxDescriptor(device, None) - test(lib, descriptor, "mlu") - lib.destroyCausalSoftmaxDescriptor(descriptor) + handle = create_handle(lib, device) + for x_shape, x_stride in test_cases: + test(lib, handle, "mlu", x_shape, x_stride) + destroy_handle(lib, handle) + +def test_ascend(lib, test_cases): + import torch_npu + + device = DeviceEnum.DEVICE_ASCEND + handle = create_handle(lib, device) + for x_shape, x_stride in test_cases: + test(lib, handle, "npu", x_shape, x_stride) + + destroy_handle(lib, handle) + +def test_maca(lib, test_cases): + device = DeviceEnum.DEVICE_MACA + handle = create_handle(lib, device) + for x_shape, x_stride in test_cases: + test(lib, handle, "cuda", x_shape, x_stride) + + destroy_handle(lib, handle) + +def test_musa(lib, test_cases): + import torch_musa + device = DeviceEnum.DEVICE_MUSA + + handle = create_handle(lib, device) + for x_shape, x_stride in test_cases: + test(lib, handle, "musa", x_shape, x_stride) + + destroy_handle(lib, handle) if __name__ == "__main__": + test_cases = [ + # x_shape, x_stride + ((32, 20, 512), None), + ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续 + ] args = get_args() lib = open_lib() - lib.createCausalSoftmaxDescriptor.restype = c_void_p - lib.destroyCausalSoftmaxDescriptor.argtypes = [c_void_p] - lib.causalSoftmax.argtypes = [ + lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32 + lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopCausalSoftmaxDescriptor_t), + infiniopTensorDescriptor_t, + ] + lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32 + lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [ + infiniopCausalSoftmaxDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopCausalSoftmax.restype = c_int32 + lib.infiniopCausalSoftmax.argtypes = [ + infiniopCausalSoftmaxDescriptor_t, + c_void_p, + c_uint64, c_void_p, - CTensor, c_void_p, ] + lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32 + lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [ + infiniopCausalSoftmaxDescriptor_t, + ] + if args.cpu: - test_cpu(lib) + test_cpu(lib, test_cases) if args.cuda: - test_cuda(lib) + test_cuda(lib, test_cases) if args.bang: - test_bang(lib) + test_bang(lib, test_cases) + if args.ascend: + test_ascend(lib, test_cases) + if args.maca: + test_maca(lib, test_cases) + if args.musa: + test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.ascend or args.maca or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/conv.py b/operatorspy/tests/conv.py new file mode 100644 index 00000000..7e7ea953 --- /dev/null +++ b/operatorspy/tests/conv.py @@ -0,0 +1,297 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +import torch +import math +import ctypes +from torch.nn import functional as F +from typing import List, Tuple + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class ConvDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopConvDescriptor_t = POINTER(ConvDescriptor) + + +def conv(x, w, stride, padding, dilation): + match len(x.shape) - 2: + case 1: + return F.conv1d( + x, w, stride=stride, padding=padding, dilation=dilation + ) + case 2: + return F.conv2d( + x, w, stride=stride, padding=padding, dilation=dilation + ) + case 3: + return F.conv3d( + x, w, stride=stride, padding=padding, dilation=dilation + ) + case _: + print("Error: Pytorch -> Unsupported tensor dimension") + return None + + +# infer the shape of the output given the inputs for a N-ary convolution +def inferShape( + x_shape: List[int], + w_shape: List[int], + pads: List[int], + strides: List[int], + dilations: List[int], +) -> Tuple[int, ...]: + assert ( + len(x_shape) == len(w_shape) == len(pads) + 2 == len(dilations) + 2 == len(strides) + 2 + ), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2" + output_dims = [ + math.floor( + (x_shape[i+2] + 2 * pads[i] - dilations[i] * (w_shape[i+2] - 1) - 1) + / strides[i] + + 1 + ) + for i in range(len(pads)) + ] + return (x_shape[0], w_shape[0]) + tuple(output_dims) + + +# convert a python tuple to a ctype void pointer +def tuple_to_void_p(py_tuple: Tuple): + array = ctypes.c_int64 * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.c_void_p) + + +def test( + lib, + handle, + torch_device, + x_shape, + w_shape, + pads, + strides, + dilations, + tensor_stride=None, + tensor_dtype=torch.float16, +): + assert len(pads) == len(strides) == len(dilations) + print( + f"Testing Conv on {torch_device} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {tensor_stride} dtype:{tensor_dtype}" + ) + x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) + w = torch.rand(w_shape, dtype=tensor_dtype).to(torch_device) + y = torch.zeros( + inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype + ).to(torch_device) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = conv(x, w, strides, pads, dilations) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = conv(x, w, strides, pads, dilations) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + w_tensor = to_tensor(w, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopConvDescriptor_t() + + check_error( + lib.infiniopCreateConvDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + w_tensor.descriptor, + tuple_to_void_p(pads), + tuple_to_void_p(strides), + tuple_to_void_p(dilations), + len(pads), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + w_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopConv( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + w_tensor.data, + None, + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopConv( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + w_tensor.data, + None, + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + if (tensor_dtype == torch.float16): + assert torch.allclose(y, ans, atol=0, rtol=1e-2) + else: + assert torch.allclose(y, ans, atol=0, rtol=1e-3) + check_error(lib.infiniopDestroyConvDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases: + test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16) + test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases: + test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16) + test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases: + test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16) + test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # x_shape, w_shape, pads, strides, dilations, x_strides + ( + (32, 3, 4), + (32, 3, 5), + (1,), + (1,), + (1,), + None, + ), + ( + (1, 3, 4, 4), + (2, 3, 3, 3), + (1, 1), + (1, 2), + (2, 1), + None, + ), + ( + (32, 3, 128, 128), + (64, 3, 5, 5), + (2, 2), + (2, 2), + (1, 1), + None, + ), + ( + (1, 1, 4, 4, 4), + (1, 1, 5, 5, 5), + (1, 1, 1), + (1, 1, 1), + (1, 1, 1), + None, + ), + ( + (32, 3, 32, 32, 32), + (64, 3, 5, 5, 5), + (3, 2, 2), + (4, 3, 3), + (2, 2, 1), + None, + ), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateConvDescriptor.restype = c_int32 + lib.infiniopCreateConvDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopConvDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_uint64, + ] + lib.infiniopConv.restype = c_int32 + lib.infiniopConv.argtypes = [ + infiniopConvDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyConvDescriptor.restype = c_int32 + lib.infiniopDestroyConvDescriptor.argtypes = [ + infiniopConvDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/expand.py b/operatorspy/tests/expand.py new file mode 100644 index 00000000..87365c05 --- /dev/null +++ b/operatorspy/tests/expand.py @@ -0,0 +1,191 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, +) + +from operatorspy.tests.test_utils import get_args +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class ExpandDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopExpandDescriptor_t = POINTER(ExpandDescriptor) + + +def expand(x, y): + if PROFILE: + ans = x.expand_as(y).clone() + torch.cuda.synchronize() + return ans + return x.expand_as(y) + + +def test( + lib, + handle, + torch_device, + y_shape, + x_shape, + y_stride=None, + x_stride=None, + tensor_dtype=torch.float16, +): + print( + f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}" + ) + + x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) + y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device) + + if x_stride is not None: + x = rearrange_tensor(x, x_stride) + if y_stride is not None: + y = rearrange_tensor(y, y_stride) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = expand(x, y) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = expand(x, y) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopExpandDescriptor_t() + + check_error( + lib.infiniopCreateExpandDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + assert torch.allclose(y, ans, atol=0, rtol=1e-3) + check_error(lib.infiniopDestroyExpandDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for y_shape, x_shape, y_stride, x_stride in test_cases: + test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16) + test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for y_shape, x_shape, y_stride, x_stride in test_cases: + test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16) + test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for y_shape, x_shape, y_stride, x_stride in test_cases: + test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16) + test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + +def test_musa(lib, test_cases): + import torch_musa + + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for y_shape, x_shape, y_stride, x_stride in test_cases: + test(lib, handle, "musa", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16) + test(lib, handle, "musa", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # y_shape, x_shape, y_stride, x_stride + ((), (), None, None), + ((3, 3), (1,), None, None), + ((5, 4, 3), (4, 3,), None, (6, 1)), + ((99, 111), (111,), None, None), + ((2, 4, 3), (1, 3), None, None), + ((2, 20, 3), (2, 1, 3), None, None), + ((2, 3, 4, 5), (5,), None, None), + ((3, 2, 4, 5), (3, 2, 1, 1), None, None), + ((32, 256, 112, 112), (32, 256, 112, 1), None, None), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateExpandDescriptor.restype = c_int32 + lib.infiniopCreateExpandDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopExpandDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopExpand.restype = c_int32 + lib.infiniopExpand.argtypes = [ + infiniopExpandDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyExpandDescriptor.restype = c_int32 + lib.infiniopDestroyExpandDescriptor.argtypes = [ + infiniopExpandDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if args.musa: + test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/gemm.py b/operatorspy/tests/gemm.py new file mode 100644 index 00000000..5da99eac --- /dev/null +++ b/operatorspy/tests/gemm.py @@ -0,0 +1,374 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, +) + +from operatorspy.tests.test_utils import get_args +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class GEMMDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor) + + +def gemm(A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32): + A = A.T if transA else A + B = B.T if transB else B + result = alpha * torch.matmul(A if dtype != torch.float16 else A.to(torch.float32), B if dtype != torch.float16 else B.to(torch.float32)).to(dtype) + if C is not None: + result += beta * C if dtype != torch.float16 else C.to(torch.float32) + if PROFILE: + torch.cuda.synchronize() + return result + + +def test( + lib, + handle, + torch_device, + alpha, + beta, + transA, + transB, + a_shape, + b_shape, + c_shape, + y_shape, + a_stride=None, + b_stride=None, + c_stride=None, + y_stride=None, + dtype=torch.float16, +): + print( + f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} " + f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} " + f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}" + ) + + a = torch.rand(a_shape, dtype=dtype).to(torch_device) + b = torch.rand(b_shape, dtype=dtype).to(torch_device) + c = torch.rand(c_shape, dtype=dtype).to(torch_device) if c_shape else None + y = torch.rand(y_shape, dtype=dtype).to(torch_device) + + if a_stride is not None: + a = rearrange_tensor(a, a_stride) + if b_stride is not None: + b = rearrange_tensor(b, b_stride) + if c_stride is not None and c is not None: + c = rearrange_tensor(c, c_stride) + if y_stride is not None: + y = rearrange_tensor(y, y_stride) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = gemm(a, b, c, transA, transB, alpha, beta, dtype) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = gemm(a, b, c, transA, transB, alpha, beta, dtype) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) + c_tensor = to_tensor(c, lib) if c is not None else None + y_tensor = to_tensor(y, lib) + descriptor = infiniopGEMMDescriptor_t() + check_error( + lib.infiniopCreateGEMMDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + c_tensor.descriptor if c_tensor else None, + alpha, + beta, + transA, + transB, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + a_tensor.descriptor.contents.invalidate() + b_tensor.descriptor.contents.invalidate() + if c_tensor is not None: + c_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspace_size = ctypes.c_uint64(0) + check_error( + lib.infiniopGetGEMMWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to( + torch_device + ) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopGEMM( + descriptor, + workspace_ptr, + workspace_size, + y_tensor.data, + a_tensor.data, + b_tensor.data, + c_tensor.data if c_tensor else None, + None, + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopGEMM( + descriptor, + workspace_ptr, + workspace_size, + y_tensor.data, + a_tensor.data, + b_tensor.data, + c_tensor.data if c_tensor else None, + None, + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(y, ans, atol=0, rtol=1e-2) + check_error(lib.infiniopDestroyGEMMDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for ( + alpha, + beta, + transA, + transB, + a_shape, + b_shape, + c_shape, + y_shape, + a_stride, + b_stride, + c_stride, + y_stride, + ) in test_cases: + test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16) + test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for ( + alpha, + beta, + transA, + transB, + a_shape, + b_shape, + c_shape, + y_shape, + a_stride, + b_stride, + c_stride, + y_stride, + ) in test_cases: + test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16) + test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + + for ( + alpha, + beta, + transA, + transB, + a_shape, + b_shape, + c_shape, + y_shape, + a_stride, + b_stride, + c_stride, + y_stride, + ) in test_cases: + test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16) + test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32) + + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride + ( + 1.0, + 1.0, + False, + False, + (1, 2048), + (2048, 2048), + (1, 2048), + (1, 2048), + None, + None, + None, + None, + ), + ( + 1.0, + 1.0, + True, + True, + (2048, 4), + (2048, 2048), + (4, 2048), + (4, 2048), + None, + None, + None, + None, + ), + ( + 1.0, + 1.0, + False, + True, + (1, 2048), + (1000, 2048), + (1000), + (1, 1000), + None, + None, + None, + None, + ), + ( + 1.0, + 1.0, + True, + False, + (2048, 4), + (2048, 2048), + (2048), + (4, 2048), + (4096, 1), + (4096, 1), + (2,), + (4096, 1), + ), + ( + 1.0, + 1.0, + False, + False, + (3, 1, 2048), + (3, 2048, 2048), + (1,), + (3, 1, 2048), + None, + None, + None, + None, + ), + ( + 1.0, + 1.0, + True, + False, + (2048, 4), + (2048, 2048), + None, + (4, 2048), + (4096, 1), + (4096, 1), + (2,), + (4096, 1), + ), + ] + args = get_args() + lib = open_lib() + + lib.infiniopCreateGEMMDescriptor.restype = c_int32 + lib.infiniopCreateGEMMDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopGEMMDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_float, + c_float, + c_bool, + c_bool, + ] + + lib.infiniopGetGEMMWorkspaceSize.restype = c_int32 + lib.infiniopGetGEMMWorkspaceSize.argtypes = [ + infiniopGEMMDescriptor_t, + POINTER(c_uint64), + ] + + lib.infiniopGEMM.restype = c_int32 + lib.infiniopGEMM.argtypes = [ + infiniopGEMMDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyGEMMDescriptor.restype = c_int32 + lib.infiniopDestroyGEMMDescriptor.argtypes = [ + infiniopGEMMDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/global_avg_pool.py b/operatorspy/tests/global_avg_pool.py new file mode 100644 index 00000000..33f7b64d --- /dev/null +++ b/operatorspy/tests/global_avg_pool.py @@ -0,0 +1,208 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64 +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +import torch, time + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class GlobalAvgPoolDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor) + + +def inferShape(x): + return x.shape[:2] + (1,) * (x.dim() - 2) + + +def globalAvgPool(x): + y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True) + if PROFILE: + torch.cuda.synchronize() + return y.view(*inferShape(x)) + + +def test( + lib, + handle, + torch_device, + x_shape, + tensor_dtype=torch.float16, +): + print( + f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}" + ) + + x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) + y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = globalAvgPool(x) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = globalAvgPool(x) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopGlobalAvgPoolDescriptor_t() + + check_error( + lib.infiniopCreateGlobalAvgPoolDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetGlobalAvgPoolWorkspaceSize( + descriptor, ctypes.byref(workspaceSize) + ) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to( + torch_device + ) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopGlobalAvgPool( + descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopGlobalAvgPool( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(y, ans, atol=0, rtol=1e-3) + check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for x_shape in test_cases: + test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16) + test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for x_shape in test_cases: + test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16) + test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for x_shape in test_cases: + test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16) + test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # x_shape + ((1, 3, 3)), + ((1, 3, 1, 1, 3)), + ((1, 3, 1, 1, 257)), + ((1, 2, 1, 1, 514)), + ((1, 3, 1, 1, 1025)), + ((32, 256, 1, 112, 112)), + ((2, 3, 2048000)), + ((2, 1, 10243)), + ((2, 20, 100)), + ((3, 33, 333)), + ((32, 20, 512)), + ((3, 3, 11, 11, 11, 3, 2)), + ((32, 256, 1, 112, 112)), + ((32, 256, 112, 112)), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32 + lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopGlobalAvgPoolDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32 + lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [ + infiniopGlobalAvgPoolDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopGlobalAvgPool.restype = c_int32 + lib.infiniopGlobalAvgPool.argtypes = [ + infiniopGlobalAvgPoolDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32 + lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [ + infiniopGlobalAvgPoolDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py index 9dce5f31..31076fb5 100644 --- a/operatorspy/tests/matmul.py +++ b/operatorspy/tests/matmul.py @@ -1,6 +1,8 @@ -from ctypes import c_float, c_void_p +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes import sys import os +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( @@ -8,81 +10,416 @@ to_tensor, CTensor, DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, + create_workspace, ) -from operatorspy.tests.test_utils import get_args +from operatorspy.tests.test_utils import get_args, synchronize_device import torch +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 -def matmul(c, beta, a, b, alpha): +class MatmulDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopMatmulDescriptor_t = POINTER(MatmulDescriptor) + +def matmul(_c, beta, _a, _b, alpha): + a = _a.clone() + b = _b.clone() + c = _c.clone() input_dtype = c.dtype - return ( + ans = ( alpha * torch.matmul(a.to(torch.float32), b.to(torch.float32)).to(input_dtype) + beta * c ) + return ans -def test(lib, descriptor, torch_device): - c = torch.zeros((1, 2048), dtype=torch.float16).to(torch_device) - a = torch.rand((1, 2048), dtype=torch.float16).to(torch_device) - b = torch.rand((2048, 2048), dtype=torch.float16).to(torch_device) +def test( + lib, + handle, + torch_device, + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride=None, + b_stride=None, + c_stride=None, + dtype=torch.float16, +): + print( + f"Testing Matmul on {torch_device} with a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape}" + f" a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}" + ) - beta = 0.0 - alpha = 1.0 + a = torch.rand(a_shape, dtype=dtype).to(torch_device) + b = torch.rand(b_shape, dtype=dtype).to(torch_device) + c = torch.ones(c_shape, dtype=dtype).to(torch_device) ans = matmul(c, beta, a, b, alpha) - lib.matmul( - descriptor, - to_tensor(c, lib), - beta, - to_tensor(a, lib), - to_tensor(b, lib), - alpha, - None, + + if a_stride is not None: + a = rearrange_tensor(a, a_stride) + if b_stride is not None: + b = rearrange_tensor(b, b_stride) + if c_stride is not None: + c = rearrange_tensor(c, c_stride) + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) + c_tensor = to_tensor(c, lib) + descriptor = infiniopMatmulDescriptor_t() + check_error( + lib.infiniopCreateMatmulDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + alpha, + a_tensor.descriptor, + b_tensor.descriptor, + beta + ) ) - assert torch.allclose(c, ans, atol=0, rtol=1e-3) - print("Test passed!") + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + a_tensor.descriptor.contents.invalidate() + b_tensor.descriptor.contents.invalidate() + c_tensor.descriptor.contents.invalidate() + + workspace_size = c_uint64(0) + check_error( + lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size)) + ) + workspace = create_workspace(workspace_size.value, a.device) + + check_error( + lib.infiniopMatmul( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + c_tensor.data, + a_tensor.data, + b_tensor.data, + None, + ) + ) + + assert torch.allclose(c, ans, atol=0, rtol=1e-2) + + if PROFILE: + for i in range(NUM_PRERUN): + _ = matmul(c, beta, a, b, alpha) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = matmul(c, beta, a, b, alpha) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + for i in range(NUM_PRERUN): + check_error( + lib.infiniopMatmul( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + c_tensor.data, + a_tensor.data, + b_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopMatmul( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + c_tensor.data, + a_tensor.data, + b_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + + check_error(lib.infiniopDestroyMatmulDescriptor(descriptor)) -def test_cpu(lib): +def test_cpu(lib, test_cases): device = DeviceEnum.DEVICE_CPU - descriptor = lib.createMatmulDescriptor(device, None) - test(lib, descriptor, "cpu") - lib.destroyMatmulDescriptor(descriptor) + handle = create_handle(lib, device) + + for ( + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) in test_cases: + test( + lib, + handle, + "cpu", + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) + + destroy_handle(lib, handle) -def test_cuda(lib): +def test_cuda(lib, test_cases): device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + + for ( + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) in test_cases: + test( + lib, + handle, + "cuda", + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) - descriptor = lib.createMatmulDescriptor(device, None) - test(lib, descriptor, "cuda") - lib.destroyMatmulDescriptor(descriptor) + destroy_handle(lib, handle) -def test_bang(lib): + +def test_bang(lib, test_cases): import torch_mlu device = DeviceEnum.DEVICE_BANG - descriptor = lib.createMatmulDescriptor(device, None) - test(lib, descriptor, "mlu") - lib.destroyMatmulDescriptor(descriptor) + handle = create_handle(lib, device) + + for ( + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) in test_cases: + test( + lib, + handle, + "mlu", + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) + + destroy_handle(lib, handle) + +def test_ascend(lib, test_cases): + import torch_npu + + device = DeviceEnum.DEVICE_ASCEND + handle = create_handle(lib, device) + + for ( + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) in test_cases: + test( + lib, + handle, + "npu", + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) + + destroy_handle(lib, handle) + +def test_maca(lib, test_cases): + device = DeviceEnum.DEVICE_MACA + handle = create_handle(lib, device) + + for ( + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) in test_cases: + test( + lib, + handle, + "cuda", + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) + + destroy_handle(lib, handle) + +def test_musa(lib, test_cases): + import torch_musa + + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for ( + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) in test_cases: + test( + lib, + handle, + "musa", + alpha, + beta, + a_shape, + b_shape, + c_shape, + a_stride, + b_stride, + c_stride, + dtype, + ) if __name__ == "__main__": + test_cases = [ + # alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride, dtype + (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None, torch.float16), + (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None, torch.float32), + (1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None, torch.float16), + (1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None, torch.float32), + (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1), torch.float16), + (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1), torch.float32), + (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1), torch.float16), + (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1), torch.float32), + (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None, torch.float16), + (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None, torch.float32), + ] args = get_args() lib = open_lib() - lib.createMatmulDescriptor.restype = c_void_p - lib.destroyMatmulDescriptor.argtypes = [c_void_p] - lib.matmul.argtypes = [ - c_void_p, - CTensor, - c_float, - CTensor, - CTensor, + + lib.infiniopCreateMatmulDescriptor.restype = c_int32 + lib.infiniopCreateMatmulDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopMatmulDescriptor_t), + infiniopTensorDescriptor_t, c_float, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_float + ] + + lib.infiniopGetMatmulWorkspaceSize.restype = c_int32 + lib.infiniopGetMatmulWorkspaceSize.argtypes = [ + infiniopMatmulDescriptor_t, + POINTER(c_uint64), + ] + + lib.infiniopMatmul.restype = c_int32 + lib.infiniopMatmul.argtypes = [ + infiniopMatmulDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, c_void_p, ] + + lib.infiniopDestroyMatmulDescriptor.restype = c_int32 + lib.infiniopDestroyMatmulDescriptor.argtypes = [ + infiniopMatmulDescriptor_t, + ] + + if args.profile: + PROFILE = True if args.cpu: - test_cpu(lib) + test_cpu(lib, test_cases) if args.cuda: - test_cuda(lib) + test_cuda(lib, test_cases) if args.bang: - test_bang(lib) + test_bang(lib, test_cases) + if args.ascend: + test_ascend(lib, test_cases) + if args.maca: + test_maca(lib, test_cases) + if args.musa: + test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.ascend or args.maca or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/max_pool.py b/operatorspy/tests/max_pool.py new file mode 100644 index 00000000..ffc0bb19 --- /dev/null +++ b/operatorspy/tests/max_pool.py @@ -0,0 +1,236 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64 +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +import torch +from typing import Tuple + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class MaxPoolDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopMaxPoolDescriptor_t = POINTER(MaxPoolDescriptor) + + +def pool(x, k, padding, stride, dilation = 1): + pooling_layers = { + 1: torch.nn.MaxPool1d, + 2: torch.nn.MaxPool2d, + 3: torch.nn.MaxPool3d, + } + + ndim = len(x.shape) - 2 + if ndim not in pooling_layers: + print("Error: Pytorch -> Unsupported tensor dimension") + return None + + ans = pooling_layers[ndim](k, stride=stride, padding=padding, dilation=dilation)(x) + if PROFILE: + torch.cuda.synchronize() + return ans + + +def inferShape(x_shape, kernel_shape, padding, strides): + assert ( + len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides) + ), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel" + input_shape = x_shape[2:] + output_shape = [] + + for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides): + output_dim = (dim + 2 * p - k) // s + 1 + output_shape.append(output_dim) + + return x_shape[:2] + tuple(output_shape) + +# convert a python tuple to a ctype void pointer +def tuple_to_void_p(py_tuple: Tuple): + array = ctypes.c_int64 * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.c_void_p) + +def test( + lib, + handle, + torch_device, + x_shape, + k_shape, + padding, + strides, + tensor_dtype=torch.float16, +): + print( + f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}" + ) + + x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) + y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = pool(x, k_shape, padding, strides) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = pool(x, k_shape, padding, strides) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopMaxPoolDescriptor_t() + + check_error( + lib.infiniopCreateMaxPoolDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + tuple_to_void_p(k_shape), + tuple_to_void_p(padding), + tuple_to_void_p(strides), + len(k_shape), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetMaxPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopMaxPool( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopMaxPool( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(y, ans, atol=0, rtol=1e-3) + check_error(lib.infiniopDestroyMaxPoolDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for x_shape, kernel_shape, padding, strides in test_cases: + test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) + test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for x_shape, kernel_shape, padding, strides in test_cases: + test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) + test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for x_shape, kernel_shape, padding, strides in test_cases: + test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) + test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # x_shape, kernel_shape, padding, strides + ((1, 1, 10), (3,), (1,), (1,)), + ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)), + ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateMaxPoolDescriptor.restype = c_int32 + lib.infiniopCreateMaxPoolDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopMaxPoolDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_uint64, + ] + lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32 + lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [ + infiniopMaxPoolDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopMaxPool.restype = c_int32 + lib.infiniopMaxPool.argtypes = [ + infiniopMaxPoolDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32 + lib.infiniopDestroyMaxPoolDescriptor.argtypes = [ + infiniopMaxPoolDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/mlp.py b/operatorspy/tests/mlp.py new file mode 100644 index 00000000..668d7861 --- /dev/null +++ b/operatorspy/tests/mlp.py @@ -0,0 +1,316 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool +import ctypes +import sys +import os + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + CTensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, + create_workspace, +) + +from operatorspy.tests.test_utils import get_args +import torch +import torch.nn as nn + + +class MLPDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopMLPDescriptor_t = POINTER(MLPDescriptor) + + +def swiglu(a, b): + return a * b / (1 + torch.exp(-b.float()).to(b.dtype)) + + +def mlp(y, x, w12, w3, alpha, residual): + input_dtype = x.dtype + + intermediate_size = w3.shape[0] + + a = torch.matmul( + x.to(torch.float32), w12[:, intermediate_size:].to(torch.float32) + ).to(input_dtype) + b = torch.matmul( + x.to(torch.float32), w12[:, 0:intermediate_size].to(torch.float32) + ).to(input_dtype) + c = swiglu(a, b) + d = torch.matmul(c.to(torch.float32), alpha * w3.to(torch.float32)).to(input_dtype) + out = d + y if residual else d + return out + + +def test( + lib, + handle, + torch_device, + num_tokens, + hidden_size, + intermediate_size, + alpha, + residual, + dtype=torch.float16, + x_stride=None, + y_stride=None, + w12_stride=None, + w3_stride=None, +): + print( + f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}" + f" alpha:{alpha} residual:{residual} dtype:{dtype} x_stride:{x_stride} y_stride:{y_stride} w12_stride:{w12_stride} w3_stride:{w3_stride}" + ) + + y = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01 + x = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01 + w12 = ( + torch.rand([hidden_size, 2 * intermediate_size], dtype=dtype).to(torch_device) + * 0.01 + ) + w3 = ( + torch.rand([intermediate_size, hidden_size], dtype=dtype).to(torch_device) + * 0.01 + ) + + ans = mlp(y, x, w12, w3, alpha, residual) + + if x_stride is not None: + x = rearrange_tensor(x, x_stride) + if y_stride is not None: + y = rearrange_tensor(y, y_stride) + if w12_stride is not None: + w12 = rearrange_tensor(w12, w12_stride) + if w3_stride is not None: + w3 = rearrange_tensor(w3, w3_stride) + + y_tensor = to_tensor(y, lib) + x_tensor = to_tensor(x, lib) + w12_tensor = to_tensor(w12, lib) + w3_tensor = to_tensor(w3, lib) + descriptor = infiniopMLPDescriptor_t() + check_error( + lib.infiniopCreateMLPDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + w12_tensor.descriptor, + w3_tensor.descriptor, + alpha, + residual, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + y_tensor.descriptor.contents.invalidate() + x_tensor.descriptor.contents.invalidate() + w12_tensor.descriptor.contents.invalidate() + w3_tensor.descriptor.contents.invalidate() + + workspace_size = c_uint64(0) + check_error( + lib.infiniopGetMLPWorkspaceSize(descriptor, ctypes.byref(workspace_size)) + ) + workspace = create_workspace(workspace_size.value, x.device) + + check_error( + lib.infiniopMLP( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + y_tensor.data, + x_tensor.data, + w12_tensor.data, + w3_tensor.data, + None, + ) + ) + assert torch.allclose(y, ans, atol=0, rtol=2e-2) + + check_error(lib.infiniopDestroyMLPDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + + for ( + num_tokens, + hidden_size, + intermediate_size, + alpha, + residual, + dtype, + x_stride, + y_stride, + w12_stride, + w3_stride, + ) in test_cases: + test( + lib, + handle, + "cpu", + num_tokens, + hidden_size, + intermediate_size, + alpha, + residual, + dtype, + x_stride, + y_stride, + w12_stride, + w3_stride, + ) + + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + + for ( + num_tokens, + hidden_size, + intermediate_size, + alpha, + residual, + dtype, + x_stride, + y_stride, + w12_stride, + w3_stride, + ) in test_cases: + test( + lib, + handle, + "cuda", + num_tokens, + hidden_size, + intermediate_size, + alpha, + residual, + dtype, + x_stride, + y_stride, + w12_stride, + w3_stride, + ) + + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + + for ( + num_tokens, + hidden_size, + intermediate_size, + alpha, + residual, + dtype, + x_stride, + y_stride, + w12_stride, + w3_stride, + ) in test_cases: + test( + lib, + handle, + "mlu", + num_tokens, + hidden_size, + intermediate_size, + alpha, + residual, + dtype, + x_stride, + y_stride, + w12_stride, + w3_stride, + ) + + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # num_tokens, hidden_size, intermediate_size, alpha, residual, dtype, x_stride, y_stride, w12_stride, w3_stride + (4, 4096, 11008, 1.0, True, torch.float16, None, None, None, None), + (4, 4096, 11008, 1.0, True, torch.float16, [8192, 1], [8192, 1], None, None), + ( + 4, + 4096, + 11008, + 1.0, + True, + torch.float16, + None, + None, + [1, 4096], + [1, 11008], + ), + (4, 4096, 11008, 1.0, False, torch.float16, None, None, None, None), + (4, 4096, 11008, 1.0, False, torch.float16, [8192, 1], [8192, 1], None, None), + ] + args = get_args() + lib = open_lib() + + lib.infiniopCreateMLPDescriptor.restype = c_int32 + lib.infiniopCreateMLPDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopMLPDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_float, + c_bool, + ] + + lib.infiniopGetMLPWorkspaceSize.restype = c_int32 + lib.infiniopGetMLPWorkspaceSize.argtypes = [ + infiniopMLPDescriptor_t, + POINTER(c_uint64), + ] + + lib.infiniopMLP.restype = c_int32 + lib.infiniopMLP.argtypes = [ + infiniopMLPDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyMLPDescriptor.restype = c_int32 + lib.infiniopDestroyMLPDescriptor.argtypes = [ + infiniopMLPDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py new file mode 100644 index 00000000..85a3c681 --- /dev/null +++ b/operatorspy/tests/random_sample.py @@ -0,0 +1,250 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes +import sys +import os + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, + create_workspace, + U64, +) + +from operatorspy.tests.test_utils import get_args +import torch + + +class RandomSampleDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopRandomSampleDescriptor_t = POINTER(RandomSampleDescriptor) + + +def random_sample(data, random_val, topp, topk, voc, temperature, torch_device): + indices = torch.zeros([topk], dtype = torch.int64) + dataNp = data.clone().detach() + sorted_indices = torch.arange(voc) + + for i in range(topk): + for j in range(i + 1, voc): + if(dataNp[i] < dataNp[j]): + tmp = dataNp[i].clone().detach() + dataNp[i] = dataNp[j].clone().detach() + dataNp[j] = tmp + + tmpInd = sorted_indices[i].clone().detach() + sorted_indices[i] = sorted_indices[j].clone().detach() + sorted_indices[j] = tmpInd + + #sorted_indices = torch.argsort(dataNp, descending=True) + indices = sorted_indices[:topk] + + dataNp = dataNp[sorted_indices] + + globalM = dataNp[0] + dataNp = (dataNp - globalM) / temperature + dataNp = torch.softmax(dataNp.float(), dim = 0) + sum_s = 0 + for end in range(topk): + sum_s += dataNp[end] + if(sum_s >= topp): + break + if(end < topk - 1): + end += 1 + else: + end = topk + + sum_s = 0 + for i in range(end): + sum_s += dataNp[i] + random_val *= sum_s + + sum_s = 0 + for i in range(end): + sum_s += dataNp[i] + if(random_val < sum_s): + return indices[i] + +def random_sample_0(data): + return torch.argmax(data) + +def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_dtype=torch.float16): + print( + f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}" + ) + data = torch.arange(voc).float() * 0.0001 + _perm = torch.randperm(voc) + if (torch_device == 'maca'): + data = data[_perm].to(x_dtype).to('cuda') + else: + data = data[_perm].to(x_dtype).to(torch_device) + if(topp > 0 and topk > 1): + ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu") + else: + ans = random_sample_0(data) + if(torch_device == 'maca'): + indices = torch.zeros([1], dtype = torch.int64).to('cuda') + else: + indices = torch.zeros([1], dtype = torch.int64).to(torch_device) + x_tensor = to_tensor(data, lib) + indices_tensor = to_tensor(indices, lib) + indices_tensor.descriptor.contents.dt = U64 # treat int64 as uint64 + + descriptor = infiniopRandomSampleDescriptor_t() + check_error( + lib.infiniopCreateRandomSampleDescriptor( + handle, ctypes.byref(descriptor), indices_tensor.descriptor, x_tensor.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + indices_tensor.descriptor.contents.invalidate() + + workspace_size = c_uint64(0) + check_error( + lib.infiniopGetRandomSampleWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = create_workspace(workspace_size.value, torch_device) + check_error( + lib.infiniopRandomSample( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + indices_tensor.data, + x_tensor.data, + random_val, + topp, + topk, + temperature, + None, + ) + ) + if torch_device == "npu": + torch.npu.synchronize() + + assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]] + check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor)) + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for (voc, random_val, topp, topk, temperature) in test_cases: + test(lib, handle, "cpu", voc, random_val, topp, topk, temperature) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for (voc, random_val, topp, topk, temperature) in test_cases: + test(lib, handle, "cuda", voc, random_val, topp, topk, temperature) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for (voc, random_val, topp, topk, temperature) in test_cases: + test(lib, handle, "mlu", voc, random_val, topp, topk, temperature) + destroy_handle(lib, handle) + + +def test_ascend(lib, test_cases): + import torch_npu + device = DeviceEnum.DEVICE_ASCEND + handle = create_handle(lib, device) + for (voc, random_val, topp, topk, temperature) in test_cases: + test(lib, handle, "npu", voc, random_val, topp, topk, temperature) + destroy_handle(lib, handle) + +def test_maca(lib, test_cases): + device = DeviceEnum.DEVICE_MACA + handle = create_handle(lib, device) + for (voc, random_val, topp, topk, temperature) in test_cases: + test(lib, handle, "maca", voc, random_val, topp, topk, temperature) + destroy_handle(lib, handle) + + +def test_musa(lib, test_cases): + import torch_musa + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for (voc, random_val, topp, topk, temperature) in test_cases: + test(lib, handle, "musa", voc, random_val, topp, topk, temperature) + destroy_handle(lib, handle) + +if __name__ == "__main__": + test_cases = [ + # voc, random_val, topp, topk, temperature + (512, 0.8, 0.8, 3, 0.5), + (4096, 0.05, 0.9, 5, 1.0), + (16384, 0.15, 0.85, 10, 2.0), + (512, 0.08, 0, 3, 0.5), + (4096, 0.5, 0.9, 1, 1.0), + (16384, 0.15, 0, 1, 2.0), + (16384, 0.15, 0, 1, 2.0), + (32000, 0.08, 0.8, 50, 1.0), + (32000, 0.08, 1.0, 25, 1.0), + # (119696, 0.01, 1.0, 100, 1.0), + ] + + args = get_args() + lib = open_lib() + lib.infiniopCreateRandomSampleDescriptor.restype = c_int32 + lib.infiniopCreateRandomSampleDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopRandomSampleDescriptor_t), + infiniopTensorDescriptor_t, + ] + lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32 + lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [ + infiniopRandomSampleDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopRandomSample.restype = c_int32 + lib.infiniopRandomSample.argtypes = [ + infiniopRandomSampleDescriptor_t, + c_void_p, + c_uint64, + c_uint64, + c_void_p, + c_float, + c_float, + c_int32, + c_float, + c_void_p, + ] + lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32 + lib.infiniopDestroyRandomSampleDescriptor.argtypes = [ + infiniopRandomSampleDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if args.ascend: + test_ascend(lib, test_cases) + if args.maca: + test_maca(lib, test_cases) + if args.musa: + test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.ascend or args.maca or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/rearrange.py b/operatorspy/tests/rearrange.py new file mode 100644 index 00000000..9709e6b3 --- /dev/null +++ b/operatorspy/tests/rearrange.py @@ -0,0 +1,181 @@ +import ctypes +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p +import sys +import os + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + CTensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, +) + +from operatorspy.tests.test_utils import get_args +import torch + + +class RerrangeDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopRearrangeDescriptor_t = POINTER(RerrangeDescriptor) + + +def test( + lib, + handle, + torch_device, + x_shape, + x_stride, + y_shape, + y_stride, + x_dtype=torch.float16, +): + print( + f"Testing Rerrange on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} y_shape:{y_shape} y_stride:{y_stride} x_dtype:{x_dtype}" + ) + x = torch.rand(x_shape, dtype=x_dtype).to(torch_device) + y = torch.zeros(y_shape, dtype=x_dtype).to(torch_device) + if x_stride is not None: + x = rearrange_tensor(x, x_stride) + if y_stride is not None: + y = rearrange_tensor(y, y_stride) + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + + descriptor = infiniopRearrangeDescriptor_t() + check_error( + lib.infiniopCreateRearrangeDescriptor( + handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + check_error( + lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None) + ) + assert torch.allclose(x, y, atol=0, rtol=1e-3) + check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for test_case in test_cases: + x_shape, x_stride = test_case[0] + y_shape, y_stride = test_case[1] + test(lib, handle, "cpu", x_shape, x_stride, y_shape, y_stride) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for test_case in test_cases: + x_shape, x_stride = test_case[0] + y_shape, y_stride = test_case[1] + test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride) + destroy_handle(lib, handle) + +def test_bang(lib, test_cases): + import torch_mlu + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for test_case in test_cases: + x_shape, x_stride = test_case[0] + y_shape, y_stride = test_case[1] + test(lib, handle, "mlu", x_shape, x_stride, y_shape, y_stride) + destroy_handle(lib, handle) + +def test_ascend(lib, test_cases): + import torch_npu + + device = DeviceEnum.DEVICE_ASCEND + handle = create_handle(lib, device) + for test_case in test_cases: + x_shape, x_stride = test_case[0] + y_shape, y_stride = test_case[1] + test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride) + destroy_handle(lib, handle) + +def test_maca(lib, test_cases): + device = DeviceEnum.DEVICE_MACA + handle = create_handle(lib, device) + for test_case in test_cases: + x_shape, x_stride = test_case[0] + y_shape, y_stride = test_case[1] + test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride) + destroy_handle(lib, handle) + +def test_musa(lib, test_cases): + import torch_musa + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for test_case in test_cases: + x_shape, x_stride = test_case[0] + y_shape, y_stride = test_case[1] + test(lib, handle, "musa", x_shape, x_stride, y_shape, y_stride) + destroy_handle(lib, handle) + +def test_musa(lib, test_cases): + import torch_musa + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for test_case in test_cases: + x_shape, x_stride = test_case[0] + y_shape, y_stride = test_case[1] + test(lib, handle, "musa", x_shape, x_stride, y_shape, y_stride) + destroy_handle(lib, handle) + +if __name__ == "__main__": + args = get_args() + test_cases = [ + # ((src_shape, src_stride), (dst_shape, dst_stride)) + (((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))), + (((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)), + (((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))), + (((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))), + (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))), + (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))), + (((64,), (1,)), ((64,), (1,))), + ] + lib = open_lib() + lib.infiniopCreateRearrangeDescriptor.restype = c_int32 + lib.infiniopCreateRearrangeDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopRearrangeDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopRearrange.restype = c_int32 + lib.infiniopRearrange.argtypes = [ + infiniopRearrangeDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyRearrangeDescriptor.restype = c_int32 + lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t] + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if args.ascend: + test_ascend(lib, test_cases) + if args.maca: + test_maca(lib, test_cases) + if args.musa: + test_musa(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/reform.py b/operatorspy/tests/reform.py deleted file mode 100644 index d671c003..00000000 --- a/operatorspy/tests/reform.py +++ /dev/null @@ -1,91 +0,0 @@ -import ctypes -from ctypes import c_float, POINTER, c_void_p -import sys -import os - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) -from operatorspy import ( - open_lib, - to_tensor, - CTensor, - DeviceEnum, -) - -from operatorspy.tests.test_utils import get_args -import torch -import time - - -def test(lib, descriptor, torch_device, x = None): - if x is None: - x = torch.rand((10, 10), dtype=torch.float16).to(torch_device) - else: - x = x.to(torch_device) - y = torch.zeros((5, 5), dtype=torch.float16).to(torch_device) - - lib.reform(descriptor, to_tensor(y, lib), to_tensor(x, lib, [5, 5], [20, 2]), None) - - return x, y - -def test_cpu(lib): - device = DeviceEnum.DEVICE_CPU - config = None - descriptor = lib.createReformDescriptor(device, config) - test(lib, descriptor, "cpu") - lib.destroyReformDescriptor(descriptor) - print("Test passed!") - -def run_cpu(lib): - device = DeviceEnum.DEVICE_CPU - config = None - descriptor = lib.createReformDescriptor(device, config) - x, ans = test(lib, descriptor, "cpu") - lib.destroyReformDescriptor(descriptor) - return x, ans - -def test_cuda(lib): - device = DeviceEnum.DEVICE_CUDA - config = None - descriptor = lib.createReformDescriptor(device, config) - - # compare with cpu results - x, cpu_ans = run_cpu(lib) - _, cuda_ans = test(lib, descriptor, "cuda", x) - - assert torch.allclose(cuda_ans.cpu(), cpu_ans, atol=1e-3, rtol=1e-3) - print("Test passed!") - - lib.destroyReformDescriptor(descriptor) - -def test_bang(lib): - import torch_mlu - device = DeviceEnum.DEVICE_BANG - descriptor = lib.createReformDescriptor(device, None) - - # compare with cpu results - x, cpu_ans = run_cpu(lib) - _, bang_ans = test(lib, descriptor, "mlu", x) - - assert torch.allclose(bang_ans.cpu(), cpu_ans, atol=1e-3, rtol=1e-3) - print("Test passed!") - - lib.destroyReformDescriptor(descriptor) - - -if __name__ == "__main__": - args = get_args() - lib = open_lib() - lib.createReformDescriptor.restype = c_void_p - lib.destroyReformDescriptor.argtypes = [c_void_p] - lib.reform.argtypes = [ - c_void_p, - CTensor, - CTensor, - c_void_p, - ] - if args.cpu: - test_cpu(lib) - if args.cuda: - test_cuda(lib) - if args.bang: - test_bang(lib) diff --git a/operatorspy/tests/relu.py b/operatorspy/tests/relu.py new file mode 100644 index 00000000..b99706ff --- /dev/null +++ b/operatorspy/tests/relu.py @@ -0,0 +1,189 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +class ReluDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopReluDescriptor_t = POINTER(ReluDescriptor) + + +def relu(x): + if PROFILE: + ans = torch.nn.functional.relu(x).to(x.dtype) + torch.cuda.synchronize() + return ans + return torch.nn.functional.relu(x).to(x.dtype) + + +def test( + lib, + handle, + torch_device, + tensor_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + + x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1 + y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else x + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = relu(x) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = relu(x) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor + descriptor = infiniopReluDescriptor_t() + + check_error( + lib.infiniopCreateReluDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(y, ans, atol=0, rtol=1e-3) + check_error(lib.infiniopDestroyReluDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for tensor_shape, inplace in test_cases: + test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for tensor_shape, inplace in test_cases: + test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for tensor_shape, inplace in test_cases: + test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + +def test_musa(lib, test_cases): + import torch_musa + + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for tensor_shape, inplace in test_cases: + test(lib, handle, "musa", tensor_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "musa", tensor_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # tensor_shape, inplace + ((), Inplace.OUT_OF_PLACE), + ((), Inplace.INPLACE_X), + ((1, 3), Inplace.OUT_OF_PLACE), + ((3, 3), Inplace.OUT_OF_PLACE), + ((3, 3, 13, 9, 17), Inplace.INPLACE_X), + ((32, 20, 512), Inplace.INPLACE_X), + ((33, 333, 333), Inplace.OUT_OF_PLACE), + ((32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateReluDescriptor.restype = c_int32 + lib.infiniopCreateReluDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopReluDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopRelu.restype = c_int32 + lib.infiniopRelu.argtypes = [ + infiniopReluDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyReluDescriptor.restype = c_int32 + lib.infiniopDestroyReluDescriptor.argtypes = [ + infiniopReluDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if args.musa: + test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/rms_norm.py b/operatorspy/tests/rms_norm.py index 2442376d..46b1d0f3 100644 --- a/operatorspy/tests/rms_norm.py +++ b/operatorspy/tests/rms_norm.py @@ -1,4 +1,5 @@ -from ctypes import c_float, c_void_p +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes import sys import os @@ -6,13 +7,24 @@ from operatorspy import ( open_lib, to_tensor, - CTensor, DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, + create_workspace, ) from operatorspy.tests.test_utils import get_args import torch +class RMSNormDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopRMSNormDescriptor_t = POINTER(RMSNormDescriptor) def rms_norm(x, w, eps): input_dtype = x.dtype @@ -22,61 +34,156 @@ def rms_norm(x, w, eps): return w * hidden_states.to(input_dtype) -def test(lib, descriptor, torch_device): - y = torch.zeros((16, 13312), dtype=torch.float16).to(torch_device) - x = torch.rand((16, 2048), dtype=torch.float16).to(torch_device) - w = torch.ones((2048,), dtype=torch.float16).to(torch_device) +def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float16, w_dtype=torch.float16): + print(f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}" + f" dtype:{dtype} w_dtype:{w_dtype}") + + y = torch.zeros(y_shape, dtype=dtype).to(torch_device) + x = torch.rand(x_shape, dtype=dtype).to(torch_device) + w = torch.ones(w_shape, dtype=w_dtype).to(torch_device) eps = 1e-5 ans = rms_norm(x, w, eps) - lib.rmsNorm( - descriptor, to_tensor(y, lib, [16, 2048], [26624, 2]), to_tensor(x, lib), to_tensor(w, lib), eps, None + + y_tensor = to_tensor(y, lib) + x_tensor = to_tensor(x, lib) + w_tensor = to_tensor(w, lib) + + descriptor = infiniopRMSNormDescriptor_t() + w_dataType = 0 if w_dtype==torch.float16 else 1 + + check_error( + lib.infiniopCreateRMSNormDescriptor( + handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor, + w_tensor.descriptor, eps + ) ) - # print(ans) - # print("=======================================================") - # print(y[:, :2048]) - assert torch.allclose(y[:, :2048], ans, atol=1e-3, rtol=1e-3) - print("Test passed!") + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + w_tensor.descriptor.contents.invalidate() + workspace_size = c_uint64(0) + check_error( + lib.infiniopGetRMSNormWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = create_workspace(workspace_size.value, y.device) + check_error( + lib.infiniopRMSNorm( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + y_tensor.data, + x_tensor.data, + w_tensor.data, + None, + ) + ) -def test_cpu(lib): - device = DeviceEnum.DEVICE_CPU - descriptor = lib.createRMSNormDescriptor(device, None) - test(lib, descriptor, "cpu") - lib.destroyRMSNormDescriptor(descriptor) + assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3) + check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor)) +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases: + test(lib, handle, "cpu", y_shape, x_shape, w_shape, dtype, w_dtype) + destroy_handle(lib, handle) -def test_cuda(lib): +def test_cuda(lib, test_cases): device = DeviceEnum.DEVICE_CUDA - descriptor = lib.createRMSNormDescriptor(device, None) - test(lib, descriptor, "cuda") - lib.destroyRMSNormDescriptor(descriptor) + handle = create_handle(lib, device) + for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases: + test(lib, handle, "cuda", y_shape, x_shape, w_shape, dtype, w_dtype) + destroy_handle(lib, handle) -def test_bang(lib): +def test_bang(lib, test_cases): import torch_mlu device = DeviceEnum.DEVICE_BANG - descriptor = lib.createRMSNormDescriptor(device, None) - test(lib, descriptor, "mlu") - lib.destroyRMSNormDescriptor(descriptor) + handle = create_handle(lib, device) + for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases: + test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype) + destroy_handle(lib, handle) + +def test_ascend(lib, test_cases): + import torch_npu + device = DeviceEnum.DEVICE_ASCEND + handle = create_handle(lib, device) + for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases: + test(lib, handle, "npu", y_shape, x_shape, w_shape, dtype, w_dtype) + + destroy_handle(lib, handle) + +def test_maca(lib, test_cases): + device = DeviceEnum.DEVICE_MACA + handle = create_handle(lib, device) + for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases: + test(lib, handle, "cuda", y_shape, x_shape, w_shape, dtype, w_dtype) + destroy_handle(lib, handle) + +def test_musa(lib, test_cases): + import torch_musa + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases: + test(lib, handle, "musa", y_shape, x_shape, w_shape, dtype, w_dtype) + destroy_handle(lib, handle) if __name__ == "__main__": + test_cases = [ + # y_shape, x_shape, w_shape, dtype, w_dtype + ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float16), + ((16, 2048), (16, 2048), (2048,), torch.float16, torch.float32), + ] args = get_args() lib = open_lib() - lib.createRMSNormDescriptor.restype = c_void_p - lib.destroyRMSNormDescriptor.argtypes = [c_void_p] - lib.rmsNorm.argtypes = [ - c_void_p, - CTensor, - CTensor, - CTensor, + lib.infiniopCreateRMSNormDescriptor.restype = c_int32 + lib.infiniopCreateRMSNormDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopRMSNormDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, c_float, + ] + + lib.infiniopGetRMSNormWorkspaceSize.restype = c_int32 + lib.infiniopGetRMSNormWorkspaceSize.argtypes = [ + infiniopRMSNormDescriptor_t, + POINTER(c_uint64), + ] + + lib.infiniopRMSNorm.restypes = c_int32 + lib.infiniopRMSNorm.argtypes = [ + infiniopRMSNormDescriptor_t, c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyRMSNormDescriptor.restype = c_int32 + lib.infiniopDestroyRMSNormDescriptor.argtypes = [ + infiniopRMSNormDescriptor_t, ] + if args.cpu: - test_cpu(lib) + test_cpu(lib, test_cases) if args.cuda: - test_cuda(lib) + test_cuda(lib, test_cases) if args.bang: - test_bang(lib) + test_bang(lib, test_cases) + if args.ascend: + test_ascend(lib, test_cases) + if args.maca: + test_maca(lib, test_cases) + if args.musa: + test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.ascend or args.maca or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/rotary_embedding.py b/operatorspy/tests/rotary_embedding.py index bfa4d8db..1c1122a6 100644 --- a/operatorspy/tests/rotary_embedding.py +++ b/operatorspy/tests/rotary_embedding.py @@ -1,20 +1,35 @@ import ctypes -from ctypes import c_float, POINTER, c_void_p +from ctypes import c_float, POINTER, c_void_p, c_int32, c_uint64, Structure, byref import sys import os + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( open_lib, to_tensor, - CTensor, DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, + create_workspace, + U64, ) from operatorspy.tests.test_utils import get_args import torch +class RoPEDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopRoPEDescriptor_t = POINTER(RoPEDescriptor) + + def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): ndim = x.ndim assert 0 <= 1 < ndim @@ -30,79 +45,204 @@ def rotary_embedding(t, pos, theta, torch_device): ) freqs = torch.outer(pos, freqs) freqs_cis = torch.polar(torch.ones_like(freqs), freqs) - t_ = torch.view_as_complex(t.reshape(*t.shape[:-1], -1, 2)) freqs_cis = reshape_for_broadcast(freqs_cis, t_) t_out = torch.view_as_real(t_ * freqs_cis).flatten(2).to(t.dtype) return t_out +def sin_cos_table(max_seq_len, dim, torch_device, theta): + pos = torch.arange( + 0, max_seq_len, dtype=torch.float32, device=torch.device(torch_device) + ) + freqs = (1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))).to( + torch_device + ) + # (a0, a1, a2) -> (a0, a0, a1, a1, a2, a2) + freqs = torch.repeat_interleave(freqs, repeats=2) + angles = torch.outer(pos, freqs) + return torch.sin(angles), torch.cos(angles) -def test(lib, descriptor, torch_device): - t = torch.rand((1, 32, 128), dtype=torch.float16).to(torch_device) - pos = torch.ones((1,), dtype=torch.int32).to(torch_device) - theta = 1e4 - ans = rotary_embedding(t, pos, theta, torch_device) - lib.rotaryEmbedding( - descriptor, to_tensor(t, lib), to_tensor(pos, lib), c_float(theta, lib), None +def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16): + print( + f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}" ) - assert torch.allclose(t, ans, atol=1, rtol=1e-3) - print("Test passed!") + t = torch.rand(shape, dtype=dtype) + if strides is not None: + t = rearrange_tensor(t, strides) + posTmp = torch.arange(0, t.shape[0]) + pos = torch.zeros(2 * posTmp.shape[0], dtype = torch.int32) + for i in range(posTmp.shape[0]): + pos[2 * i] = posTmp[i] + pos[2 * i + 1] = 0 + theta = 1e4 + if torch_device == 'mlu' or torch_device == 'npu' or torch_device == 'musa': + ans = rotary_embedding(t, posTmp, theta, "cpu").to(torch_device) + pos = pos.to(torch_device) + t = t.to(torch_device) + elif torch_device == 'maca': + ans = rotary_embedding(t, posTmp, theta, "cpu").to('cuda') + pos = pos.to('cuda') + t = t.to('cuda') + else: + t = t.to(torch_device) + pos = pos.to(torch_device) + ans = rotary_embedding(t, posTmp.to(torch_device), theta, torch_device) + + descriptor = infiniopRoPEDescriptor_t() + # 2x table length for test + sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta) + t_tensor = to_tensor(t, lib) + pos_tensor = to_tensor(pos[: t.shape[0]], lib) + pos_tensor.descriptor.contents.dt = U64 + sin_table_tensor = to_tensor(sin_table, lib) + cos_table_tensor = to_tensor(cos_table, lib) + + if torch_device == "npu": + torch.npu.synchronize() + + check_error( + lib.infiniopCreateRoPEDescriptor( + handle, + byref(descriptor), + t_tensor.descriptor, + pos_tensor.descriptor, + sin_table_tensor.descriptor, + cos_table_tensor.descriptor, + ) + ) + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + t_tensor.descriptor.contents.invalidate() + pos_tensor.descriptor.contents.invalidate() + sin_table_tensor.descriptor.contents.invalidate() + cos_table_tensor.descriptor.contents.invalidate() + + workspace_size = c_uint64(0) + check_error( + lib.infiniopGetRoPEWorkspaceSize(descriptor, ctypes.byref(workspace_size)) + ) + workspace = create_workspace(workspace_size.value, t.device) + check_error( + lib.infiniopRoPE( + descriptor, + workspace.data_ptr() if workspace is not None else None, + workspace_size.value, + t_tensor.data, + pos_tensor.data, + sin_table_tensor.data, + cos_table_tensor.data, + None, + ) + ) + assert torch.allclose(t, ans, atol=1e-4, rtol=1e-2) + check_error(lib.infiniopDestroyRoPEDescriptor(descriptor)) -def test_cpu(lib): + +def test_cpu(lib, test_cases): device = DeviceEnum.DEVICE_CPU - config = None - descriptor = lib.createRotaryEmbeddingDescriptor(device, config) - test(lib, descriptor, "cpu") - lib.destroyRotaryEmbeddingDescriptor(descriptor) + handle = create_handle(lib, device) + for shape, strides, dtype in test_cases: + test(lib, handle, "cpu", shape, strides, dtype) + destroy_handle(lib, handle) -def test_cuda(lib): +def test_cuda(lib, test_cases): device = DeviceEnum.DEVICE_CUDA - config = None - descriptor = lib.createRotaryEmbeddingDescriptor(device, config) - test(lib, descriptor, "cuda") - lib.destroyRotaryEmbeddingDescriptor(descriptor) + handle = create_handle(lib, device) + for shape, strides, dtype in test_cases: + test(lib, handle, "cuda", shape, strides, dtype) + destroy_handle(lib, handle) + -def test_bang(lib): +def test_bang(lib, test_cases): import torch_mlu device = DeviceEnum.DEVICE_BANG - config = None - descriptor = lib.createRotaryEmbeddingDescriptor(device, config) - - # Note: BANG does not support complex calculation, compare with cpu results - t = torch.rand((1, 32, 128), dtype=torch.float16) - pos = torch.ones((1,), dtype=torch.int32) - theta = 1e4 - ans = rotary_embedding(t, pos, theta, "cpu") - - t = t.to("mlu") - pos = pos.to("mlu") - lib.rotaryEmbedding( - descriptor, to_tensor(t, lib), to_tensor(pos, lib), c_float(theta), None - ) - assert torch.allclose(t.cpu(), ans, atol=1e-3, rtol=1e-3) - print("Test passed!") - - lib.destroyRotaryEmbeddingDescriptor(descriptor) + handle = create_handle(lib, device) + for shape, strides, dtype in test_cases: + test(lib, handle, "mlu", shape, strides, dtype) + destroy_handle(lib, handle) + + +def test_ascend(lib, test_cases) : + import torch_npu + + device = DeviceEnum.DEVICE_ASCEND + handle = create_handle(lib, device) + for shape, strides, dtype in test_cases: + test(lib, handle, "npu", shape, strides, dtype) + destroy_handle(lib, handle) + +def test_maca(lib, test_cases) : + device = DeviceEnum.DEVICE_MACA + handle = create_handle(lib, device) + for shape, strides, dtype in test_cases: + test(lib, handle, "maca", shape, strides, dtype) + destroy_handle(lib, handle) + +def test_musa(lib, test_cases) : + import torch_musa + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + for shape, strides, dtype in test_cases: + test(lib, handle, "musa", shape, strides, dtype) + destroy_handle(lib, handle) if __name__ == "__main__": + test_cases = [ + ((1, 32, 128), None, torch.float16), + ((1, 32, 64), None, torch.float16), + # 昇腾暂不满足这个用例,最后一维度 <=32 会有问题,可能与其核心 + # 接口 GatherMask 的内部实现相关,目前 48 64 128 都可以支持 + ((4, 1, 32), None, torch.float16), + ((1, 32, 128), None, torch.float16), + + ((3, 32, 128), (8000, 200, 1), torch.float16), + ] args = get_args() lib = open_lib() - lib.createRotaryEmbeddingDescriptor.restype = c_void_p - lib.destroyRotaryEmbeddingDescriptor.argtypes = [c_void_p] - lib.rotaryEmbedding.argtypes = [ + lib.infiniopCreateRoPEDescriptor.restype = c_int32 + lib.infiniopCreateRoPEDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopRoPEDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetRoPEWorkspaceSize.restype = c_int32 + lib.infiniopGetRoPEWorkspaceSize.argtypes = [ + infiniopRoPEDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopRoPE.restype = c_int32 + lib.infiniopRoPE.argtypes = [ + infiniopRoPEDescriptor_t, + c_void_p, + c_uint64, + c_void_p, c_void_p, - CTensor, - CTensor, - c_float, c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyRoPEDescriptor.restype = c_int32 + lib.infiniopDestroyRoPEDescriptor.argtypes = [ + infiniopRoPEDescriptor_t, ] if args.cpu: - test_cpu(lib) + test_cpu(lib, test_cases) if args.cuda: - test_cuda(lib) + test_cuda(lib, test_cases) if args.bang: - test_bang(lib) + test_bang(lib, test_cases) + if args.ascend: + test_ascend(lib, test_cases) + if args.maca: + test_maca(lib, test_cases) + if args.musa: + test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.ascend or args.maca or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/swiglu.py b/operatorspy/tests/swiglu.py index 1be3c437..9ca07c14 100644 --- a/operatorspy/tests/swiglu.py +++ b/operatorspy/tests/swiglu.py @@ -1,4 +1,5 @@ -from ctypes import c_float, c_void_p +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p +import ctypes import sys import os @@ -8,61 +9,318 @@ to_tensor, CTensor, DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, ) from operatorspy.tests.test_utils import get_args import torch -def swiglu(gate, up): - return up * torch.nn.functional.silu(gate).to(gate.dtype) +class SwiGLUDescriptor(Structure): + _fields_ = [("device", c_int32)] -def test(lib, descriptor, torch_device): - gate = torch.rand((13, 4), dtype=torch.float16).to(torch_device) - up = torch.rand((13, 4), dtype=torch.float16).to(torch_device) - ans = swiglu(gate, up) - lib.swiglu(descriptor, to_tensor(gate, lib), to_tensor(up, lib), None) - assert torch.allclose(gate, ans, atol=1e-3, rtol=1e-3) - print("Test passed!") +infiniopSwiGLUDescriptor_t = POINTER(SwiGLUDescriptor) -def test_cpu(lib): + +def swiglu(a, b): + + return a * b / (1 + torch.exp(-b.float()).to(b.dtype)) + +def test_out_of_place( + lib, + handle, + torch_device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + dtype=torch.float16, + sync=None, +): + print( + f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}" + ) + a = torch.rand(shape, dtype=dtype).to(torch_device) + b = torch.rand(shape, dtype=dtype).to(torch_device) + c = torch.rand(shape, dtype=dtype).to(torch_device) + + if a_stride is not None: + a = rearrange_tensor(a, a_stride) + if b_stride is not None: + b = rearrange_tensor(b, b_stride) + if c_stride is not None: + c = rearrange_tensor(c, c_stride) + ans = swiglu(a, b) + + if sync is not None: + sync() + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) + c_tensor = to_tensor(c, lib) + descriptor = infiniopSwiGLUDescriptor_t() + check_error( + lib.infiniopCreateSwiGLUDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + a_tensor.descriptor.contents.invalidate() + b_tensor.descriptor.contents.invalidate() + c_tensor.descriptor.contents.invalidate() + + check_error( + lib.infiniopSwiGLU( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + + assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2) + print("out-of-place Test passed!") + + check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) + + +def test_in_place1( + lib, + handle, + torch_device, + shape, + a_stride=None, + b_stride=None, + dtype=torch.float16, + sync=None, +): + a = torch.rand(shape, dtype=dtype).to(torch_device) + b = torch.rand(shape, dtype=dtype).to(torch_device) + + if a_stride is not None: + a = rearrange_tensor(a, a_stride) + if b_stride is not None: + b = rearrange_tensor(b, b_stride) + ans = swiglu(a, b) + + if sync is not None: + sync() + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) + descriptor = infiniopSwiGLUDescriptor_t() + check_error( + lib.infiniopCreateSwiGLUDescriptor( + handle, + ctypes.byref(descriptor), + a_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + a_tensor.descriptor.contents.invalidate() + b_tensor.descriptor.contents.invalidate() + + check_error( + lib.infiniopSwiGLU( + descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + + assert torch.allclose(a, ans, atol=1e-4, rtol=1e-2) + print("in-place1 Test passed!") + + check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) + + +def test_in_place2( + lib, + handle, + torch_device, + shape, + a_stride=None, + b_stride=None, + dtype=torch.float16, + sync=None, +): + a = torch.rand(shape, dtype=dtype).to(torch_device) + b = torch.rand(shape, dtype=dtype).to(torch_device) + + if a_stride is not None: + a = rearrange_tensor(a, a_stride) + if b_stride is not None: + b = rearrange_tensor(b, b_stride) + ans = swiglu(a, b) + + if sync is not None: + sync() + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) + descriptor = infiniopSwiGLUDescriptor_t() + check_error( + lib.infiniopCreateSwiGLUDescriptor( + handle, + ctypes.byref(descriptor), + b_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + a_tensor.descriptor.contents.invalidate() + b_tensor.descriptor.contents.invalidate() + + check_error( + lib.infiniopSwiGLU( + descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + + assert torch.allclose(b, ans, atol=1e-4, rtol=1e-2) + + check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): device = DeviceEnum.DEVICE_CPU - descriptor = lib.createSwigluDescriptor(device, None) - test(lib, descriptor, "cpu") - lib.destroySwigluDescriptor(descriptor) + handle = create_handle(lib, device) + + for shape, a_stride, b_stride, c_stride, dtype in test_cases: + test_out_of_place( + lib, handle, "cpu", shape, a_stride, b_stride, c_stride, dtype + ) + test_in_place1(lib, handle, "cpu", shape, a_stride, b_stride, dtype) + test_in_place2(lib, handle, "cpu", shape, a_stride, b_stride, dtype) + + destroy_handle(lib, handle) -def test_cuda(lib): +def test_cuda(lib, test_cases): device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) - descriptor = lib.createSwigluDescriptor(device, None) - test(lib, descriptor, "cuda") - lib.destroySwigluDescriptor(descriptor) + for shape, a_stride, b_stride, c_stride, dtype in test_cases: + test_out_of_place( + lib, handle, "cuda", shape, a_stride, b_stride, c_stride, dtype + ) + test_in_place1(lib, handle, "cuda", shape, a_stride, b_stride, dtype) + test_in_place2(lib, handle, "cuda", shape, a_stride, b_stride, dtype) + destroy_handle(lib, handle) -def test_bang(lib): + +def test_bang(lib, test_cases): import torch_mlu device = DeviceEnum.DEVICE_BANG - descriptor = lib.createSwigluDescriptor(device, None) - test(lib, descriptor, "mlu") - lib.destroySwigluDescriptor(descriptor) + handle = create_handle(lib, device) + + for shape, a_stride, b_stride, c_stride, dtype in test_cases: + test_out_of_place( + lib, handle, "mlu", shape, a_stride, b_stride, c_stride, dtype + ) + test_in_place1(lib, handle, "mlu", shape, a_stride, b_stride, dtype) + test_in_place2(lib, handle, "mlu", shape, a_stride, b_stride, dtype) + + destroy_handle(lib, handle) + + +def test_ascend(lib, test_cases): + import torch_npu + device = DeviceEnum.DEVICE_ASCEND + handle = create_handle(lib, device) + + for shape, a_stride, b_stride, c_stride, dtype in test_cases: + test_out_of_place( + lib, handle, "npu", shape, a_stride, b_stride, c_stride, dtype, torch.npu.synchronize + ) + test_in_place1(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize) + test_in_place2(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize) + + destroy_handle(lib, handle) + +def test_maca(lib, test_cases): + device = DeviceEnum.DEVICE_MACA + handle = create_handle(lib, device) + + for shape, a_stride, b_stride, c_stride, dtype in test_cases: + test_out_of_place( + lib, handle, "cuda", shape, a_stride, b_stride, c_stride, dtype) + test_in_place1(lib, handle, "cuda", shape, a_stride, b_stride, dtype) + test_in_place2(lib, handle, "cuda", shape, a_stride, b_stride, dtype) + + destroy_handle(lib, handle) + +def test_musa(lib, test_cases): + import torch_musa + device = DeviceEnum.DEVICE_MUSA + handle = create_handle(lib, device) + + for shape, a_stride, b_stride, c_stride, dtype in test_cases: + test_out_of_place( + lib, handle, "musa", shape, a_stride, b_stride, c_stride, dtype + ) + test_in_place1(lib, handle, "musa", shape, a_stride, b_stride, dtype) + test_in_place2(lib, handle, "musa", shape, a_stride, b_stride, dtype) + + destroy_handle(lib, handle) if __name__ == "__main__": + test_cases = [ + # shape, a_stride, b_stride, c_stride, dtype + ((13, 4), None, None, None, torch.float16), + ((13, 4), (10, 1), (10, 1), (10, 1), torch.float16), + ((16, 5632), None, None, None, torch.float16), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), torch.float16), + ] args = get_args() lib = open_lib() - lib.createSwigluDescriptor.restype = c_void_p - lib.destroySwigluDescriptor.argtypes = [c_void_p] - lib.swiglu.argtypes = [ + + lib.infiniopCreateSwiGLUDescriptor.restype = c_int32 + lib.infiniopCreateSwiGLUDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopSwiGLUDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopSwiGLU.restype = c_int32 + lib.infiniopSwiGLU.argtypes = [ + infiniopSwiGLUDescriptor_t, + c_void_p, + c_void_p, c_void_p, - CTensor, - CTensor, c_void_p, ] + + lib.infiniopDestroySwiGLUDescriptor.restype = c_int32 + lib.infiniopDestroySwiGLUDescriptor.argtypes = [ + infiniopSwiGLUDescriptor_t, + ] + if args.cpu: - test_cpu(lib) + test_cpu(lib, test_cases) if args.cuda: - test_cuda(lib) + test_cuda(lib, test_cases) if args.bang: - test_bang(lib) + test_bang(lib, test_cases) + if args.ascend: + test_ascend(lib, test_cases) + if args.maca: + test_maca(lib, test_cases) + if args.musa: + test_musa(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/test_utils.py b/operatorspy/tests/test_utils.py index 9a75d15b..6e4960d5 100644 --- a/operatorspy/tests/test_utils.py +++ b/operatorspy/tests/test_utils.py @@ -2,6 +2,11 @@ def get_args(): import argparse parser = argparse.ArgumentParser(description="Test Operator") + parser.add_argument( + "--profile", + action="store_true", + help="Whether profile tests", + ) parser.add_argument( "--cpu", action="store_true", @@ -17,5 +22,30 @@ def get_args(): action="store_true", help="Run BANG test", ) + parser.add_argument( + "--ascend", + action="store_true", + help="Run ASCEND NPU test", + ) + parser.add_argument( + "--maca", + action="store_true", + help="Run ASCEND NPU test", + ) + parser.add_argument( + "--musa", + action="store_true", + help="Run MUSA test", + ) return parser.parse_args() + + +def synchronize_device(torch_device): + import torch + if torch_device == "cuda": + torch.cuda.synchronize() + elif torch_device == "npu": + torch.npu.synchronize() + elif torch_device == "mlu": + torch.mlu.synchronize() diff --git a/operatorspy/utils.py b/operatorspy/utils.py new file mode 100644 index 00000000..bb095658 --- /dev/null +++ b/operatorspy/utils.py @@ -0,0 +1,110 @@ +import ctypes +from .data_layout import * +from .liboperators import infiniopTensorDescriptor_t, CTensor, infiniopHandle_t + + +def check_error(status): + if status != 0: + raise Exception("Error code " + str(status)) + + +def to_tensor(tensor, lib): + """ + Convert a PyTorch tensor to a library Tensor(descriptor, data). + """ + import torch + + ndim = tensor.ndimension() + shape = (ctypes.c_uint64 * ndim)(*tensor.shape) + strides = (ctypes.c_int64 * ndim)(*(tensor.stride())) + data_ptr = tensor.data_ptr() + # fmt: off + dt = ( + I8 if tensor.dtype == torch.int8 else + I16 if tensor.dtype == torch.int16 else + I32 if tensor.dtype == torch.int32 else + I64 if tensor.dtype == torch.int64 else + U8 if tensor.dtype == torch.uint8 else + F16 if tensor.dtype == torch.float16 else + BF16 if tensor.dtype == torch.bfloat16 else + F32 if tensor.dtype == torch.float32 else + F64 if tensor.dtype == torch.float64 else + # TODO: These following types may not be supported by older + # versions of PyTorch. + U16 if tensor.dtype == torch.uint16 else + U32 if tensor.dtype == torch.uint32 else + U64 if tensor.dtype == torch.uint64 else + None + ) + # fmt: on + assert dt is not None + # Create TensorDecriptor + tensor_desc = infiniopTensorDescriptor_t() + lib.infiniopCreateTensorDescriptor( + ctypes.byref(tensor_desc), ndim, shape, strides, dt + ) + # Create Tensor + return CTensor(tensor_desc, data_ptr) + +def create_workspace(size, torch_device): + if size == 0: + return None + import torch + if (torch_device == 'maca'): + return torch.zeros(size=(size,), dtype=torch.uint8, device='cuda') + return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device) + +def create_handle(lib, device, id=0): + handle = infiniopHandle_t() + check_error(lib.infiniopCreateHandle(ctypes.byref(handle), device, id)) + return handle + + +def destroy_handle(lib, handle): + check_error(lib.infiniopDestroyHandle(handle)) + + +def rearrange_tensor(tensor, new_strides): + """ + Given a PyTorch tensor and a list of new strides, return a new PyTorch tensor with the given strides. + """ + import torch + + shape = tensor.shape + + new_size = [0] * len(shape) + left = 0 + right = 0 + for i in range(len(shape)): + if new_strides[i] > 0: + new_size[i] = (shape[i] - 1) * new_strides[i] + 1 + right += new_strides[i] * (shape[i] - 1) + else: # TODO: Support negative strides in the future + # new_size[i] = (shape[i] - 1) * (-new_strides[i]) + 1 + # left += new_strides[i] * (shape[i] - 1) + raise ValueError("Negative strides are not supported yet") + + # Create a new tensor with zeros + new_tensor = torch.zeros( + (right - left + 1,), dtype=tensor.dtype, device=tensor.device + ) + + # Generate indices for original tensor based on original strides + indices = [torch.arange(s) for s in shape] + mesh = torch.meshgrid(*indices, indexing="ij") + + # Flatten indices for linear indexing + linear_indices = [m.flatten() for m in mesh] + + # Calculate new positions based on new strides + new_positions = sum( + linear_indices[i] * new_strides[i] for i in range(len(shape)) + ).to(tensor.device) + offset = -left + new_positions += offset + + # Copy the original data to the new tensor + new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1)) + new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides)) + + return new_tensor diff --git a/src/devices/ascend/CMakeLists.txt b/src/devices/ascend/CMakeLists.txt new file mode 100644 index 00000000..8cc7f7f8 --- /dev/null +++ b/src/devices/ascend/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.16.0) + +# project information +project(Ascend_C) +set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type") +set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "ASCEND CANN package installation directory") +set(RUN_MODE "npu" CACHE STRING "run mode: npu") +set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) + +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.") +endif() + +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascend_kernels STATIC + ../../ops/swiglu/ascend/swiglu_kernel.cpp + ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp + ../../ops/random_sample/ascend/random_sample_kernel.cpp +) + diff --git a/src/devices/ascend/Makefile b/src/devices/ascend/Makefile new file mode 100644 index 00000000..7af26076 --- /dev/null +++ b/src/devices/ascend/Makefile @@ -0,0 +1,10 @@ +.PHONY: build clean + +MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +MKFILE_DIR := $(dir $(MKFILE_PATH)) + +build: + mkdir -p build && cd build && cmake .. && make -j8 + +clean: + rm -rf build diff --git a/src/devices/ascend/ascend_handle.cc b/src/devices/ascend/ascend_handle.cc new file mode 100644 index 00000000..84b31fd5 --- /dev/null +++ b/src/devices/ascend/ascend_handle.cc @@ -0,0 +1,23 @@ +#include "ascend_handle.h" + +infiniopStatus_t createAscendHandle(AscendHandle_t *handle_ptr, int device_id) { + uint32_t device_count; + aclrtGetDeviceCount(&device_count); + if (device_id >= static_cast(device_count)) { + return STATUS_BAD_DEVICE; + } + + auto ret = aclrtSetDevice(device_id); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret)); + + *handle_ptr = new AscendContext{DevAscendNpu, device_id}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t deleteAscendHandle(AscendHandle_t handle_ptr) { + delete handle_ptr; + + return STATUS_SUCCESS; +} diff --git a/src/devices/ascend/ascend_handle.h b/src/devices/ascend/ascend_handle.h new file mode 100644 index 00000000..fbbeb824 --- /dev/null +++ b/src/devices/ascend/ascend_handle.h @@ -0,0 +1,23 @@ +#ifndef ASCEND_HANDLE_H +#define ASCEND_HANDLE_H + +#include "common_ascend.h" +#include "device.h" +#include "status.h" +#include +#include +#include +#include +#include + +struct AscendContext { + Device device; + int device_id; +}; +typedef struct AscendContext *AscendHandle_t; + +infiniopStatus_t createAscendHandle(AscendHandle_t *handle_ptr, int device_id); + +infiniopStatus_t deleteAscendHandle(AscendHandle_t handle_ptr); + +#endif diff --git a/src/devices/ascend/common_ascend.cc b/src/devices/ascend/common_ascend.cc new file mode 100644 index 00000000..fe988e5d --- /dev/null +++ b/src/devices/ascend/common_ascend.cc @@ -0,0 +1,145 @@ +#include "common_ascend.h" + +int64_t numElements(const int64_t *shape, int64_t num) { + int64_t numEle = 1; + for (int i = 0; i < num; i++) { + numEle *= shape[i]; + } + return numEle; +} + +infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize) { + *workspaceAddr = nullptr; + if (workspaceSize > 0) { + auto ret = aclrtMalloc(workspaceAddr, workspaceSize, + ACL_MEM_MALLOC_HUGE_FIRST); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + } + return STATUS_SUCCESS; +} + +infiniopStatus_t freeWorkspace(void *workspaceAddr) { + if (workspaceAddr != nullptr) { + auto ret = aclrtFree(workspaceAddr); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclrtFree failed, ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + } + return STATUS_SUCCESS; +} + +aclDataType toAclDataType(DT dt) { + if (dt == I8) + return aclDataType::ACL_INT8; + else if (dt == I16) + return aclDataType::ACL_INT16; + else if (dt == I32) + return aclDataType::ACL_INT32; + else if (dt == I64) + return aclDataType::ACL_INT64; + else if (dt == U8) + return aclDataType::ACL_UINT8; + else if (dt == U16) + return aclDataType::ACL_UINT16; + else if (dt == U32) + return aclDataType::ACL_UINT32; + else if (dt == U64) + return aclDataType::ACL_UINT64; + else if (dt == F16) + return aclDataType::ACL_FLOAT16; + else if (dt == BF16) + return aclDataType::ACL_BF16; + else if (dt == F32) + return aclDataType::ACL_FLOAT; + else if (dt == F64) + return aclDataType::ACL_DOUBLE; + else + return aclDataType::ACL_DT_UNDEFINED; +} + + +const char *dataTypeToString(aclDataType dtype) { + switch (dtype) { + case ACL_DT_UNDEFINED: + return "ACL_DT_UNDEFINED"; + case ACL_FLOAT: + return "ACL_FLOAT"; + case ACL_FLOAT16: + return "ACL_FLOAT16"; + case ACL_INT8: + return "ACL_INT8"; + case ACL_INT32: + return "ACL_INT32"; + case ACL_UINT8: + return "ACL_UINT8"; + case ACL_INT16: + return "ACL_INT16"; + case ACL_UINT16: + return "ACL_UINT16"; + case ACL_UINT32: + return "ACL_UINT32"; + case ACL_INT64: + return "ACL_INT64"; + case ACL_UINT64: + return "ACL_UINT64"; + case ACL_DOUBLE: + return "ACL_DOUBLE"; + case ACL_BOOL: + return "ACL_BOOL"; + case ACL_STRING: + return "ACL_STRING"; + case ACL_COMPLEX64: + return "ACL_COMPLEX64"; + case ACL_COMPLEX128: + return "ACL_COMPLEX128"; + case ACL_BF16: + return "ACL_BF16"; + case ACL_INT4: + return "ACL_INT4"; + case ACL_UINT1: + return "ACL_UINT1"; + case ACL_COMPLEX32: + return "ACL_COMPLEX32"; + default: + return "UNKNOWN"; + } +} + +const char *formatToString(aclFormat format) { + switch (format) { + case ACL_FORMAT_UNDEFINED: + return "ACL_FORMAT_UNDEFINED"; + case ACL_FORMAT_NCHW: + return "ACL_FORMAT_NCHW"; + case ACL_FORMAT_NHWC: + return "ACL_FORMAT_NHWC"; + case ACL_FORMAT_ND: + return "ACL_FORMAT_ND"; + case ACL_FORMAT_NC1HWC0: + return "ACL_FORMAT_NC1HWC0"; + case ACL_FORMAT_FRACTAL_Z: + return "ACL_FORMAT_FRACTAL_Z"; + case ACL_FORMAT_NC1HWC0_C04: + return "ACL_FORMAT_NC1HWC0_C04"; + case ACL_FORMAT_HWCN: + return "ACL_FORMAT_HWCN"; + case ACL_FORMAT_NDHWC: + return "ACL_FORMAT_NDHWC"; + case ACL_FORMAT_FRACTAL_NZ: + return "ACL_FORMAT_FRACTAL_NZ"; + case ACL_FORMAT_NCDHW: + return "ACL_FORMAT_NCDHW"; + case ACL_FORMAT_NDC1HWC0: + return "ACL_FORMAT_NDC1HWC0"; + case ACL_FRACTAL_Z_3D: + return "ACL_FRACTAL_Z_3D"; + case ACL_FORMAT_NC: + return "ACL_FORMAT_NC"; + case ACL_FORMAT_NCL: + return "ACL_FORMAT_NCL"; + default: + return "UNKNOWN"; + } +} diff --git a/src/devices/ascend/common_ascend.h b/src/devices/ascend/common_ascend.h new file mode 100644 index 00000000..9b23fd35 --- /dev/null +++ b/src/devices/ascend/common_ascend.h @@ -0,0 +1,41 @@ +#ifndef __COMMON_ASCEND_H__ +#define __COMMON_ASCEND_H__ + +#include "operators.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define CHECK_RET(cond, return_expr) \ + do { \ + if (!(cond)) { \ + return_expr; \ + } \ + } while (0) + +#define LOG_PRINT(message, ...) \ + do { \ + printf(message, ##__VA_ARGS__); \ + } while (0) + +#ifdef __cplusplus +}; +#endif + +int64_t numElements(const int64_t *shape, int64_t num); +const char *dataTypeToString(aclDataType dtype); +const char *formatToString(aclFormat format); +infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize); +infiniopStatus_t freeWorkspace(void *workspaceAddr); +aclDataType toAclDataType(DT dt); + +#endif diff --git a/src/devices/ascend/tensor_aclnn.cc b/src/devices/ascend/tensor_aclnn.cc new file mode 100644 index 00000000..0a0fad74 --- /dev/null +++ b/src/devices/ascend/tensor_aclnn.cc @@ -0,0 +1,137 @@ +#include "tensor_aclnn.h" +#include "../../ops/utils.h" +#include + +infiniopStatus_t aclnnTensorDescriptor::setDescriptor(aclDataType dtype, const std::vector &shape, const std::vector &strides) { + if (shape.size() != strides.size()) { + return STATUS_BAD_PARAM; + } + this->ndim = shape.size(); + this->shape = std::vector(shape); + this->strides = std::vector(strides); + this->dataType = dtype; + + // Set format + // TODO: Support other format + aclFormat format = aclFormat::ACL_FORMAT_ND; + this->format = format; + + CHECK_STATUS(this->inferStorageShape(), STATUS_SUCCESS); + + return STATUS_SUCCESS; +} + + +/// @brief Infer storage shape. For now this ruturns a 1D shape of the total tensor storage size. +/// We don't see why higher dimensional storage shape is ever needed. To change if necesary. +infiniopStatus_t aclnnTensorDescriptor::inferStorageShape() { + auto index = std::max_element(this->strides.begin(), this->strides.end()); + uint64_t max_stride_index = std::distance(this->strides.begin(), index); + this->storageNdim = 1; + this->storageShape = std::vector({this->shape[max_stride_index] * this->strides[max_stride_index]}); + + return STATUS_SUCCESS; +} + +/// @brief Set aclnnTensorDescriptor from infiniopTensorDescriptor +/// @param y infiniopTensorDescriptor +/// @return infiniopStatus_t +infiniopStatus_t aclnnTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y) { + uint64_t ndim = y->ndim; + // Cast shape type + auto shape = std::vector(ndim); + auto strides = std::vector(ndim); + for (uint64_t i = 0; i < ndim; ++i) { + shape[i] = static_cast(y->shape[i]); + strides[i] = y->strides[i]; + } + return setDescriptor(toAclDataType(y->dt), shape, strides); +} + +/// @brief Wrapper of aclCreateTensor. Create aclTensor. +/// See https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha001/apiref/appdevgapi/aclcppdevg_03_0168.html +/// @param desc Alias of aclnnTensorDescriptor*. +/// @param data Data ptr on device global mem. +/// @param tensor Pointer of pointer of aclTensor. +/// @return +infiniopStatus_t aclnnTensorDescriptor::createTensor(void *data) { + if (this->t) { + return STATUS_SUCCESS; + } + this->t = aclCreateTensor(this->shape.data(), + this->ndim, + this->dataType, + this->strides.data(), + this->offset, + this->format, + this->storageShape.data(), + this->storageNdim, + data); + return STATUS_SUCCESS; +} + +infiniopStatus_t aclnnTensorDescriptor::destroyTensor() { + auto ret = aclDestroyTensor(this->t); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclDesctroyTensor failed, ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + t = nullptr; + + return STATUS_SUCCESS; +} + +aclnnTensorDescriptor::~aclnnTensorDescriptor() { + if (this->t) { + destroyTensor(); + } +} + +/// @brief TensorDescriptor's string info +/// @param desc Alias of aclnnTensorDescriptor*. +/// @return String of aclnnTensorDescriptor. +char *aclnnTensorDescriptor::toString() { + + // Assume bufferSize + size_t bufferSize = 1024 + this->ndim * 40 + this->storageNdim * 40; + char *buffer = (char *) malloc(bufferSize); + if (!buffer) return NULL; + + // Write info into buffer + char *ptr = buffer; + ptr += sprintf(ptr, "ndim: %" PRId64 "\n", this->ndim); + + ptr += sprintf(ptr, "shape: ["); + for (uint64_t i = 0; i < this->ndim; ++i) { + ptr += sprintf(ptr, "%" PRId64, this->shape[i]); + if (i < this->ndim - 1) { + ptr += sprintf(ptr, ", "); + } + } + ptr += sprintf(ptr, "]\n"); + + ptr += sprintf(ptr, "stride: ["); + for (uint64_t i = 0; i < this->ndim; ++i) { + ptr += sprintf(ptr, "%" PRId64, this->strides[i]); + if (i < this->ndim - 1) { + ptr += sprintf(ptr, ", "); + } + } + ptr += sprintf(ptr, "]\n"); + + ptr += sprintf(ptr, "offset: %" PRId64 "\n", this->offset); + ptr += sprintf(ptr, "dataType: %s\n", dataTypeToString(this->dataType)); + ptr += sprintf(ptr, "format: %s\n", formatToString(this->format)); + + ptr += sprintf(ptr, "storageShape: ["); + for (int64_t i = 0; i < this->storageNdim; ++i) { + ptr += sprintf(ptr, "%" PRId64, this->storageShape[i]); + if (i < this->storageNdim - 1) { + ptr += sprintf(ptr, ", "); + } + } + ptr += sprintf(ptr, "]\n"); + + ptr += sprintf(ptr, "storageNdim: %" PRId64 "\n", this->storageNdim); + + return buffer; +} diff --git a/src/devices/ascend/tensor_aclnn.h b/src/devices/ascend/tensor_aclnn.h new file mode 100644 index 00000000..4aa25074 --- /dev/null +++ b/src/devices/ascend/tensor_aclnn.h @@ -0,0 +1,41 @@ +#ifndef __ACLNN_TENSOR__ +#define __ACLNN_TENSOR__ + +#include "./common_ascend.h" +#include "operators.h" +#include "tensor.h" +#include "tensor/tensor_descriptor.h" +#include +#include +#include +#include + +// Aclnn tensor descriptor, +// used to build aclTensor +struct aclnnTensorDescriptor { + uint64_t ndim; + std::vector shape; + std::vector strides; + int64_t offset; + aclDataType dataType; + aclFormat format; + std::vector storageShape; + int64_t storageNdim; + + aclTensor *t; + + // Transfer from infiniOp DT to aclDataType + infiniopStatus_t setDescriptor(aclDataType dtype, const std::vector &shape, const std::vector &strides); + infiniopStatus_t inferStorageShape(); + // Convert form InfiniOpTensorDescriptor + infiniopStatus_t fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y_desc); + infiniopStatus_t createTensor(void *data = nullptr); + infiniopStatus_t destroyTensor(); + ~aclnnTensorDescriptor(); + + char *toString(); +}; + +typedef aclnnTensorDescriptor *aclnnTensorDescriptor_t; + +#endif diff --git a/src/devices/bang/bang_handle.cc b/src/devices/bang/bang_handle.cc new file mode 100644 index 00000000..1625181e --- /dev/null +++ b/src/devices/bang/bang_handle.cc @@ -0,0 +1,21 @@ +#include "bang_handle.h" + +infiniopStatus_t createBangHandle(BangHandle_t *handle_ptr, int device_id) { + unsigned int device_count; + cnrtGetDeviceCount(&device_count); + if (device_id >= static_cast(device_count)) { + return STATUS_BAD_DEVICE; + } + + auto pool = std::make_shared>(); + if (cnrtSetDevice(device_id) != cnrtSuccess){ + return STATUS_BAD_DEVICE; + } + cnnlHandle_t handle; + cnnlCreate(&handle); + pool->push(std::move(handle)); + + *handle_ptr = new BangContext{DevCambriconMlu, device_id, std::move(pool)}; + + return STATUS_SUCCESS; +} diff --git a/src/devices/bang/bang_handle.h b/src/devices/bang/bang_handle.h new file mode 100644 index 00000000..cc149678 --- /dev/null +++ b/src/devices/bang/bang_handle.h @@ -0,0 +1,32 @@ +#ifndef BANG_HANDLE_H +#define BANG_HANDLE_H + +#include "../pool.h" +#include "cnnl.h" +#include "cnrt.h" +#include "device.h" +#include "status.h" +#include + +struct BangContext { + Device device; + int device_id; + std::shared_ptr> cnnl_handles; +}; +typedef struct BangContext *BangHandle_t; + +infiniopStatus_t createBangHandle(BangHandle_t *handle_ptr, int device_id); + +template +void use_cnnl(std::shared_ptr> &pool, int device_id, cnrtQueue_t queue, T const &f) { + auto handle = pool->pop(); + if (!handle) { + cnrtSetDevice(device_id); + cnnlCreate(&(*handle)); + } + cnnlSetQueue(*handle, (cnrtQueue_t) queue); + f(*handle); + pool->push(std::move(*handle)); +} + +#endif diff --git a/src/devices/bang/common_bang.h b/src/devices/bang/common_bang.h index 6be9bfc3..b855a41f 100644 --- a/src/devices/bang/common_bang.h +++ b/src/devices/bang/common_bang.h @@ -3,13 +3,14 @@ #include "cnnl.h" #include "tensor.h" +#include "../../ops/utils.h" #include const int NRAM_MAX_SIZE = 1024 * 256;//the maximum NRAM memory is 1024 * 768 const int GDRAM_MAX_SIZE = 1024 * 1024 * 1024; // set cnnl tensor descriptor without strides11 -inline void setCnnlTensor(cnnlTensorDescriptor_t desc, const TensorLayout* layout) { +inline void setCnnlTensor(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) { std::vector dims(layout->ndim); for (uint64_t i = 0; i < layout->ndim; i++) { dims[i] = static_cast(layout->shape[i]); @@ -19,7 +20,7 @@ inline void setCnnlTensor(cnnlTensorDescriptor_t desc, const TensorLayout* layou } // set cnnl tensor descriptor with strides -inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc, const TensorLayout *layout) { +inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) { std::vector dim_size(layout->ndim), dim_stride(layout->ndim); for (uint64_t i = 0; i < layout->ndim; i++) { dim_size[i] = static_cast(layout->shape[i]); @@ -29,4 +30,26 @@ inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc, const TensorLayout *lay dim_size.size(), dim_size.data(), dim_stride.data()); } -#endif // __COMMON_BANG_H__ +inline cnnlDataType_t cnnlDataTypeConvert(DT dataType) { + if (dtype_eq(dataType, F32)) { + return CNNL_DTYPE_FLOAT; + } else if (dtype_eq(dataType, F64)) { + return CNNL_DTYPE_DOUBLE; + } else if (dtype_eq(dataType, F16)) { + return CNNL_DTYPE_HALF; + } else if (dtype_eq(dataType, I8)) { + return CNNL_DTYPE_INT8; + } else if (dtype_eq(dataType, I32)) { + return CNNL_DTYPE_INT32; + } else if (dtype_eq(dataType, U8)) { + return CNNL_DTYPE_UINT8; + } else if (dtype_eq(dataType, BF16)) { + return CNNL_DTYPE_BFLOAT16; + } else if (dtype_eq(dataType, I64)) { + return CNNL_DTYPE_INT64; + } else { + return CNNL_DTYPE_INVALID; + } +} + +#endif// __COMMON_BANG_H__ diff --git a/src/devices/bang/handle_pool.cc b/src/devices/bang/handle_pool.cc deleted file mode 100644 index 4b712c1f..00000000 --- a/src/devices/bang/handle_pool.cc +++ /dev/null @@ -1,22 +0,0 @@ -#include -#include -#include "handle_pool.h" - -const Pool &get_cnnl_pool() { - int device_id; - cnrtGetDevice(&device_id); - static std::once_flag flag; - static std::vector> cnnl_pool; - std::call_once(flag, [&]() { - unsigned int device_count; - cnrtGetDeviceCount(&device_count); - for (auto i = 0; i < static_cast(device_count); i++) { - auto pool = Pool(); - cnnlHandle_t handle; - cnnlCreate(&handle); - pool.push(std::move(handle)); - cnnl_pool.emplace_back(std::move(pool)); - } - }); - return cnnl_pool[device_id]; -} diff --git a/src/devices/bang/handle_pool.h b/src/devices/bang/handle_pool.h deleted file mode 100644 index e30d8768..00000000 --- a/src/devices/bang/handle_pool.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef __BANG_HANDLE_POOL_H__ -#define __BANG_HANDLE_POOL_H__ - -#include "cnnl.h" -#include "cnrt.h" -#include "../pool.h" - -const Pool &get_cnnl_pool(); - -template -void use_cnnl(cnrtQueue_t queue, T const &f) { - auto &pool = get_cnnl_pool(); - auto handle = pool.pop(); - if (!handle) { - cnnlCreate(&(*handle)); - } - cnnlSetQueue(*handle, (cnrtQueue_t) queue); - f(*handle); - pool.push(std::move(*handle)); -} - -#endif // __BANG_HANDLE_POOL_H__ diff --git a/src/devices/cpu/common_cpu.cc b/src/devices/cpu/common_cpu.cc index 13228dd4..7fb9e5d8 100644 --- a/src/devices/cpu/common_cpu.cc +++ b/src/devices/cpu/common_cpu.cc @@ -1,22 +1,97 @@ #include "common_cpu.h" -float f16_to_f32(uint16_t code) { - union { - uint32_t u32; - float f32; - } ans{0}; - ans.u32 = ((static_cast(code) << 16) & (1 << 31)) | - ((((code >> 10) & mask_low(5)) - 15 + 127) << 23) | - ((code & mask_low(10)) << 13); - return ans.f32; +float f16_to_f32(uint16_t h) { + uint32_t sign = (h & 0x8000) << 16; // Extract the sign bit + int32_t exponent = (h >> 10) & 0x1F;// Extract the exponent + uint32_t mantissa = h & 0x3FF; // Extract the mantissa (fraction part) + + if (exponent == 31) {// Special case for Inf and NaN + if (mantissa != 0) { + // NaN: Set float32 NaN + uint32_t f32 = sign | 0x7F800000 | (mantissa << 13); + return *(float *) &f32; + } else { + // Infinity + uint32_t f32 = sign | 0x7F800000; + return *(float *) &f32; + } + } else if (exponent == 0) {// Subnormal float16 or zero + if (mantissa == 0) { + // Zero (positive or negative) + uint32_t f32 = sign;// Just return signed zero + return *(float *) &f32; + } else { + // Subnormal: Convert to normalized float32 + exponent = -14; // Set exponent for subnormal numbers + while ((mantissa & 0x400) == 0) {// Normalize mantissa + mantissa <<= 1; + exponent--; + } + mantissa &= 0x3FF;// Clear the leading 1 bit + uint32_t f32 = sign | ((exponent + 127) << 23) | (mantissa << 13); + return *(float *) &f32; + } + } else { + // Normalized float16 + uint32_t f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); + return *(float *) &f32; + } } uint16_t f32_to_f16(float val) { - union { - float f32; - uint32_t u32; - } x{val}; - return (static_cast(x.u32 >> 16) & (1 << 15)) | - (((static_cast(x.u32 >> 23) - 127 + 15) & mask_low(5)) << 10) | - (static_cast(x.u32 >> 13) & mask_low(10)); + uint32_t f32 = *(uint32_t *) &val; // Read the bits of the float32 + uint16_t sign = (f32 >> 16) & 0x8000; // Extract the sign bit + int32_t exponent = ((f32 >> 23) & 0xFF) - 127;// Extract and de-bias the exponent + uint32_t mantissa = f32 & 0x7FFFFF; // Extract the mantissa (fraction part) + + if (exponent >= 31) {// Special cases for Inf and NaN + // NaN + if (exponent == 128 && mantissa != 0) { + return sign | 0x7E00; + } + // Infinity + return sign | 0x7C00; + } else if (exponent >= -14) {// Normalized case + return sign | ((exponent + 15) << 10) | (mantissa >> 13); + } else if (exponent >= -24) { + mantissa |= 0x800000;// Add implicit leading 1 + mantissa >>= (-14 - exponent); + return sign | (mantissa >> 13); + } else { + // Too small for subnormal: return signed zero + return sign; + } +} + +uint64_t getDstOffset(uint64_t flat_index, uint64_t ndim, int64_t const *src_strides, int64_t const *dst_strides) { + uint64_t res = 0; + for (uint64_t i = 0; i < ndim; ++i) { + res += flat_index / src_strides[i] * dst_strides[i]; + flat_index %= src_strides[i]; + } + return res; +} + +uint64_t getOffset(uint64_t flat_index, uint64_t ndim, uint64_t const *shape, int64_t const *strides) { + uint64_t res = 0; + for (long i = ndim - 1; i >= 0; --i) { + res += (flat_index % shape[i]) * strides[i]; + flat_index /= shape[i]; + } + return res; +} + +uint64_t getPaddedSize(uint64_t ndim, uint64_t *shape, uint64_t const *pads) { + uint64_t total_size = 1; + for (size_t i = 0; i < ndim; ++i) { + total_size *= shape[i] + (i < 2 ? 0 : 2 * pads[i - 2]); + } + return total_size; +} + +void getPaddedShape(uint64_t ndim, uint64_t const *shape, uint64_t const *pads, uint64_t *padded_shape) { + memcpy(padded_shape, shape, ndim * sizeof(uint64_t)); + for (size_t i = 2; i < ndim; ++i) { + padded_shape[i] += 2 * pads[i - 2]; + } } diff --git a/src/devices/cpu/common_cpu.h b/src/devices/cpu/common_cpu.h index 20f1a2d8..c3139d69 100644 --- a/src/devices/cpu/common_cpu.h +++ b/src/devices/cpu/common_cpu.h @@ -3,6 +3,7 @@ #include #include +#include // return a mask with the specified number of low bits set to 1 constexpr static uint16_t mask_low(int bits) noexcept { @@ -15,4 +16,19 @@ float f16_to_f32(uint16_t code); // convert single-precision float to half-precision float uint16_t f32_to_f16(float val); -#endif // __COMMON_CPU_H__ +// get the corresponding offset in the destination given the flat index of the source (for element mapping in shape broadcast) +uint64_t getDstOffset(uint64_t flat_index, uint64_t ndim, int64_t const *src_strides, int64_t const *dst_strides); + +// get the memory offset of the given element in a tensor given its flat index +uint64_t getOffset(uint64_t flat_index, uint64_t ndim, uint64_t const *shape, int64_t const *strides); + +/** + * get the total array size (element count) after applying padding for a + * ndim-ary tensor with the given shape + */ +uint64_t getPaddedSize(uint64_t ndim, uint64_t *shape, uint64_t const *pads); + +// calculate the padded shape and store the result in padded_shape +void getPaddedShape(uint64_t ndim, uint64_t const *shape, uint64_t const *pads, uint64_t *padded_shape); + +#endif// __COMMON_CPU_H__ diff --git a/src/devices/cpu/cpu_handle.cc b/src/devices/cpu/cpu_handle.cc new file mode 100644 index 00000000..fbbf09b7 --- /dev/null +++ b/src/devices/cpu/cpu_handle.cc @@ -0,0 +1,7 @@ +#include "device.h" +#include "cpu_handle.h" + +infiniopStatus_t createCpuHandle(CpuHandle_t* handle_ptr){ + *handle_ptr = new CpuContext{DevCpu}; + return STATUS_SUCCESS; +} diff --git a/src/devices/cpu/cpu_handle.h b/src/devices/cpu/cpu_handle.h new file mode 100644 index 00000000..1be72724 --- /dev/null +++ b/src/devices/cpu/cpu_handle.h @@ -0,0 +1,14 @@ +#ifndef CPU_HANDLE_H +#define CPU_HANDLE_H + +#include "device.h" +#include "status.h" + +struct CpuContext { + Device device; +}; +typedef struct CpuContext *CpuHandle_t; + +infiniopStatus_t createCpuHandle(CpuHandle_t *handle_ptr); + +#endif diff --git a/src/devices/cuda/common_cuda.h b/src/devices/cuda/common_cuda.h index a85e7994..d46d45c4 100644 --- a/src/devices/cuda/common_cuda.h +++ b/src/devices/cuda/common_cuda.h @@ -1,8 +1,92 @@ #ifndef __COMMON_CUDA_H__ #define __COMMON_CUDA_H__ +#ifdef ENABLE_SUGON_DCU +#define MAX_THREADS_PER_BLOCK 512 +#else #define MAX_THREADS_PER_BLOCK 1024 +#endif + #define MAX_WARP_PER_BLOCK 32 #define WARP_SIZE 32 -#endif // __COMMON_CUDA_H__ +#include + +#define checkCudaErrorWithCode(call, errorCode) \ + do { \ + if (auto status = call; status != cudaSuccess) { \ + std::cerr << "CUDA error: " << cudaGetErrorString(status) \ + << " in file " << __FILE__ \ + << ", function " << __func__ \ + << ", line " << __LINE__ << std::endl; \ + return errorCode; \ + } \ + } while (0) + +#define checkCudaError(call) checkCudaErrorWithCode(call, STATUS_BAD_DEVICE) + +#define checkCudnnError(call) \ + do { \ + if (auto status = call; status != CUDNN_STATUS_SUCCESS) { \ + std::cerr << "CUDNN error: " << cudnnGetErrorString(status) \ + << " in file " << __FILE__ \ + << ", function " << __func__ \ + << ", line " << __LINE__ << std::endl; \ + return STATUS_EXECUTION_FAILED; \ + } \ + } while (0) + +#include "data_type.h" +#include + +typedef struct DTCudnnMapping { + DT layout; + cudnnDataType_t cudnn_type; +} DTCudnnMapping; + +// DT cudnnDataType_t mapping table +const DTCudnnMapping dtMappings[] = { + {F16, CUDNN_DATA_HALF}, + {F32, CUDNN_DATA_FLOAT}, + {F64, CUDNN_DATA_DOUBLE}, + {BF16, CUDNN_DATA_BFLOAT16}, + {I8, CUDNN_DATA_INT8}, + {I32, CUDNN_DATA_INT32}, + {I64, CUDNN_DATA_INT64}, + {U8, CUDNN_DATA_UINT8}, +}; + +typedef struct DataLayoutMap { + int operator[](const DataLayout &layout) const { + for (const auto &mapping : dtMappings) { + if (mapping.layout == layout) { + return mapping.cudnn_type; + } + } + return -1; + } +} DTMap; + +constexpr DTMap dataTypeMap; + +// get the corresponding offset in the destination given the flat index of the source (for element mapping in shape broadcast) +inline __device__ uint64_t getDstOffset(uint64_t flat_index, uint64_t ndim, int64_t const *src_strides, int64_t const *dst_strides) { + uint64_t res = 0; + for (uint64_t i = 0; i < ndim; ++i) { + res += flat_index / src_strides[i] * dst_strides[i]; + flat_index %= src_strides[i]; + } + return res; +} + +// get the memory offset of the given element in a tensor given its flat index +inline __device__ uint64_t getOffset(uint64_t flat_index, uint64_t ndim, uint64_t const *shape, int64_t const *strides) { + uint64_t res = 0; + for (long i = ndim - 1; i >= 0; --i) { + res += (flat_index % shape[i]) * strides[i]; + flat_index /= shape[i]; + } + return res; +} + +#endif// __COMMON_CUDA_H__ diff --git a/src/devices/cuda/cuda_handle.cc b/src/devices/cuda/cuda_handle.cc new file mode 100644 index 00000000..7d7db662 --- /dev/null +++ b/src/devices/cuda/cuda_handle.cc @@ -0,0 +1,55 @@ +#include "cuda_handle.h" + +infiniopStatus_t createCudaHandle(CudaHandle_t *handle_ptr, int device_id) { + // Check if device_id is valid + int device_count; + cudaGetDeviceCount(&device_count); + if (device_id >= device_count) { + return STATUS_BAD_DEVICE; + } + + // Create a new cublas handle pool + auto pool = std::make_shared>(); + if (cudaSetDevice(device_id) != cudaSuccess) { + return STATUS_BAD_DEVICE; + } + cublasHandle_t handle; + cublasCreate(&handle); + pool->push(std::move(handle)); + + // create a cudnn handle pool + auto cudnn_pool = std::make_shared>(); + cudnnHandle_t cudnn_handle; + checkCudnnError(cudnnCreate(&cudnn_handle)); + cudnn_pool->push(std::move(cudnn_handle)); + + // set CUDA device property + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, device_id); + + // set device compute capability numbers + int capability_major; + int capability_minor; + cudaDeviceGetAttribute(&capability_major, cudaDevAttrComputeCapabilityMajor, device_id); + cudaDeviceGetAttribute(&capability_minor, cudaDevAttrComputeCapabilityMinor, device_id); + + *handle_ptr = new CudaContext{ + DevNvGpu, + device_id, + std::move(pool), + std::move(cudnn_pool), + std::move(prop), + capability_major, + capability_minor, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t deleteCudaHandle(CudaHandle_t handle_ptr) { + handle_ptr->cublas_handles_t = nullptr; + handle_ptr->cudnn_handles_t = nullptr; + delete handle_ptr; + + return STATUS_SUCCESS; +} diff --git a/src/devices/cuda/cuda_handle.h b/src/devices/cuda/cuda_handle.h new file mode 100644 index 00000000..f935ed5f --- /dev/null +++ b/src/devices/cuda/cuda_handle.h @@ -0,0 +1,52 @@ +#ifndef CUDA_HANDLE_H +#define CUDA_HANDLE_H + +#include "../pool.h" +#include "common_cuda.h" +#include "device.h" +#include "status.h" +#include +#include +#include + +struct CudaContext { + Device device; + int device_id; + std::shared_ptr> cublas_handles_t; + std::shared_ptr> cudnn_handles_t; + cudaDeviceProp prop; + int compute_capability_major; + int compute_capability_minor; +}; +typedef struct CudaContext *CudaHandle_t; + +infiniopStatus_t createCudaHandle(CudaHandle_t *handle_ptr, int device_id); + +infiniopStatus_t deleteCudaHandle(CudaHandle_t handle_ptr); + +template +void use_cublas(std::shared_ptr> cublas_handles_t, int device_id, cudaStream_t stream, T const &f) { + auto handle = cublas_handles_t->pop(); + if (!handle) { + cudaSetDevice(device_id); + cublasCreate(&(*handle)); + } + cublasSetStream(*handle, (cudaStream_t) stream); + f(*handle); + cublas_handles_t->push(std::move(*handle)); +} + +template +cudnnStatus_t use_cudnn(std::shared_ptr> cudnn_handles_t, int device_id, cudaStream_t stream, T const &f) { + auto handle = cudnn_handles_t->pop(); + if (!handle) { + cudaSetDevice(device_id); + cudnnCreate(&(*handle)); + } + cudnnSetStream(*handle, stream); + cudnnStatus_t status = f(*handle); + cudnn_handles_t->push(std::move(*handle)); + return status; +} + +#endif diff --git a/src/devices/cuda/handle_pool.cc b/src/devices/cuda/handle_pool.cc deleted file mode 100644 index fe89340c..00000000 --- a/src/devices/cuda/handle_pool.cc +++ /dev/null @@ -1,22 +0,0 @@ -#include "handle_pool.h" -#include -#include - -const Pool &get_cublas_pool() { - int device_id; - cudaGetDevice(&device_id); - static std::once_flag flag; - static std::vector> cublas_pool; - std::call_once(flag, [&]() { - int device_count; - cudaGetDeviceCount(&device_count); - for (int i = 0; i < device_count; i++) { - auto pool = Pool(); - cublasHandle_t handle; - cublasCreate(&handle); - pool.push(std::move(handle)); - cublas_pool.emplace_back(std::move(pool)); - } - }); - return cublas_pool[device_id]; -} diff --git a/src/devices/cuda/handle_pool.h b/src/devices/cuda/handle_pool.h deleted file mode 100644 index 4165902b..00000000 --- a/src/devices/cuda/handle_pool.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef __CUDA_HANDLE_POOL_H__ -#define __CUDA_HANDLE_POOL_H__ - -#include -#include "../pool.h" - -const Pool &get_cublas_pool(); - -template -void use_cublas(cudaStream_t stream, T const &f) { - auto &pool = get_cublas_pool(); - auto handle = pool.pop(); - if (!handle) { - cublasCreate(&(*handle)); - } - cublasSetStream(*handle, (cudaStream_t) stream); - f(*handle); - pool.push(std::move(*handle)); -} - -#endif // __CUDA_HANDLE_POOL_H__ diff --git a/src/devices/handle.cc b/src/devices/handle.cc new file mode 100644 index 00000000..6b7f54a8 --- /dev/null +++ b/src/devices/handle.cc @@ -0,0 +1,101 @@ +#include "handle/handle_export.h" +#ifdef ENABLE_CPU +#include "./cpu/cpu_handle.h" +#endif +#ifdef ENABLE_NV_GPU +#include "./cuda/cuda_handle.h" +#endif +#ifdef ENABLE_CAMBRICON_MLU +#include "./bang/bang_handle.h" +#endif +#ifdef ENABLE_ASCEND_NPU +#include "./ascend/ascend_handle.h" +#endif +#ifdef ENABLE_METAX_GPU +#include "./maca/maca_handle.h" +#endif +#ifdef ENABLE_MTHREADS_GPU +#include "./musa/musa_handle.h" +#endif + + +__C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id) { + if (handle_ptr == nullptr) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + if (device_id < 0) { + return STATUS_BAD_PARAM; + } + + switch (device) { +#ifdef ENABLE_CPU + case DevCpu: + return createCpuHandle((CpuHandle_t *) handle_ptr); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return createCudaHandle((CudaHandle_t *) handle_ptr, device_id); + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return createBangHandle((BangHandle_t *) handle_ptr, device_id); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return createAscendHandle((AscendHandle_t *) handle_ptr, device_id); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return createMacaHandle((MacaHandle_t *) handle_ptr, device_id); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return createMusaHandle((MusaHandle_t *) handle_ptr, device_id); + } +#endif + } + return STATUS_BAD_DEVICE; +} + + +__C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + delete handle; + return STATUS_SUCCESS; +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return deleteCudaHandle((CudaHandle_t) handle); + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + delete (BangHandle_t) handle; + return STATUS_SUCCESS; + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return deleteAscendHandle((AscendHandle_t) handle); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return deleteMacaHandle((MacaHandle_t) handle); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + deleteMusaHandle((MusaHandle_t) handle); + return STATUS_SUCCESS; + } +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/devices/maca/common_maca.h b/src/devices/maca/common_maca.h new file mode 100644 index 00000000..9fa82e78 --- /dev/null +++ b/src/devices/maca/common_maca.h @@ -0,0 +1,87 @@ +#ifndef __COMMON_MACA_H__ +#define __COMMON_MACA_H__ + +#define MAX_THREADS_PER_BLOCK 1024 +#define MAX_WARP_PER_BLOCK 32 +#define WARP_SIZE 32 + +#include + +#define checkMacaErrorWithCode(call, errorCode) \ + do { \ + if (auto status = call; status != hcSuccess) { \ + std::cerr << "MACA error: " << hcGetErrorString(status) \ + << " in file " << __FILE__ \ + << ", function " << __func__ \ + << ", line " << __LINE__ << std::endl; \ + return errorCode; \ + } \ + } while (0) + +#define checkMacaError(call) checkMacaErrorWithCode(call, STATUS_BAD_DEVICE) + +#define checkMcdnnError(call) \ + do { \ + if (auto status = call; status != HCDNN_STATUS_SUCCESS) { \ + std::cerr << "MCDNN error: " << hcdnnGetErrorString(status) \ + << " in file " << __FILE__ \ + << ", function " << __func__ \ + << ", line " << __LINE__ << std::endl; \ + return STATUS_EXECUTION_FAILED; \ + } \ + } while (0) + +#include "data_type.h" +#include + +typedef struct DTMcdnnMapping { + DT layout; + hcdnnDataType_t hcdnn_type; +} DTMcdnnMapping; + +// DT mcdnnDataType_t mapping table +const DTMcdnnMapping dtMappings[] = { + {F16, HCDNN_DATA_HALF}, + {F32, HCDNN_DATA_FLOAT}, + {F64, HCDNN_DATA_DOUBLE}, + {BF16, HCDNN_DATA_BFLOAT16}, + {I8, HCDNN_DATA_INT8}, + {I32, HCDNN_DATA_INT32}, + {I64, HCDNN_DATA_INT64}, + {U8, HCDNN_DATA_UINT8}, +}; + +typedef struct DataLayoutMap { + int operator[](const DataLayout &layout) const { + for (const auto &mapping : dtMappings) { + if (mapping.layout == layout) { + return mapping.hcdnn_type; + } + } + return -1; + } +} DTMap; + +constexpr DTMap dataTypeMap; + +// get the corresponding offset in the destination given the flat index of the source (for element mapping in shape broadcast) +inline __device__ uint64_t getDstOffset(uint64_t flat_index, uint64_t ndim, int64_t const *src_strides, int64_t const *dst_strides) { + uint64_t res = 0; + for (uint64_t i = 0; i < ndim; ++i) { + res += flat_index / src_strides[i] * dst_strides[i]; + flat_index %= src_strides[i]; + } + return res; +} + +// get the memory offset of the given element in a tensor given its flat index +inline __device__ uint64_t getOffset(uint64_t flat_index, uint64_t ndim, uint64_t const *shape, int64_t const *strides) { + uint64_t res = 0; + for (long i = ndim - 1; i >= 0; --i) { + res += (flat_index % shape[i]) * strides[i]; + flat_index /= shape[i]; + } + return res; +} + +#endif// __COMMON_MACA_H__ diff --git a/src/devices/maca/maca_handle.cc b/src/devices/maca/maca_handle.cc new file mode 100644 index 00000000..9b1b52b8 --- /dev/null +++ b/src/devices/maca/maca_handle.cc @@ -0,0 +1,55 @@ +#include "maca_handle.h" + +infiniopStatus_t createMacaHandle(MacaHandle_t *handle_ptr, int device_id) { + // Check if device_id is valid + int device_count; + hcGetDeviceCount(&device_count); + if (device_id >= device_count) { + return STATUS_BAD_DEVICE; + } + + // Create a new mcblas handle pool + auto pool = std::make_shared>(); + if (hcSetDevice(device_id) != hcSuccess) { + return STATUS_BAD_DEVICE; + } + hcblasHandle_t handle; + hcblasCreate(&handle); + pool->push(std::move(handle)); + + // create a mcdnn handle pool + auto mcdnn_pool = std::make_shared>(); + hcdnnHandle_t mcdnn_handle; + checkMcdnnError(hcdnnCreate(&mcdnn_handle)); + mcdnn_pool->push(std::move(mcdnn_handle)); + + // set MACA device property + hcDeviceProp_t prop; + hcGetDeviceProperties(&prop, device_id); + + // set device compute capability numbers + int capability_major; + int capability_minor; + hcDeviceGetAttribute(&capability_major, hcDeviceAttributeComputeCapabilityMajor, device_id); + hcDeviceGetAttribute(&capability_minor, hcDeviceAttributeComputeCapabilityMinor, device_id); + + *handle_ptr = new MacaContext{ + DevMetaxGpu, + device_id, + std::move(pool), + std::move(mcdnn_pool), + std::move(prop), + capability_major, + capability_minor, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t deleteMacaHandle(MacaHandle_t handle_ptr) { + handle_ptr->mcblas_handles_t = nullptr; + handle_ptr->mcdnn_handles_t = nullptr; + delete handle_ptr; + + return STATUS_SUCCESS; +} diff --git a/src/devices/maca/maca_handle.h b/src/devices/maca/maca_handle.h new file mode 100644 index 00000000..41485099 --- /dev/null +++ b/src/devices/maca/maca_handle.h @@ -0,0 +1,52 @@ +#ifndef MACA_HANDLE_H +#define MACA_HANDLE_H + +#include "../pool.h" +#include "common_maca.h" +#include "device.h" +#include "status.h" +#include +#include +#include + +struct MacaContext { + Device device; + int device_id; + std::shared_ptr> mcblas_handles_t; + std::shared_ptr> mcdnn_handles_t; + hcDeviceProp_t prop; + int compute_capability_major; + int compute_capability_minor; +}; +typedef struct MacaContext *MacaHandle_t; + +infiniopStatus_t createMacaHandle(MacaHandle_t *handle_ptr, int device_id); + +infiniopStatus_t deleteMacaHandle(MacaHandle_t handle_ptr); + +template +void use_mcblas(std::shared_ptr> mcblas_handles_t, int device_id, hcStream_t stream, T const &f) { + auto handle = mcblas_handles_t->pop(); + if (!handle) { + hcSetDevice(device_id); + hcblasCreate(&(*handle)); + } + hcblasSetStream(*handle, (hcStream_t) stream); + f(*handle); + mcblas_handles_t->push(std::move(*handle)); +} + +template +hcdnnStatus_t use_mcdnn(std::shared_ptr> mcdnn_handles_t, int device_id, hcStream_t stream, T const &f) { + auto handle = mcdnn_handles_t->pop(); + if (!handle) { + hcSetDevice(device_id); + hcdnnCreate(&(*handle)); + } + hcdnnSetStream(*handle, stream); + hcdnnStatus_t status = f(*handle); + mcdnn_handles_t->push(std::move(*handle)); + return status; +} + +#endif diff --git a/src/devices/musa/common_musa.h b/src/devices/musa/common_musa.h new file mode 100644 index 00000000..c42b5197 --- /dev/null +++ b/src/devices/musa/common_musa.h @@ -0,0 +1,77 @@ +#ifndef __COMMON_MUSA_H__ +#define __COMMON_MUSA_H__ + +#define MAX_THREADS_PER_BLOCK 1024 +#define MAX_WARP_PER_BLOCK 32 +#define WARP_SIZE 32 + +#include +#include "data_type.h" +#include +#include +#include + +enum class Type { + QINT4, + QINT8, + INT8, + INT16, + INT32, + INT64, + UINT8, + UINT16, + UINT32, + UINT64, + HALF, + BFLOAT16, + FLOAT, + DOUBLE, + BOOL, +}; + +enum class Format { + UNKNOWN, + NCW, + NWC, + NCHW, + NHWC, + HWCN, + NCDHW, + NDHWC, + DHWCN, +}; + +#define checkMusaErrorWithCode(call, errorCode) \ + do { \ + if (auto status = call; status != musaSuccess) { \ + std::cerr << "MUSA error: " << musaGetErrorString(status) \ + << " in file " << __FILE__ \ + << ", function " << __func__ \ + << ", line " << __LINE__ << std::endl; \ + return errorCode; \ + } \ + } while (0) + +#define checkMusaError(call) checkMusaErrorWithCode(call, STATUS_BAD_DEVICE) + +// get the corresponding offset in the destination given the flat index of the source (for element mapping in shape broadcast) +inline __device__ uint64_t getDstOffset(uint64_t flat_index, uint64_t ndim, int64_t const *src_strides, int64_t const *dst_strides) { + uint64_t res = 0; + for (uint64_t i = 0; i < ndim; ++i) { + res += flat_index / src_strides[i] * dst_strides[i]; + flat_index %= src_strides[i]; + } + return res; +} + +// get the memory offset of the given element in a tensor given its flat index +inline __device__ uint64_t getOffset(uint64_t flat_index, uint64_t ndim, uint64_t const *shape, int64_t const *strides) { + uint64_t res = 0; + for (long i = ndim - 1; i >= 0; --i) { + res += (flat_index % shape[i]) * strides[i]; + flat_index /= shape[i]; + } + return res; +} + +#endif // __COMMON_MUSA_H__ diff --git a/src/devices/musa/musa_handle.cc b/src/devices/musa/musa_handle.cc new file mode 100644 index 00000000..3a7f8174 --- /dev/null +++ b/src/devices/musa/musa_handle.cc @@ -0,0 +1,57 @@ +#include "musa_handle.h" +#include + +infiniopStatus_t createMusaHandle(MusaHandle_t* handle_ptr, int device_id) { + int device_count; + musaGetDeviceCount(&device_count); + if (device_id >= device_count) { + return STATUS_BAD_DEVICE; + } + + int current_device; + if (musaGetDevice(¤t_device) != musaSuccess) { + return STATUS_BAD_DEVICE; + } + if (current_device != device_id && musaSetDevice(device_id) != musaSuccess) { + return STATUS_BAD_DEVICE; + } + + // set MUSA device property + musaDeviceProp prop; + musaGetDeviceProperties(&prop, device_id); + + // create a mublas handle pool + auto mublas_pool = std::make_shared>(); + mublasHandle_t *mublas_handle = new mublasHandle_t; + mublasCreate(mublas_handle); + mublas_pool->push(mublas_handle); + + // create a mudnn handle pool + auto mudnn_pool = std::make_shared>(); + musa::dnn::Handle *mudnn_handle = new musa::dnn::Handle; + mudnn_pool->push(mudnn_handle); + + int capability_major; + int capability_minor; + musaDeviceGetAttribute(&capability_major, musaDevAttrComputeCapabilityMajor, device_id); + musaDeviceGetAttribute(&capability_minor, musaDevAttrComputeCapabilityMinor, device_id); + + *handle_ptr = new MusaContext{ + DevMthreadsGpu, + device_id, + std::move(mublas_pool), + std::move(mudnn_pool), + std::move(prop), + capability_major, + capability_minor,}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t deleteMusaHandle(MusaHandle_t handle_ptr) { + handle_ptr->mublas_handles_t = nullptr; + handle_ptr->mudnn_handles_t = nullptr; + delete handle_ptr; + + return STATUS_SUCCESS; +} diff --git a/src/devices/musa/musa_handle.h b/src/devices/musa/musa_handle.h new file mode 100644 index 00000000..6de2c2d3 --- /dev/null +++ b/src/devices/musa/musa_handle.h @@ -0,0 +1,64 @@ +#ifndef __MUSA_HANDLE_H__ +#define __MUSA_HANDLE_H__ + +#include "pool.h" +#include "device.h" +#include "status.h" +#include "ops/matmul/matmul.h" +#include +#include +#include +#include +#include + +struct MusaContext { + Device device; + int device_id; + std::shared_ptr> mublas_handles_t; + std::shared_ptr> mudnn_handles_t; + musaDeviceProp prop; + int compute_capability_major; + int compute_capability_minor; +}; +typedef struct MusaContext *MusaHandle_t; + +infiniopStatus_t createMusaHandle(MusaHandle_t *handle_ptr, int device_id); + +infiniopStatus_t deleteMusaHandle(MusaHandle_t handle_ptr); + +template +void use_mublas(std::shared_ptr> mublas_handles_t, int device_id, MUstream stream, T const &f) { + mublasHandle_t *handle = mublas_handles_t->pop(); + if (!handle) { + int current_device; + musaGetDevice(¤t_device); + if (current_device != device_id) { + musaSetDevice(device_id); + } + mublasHandle_t *handle = new mublasHandle_t; + mublasCreate(handle); + } + mublasSetStream(*handle, (MUstream) stream); + f(*handle); + mublas_handles_t->push(handle); +} + +template +void use_mudnn(std::shared_ptr> mudnn_handles_t, int device_id, musaStream_t stream, T const &f) { + musa::dnn::Handle* handle = mudnn_handles_t->pop(); + if (!handle) { + int current_device; + musaGetDevice(¤t_device); + if (current_device != device_id) { + musaSetDevice(device_id); + } + handle = new musa::dnn::Handle(device_id); + // mudnnCreate(handle); + } + // mudnnSetStream(*handle, (MUstream) stream); + handle->SetStream(stream); + f(handle); + mudnn_handles_t->push(handle); +} + +#endif // __MUSA_HANDLE_H__ diff --git a/src/devices/musa/pool.h b/src/devices/musa/pool.h new file mode 100644 index 00000000..2cfb5e32 --- /dev/null +++ b/src/devices/musa/pool.h @@ -0,0 +1,50 @@ +#ifndef __POOL_MUSA_H__ +#define __POOL_MUSA_H__ + +#include +#include +#include + +template +class Pool { +public: + Pool() : _head(nullptr) {} + + Pool(const Pool &) = delete; + + Pool(Pool &&pool) noexcept : _head(pool._head.exchange(nullptr)) {} + + ~Pool() { + while (this->pop()) {} + } + + void push(T *val) const { + Node *new_node = new Node(val); + new_node->next = _head.load(); + while (!_head.compare_exchange_weak(new_node->next, new_node)); + } + + T* pop() const { + Node *top = _head.load(); + Node *new_head = nullptr; + do { + if (!top) { + return nullptr; + } + new_head = top->next; + } while (!_head.compare_exchange_weak(top, new_head)); + return top->data; + } + +private: + template + struct Node { + U *data; + Node *next; + Node(U *data) : data(data), next(nullptr) {} + }; + + mutable std::atomic *> _head; +}; + +#endif // __POOL_MUSA_H__ diff --git a/src/main.c b/src/main.c deleted file mode 100644 index 721159e4..00000000 --- a/src/main.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "ops/rotary_embedding/rotary_embedding.h" -#include "tensor.h" -#include - -void test_rms_norm() { - void *descriptor = createRotaryEmbeddingDescriptor(DevNvGpu, NULL); - struct TensorLayout l; - Tensor t = {&l, NULL}; - Tensor t2 = {&l, NULL}; - rotaryEmbedding(descriptor, t, t2, 10000.0, NULL); - destroyRotaryEmbeddingDescriptor(descriptor); -} - -int main(int argc, char **argv) { - test_rms_norm(); - return 0; -} diff --git a/src/ops/add/cpu/add_cpu.cc b/src/ops/add/cpu/add_cpu.cc new file mode 100644 index 00000000..ce859b1a --- /dev/null +++ b/src/ops/add/cpu/add_cpu.cc @@ -0,0 +1,104 @@ +#include "add_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) { + for (int64_t i = ndim - 1; i >= 0; --i) { + if (++indices[i] != shape[i]) { + return; + } + indices[i] = 0; + } +} + +inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) { + return std::inner_product(indices, indices + ndim, strides, uint64_t(0)); +} + +infiniopStatus_t cpuCreateAddDescriptor(infiniopHandle_t, + AddCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b) { + uint64_t ndim = c->ndim; + if (!isValidBroadcastShape(a, b, c)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (c->dt != F16 && c->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (c->dt != a->dt || c->dt != b->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); + + // get the adjusted strides for a and b + uint64_t *a_strides = new uint64_t[ndim]; + uint64_t *b_strides = new uint64_t[ndim]; + for (size_t i = 0; i < ndim; ++i) { + a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; + b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; + } + + uint64_t *c_indices = new uint64_t[ndim]; + std::fill(c_indices, c_indices + ndim, 0); + uint64_t *c_shape = new uint64_t[ndim]; + std::copy(c->shape, c->shape + ndim, c_shape); + + *desc_ptr = new AddCpuDescriptor{ + DevCpu, + c->dt, + ndim, + c_data_size, + c_shape, + a_strides, + b_strides, + c_indices, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyAddDescriptor(AddCpuDescriptor_t desc) { + delete[] desc->c_shape; + delete[] desc->a_strides; + delete[] desc->b_strides; + delete[] desc->c_indices; + delete desc; + return STATUS_SUCCESS; +} + +template +infiniopStatus_t add_cpu(AddCpuDescriptor_t desc, void *c, void const *a, void const *b) { + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); + const auto &indices = desc->c_indices; + + for (uint64_t i = 0; i < desc->c_data_size; ++i, incrementOne(indices, desc->c_shape, desc->ndim)) { + auto a_index = compactToFlat(indices, desc->a_strides, desc->ndim); + auto b_index = compactToFlat(indices, desc->b_strides, desc->ndim); + if constexpr (std::is_same::value) { + c_[i] = f32_to_f16(f16_to_f32(a_[a_index]) + f16_to_f32(b_[b_index])); + } else { + c_[i] = a_[a_index] + b_[b_index]; + } + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuAdd(AddCpuDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream) { + if (desc->dtype == F16) { + return add_cpu(desc, c, a, b); + } + if (desc->dtype == F32) { + return add_cpu(desc, c, a, b); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/add/cpu/add_cpu.h b/src/ops/add/cpu/add_cpu.h new file mode 100644 index 00000000..42e62435 --- /dev/null +++ b/src/ops/add/cpu/add_cpu.h @@ -0,0 +1,33 @@ +#ifndef __CPU_ADD_H__ +#define __CPU_ADD_H__ + +#include "operators.h" +#include +#include + +struct AddCpuDescriptor { + Device device; + DT dtype; + uint64_t ndim; + uint64_t c_data_size; + uint64_t const *c_shape; + uint64_t const *a_strides; + uint64_t const *b_strides; + uint64_t *c_indices; +}; + +typedef struct AddCpuDescriptor *AddCpuDescriptor_t; + +infiniopStatus_t cpuCreateAddDescriptor(infiniopHandle_t, + AddCpuDescriptor_t *, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +infiniopStatus_t cpuAdd(AddCpuDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream); + +infiniopStatus_t cpuDestroyAddDescriptor(AddCpuDescriptor_t desc); + +#endif diff --git a/src/ops/add/cuda/add.cc b/src/ops/add/cuda/add.cc new file mode 100644 index 00000000..eebcf4be --- /dev/null +++ b/src/ops/add/cuda/add.cc @@ -0,0 +1,81 @@ +#include "add.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t handle, + AddCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b) { + uint64_t ndim = c->ndim; + if (!isValidBroadcastShape(a, b, c)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (c->dt != F16 && c->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (c->dt != a->dt || c->dt != b->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + bool broadcasted = false; + if (ndim != a->ndim || ndim != b->ndim) { + broadcasted = true; + } else { + for (uint64_t i = 0; i < ndim; ++i) { + if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) { + broadcasted = true; + break; + } + } + } + + uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); + + // get the adjusted strides for a and b + int64_t *a_strides = new int64_t[ndim]; + int64_t *b_strides = new int64_t[ndim]; + for (size_t i = 0; i < ndim; ++i) { + a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; + b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; + } + + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, handle->device_id); + + int64_t *a_strides_d, *b_strides_d, *c_strides_d; + checkCudaErrorWithCode(cudaMalloc((void **) &a_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc((void **) &b_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc((void **) &c_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(a_strides_d, a_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaMemcpy(b_strides_d, b_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaMemcpy(c_strides_d, c->strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + + *desc_ptr = new AddCudaDescriptor{ + DevNvGpu, + c->dt, + handle->device_id, + ndim, + c_data_size, + static_cast(prop.maxGridSize[0]), + a_strides_d, + b_strides_d, + c_strides_d, + broadcasted, + }; + + delete[] a_strides; + delete[] b_strides; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyAddDescriptor(AddCudaDescriptor_t desc) { + checkCudaErrorWithCode(cudaFree((void *) desc->a_strides), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree((void *) desc->b_strides), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree((void *) desc->c_strides), STATUS_EXECUTION_FAILED); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/add/cuda/add.cu b/src/ops/add/cuda/add.cu new file mode 100644 index 00000000..9d9aefcb --- /dev/null +++ b/src/ops/add/cuda/add.cu @@ -0,0 +1,116 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "add.cuh" + +/** + * @brief A templated vector struct that supports element-wise addition on arrays. + * + * @tparam T - The access data type for elements in the vector. + * @tparam TComp - The computation data type used for arithmetic operations. + * @tparam N - The number of elements of type T in the vector for a single access. + */ +template +struct vecN { + T data[N]; + + __device__ __forceinline__ vecN operator+(const vecN &other) const { + vecN result; + + for (int i = 0; i < N; ++i) { + if constexpr (std::is_same::value) { + result.data[i] = data[i] + other.data[i]; + } else { + constexpr static size_t pack_size = sizeof(T) / sizeof(TComp); + auto data_ = reinterpret_cast *>(result.data); + data_[i] = std::move(reinterpret_cast const *>(data)[i] + + reinterpret_cast const *>(other.data)[i]); + } + } + + return result; + } + + __device__ __forceinline__ const T &operator[](size_t i) const { + return data[i]; + } +}; + +template +__global__ void add( + Tdata *c, + const Tdata *a, + const Tdata *b, + const int64_t *a_strides, + const int64_t *b_strides, + const int64_t *c_strides, + uint64_t data_size, + uint64_t ndim, + uint64_t offset, + bool broadcasted, + unsigned pack_size) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if (idx < data_size) { + if (broadcasted) { + idx *= pack_size; + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); +#pragma unroll + for (size_t i = 0; i < pack_size; ++i) { + auto a_idx = getDstOffset(idx + i, ndim, c_strides, a_strides); + auto b_idx = getDstOffset(idx + i, ndim, c_strides, b_strides); + c_[idx + i] = a_[a_idx] + b_[b_idx]; + } + return; + } + c[idx] = a[idx] + b[idx]; + } +} + +template +void _add_nv_gpu(AddCudaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) { + if (data_size == 0) { + return; + } + dim3 blockDims = dim3(std::min(static_cast(256), data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < data_size; i += step) { + add<<>>( + c, a, b, desc->a_strides, desc->b_strides, desc->c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size); + } +} + +template +infiniopStatus_t add_nv_gpu(AddCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream, uint64_t pack_size) { + const auto data_size = desc->c_data_size / pack_size; + const auto a_vec = reinterpret_cast(a); + const auto b_vec = reinterpret_cast(b); + const auto c_vec = reinterpret_cast(c); + _add_nv_gpu(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream); + + const auto remainder = desc->c_data_size % pack_size; + const auto a_ = reinterpret_cast(a); + const auto b_ = reinterpret_cast(b); + const auto c_ = reinterpret_cast(c); + _add_nv_gpu(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream); + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaAdd(AddCudaDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return add_nv_gpu, half>(desc, c, a, b, stream, 8); + } + if (desc->dtype == F32) { + return add_nv_gpu, float>(desc, c, a, b, stream, 4); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/add/cuda/add.cuh b/src/ops/add/cuda/add.cuh new file mode 100644 index 00000000..03a181eb --- /dev/null +++ b/src/ops/add/cuda/add.cuh @@ -0,0 +1,37 @@ +#ifndef __CUDA_ADD_H__ +#define __CUDA_ADD_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include + +struct AddCudaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t c_data_size; + uint64_t max_grid_size; + int64_t const *a_strides; + int64_t const *b_strides; + int64_t const *c_strides; + bool broadcasted; +}; + +typedef struct AddCudaDescriptor *AddCudaDescriptor_t; + +infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t, + AddCudaDescriptor_t *, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +infiniopStatus_t cudaAdd(AddCudaDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream); + +infiniopStatus_t cudaDestroyAddDescriptor(AddCudaDescriptor_t desc); + +#endif diff --git a/src/ops/add/musa/add_musa.cc b/src/ops/add/musa/add_musa.cc new file mode 100644 index 00000000..8c4475fe --- /dev/null +++ b/src/ops/add/musa/add_musa.cc @@ -0,0 +1,81 @@ +#include "add_musa.h" +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" + +infiniopStatus_t musaCreateAddDescriptor(MusaHandle_t handle, + AddMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b) { + uint64_t ndim = c->ndim; + if (!isValidBroadcastShape(a, b, c)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (c->dt != F16 && c->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (c->dt != a->dt || c->dt != b->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + bool broadcasted = false; + if (ndim != a->ndim || ndim != b->ndim) { + broadcasted = true; + } else { + for (uint64_t i = 0; i < ndim; ++i) { + if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) { + broadcasted = true; + break; + } + } + } + + uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); + + // get the adjusted strides for a and b + int64_t *a_strides = new int64_t[ndim]; + int64_t *b_strides = new int64_t[ndim]; + for (size_t i = 0; i < ndim; ++i) { + a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; + b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; + } + + musaDeviceProp prop; + musaGetDeviceProperties(&prop, handle->device_id); + + int64_t *a_strides_d, *b_strides_d, *c_strides_d; + checkMusaErrorWithCode(musaMalloc(&a_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkMusaErrorWithCode(musaMalloc(&b_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkMusaErrorWithCode(musaMalloc(&c_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkMusaErrorWithCode(musaMemcpy(a_strides_d, a_strides, ndim * sizeof(int64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkMusaErrorWithCode(musaMemcpy(b_strides_d, b_strides, ndim * sizeof(int64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkMusaErrorWithCode(musaMemcpy(c_strides_d, c->strides, ndim * sizeof(int64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + + *desc_ptr = new AddMusaDescriptor{ + DevMthreadsGpu, + c->dt, + handle->device_id, + ndim, + c_data_size, + static_cast(prop.maxGridSize[0]), + a_strides_d, + b_strides_d, + c_strides_d, + broadcasted, + }; + + delete[] a_strides; + delete[] b_strides; + + return STATUS_SUCCESS; +} + +infiniopStatus_t musaDestroyAddDescriptor(AddMusaDescriptor_t desc) { + checkMusaErrorWithCode(musaFree((void *) desc->a_strides), STATUS_EXECUTION_FAILED); + checkMusaErrorWithCode(musaFree((void *) desc->b_strides), STATUS_EXECUTION_FAILED); + checkMusaErrorWithCode(musaFree((void *) desc->c_strides), STATUS_EXECUTION_FAILED); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/add/musa/add_musa.h b/src/ops/add/musa/add_musa.h new file mode 100644 index 00000000..c492c45c --- /dev/null +++ b/src/ops/add/musa/add_musa.h @@ -0,0 +1,37 @@ +#ifndef __MUSA_ADD_H__ +#define __MUSA_ADD_H__ + +#include "../../../devices/musa/common_musa.h" +#include "../../../devices/musa/musa_handle.h" +#include "operators.h" +#include +#include + +struct AddMusaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t c_data_size; + uint64_t max_grid_size; + int64_t const *a_strides; + int64_t const *b_strides; + int64_t const *c_strides; + bool broadcasted; +}; + +typedef struct AddMusaDescriptor *AddMusaDescriptor_t; + +infiniopStatus_t musaCreateAddDescriptor(MusaHandle_t, + AddMusaDescriptor_t *, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +infiniopStatus_t musaAdd(AddMusaDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream); + +infiniopStatus_t musaDestroyAddDescriptor(AddMusaDescriptor_t desc); + +#endif diff --git a/src/ops/add/musa/add_musa.mu b/src/ops/add/musa/add_musa.mu new file mode 100644 index 00000000..0766aa7c --- /dev/null +++ b/src/ops/add/musa/add_musa.mu @@ -0,0 +1,116 @@ +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include "add_musa.h" + +/** + * @brief A templated vector struct that supports element-wise addition on arrays. + * + * @tparam T - The access data type for elements in the vector. + * @tparam TComp - The computation data type used for arithmetic operations. + * @tparam N - The number of elements of type T in the vector for a single access. + */ +template +struct vecN { + T data[N]; + + __device__ __forceinline__ vecN operator+(const vecN &other) const { + vecN result; + + for (int i = 0; i < N; ++i) { + if constexpr (std::is_same::value) { + result.data[i] = data[i] + other.data[i]; + } else { + constexpr static size_t pack_size = sizeof(T) / sizeof(TComp); + auto data_ = reinterpret_cast *>(result.data); + data_[i] = std::move(reinterpret_cast const *>(data)[i] + + reinterpret_cast const *>(other.data)[i]); + } + } + + return result; + } + + __device__ __forceinline__ const T &operator[](size_t i) const { + return data[i]; + } +}; + +template +__global__ void add( + Tdata *c, + const Tdata *a, + const Tdata *b, + const int64_t *a_strides, + const int64_t *b_strides, + const int64_t *c_strides, + uint64_t data_size, + uint64_t ndim, + uint64_t offset, + bool broadcasted, + unsigned pack_size) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if (idx < data_size) { + if (broadcasted) { + idx *= pack_size; + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); +#pragma unroll + for (size_t i = 0; i < pack_size; ++i) { + auto a_idx = getDstOffset(idx + i, ndim, c_strides, a_strides); + auto b_idx = getDstOffset(idx + i, ndim, c_strides, b_strides); + c_[idx + i] = a_[a_idx] + b_[b_idx]; + } + return; + } + c[idx] = a[idx] + b[idx]; + } +} + +template +void _add_nv_gpu(AddMusaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) { + if (data_size == 0) { + return; + } + dim3 blockDims = dim3(std::min(static_cast(256), data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + musaStream_t musa_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < data_size; i += step) { + add<<>>( + c, a, b, desc->a_strides, desc->b_strides, desc->c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size); + } +} + +template +infiniopStatus_t add_mt_gpu(AddMusaDescriptor_t desc, void *c, void const *a, void const *b, void *stream, uint64_t pack_size) { + const auto data_size = desc->c_data_size / pack_size; + const auto a_vec = reinterpret_cast(a); + const auto b_vec = reinterpret_cast(b); + const auto c_vec = reinterpret_cast(c); + _add_nv_gpu(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream); + + const auto remainder = desc->c_data_size % pack_size; + const auto a_ = reinterpret_cast(a); + const auto b_ = reinterpret_cast(b); + const auto c_ = reinterpret_cast(c); + _add_nv_gpu(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream); + return STATUS_SUCCESS; +} + +infiniopStatus_t musaAdd(AddMusaDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream) { + checkMusaError(musaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return add_mt_gpu, half>(desc, c, a, b, stream, 8); + } + if (desc->dtype == F32) { + return add_mt_gpu, float>(desc, c, a, b, stream, 4); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/add/operator.cc b/src/ops/add/operator.cc new file mode 100644 index 00000000..de97dc94 --- /dev/null +++ b/src/ops/add/operator.cc @@ -0,0 +1,91 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/add/add.h" + +#ifdef ENABLE_CPU +#include "cpu/add_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/add.cuh" +#endif +#ifdef ENABLE_MTHREADS_GPU +#include "musa/add_musa.h" +#endif + +__C infiniopStatus_t infiniopCreateAddDescriptor( + infiniopHandle_t handle, + infiniopAddDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateAddDescriptor(handle, (AddCpuDescriptor_t *) desc_ptr, c, a, b); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateAddDescriptor((CudaHandle_t) handle, (AddCudaDescriptor_t *) desc_ptr, c, a, b); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaCreateAddDescriptor((MusaHandle_t) handle, (AddMusaDescriptor_t *) desc_ptr, c, a, b); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc, void *c, void const *a, void const *b, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuAdd((AddCpuDescriptor_t) desc, c, a, b, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaAdd((AddCudaDescriptor_t) desc, c, a, b, stream); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaAdd((AddMusaDescriptor_t) desc, c, a, b, stream); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyAddDescriptor((AddCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyAddDescriptor((AddCudaDescriptor_t) desc); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaDestroyAddDescriptor((AddMusaDescriptor_t) desc); + } +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/attention/operator.cc b/src/ops/attention/operator.cc new file mode 100644 index 00000000..fc3ee9b3 --- /dev/null +++ b/src/ops/attention/operator.cc @@ -0,0 +1,320 @@ +#include "../utils.h" +#include "ops/attention/attention.h" +#include "ops/causal_softmax/causal_softmax.h" +#include "ops/matmul/matmul.h" +#include "ops/rearrange/rearrange.h" +#include "tensor/tensor_descriptor.h" +#include + +struct _AttentionDescriptor { + Device device; + infiniopRearrangeDescriptor_t rearrange_desc_k; + infiniopRearrangeDescriptor_t rearrange_desc_v; + infiniopRearrangeDescriptor_t rearrange_desc_q; + infiniopRearrangeDescriptor_t rearrange_desc_out; + infiniopMatmulDescriptor_t matmul_desc1; + infiniopMatmulDescriptor_t matmul_desc2; + infiniopCausalSoftmaxDescriptor_t softmax_desc; + uint64_t workspace_size; + uint64_t rearranged_q_size; + uint64_t matmul1_workspace_size; + uint64_t matmul1_tensor_size; + uint64_t matmul2_workspace_size; + uint64_t matmul2_tensor_size; + uint64_t softmax_workspace_size; + uint64_t k_cache_offset; + uint64_t v_cache_offset; +}; + +typedef struct _AttentionDescriptor *_AttentionDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t handle, + infiniopAttentionDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t q_desc, + infiniopTensorDescriptor_t k_desc, + infiniopTensorDescriptor_t v_desc, + infiniopTensorDescriptor_t k_cache_desc, + infiniopTensorDescriptor_t v_cache_desc, + uint64_t pos) { + if (out_desc->ndim != 3 || q_desc->ndim != 3 || k_desc->ndim != 3 || + v_desc->ndim != 3 || k_cache_desc->ndim != 3 || v_cache_desc->ndim != 3) { + return STATUS_BAD_TENSOR_SHAPE; + } + + if (!is_contiguous(out_desc, 0, 2)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + if (q_desc->strides[2] != 1 || k_desc->strides[2] != 1 || v_desc->strides[2] != 1 || + k_cache_desc->strides[2] != 1 || v_cache_desc->strides[2] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + uint64_t n_q_head = q_desc->shape[0]; + uint64_t seq_len = q_desc->shape[1]; + uint64_t head_dim = q_desc->shape[2]; + uint64_t hidden_size = n_q_head * head_dim; + uint64_t n_kv_head = k_desc->shape[0]; + uint64_t total_seq_len = seq_len + pos; + uint64_t n_group = n_q_head / n_kv_head; + + // out: [seq_len, n_q_head, head_dim] + if (out_desc->shape[0] != seq_len || out_desc->shape[1] != n_q_head || out_desc->shape[2] != head_dim) { + return STATUS_BAD_PARAM; + } + + // k: [n_kv_head, seq_len, head_dim] + if (k_desc->shape[0] != n_kv_head || k_desc->shape[1] != seq_len || k_desc->shape[2] != head_dim) { + return STATUS_BAD_PARAM; + } + + // v: [n_kv_head, seq_len, head_dim] + if (v_desc->shape[0] != n_kv_head || v_desc->shape[1] != seq_len || v_desc->shape[2] != head_dim) { + return STATUS_BAD_PARAM; + } + + // k_cache: [n_kv_head, _, head_dim] + if (k_cache_desc->shape[0] != n_kv_head || k_cache_desc->shape[1] < total_seq_len || k_cache_desc->shape[2] != head_dim) { + return STATUS_BAD_PARAM; + } + + // v_cache: [n_kv_head, _, head_dim] + if (v_cache_desc->shape[0] != n_kv_head || v_cache_desc->shape[1] < total_seq_len || v_cache_desc->shape[2] != head_dim) { + return STATUS_BAD_PARAM; + } + + // Rearrange k into k_cache + infiniopTensorDescriptor_t dst_k_desc; + CHECK_STATUS(infiniopCreateTensorDescriptor(&dst_k_desc, 3, k_desc->shape, k_cache_desc->strides, k_cache_desc->dt), STATUS_SUCCESS); + infiniopRearrangeDescriptor_t rearrange_desc_k; + CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_k, dst_k_desc, k_desc), STATUS_SUCCESS); + + // Rearrange v into v_cache + infiniopTensorDescriptor_t dst_v_desc; + CHECK_STATUS(infiniopCreateTensorDescriptor(&dst_v_desc, 3, v_desc->shape, v_cache_desc->strides, v_cache_desc->dt), STATUS_SUCCESS); + infiniopRearrangeDescriptor_t rearrange_desc_v; + CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_v, dst_v_desc, v_desc), STATUS_SUCCESS); + + // Rearrange q into contiguous + infiniopRearrangeDescriptor_t rearrange_desc_q = nullptr; + uint64_t rearranged_q_size = 0; + if (!is_contiguous(q_desc, 0, 1)) { + infiniopTensorDescriptor_t rearranged_q_desc; + CHECK_STATUS(infiniopCreateTensorDescriptor(&rearranged_q_desc, 3, q_desc->shape, nullptr, q_desc->dt), STATUS_SUCCESS); + rearranged_q_size = get_byte_size(rearranged_q_desc); + rearrange_desc_q = new RearrangeDescriptor; + CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_q, rearranged_q_desc, q_desc), STATUS_SUCCESS); + } + + // Matmul1: q * full_k + // q: [n_q_head, seq_len, head_dim] -> [n_kv_head, n_group *seq_len, head_dim] + infiniopTensorDescriptor_t reshaped_q_desc; + CHECK_STATUS(infiniopCreateTensorDescriptor(&reshaped_q_desc, 3, q_desc->shape, nullptr, q_desc->dt), STATUS_SUCCESS); + reshaped_q_desc = dim_split(reshaped_q_desc, 0, {n_kv_head, n_group}); + if (!reshaped_q_desc) { + return STATUS_BAD_PARAM; + } + reshaped_q_desc = dim_merge(reshaped_q_desc, 1, 2); + if (!reshaped_q_desc) { + return STATUS_BAD_PARAM; + } + // full_k: [n_kv_head, head_dim, total_seq_len] + infiniopTensorDescriptor_t full_k_desc; + uint64_t full_k_shape[3] = {n_kv_head, total_seq_len, head_dim}; + CHECK_STATUS(infiniopCreateTensorDescriptor(&full_k_desc, 3, full_k_shape, k_cache_desc->strides, k_cache_desc->dt), STATUS_SUCCESS); + full_k_desc = permute(full_k_desc, {0, 2, 1}); + if (!full_k_desc) { + return STATUS_BAD_PARAM; + } + // qk: [n_kv_head, n_group * seq_len, total_seq_len] + infiniopTensorDescriptor_t qk_desc; + uint64_t qk_shape[3] = {n_kv_head, n_group * seq_len, total_seq_len}; + CHECK_STATUS(infiniopCreateTensorDescriptor(&qk_desc, 3, qk_shape, nullptr, q_desc->dt), STATUS_SUCCESS); + // matmul1_desc + // qk_alpha + float qk_alpha = 1 / sqrt(head_dim); + infiniopMatmulDescriptor_t matmul1_desc; + CHECK_STATUS(infiniopCreateMatmulDescriptor(handle, &matmul1_desc, qk_desc, qk_alpha, reshaped_q_desc, full_k_desc, 0.0), STATUS_SUCCESS); + // matmul1 workspace size + uint64_t matmul1_workspace_size; + CHECK_STATUS(infiniopGetMatmulWorkspaceSize(matmul1_desc, &matmul1_workspace_size), STATUS_SUCCESS); + // matmul1 tensor size + uint64_t matmul1_tensor_size = get_byte_size(qk_desc); + + // CausalSoftmax: softmax(qk) + // qk: [n_kv_head, n_group * seq_len, total_seq_len] -> [n_q_head, seq_len, total_seq_len] + qk_desc = dim_split(qk_desc, 1, {n_group, seq_len}); + if (!qk_desc) { + return STATUS_BAD_PARAM; + } + qk_desc = dim_merge(qk_desc, 0, 1); + if (!qk_desc) { + return STATUS_BAD_PARAM; + } + infiniopCausalSoftmaxDescriptor_t softmax_desc; + CHECK_STATUS(infiniopCreateCausalSoftmaxDescriptor(handle, &softmax_desc, qk_desc), STATUS_SUCCESS); + // softmax workspace size + uint64_t softmax_workspace_size; + CHECK_STATUS(infiniopGetCausalSoftmaxWorkspaceSize(softmax_desc, &softmax_workspace_size), STATUS_SUCCESS); + + // Matmul2: softmax(qk) * full_v + // softmax(qk): [n_q_head, seq_len, total_seq_len] -> [n_kv_head, n_group * seq_len, total_seq_len] + // full_v: [n_kv_head, total_seq_len, head_dim] + qk_desc = dim_split(qk_desc, 0, {n_kv_head, n_group}); + if (!qk_desc) { + return STATUS_BAD_PARAM; + } + qk_desc = dim_merge(qk_desc, 1, 2); + if (!qk_desc) { + return STATUS_BAD_PARAM; + } + infiniopTensorDescriptor_t full_v_desc; + uint64_t full_v_shape[3] = {n_kv_head, total_seq_len, head_dim}; + CHECK_STATUS(infiniopCreateTensorDescriptor(&full_v_desc, 3, full_v_shape, v_cache_desc->strides, v_cache_desc->dt), STATUS_SUCCESS); + // temp_out: [n_kv_head, n_group * seq_len, head_dim] + infiniopTensorDescriptor_t temp_out_desc; + uint64_t temp_out_shape[3] = {n_kv_head, n_group * seq_len, head_dim}; + CHECK_STATUS(infiniopCreateTensorDescriptor(&temp_out_desc, 3, temp_out_shape, nullptr, q_desc->dt), STATUS_SUCCESS); + // matmul2_desc + infiniopMatmulDescriptor_t matmul2_desc; + CHECK_STATUS(infiniopCreateMatmulDescriptor(handle, &matmul2_desc, temp_out_desc, 1.0, qk_desc, full_v_desc, 0.0), STATUS_SUCCESS); + // matmul2 workspace size + uint64_t matmul2_workspace_size; + CHECK_STATUS(infiniopGetMatmulWorkspaceSize(matmul2_desc, &matmul2_workspace_size), STATUS_SUCCESS); + // matmul2 tensor size + uint64_t matmul2_tensor_size = get_byte_size(temp_out_desc); + + // Rearrange temp_out into out + // out: [seq_len, n_q_head, head_dim] + // temp_out: [n_kv_head, n_group * seq_len, head_dim] -> [n_q_head, seq_len, head_dim] -> [seq_len, n_q_head, head_dim] + temp_out_desc = dim_split(temp_out_desc, 1, {n_group, seq_len}); + if (!temp_out_desc) { + return STATUS_BAD_PARAM; + } + temp_out_desc = dim_merge(temp_out_desc, 0, 1); + if (!temp_out_desc) { + return STATUS_BAD_PARAM; + } + temp_out_desc = permute(temp_out_desc, {1, 0, 2}); + if (!temp_out_desc) { + return STATUS_BAD_PARAM; + } + infiniopRearrangeDescriptor_t rearrange_desc_out; + CHECK_STATUS(infiniopCreateRearrangeDescriptor(handle, &rearrange_desc_out, out_desc, temp_out_desc), STATUS_SUCCESS); + + // workspace size + uint64_t workspace_size = rearranged_q_size + std::max(std::max(matmul1_workspace_size + matmul1_tensor_size, + matmul1_tensor_size + softmax_workspace_size), + matmul1_tensor_size + matmul2_workspace_size + matmul2_tensor_size); + + // k_cache_offset + uint64_t k_cache_offset = 0; + if (pos > 0) { + k_cache_offset = pos * get_byte_strides(k_cache_desc)[1]; + } + + // v_cache_offset + uint64_t v_cache_offset = 0; + if (pos > 0) { + v_cache_offset = pos * get_byte_strides(v_cache_desc)[1]; + } + + // create attention descriptor + *(_AttentionDescriptor_t *) desc_ptr = new _AttentionDescriptor{ + handle->device, + rearrange_desc_k, + rearrange_desc_v, + rearrange_desc_q, + rearrange_desc_out, + matmul1_desc, + matmul2_desc, + softmax_desc, + workspace_size, + rearranged_q_size, + matmul1_workspace_size, + matmul1_tensor_size, + matmul2_workspace_size, + matmul2_tensor_size, + softmax_workspace_size, + k_cache_offset, + v_cache_offset, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetAttentionWorkspaceSize(infiniopAttentionDescriptor_t desc, uint64_t *size) { + *size = ((_AttentionDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *out, + void const *q, + void const *k, + void const *v, + void *k_cache, + void *v_cache, + void *stream) { + auto _desc = (_AttentionDescriptor_t) desc; + void *_workspace = workspace; + if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + + // concat k and v to k_cache and v_cache + CHECK_STATUS(infiniopRearrange(_desc->rearrange_desc_k, + (char *) k_cache + _desc->k_cache_offset, k, stream), + STATUS_SUCCESS); + + CHECK_STATUS(infiniopRearrange(_desc->rearrange_desc_v, + (char *) v_cache + _desc->v_cache_offset, v, stream), + STATUS_SUCCESS); + + // rearrange q into contiguous + void const *_q = q; + if (_desc->rearrange_desc_q) { + CHECK_STATUS(infiniopRearrange(_desc->rearrange_desc_q, (char *) _workspace, q, stream), STATUS_SUCCESS); + _q = _workspace; + _workspace = (char *) _workspace + _desc->rearranged_q_size; + } + + // matmul1: q * full_k + CHECK_STATUS(infiniopMatmul(_desc->matmul_desc1, + (char *) _workspace + _desc->matmul1_tensor_size, _desc->workspace_size - _desc->matmul1_tensor_size, + _workspace, _q, k_cache, stream), + STATUS_SUCCESS); + // softmax(qk) + CHECK_STATUS(infiniopCausalSoftmax(_desc->softmax_desc, + (char *) _workspace + _desc->matmul1_tensor_size, _desc->workspace_size - _desc->matmul1_tensor_size, + _workspace, stream), + STATUS_SUCCESS); + // matmul2: softmax(qk) * full_v + CHECK_STATUS(infiniopMatmul(_desc->matmul_desc2, + (char *) _workspace + _desc->matmul1_tensor_size + _desc->matmul2_tensor_size, + _desc->workspace_size - _desc->matmul1_tensor_size - _desc->matmul2_tensor_size, + (char *) _workspace + _desc->matmul1_tensor_size, _workspace, v_cache, stream), + STATUS_SUCCESS); + // rearrange out + CHECK_STATUS(infiniopRearrange(_desc->rearrange_desc_out, out, (char *) _workspace + _desc->matmul1_tensor_size, stream), STATUS_SUCCESS); + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyAttentionDescriptor(infiniopAttentionDescriptor_t desc) { + if (((_AttentionDescriptor_t) desc)->rearrange_desc_q) { + CHECK_STATUS(infiniopDestroyRearrangeDescriptor(((_AttentionDescriptor_t) desc)->rearrange_desc_q), STATUS_SUCCESS); + } + CHECK_STATUS(infiniopDestroyRearrangeDescriptor(((_AttentionDescriptor_t) desc)->rearrange_desc_k), STATUS_SUCCESS); + CHECK_STATUS(infiniopDestroyRearrangeDescriptor(((_AttentionDescriptor_t) desc)->rearrange_desc_v), STATUS_SUCCESS); + CHECK_STATUS(infiniopDestroyRearrangeDescriptor(((_AttentionDescriptor_t) desc)->rearrange_desc_out), STATUS_SUCCESS); + CHECK_STATUS(infiniopDestroyMatmulDescriptor(((_AttentionDescriptor_t) desc)->matmul_desc1), STATUS_SUCCESS); + CHECK_STATUS(infiniopDestroyMatmulDescriptor(((_AttentionDescriptor_t) desc)->matmul_desc2), STATUS_SUCCESS); + CHECK_STATUS(infiniopDestroyCausalSoftmaxDescriptor(((_AttentionDescriptor_t) desc)->softmax_desc), STATUS_SUCCESS); + delete (_AttentionDescriptor_t) desc; + + return STATUS_SUCCESS; +} diff --git a/src/ops/avg_pool/operator.cc b/src/ops/avg_pool/operator.cc new file mode 100644 index 00000000..29c1a332 --- /dev/null +++ b/src/ops/avg_pool/operator.cc @@ -0,0 +1,54 @@ +#include "../pooling/pooling.h" +#include "../utils.h" +#include "ops/avg_pool/avg_pool.h" + +struct _AvgPoolDescriptor { + Device device; + infiniopPoolingDescriptor_t pooling_desc; + uint64_t workspace_size; +}; + +typedef struct _AvgPoolDescriptor *_AvgPoolDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle, + infiniopAvgPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + uint64_t const *kernel_shape, + uint64_t const *pads, + int64_t const *strides, + uint64_t n) { + infiniopPoolingDescriptor_t pooling_desc; + CHECK_STATUS(infiniopCreatePoolingDescriptor(handle, &pooling_desc, y, x, kernel_shape, pads, strides, n, 1), STATUS_SUCCESS); + uint64_t workspace_size = 0; + CHECK_STATUS(infiniopGetPoolingWorkspaceSize(pooling_desc, &workspace_size), STATUS_SUCCESS); + + *(_AvgPoolDescriptor_t *) desc_ptr = new _AvgPoolDescriptor{ + handle->device, + pooling_desc, + workspace_size}; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, uint64_t *size) { + *size = ((_AvgPoolDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) { + auto _desc = (_AvgPoolDescriptor_t) desc; + if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + + CHECK_STATUS(infiniopPooling(_desc->pooling_desc, workspace, workspace_size, y, x, stream), + STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyPoolingDescriptor(((_AvgPoolDescriptor_t) desc)->pooling_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc new file mode 100644 index 00000000..26ed34c1 --- /dev/null +++ b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.cc @@ -0,0 +1,187 @@ +#include "causal_softmax_aclnn.h" +#include "../../utils.h" + +CausalSoftmaxAclnnDescriptor::CausalSoftmaxAclnnDescriptor(Device _device) { + device = _device; + device_id = 0; + aDesc = new aclnnTensorDescriptor(); + maskDesc = new aclnnTensorDescriptor(); + outDesc = new aclnnTensorDescriptor(); + executor = nullptr; + workspaceSize = 0; + maskAddr = nullptr; +} + +infiniopStatus_t aclnnCreateCausalSoftmaxDescriptor(AscendHandle_t handle, + CausalSoftmaxAclnnDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y) { + if (y->ndim < 2 || y->ndim >= 4) { + return STATUS_BAD_TENSOR_SHAPE; + } + + if (!is_contiguous(y, 0, y->ndim - 1)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + // Construct CausalSoftmaxAclnnDescriptor + *desc_ptr = new CausalSoftmaxAclnnDescriptor(handle->device); + (*desc_ptr)->device_id = handle->device_id; + + // Set value from infiniopTensorDescriptor + auto &aDesc = (*desc_ptr)->aDesc; + auto &outDesc = (*desc_ptr)->outDesc; + + uint64_t ndim = y->ndim; + uint64_t *shape = y->shape; + int64_t *strides = y->strides; + int64_t total_seq_len = static_cast(shape[ndim - 1]); + int64_t seq_len = static_cast(shape[ndim - 2]); + + if (total_seq_len < seq_len) { + return STATUS_BAD_TENSOR_SHAPE; + } + + // Change input shape and stride + auto aclnn_shape = std::vector(4); + auto aclnn_strides = std::vector(4); + for (uint64_t i = 0; i < ndim; ++i) { + aclnn_shape[4 - i - 1] = shape[ndim - i - 1]; + aclnn_strides[4 - i - 1] = strides[ndim - i - 1]; + } + // Add padding to input shape and stride if ndim < 4 + for (uint64_t i = 0; i < 4 - ndim; ++i) { + aclnn_shape[i] = 1; + aclnn_strides[i] = aclnn_shape[i + 1] * aclnn_strides[i + 1]; + } + + CHECK_STATUS(aDesc->setDescriptor(toAclDataType(y->dt), aclnn_shape, aclnn_strides), STATUS_SUCCESS); + CHECK_STATUS(outDesc->setDescriptor(toAclDataType(y->dt), aclnn_shape, aclnn_strides), STATUS_SUCCESS); + + // Set mask Desc + auto &maskDesc = (*desc_ptr)->maskDesc; + auto mask_shape = std::vector(3); + + mask_shape[2] = total_seq_len; + mask_shape[1] = seq_len; + if (ndim == 2) { + mask_shape[0] = 1; + } else { + mask_shape[0] = static_cast(shape[0]); + } + auto mask_strides = std::vector{total_seq_len * seq_len, total_seq_len, 1}; + + CHECK_STATUS(maskDesc->setDescriptor(toAclDataType(y->dt), mask_shape, mask_strides), STATUS_SUCCESS); + + // Create aclTensor + CHECK_STATUS(aDesc->createTensor(), STATUS_SUCCESS); + CHECK_STATUS(maskDesc->createTensor(), STATUS_SUCCESS); + CHECK_STATUS(outDesc->createTensor(), STATUS_SUCCESS); + + // Get Tensor + aclTensor *ta = aDesc->t; + aclTensor *tmask = maskDesc->t; + aclTensor *tout = outDesc->t; + + auto &workspaceSize = (*desc_ptr)->workspaceSize; + auto &executor = (*desc_ptr)->executor; + auto ret = aclnnMaskedSoftmaxWithRelPosBiasGetWorkspaceSize(ta, + nullptr, + tmask, + 1.0, 0, + tout, + &workspaceSize, + &executor); + aclSetAclOpExecutorRepeatable(executor); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnMaskedSoftmaxWithRelPosBiasGetWorkspaceSize failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + + // Fill upgrade matrix + uint16_t mask_matrix[maskDesc->shape[0]][maskDesc->shape[1]][maskDesc->shape[2]]; + auto &dims = maskDesc->shape; + auto ele_size = aclDataTypeSize(maskDesc->dataType); + + // float neg_inf = -100000000; + for (int i = 0; i < dims[0]; ++i) { + for (int m = 0; m < dims[1]; ++m) { + for (int n = 0; n < dims[2]; ++n) { + if (n - m > dims[2] - dims[1]) { + // 0xF939 = -10240 half + mask_matrix[i][m][n] = 0xF880; + } else { + mask_matrix[i][m][n] = 0; + } + } + } + } + + // malloc mask space + auto &maskAddr = (*desc_ptr)->maskAddr; + auto mask_size = numElements(maskDesc->shape.data(), maskDesc->ndim) * ele_size; + CHECK_STATUS(mallocWorkspace(&maskAddr, mask_size), STATUS_SUCCESS); + + // copy mask matrix to device mem + ret = aclrtMemcpy(maskAddr, + mask_size, + mask_matrix, + mask_size, + ACL_MEMCPY_HOST_TO_DEVICE); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + + return STATUS_SUCCESS; +} + +infiniopStatus_t aclnnGetCausalSoftmaxWorkspaceSize(CausalSoftmaxAclnnDescriptor_t desc, uint64_t *size) { + + *size = desc->workspaceSize; + + return STATUS_SUCCESS; +} + +infiniopStatus_t aclnnCausalSoftmax(CausalSoftmaxAclnnDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream) { + auto &aDesc = desc->aDesc; + auto &maskDesc = desc->maskDesc; + auto &outDesc = desc->outDesc; + + + // Get aclTensor pt + aclTensor *ta = aDesc->t; + aclTensor *tmask = maskDesc->t; + aclTensor *tout = outDesc->t; + + auto &executor = desc->executor; + auto &workspaceSize = desc->workspaceSize; + auto &maskAddr = desc->maskAddr; + + // Set runing on handle device + aclrtSetDevice(desc->device_id); + + AclSetTensorAddr(executor, 0, ta, data); + AclSetTensorAddr(executor, 2, tmask, maskAddr); + AclSetTensorAddr(executor, 3, tout, data); + + auto ret = aclnnMaskedSoftmaxWithRelPosBias(workspace, + workspaceSize, + executor, + stream); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnMaskedSoftmaxWithRelPosBias failed. ERROR: %d\n", ret)); + + return STATUS_SUCCESS; +} + +infiniopStatus_t aclnnDestroyCausalSoftmaxDescriptor(CausalSoftmaxAclnnDescriptor_t desc) { + delete desc->aDesc; + delete desc->maskDesc; + delete desc->outDesc; + aclDestroyAclOpExecutor(desc->executor); + CHECK_STATUS(freeWorkspace(desc->maskAddr), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/causal_softmax/ascend/causal_softmax_aclnn.h b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.h new file mode 100644 index 00000000..f6b6d320 --- /dev/null +++ b/src/ops/causal_softmax/ascend/causal_softmax_aclnn.h @@ -0,0 +1,38 @@ +#ifndef __ACLNN_CAUSAL_SOFTMAX_H__ +#define __ACLNN_CAUSAL_SOFTMAX_H__ + +#include "../../../devices/ascend/ascend_handle.h" +#include "../../../devices/ascend/tensor_aclnn.h" +#include "operators.h" +#include +#include +#include + +struct CausalSoftmaxAclnnDescriptor { + Device device; + int device_id; + aclOpExecutor *executor; + aclnnTensorDescriptor_t aDesc, maskDesc, outDesc; + uint64_t workspaceSize; + void *maskAddr; + + CausalSoftmaxAclnnDescriptor(Device device); +}; + +typedef CausalSoftmaxAclnnDescriptor *CausalSoftmaxAclnnDescriptor_t; + +infiniopStatus_t aclnnCreateCausalSoftmaxDescriptor(AscendHandle_t handle, + CausalSoftmaxAclnnDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc); + +infiniopStatus_t aclnnGetCausalSoftmaxWorkspaceSize(CausalSoftmaxAclnnDescriptor_t desc, uint64_t *size); + +infiniopStatus_t aclnnCausalSoftmax(CausalSoftmaxAclnnDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream); + +infiniopStatus_t aclnnDestroyCausalSoftmaxDescriptor(CausalSoftmaxAclnnDescriptor_t desc); + +#endif diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.cc b/src/ops/causal_softmax/bang/causal_softmax_bang.cc new file mode 100644 index 00000000..cc9b6d37 --- /dev/null +++ b/src/ops/causal_softmax/bang/causal_softmax_bang.cc @@ -0,0 +1,50 @@ +#include "causal_softmax_bang.h" +#include "../../utils.h" + +infiniopStatus_t bangCreateCausalSoftmaxDescriptor(BangHandle_t handle, + CausalSoftmaxBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y) { + if (y->ndim < 2 || y->shape[y->ndim - 1] < y->shape[y->ndim - 2]) { + return STATUS_BAD_TENSOR_SHAPE; + } + + int ndim = y->ndim; + int *stride = new int[ndim]; + int *shape = new int[ndim]; + + int n = 1; + for (int i = 0; i < ndim; i++) { + stride[i] = static_cast(y->strides[i]); + shape[i] = static_cast(y->shape[i]); + if (i < ndim - 1) { + n *= shape[i]; + } + } + + *desc_ptr = new CausalSoftmaxBangDescriptor{ + handle->device, + handle->device_id, + y->dt, + ndim, + stride, + shape, + n}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, uint64_t *size) { + if (desc->ndim > 3) { + *size = desc->ndim * sizeof(int) * 2; + } else { + *size = 0; + } + return STATUS_SUCCESS; +} + +infiniopStatus_t bangDestroyCausalSoftmaxDescriptor(CausalSoftmaxBangDescriptor_t desc) { + delete[] desc->stride; + delete[] desc->shape; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.h b/src/ops/causal_softmax/bang/causal_softmax_bang.h index e7a33a5f..c9e09921 100644 --- a/src/ops/causal_softmax/bang/causal_softmax_bang.h +++ b/src/ops/causal_softmax/bang/causal_softmax_bang.h @@ -1,11 +1,35 @@ #ifndef __BANG_CAUSAL_SOFTMAX_H__ #define __BANG_CAUSAL_SOFTMAX_H__ +#include "../../../devices/bang/bang_handle.h" #include "../../utils.h" -#include "cnrt.h" #include "operators.h" -void causal_softmax_bang_f16(Tensor y, void *stream); +struct CausalSoftmaxBangDescriptor { + Device device; + int device_id; + DT dtype; + int ndim; + int *stride; + int *shape; + int n; +}; -#endif// __BANG_CAUSAL_SOFTMAX_H__ +typedef struct CausalSoftmaxBangDescriptor *CausalSoftmaxBangDescriptor_t; +infiniopStatus_t bangCreateCausalSoftmaxDescriptor(BangHandle_t handle, + CausalSoftmaxBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc); + +infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t desc, uint64_t *size); + +infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream); + +infiniopStatus_t bangDestroyCausalSoftmaxDescriptor(CausalSoftmaxBangDescriptor_t desc); + + +#endif diff --git a/src/ops/causal_softmax/bang/causal_softmax_bang.mlu b/src/ops/causal_softmax/bang/causal_softmax_bang.mlu index 10304324..12b3e610 100644 --- a/src/ops/causal_softmax/bang/causal_softmax_bang.mlu +++ b/src/ops/causal_softmax/bang/causal_softmax_bang.mlu @@ -1,221 +1,212 @@ +#include "../../../devices/bang/common_bang.h" #include "bang.h" #include "bang_device_functions.h" -#include "cnrt.h" #include "causal_softmax_bang.h" -#include "../../../devices/bang/common_bang.h" +#include "cnrt.h" + const int SRC_MAX_SIZE = 1024 * 64;//至少大于等于128字节 -__nram__ char nram_buffer[NRAM_MAX_SIZE]; -template -__mlu_device__ void causal_softmaxKernel(T *destination, T *source, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int dimS, int mask, int ndim){ - - const int maxNum = SRC_MAX_SIZE/sizeof(T); +__nram__ char nram_buffer[NRAM_MAX_SIZE]; + +template +__mlu_device__ void causal_softmaxKernel(T *destination, int *strideDest, int *shape, int othersize, int dimsize, int dimS, int mask, int ndim) { + + const int maxNum = SRC_MAX_SIZE / sizeof(T); int wSize = 128 / sizeof(T); __nram__ T srcMax[2]; - if(dimsize > maxNum){ - T *src = (T *)nram_buffer;//[maxNum] - T *destSum = src + maxNum;//[maxNum] + if (dimsize > maxNum) { + T *src = (T *) nram_buffer; //[maxNum] + T *destSum = src + maxNum; //[maxNum] T *destSumFinal = destSum + maxNum;//[wSize] - T *tmp = destSumFinal + wSize;//[maxNum] - + T *tmp = destSumFinal + wSize; //[maxNum] + T destOldMax; T destNewMax; - + int remain = dimsize % maxNum; int repeat = (dimsize - remain) / maxNum; - + int remainT = othersize % taskDim; int stepEasy = (othersize - remainT) / taskDim; int stepHard = stepEasy + 1; int step = (taskId < remainT ? stepHard : stepEasy); int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; + + for (int i = indStart; i < indStart + step; i++) { int indd = 0; int indi = i; - int lastI = indi%shape[ndim - 2]; + int lastI = indi % shape[ndim - 2]; for (int j = ndim - 2; j >= 0; --j) { - inds += (indi % shape[j]) * strideSrc[j]; + indd += (indi % shape[j]) * strideDest[j]; indi /= shape[j]; } - - if(mask + 1 + lastI < maxNum){ - __bang_write_value(src, maxNum, -INFINITY);//提前设置负无穷 - __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从source读取对应数据 - __bang_argmax(srcMax, src, maxNum);//获取最大值 + + if (mask + 1 + lastI < maxNum) { + __bang_write_value(src, maxNum, -INFINITY); //提前设置负无穷 + __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从destination读取对应数据 + __bang_argmax(srcMax, src, maxNum); //获取最大值 __bang_write_value(destSum, maxNum, srcMax[0]); __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//destSum前面(mask + 1 + lastI)为src,后面部分为最大值 - __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum);//destSum前面(mask + 1 + lastI)为(src - M),后面部分为0 - __bang_active_exp_less_0(destSum, destSum, maxNum);//destSum前面(mask + 1 + lastI)为exp(src - M),后面部分为1 - __bang_write_zero(src, maxNum);//重新设置src全部为0 + __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum); //destSum前面(mask + 1 + lastI)为(src - M),后面部分为0 + __bang_active_exp_less_0(destSum, destSum, maxNum); //destSum前面(mask + 1 + lastI)为exp(src - M),后面部分为1 + __bang_write_zero(src, maxNum); //重新设置src全部为0 __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//src前面(mask + 1 + lastI)为exp(src - M),后面部分为0 - - if(maxNum >= wSize){ + + if (maxNum >= wSize) { int segNum = maxNum / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ + for (int strip = segNum / 2; strip > 0; strip = strip / 2) { + for (int j = 0; j < strip; j++) { __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); } } __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - - } - else{ + + } else { __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM); __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - } T globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - (mask + 1 + lastI)));//下面开始指数变换,写回GDRAM __bang_mul_scalar(src, src, globalSumInv, maxNum); - + __memcpy(destination + indd, src, maxNum * sizeof(T), NRAM2GDRAM); __bang_write_zero(src, maxNum); - for(int s = 1; s < repeat; s++){ + for (int s = 1; s < repeat; s++) { __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM); } - if(remain){ + if (remain) { __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM); } - } - else{ + } else { int newRemain = (mask + 1 + lastI) % maxNum; int nR = (mask + 1 + lastI - newRemain) / maxNum; - + __bang_write_zero(destSum, maxNum); __bang_write_zero(destSumFinal, wSize); - + destOldMax = -INFINITY; destNewMax = -INFINITY; - for(int s = 0; s < nR; s++){ - - __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + for (int s = 0; s < nR; s++) { + + __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); __bang_argmax(srcMax, src, maxNum); - - if(destNewMax < srcMax[0]){ + + if (destNewMax < srcMax[0]) { destNewMax = srcMax[0]; } __bang_sub_scalar(src, src, destNewMax, maxNum); __bang_active_exp_less_0(src, src, maxNum); - - if(s > 0){ + + if (s > 0) { __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum); } __bang_add(destSum, destSum, src, maxNum); - + destOldMax = destNewMax; } - - if(newRemain){ + + if (newRemain) { //__bang_write_value(src, maxNum, -INFINITY); - - __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); - + + __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); + __bang_argmax(srcMax, src, maxNum); - - if(destNewMax < srcMax[0]){ + + if (destNewMax < srcMax[0]) { destNewMax = srcMax[0]; } - + __bang_write_value(tmp, maxNum, destNewMax); __memcpy(tmp, src, newRemain * sizeof(T), NRAM2NRAM); - + __bang_sub_scalar(tmp, tmp, destNewMax, maxNum); __bang_active_exp_less_0(tmp, tmp, maxNum); - - if(nR > 0){ + + if (nR > 0) { __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum); } __bang_add(destSum, destSum, tmp, maxNum); - + destOldMax = destNewMax; } - - if(maxNum >= wSize){ + + if (maxNum >= wSize) { int segNum = maxNum / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ + for (int strip = segNum / 2; strip > 0; strip = strip / 2) { + for (int j = 0; j < strip; j++) { __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); } } __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - - } - else{ - + + } else { + __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM); __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - } - + T globalSumInv; - if(newRemain){ + if (newRemain) { globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - newRemain));//下面开始指数变换,写回GDRAM - - } - else{ + + } else { globalSumInv = 1.0 / destSumFinal[0];//下面开始指数变换,写回GDRAM - } - - for(int s = 0; s < nR; s++){ - __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - + + for (int s = 0; s < nR; s++) { + __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __bang_sub_scalar(src, src, destNewMax, maxNum); __bang_active_exp_less_0(src, src, maxNum); __bang_mul_scalar(src, src, globalSumInv, maxNum); - + __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM); } __bang_write_zero(src, maxNum); - for(int s = nR; s < repeat; s++){ + for (int s = nR; s < repeat; s++) { __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM); } - if(remain){ + if (remain) { __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM); } - - if(newRemain){ - - __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); - + + if (newRemain) { + + __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); + __bang_sub_scalar(src, src, destNewMax, maxNum); __bang_active_exp_less_0(src, src, maxNum); - __bang_mul_scalar(src, src, globalSumInv, maxNum); - + __bang_mul_scalar(src, src, globalSumInv, maxNum); + __memcpy(destination + indd + nR * maxNum, src, newRemain * sizeof(T), NRAM2GDRAM); } - } } - } - else{ - T *src = (T *)nram_buffer;//[dimS] - T *destSum = src + dimS;//[dimS] + } else { + T *src = (T *) nram_buffer; //[dimS] + T *destSum = src + dimS; //[dimS] T *destSumFinal = destSum + dimS;//[wSize] - + int remainT = othersize % taskDim; int stepEasy = (othersize - remainT) / taskDim; int stepHard = stepEasy + 1; int step = (taskId < remainT ? stepHard : stepEasy); int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; + + + for (int i = indStart; i < indStart + step; i++) { + int indd = 0; int indi = i; - + for (int j = ndim - 2; j >= 0; --j) { - inds += (indi % shape[j]) * strideSrc[j]; + indd += (indi % shape[j]) * strideDest[j]; indi /= shape[j]; } __bang_write_value(src, dimS, -INFINITY); __bang_write_zero(destSumFinal, wSize); int lastI = i % shape[ndim - 2]; - __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM); + __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM); __bang_argmax(srcMax, src, dimS); __bang_write_value(destSum, dimS, srcMax[0]); __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM); @@ -224,33 +215,31 @@ __mlu_device__ void causal_softmaxKernel(T *destination, T *source, int *strideS __bang_write_zero(src, dimS); __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM); int segNum = dimS / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ + for (int strip = segNum / 2; strip > 0; strip = strip / 2) { + for (int j = 0; j < strip; j++) { __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); } } - __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 + __bang_reduce_sum(destSumFinal, destSum, wSize); //此时destSum[0]保存的就是当前maxNum长度数据的数值和 T globalSumInv = 1.0 / (destSumFinal[0] - (dimS - (mask + 1 + lastI)));//下面开始指数变换,写回GDRAM __bang_mul_scalar(src, src, globalSumInv, dimS); - - __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM); - + __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM); } } } + template -__mlu_global__ void causal_softmaxUnion1(T *destination, T *source, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int dimS, int mask, int ndim) { +__mlu_global__ void causal_softmaxUnion1(T *destination, int *strideDest, int *shape, int othersize, int dimsize, int dimS, int mask, int ndim) { - causal_softmaxKernel(destination, source, strideSrc, strideDest, shape, othersize, dimsize, dimS, mask, ndim); + causal_softmaxKernel(destination, strideDest, shape, othersize, dimsize, dimS, mask, ndim); } + template -void causal_softmax(cnrtQueue_t queue, void *destination, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int mask, int ndim) { +void causal_softmax(cnrtQueue_t queue, void *destination, int *strideDest, int *shape, int othersize, int dimsize, int mask, int ndim) { int wSize = 128 / sizeof(T); auto y_ = reinterpret_cast(destination); - T *x_; - cnrtMalloc((void**)&x_, othersize * dimsize * sizeof(T)); - cnrtMemcpy(x_, y_, othersize * dimsize * sizeof(T), cnrtMemcpyDevToDev); + int dimS; float mi = log2(dimsize); if (floor(mi) == mi) { @@ -261,7 +250,7 @@ void causal_softmax(cnrtQueue_t queue, void *destination, int *strideSrc, int *s if (dimS < wSize) { dimS = wSize; } - + cnrtDim3_t k_dim; cnrtFunctionType_t k_type; @@ -270,218 +259,205 @@ void causal_softmax(cnrtQueue_t queue, void *destination, int *strideSrc, int *s k_dim.z = 1; k_type = CNRT_FUNC_TYPE_UNION1; - causal_softmaxUnion1<<>>(y_, x_, strideSrc, strideDest, shape, othersize, dimsize, dimS, mask, ndim); - // cnrtQueueSync(queue); - cnrtFree(x_); + causal_softmaxUnion1<<>>(y_, strideDest, shape, othersize, dimsize, dimS, mask, ndim); + cnrtQueueSync(queue); } -void causal_softmax_fp16(cnrtQueue_t queue, void *destination, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int mask, int ndim) { - causal_softmax(queue, destination, strideSrc, strideDest, shape, othersize, dimsize, mask, ndim); -} -template -__mlu_global__ void causal_softmaxDim_2(T *destination, T *source, int strideS_f, int strideD_f, int othersize, int dimsize, int dimS, int mask){ - - const int maxNum = SRC_MAX_SIZE/sizeof(T); + +template +__mlu_global__ void causal_softmaxDim_2(T *destination, int strideD_f, int othersize, int dimsize, int dimS, int mask) { + + const int maxNum = SRC_MAX_SIZE / sizeof(T); int wSize = 128 / sizeof(T); __nram__ T srcMax[2]; - if(dimsize > maxNum){ - T *src = (T *)nram_buffer;//[maxNum] - T *destSum = src + maxNum;//[maxNum] + if (dimsize > maxNum) { + T *src = (T *) nram_buffer; //[maxNum] + T *destSum = src + maxNum; //[maxNum] T *destSumFinal = destSum + maxNum;//[wSize] - T *tmp = destSumFinal + wSize;//[maxNum] - + T *tmp = destSumFinal + wSize; //[maxNum] + T destOldMax; T destNewMax; - + int remain = dimsize % maxNum; int repeat = (dimsize - remain) / maxNum; - + int remainT = othersize % taskDim; int stepEasy = (othersize - remainT) / taskDim; int stepHard = stepEasy + 1; int step = (taskId < remainT ? stepHard : stepEasy); int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; + + for (int i = indStart; i < indStart + step; i++) { + int indd = 0; int indi = i; - int lastI = indi%othersize; - inds += (indi % othersize) * strideS_f; + int lastI = indi % othersize; + indd += (indi % othersize) * strideD_f; - - if(mask + 1 + lastI < maxNum){ - __bang_write_value(src, maxNum, -INFINITY);//提前设置负无穷 - __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从source读取对应数据 - __bang_argmax(srcMax, src, maxNum);//获取最大值 + + if (mask + 1 + lastI < maxNum) { + __bang_write_value(src, maxNum, -INFINITY); //提前设置负无穷 + __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从destination读取对应数据 + __bang_argmax(srcMax, src, maxNum); //获取最大值 __bang_write_value(destSum, maxNum, srcMax[0]); __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//destSum前面(mask + 1 + lastI)为src,后面部分为最大值 - __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum);//destSum前面(mask + 1 + lastI)为(src - M),后面部分为0 - __bang_active_exp_less_0(destSum, destSum, maxNum);//destSum前面(mask + 1 + lastI)为exp(src - M),后面部分为1 - __bang_write_zero(src, maxNum);//重新设置src全部为0 + __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum); //destSum前面(mask + 1 + lastI)为(src - M),后面部分为0 + __bang_active_exp_less_0(destSum, destSum, maxNum); //destSum前面(mask + 1 + lastI)为exp(src - M),后面部分为1 + __bang_write_zero(src, maxNum); //重新设置src全部为0 __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//src前面(mask + 1 + lastI)为exp(src - M),后面部分为0 - - if(maxNum >= wSize){ + + if (maxNum >= wSize) { int segNum = maxNum / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ + for (int strip = segNum / 2; strip > 0; strip = strip / 2) { + for (int j = 0; j < strip; j++) { __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); } } __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - - } - else{ + + } else { __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM); __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - } T globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - (mask + 1 + lastI)));//下面开始指数变换,写回GDRAM __bang_mul_scalar(src, src, globalSumInv, maxNum); - + __memcpy(destination + indd, src, maxNum * sizeof(T), NRAM2GDRAM); __bang_write_zero(src, maxNum); - for(int s = 1; s < repeat; s++){ + for (int s = 1; s < repeat; s++) { __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM); } - if(remain){ + if (remain) { __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM); } - } - else{ + } else { int newRemain = (mask + 1 + lastI) % maxNum; int nR = (mask + 1 + lastI - newRemain) / maxNum; - + __bang_write_zero(destSum, maxNum); __bang_write_zero(destSumFinal, wSize); - + destOldMax = -INFINITY; destNewMax = -INFINITY; - for(int s = 0; s < nR; s++){ - - __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + for (int s = 0; s < nR; s++) { + + __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); __bang_argmax(srcMax, src, maxNum); - - if(destNewMax < srcMax[0]){ + + if (destNewMax < srcMax[0]) { destNewMax = srcMax[0]; } __bang_sub_scalar(src, src, destNewMax, maxNum); __bang_active_exp_less_0(src, src, maxNum); - - if(s > 0){ + + if (s > 0) { __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum); } __bang_add(destSum, destSum, src, maxNum); - + destOldMax = destNewMax; } - - if(newRemain){ + + if (newRemain) { //__bang_write_value(src, maxNum, -INFINITY); - - __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); - + + __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); + __bang_argmax(srcMax, src, maxNum); - - if(destNewMax < srcMax[0]){ + + if (destNewMax < srcMax[0]) { destNewMax = srcMax[0]; } - + __bang_write_value(tmp, maxNum, destNewMax); __memcpy(tmp, src, newRemain * sizeof(T), NRAM2NRAM); - + __bang_sub_scalar(tmp, tmp, destNewMax, maxNum); __bang_active_exp_less_0(tmp, tmp, maxNum); - - if(nR > 0){ + + if (nR > 0) { __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum); } __bang_add(destSum, destSum, tmp, maxNum); - + destOldMax = destNewMax; } - - if(maxNum >= wSize){ + + if (maxNum >= wSize) { int segNum = maxNum / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ + for (int strip = segNum / 2; strip > 0; strip = strip / 2) { + for (int j = 0; j < strip; j++) { __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); } } __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - - } - else{ - + + } else { + __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM); __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - } - + T globalSumInv; - if(newRemain){ + if (newRemain) { globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - newRemain));//下面开始指数变换,写回GDRAM - - } - else{ + + } else { globalSumInv = 1.0 / destSumFinal[0];//下面开始指数变换,写回GDRAM - } - - for(int s = 0; s < nR; s++){ - __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - + + for (int s = 0; s < nR; s++) { + __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __bang_sub_scalar(src, src, destNewMax, maxNum); __bang_active_exp_less_0(src, src, maxNum); __bang_mul_scalar(src, src, globalSumInv, maxNum); - + __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM); } __bang_write_zero(src, maxNum); - for(int s = nR; s < repeat; s++){ + for (int s = nR; s < repeat; s++) { __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM); } - if(remain){ + if (remain) { __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM); } - - if(newRemain){ - - __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); - + + if (newRemain) { + + __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); + __bang_sub_scalar(src, src, destNewMax, maxNum); __bang_active_exp_less_0(src, src, maxNum); - __bang_mul_scalar(src, src, globalSumInv, maxNum); - + __bang_mul_scalar(src, src, globalSumInv, maxNum); + __memcpy(destination + indd + nR * maxNum, src, newRemain * sizeof(T), NRAM2GDRAM); } - } } - } - else{ - T *src = (T *)nram_buffer;//[dimS] - T *destSum = src + dimS;//[dimS] + } else { + T *src = (T *) nram_buffer; //[dimS] + T *destSum = src + dimS; //[dimS] T *destSumFinal = destSum + dimS;//[wSize] - + int remainT = othersize % taskDim; int stepEasy = (othersize - remainT) / taskDim; int stepHard = stepEasy + 1; int step = (taskId < remainT ? stepHard : stepEasy); int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; + + + for (int i = indStart; i < indStart + step; i++) { + int indd = 0; int indi = i; - - inds += (indi % othersize) * strideS_f; + + indd += (indi % othersize) * strideD_f; __bang_write_value(src, dimS, -INFINITY); __bang_write_zero(destSumFinal, wSize); int lastI = i % othersize; - __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM); + __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM); __bang_argmax(srcMax, src, dimS); __bang_write_value(destSum, dimS, srcMax[0]); __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM); @@ -490,28 +466,24 @@ __mlu_global__ void causal_softmaxDim_2(T *destination, T *source, int strideS_f __bang_write_zero(src, dimS); __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM); int segNum = dimS / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ + for (int strip = segNum / 2; strip > 0; strip = strip / 2) { + for (int j = 0; j < strip; j++) { __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); } } - __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 + __bang_reduce_sum(destSumFinal, destSum, wSize); //此时destSum[0]保存的就是当前maxNum长度数据的数值和 T globalSumInv = 1.0 / (destSumFinal[0] - (dimS - (mask + 1 + lastI)));//下面开始指数变换,写回GDRAM __bang_mul_scalar(src, src, globalSumInv, dimS); - - __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM); - + __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM); } } } + template -void causal_softmaxUnionDim_2(cnrtQueue_t queue, void *destination, int strideS_f, int strideD_f, int othersize, int dimsize, int mask) { +void causal_softmaxUnionDim_2(cnrtQueue_t queue, void *destination, int strideD_f, int othersize, int dimsize, int mask) { int wSize = 128 / sizeof(T); auto y_ = reinterpret_cast(destination); - T *x_; - cnrtMalloc((void**)&x_, othersize * dimsize * sizeof(T)); - cnrtMemcpy(x_, y_, othersize * dimsize * sizeof(T), cnrtMemcpyDevToDev); int dimS; float mi = log2(dimsize); if (floor(mi) == mi) { @@ -522,7 +494,7 @@ void causal_softmaxUnionDim_2(cnrtQueue_t queue, void *destination, int strideS_ if (dimS < wSize) { dimS = wSize; } - + cnrtDim3_t k_dim; cnrtFunctionType_t k_type; @@ -531,250 +503,237 @@ void causal_softmaxUnionDim_2(cnrtQueue_t queue, void *destination, int strideS_ k_dim.z = 1; k_type = CNRT_FUNC_TYPE_UNION1; - causal_softmaxDim_2<<>>(y_, x_, strideS_f, strideD_f, othersize, dimsize, dimS, mask); - // cnrtQueueSync(queue); - cnrtFree(x_); + causal_softmaxDim_2<<>>(y_, strideD_f, othersize, dimsize, dimS, mask); + cnrtQueueSync(queue); } -template -__mlu_global__ void causal_softmaxDim_3(T *destination, T *source, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int othersize, int middle, int dimsize, int dimS, int mask){ - - const int maxNum = SRC_MAX_SIZE/sizeof(T); + +template +__mlu_global__ void causal_softmaxDim_3(T *destination, int strideD_f, int strideD_m, int othersize, int middle, int dimsize, int dimS, int mask) { + + const int maxNum = SRC_MAX_SIZE / sizeof(T); int wSize = 128 / sizeof(T); __nram__ T srcMax[2]; int startDim = othersize / middle; - if(dimsize > maxNum){ - T *src = (T *)nram_buffer;//[maxNum] - T *destSum = src + maxNum;//[maxNum] + if (dimsize > maxNum) { + T *src = (T *) nram_buffer; //[maxNum] + T *destSum = src + maxNum; //[maxNum] T *destSumFinal = destSum + maxNum;//[wSize] - T *tmp = destSumFinal + wSize;//[maxNum] - + T *tmp = destSumFinal + wSize; //[maxNum] + T destOldMax; T destNewMax; - + int remain = dimsize % maxNum; int repeat = (dimsize - remain) / maxNum; - + int remainT = othersize % taskDim; int stepEasy = (othersize - remainT) / taskDim; int stepHard = stepEasy + 1; int step = (taskId < remainT ? stepHard : stepEasy); int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; + + for (int i = indStart; i < indStart + step; i++) { + int indd = 0; int indi = i; - int lastI = indi%middle; - inds += (indi % middle) * strideS_m; + int lastI = indi % middle; + indd += (indi % middle) * strideD_m; indi /= middle; - inds += (indi % startDim) * strideS_f; + indd += (indi % startDim) * strideD_f; - - if(mask + 1 + lastI < maxNum){ - __bang_write_value(src, maxNum, -INFINITY);//提前设置负无穷 - __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从source读取对应数据 - __bang_argmax(srcMax, src, maxNum);//获取最大值 + + if (mask + 1 + lastI < maxNum) { + __bang_write_value(src, maxNum, -INFINITY); //提前设置负无穷 + __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//从destination读取对应数据 + __bang_argmax(srcMax, src, maxNum); //获取最大值 __bang_write_value(destSum, maxNum, srcMax[0]); __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//destSum前面(mask + 1 + lastI)为src,后面部分为最大值 - __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum);//destSum前面(mask + 1 + lastI)为(src - M),后面部分为0 - __bang_active_exp_less_0(destSum, destSum, maxNum);//destSum前面(mask + 1 + lastI)为exp(src - M),后面部分为1 - __bang_write_zero(src, maxNum);//重新设置src全部为0 + __bang_sub_scalar(destSum, destSum, srcMax[0], maxNum); //destSum前面(mask + 1 + lastI)为(src - M),后面部分为0 + __bang_active_exp_less_0(destSum, destSum, maxNum); //destSum前面(mask + 1 + lastI)为exp(src - M),后面部分为1 + __bang_write_zero(src, maxNum); //重新设置src全部为0 __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//src前面(mask + 1 + lastI)为exp(src - M),后面部分为0 - - if(maxNum >= wSize){ + + if (maxNum >= wSize) { int segNum = maxNum / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ + for (int strip = segNum / 2; strip > 0; strip = strip / 2) { + for (int j = 0; j < strip; j++) { __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); } } __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - - } - else{ + + } else { __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM); __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - } T globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - (mask + 1 + lastI)));//下面开始指数变换,写回GDRAM __bang_mul_scalar(src, src, globalSumInv, maxNum); - + __memcpy(destination + indd, src, maxNum * sizeof(T), NRAM2GDRAM); __bang_write_zero(src, maxNum); - for(int s = 1; s < repeat; s++){ + for (int s = 1; s < repeat; s++) { __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM); } - if(remain){ + if (remain) { __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM); } - } - else{ + } else { int newRemain = (mask + 1 + lastI) % maxNum; int nR = (mask + 1 + lastI - newRemain) / maxNum; - + __bang_write_zero(destSum, maxNum); __bang_write_zero(destSumFinal, wSize); - + destOldMax = -INFINITY; destNewMax = -INFINITY; - for(int s = 0; s < nR; s++){ - - __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + for (int s = 0; s < nR; s++) { + + __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); __bang_argmax(srcMax, src, maxNum); - - if(destNewMax < srcMax[0]){ + + if (destNewMax < srcMax[0]) { destNewMax = srcMax[0]; } __bang_sub_scalar(src, src, destNewMax, maxNum); __bang_active_exp_less_0(src, src, maxNum); - - if(s > 0){ + + if (s > 0) { __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum); } __bang_add(destSum, destSum, src, maxNum); - + destOldMax = destNewMax; } - - if(newRemain){ + + if (newRemain) { //__bang_write_value(src, maxNum, -INFINITY); - - __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); - + + __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); + __bang_argmax(srcMax, src, maxNum); - - if(destNewMax < srcMax[0]){ + + if (destNewMax < srcMax[0]) { destNewMax = srcMax[0]; } - + __bang_write_value(tmp, maxNum, destNewMax); __memcpy(tmp, src, newRemain * sizeof(T), NRAM2NRAM); - + __bang_sub_scalar(tmp, tmp, destNewMax, maxNum); __bang_active_exp_less_0(tmp, tmp, maxNum); - - if(nR > 0){ + + if (nR > 0) { __bang_mul_scalar(destSum, destSum, exp(destOldMax - destNewMax), maxNum); } __bang_add(destSum, destSum, tmp, maxNum); - + destOldMax = destNewMax; } - - if(maxNum >= wSize){ + + if (maxNum >= wSize) { int segNum = maxNum / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ + for (int strip = segNum / 2; strip > 0; strip = strip / 2) { + for (int j = 0; j < strip; j++) { __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); } } __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - - } - else{ - + + } else { + __memcpy(destSumFinal, destSum, maxNum * sizeof(T), NRAM2NRAM); __bang_reduce_sum(destSumFinal, destSumFinal, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - } - + T globalSumInv; - if(newRemain){ + if (newRemain) { globalSumInv = 1.0 / (destSumFinal[0] - (maxNum - newRemain));//下面开始指数变换,写回GDRAM - - } - else{ + + } else { globalSumInv = 1.0 / destSumFinal[0];//下面开始指数变换,写回GDRAM - } - - for(int s = 0; s < nR; s++){ - __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - + + for (int s = 0; s < nR; s++) { + __memcpy(src, destination + indd + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __bang_sub_scalar(src, src, destNewMax, maxNum); __bang_active_exp_less_0(src, src, maxNum); __bang_mul_scalar(src, src, globalSumInv, maxNum); - + __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM); } __bang_write_zero(src, maxNum); - for(int s = nR; s < repeat; s++){ + for (int s = nR; s < repeat; s++) { __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM); } - if(remain){ + if (remain) { __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM); } - - if(newRemain){ - - __memcpy(src, source + inds + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); - + + if (newRemain) { + + __memcpy(src, destination + indd + nR * maxNum, newRemain * sizeof(T), GDRAM2NRAM); + __bang_sub_scalar(src, src, destNewMax, maxNum); __bang_active_exp_less_0(src, src, maxNum); - __bang_mul_scalar(src, src, globalSumInv, maxNum); - + __bang_mul_scalar(src, src, globalSumInv, maxNum); + __memcpy(destination + indd + nR * maxNum, src, newRemain * sizeof(T), NRAM2GDRAM); } - } } - } - else{ - T *src = (T *)nram_buffer;//[dimS] - T *destSum = src + dimS;//[dimS] + } else { + T *src = (T *) nram_buffer; //[dimS] + T *destSum = src + dimS; //[dimS] T *destSumFinal = destSum + dimS;//[wSize] - + int remainT = othersize % taskDim; int stepEasy = (othersize - remainT) / taskDim; int stepHard = stepEasy + 1; int step = (taskId < remainT ? stepHard : stepEasy); int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; + + for (int i = indStart; i < indStart + step; i++) { + int indd = 0; int indi = i; - - inds += (indi % middle) * strideS_m; + + indd += (indi % middle) * strideD_m; indi /= middle; - inds += (indi % startDim) * strideS_f; + indd += (indi % startDim) * strideD_f; __bang_write_value(src, dimS, -INFINITY); __bang_write_zero(destSumFinal, wSize); int lastI = i % middle; - __memcpy(src, source + inds, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM); + __memcpy(src, destination + indd, (mask + 1 + lastI) * sizeof(T), GDRAM2NRAM);//长度为dimsize的向量,只考虑前面mask + 1 + lastI部分的softmax __bang_argmax(srcMax, src, dimS); - __bang_write_value(destSum, dimS, srcMax[0]); - __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM); - __bang_sub_scalar(destSum, destSum, srcMax[0], dimS); - __bang_active_exp_less_0(destSum, destSum, dimS); - __bang_write_zero(src, dimS); - __memcpy(src, destSum, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM); + __bang_write_zero(destSum, dimS); + __memcpy(destSum, src, (mask + 1 + lastI) * sizeof(T), NRAM2NRAM);//初始化destSum为0,前面mask + 1 + lastI部分元素和src保持一致 + __bang_sub_scalar(destSum, destSum, srcMax[0], mask + 1 + lastI);//前面mask + 1 + lastI元素减去最大值M,后面的元素还是0 + __bang_active_exp_less_0(destSum, destSum, mask + 1 + lastI);//前面mask + 1 + lastI元素做指数变换,后面的元素还是0 + __memcpy(src, destSum, dimS * sizeof(T), NRAM2NRAM); int segNum = dimS / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ + for (int strip = segNum / 2; strip > 0; strip = strip / 2) { + for (int j = 0; j < strip; j++) { __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); } } - __bang_reduce_sum(destSumFinal, destSum, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - T globalSumInv = 1.0 / (destSumFinal[0] - (dimS - (mask + 1 + lastI)));//下面开始指数变换,写回GDRAM + __bang_reduce_sum(destSumFinal, destSum, wSize); //此时destSumFinal[0]存储的是前面mask + 1 + lastI的sum + T globalSumInv = 1.0 / destSumFinal[0]; __bang_mul_scalar(src, src, globalSumInv, dimS); - - __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM); - + __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM); } } } + template -void causal_softmaxUnionDim_3(cnrtQueue_t queue, void *destination, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int othersize, int middle, int dimsize, int mask) { +void causal_softmaxUnionDim_3(cnrtQueue_t queue, void *destination, int strideD_f, int strideD_m, int othersize, int middle, int dimsize, int mask) { int wSize = 128 / sizeof(T); auto y_ = reinterpret_cast(destination); - T *x_; - cnrtMalloc((void**)&x_, othersize * dimsize * sizeof(T)); - cnrtMemcpy(x_, y_, othersize * dimsize * sizeof(T), cnrtMemcpyDevToDev); + int dimS; float mi = log2(dimsize); if (floor(mi) == mi) { @@ -785,7 +744,7 @@ void causal_softmaxUnionDim_3(cnrtQueue_t queue, void *destination, int strideS_ if (dimS < wSize) { dimS = wSize; } - + cnrtDim3_t k_dim; cnrtFunctionType_t k_type; @@ -794,61 +753,48 @@ void causal_softmaxUnionDim_3(cnrtQueue_t queue, void *destination, int strideS_ k_dim.z = 1; k_type = CNRT_FUNC_TYPE_UNION1; - causal_softmaxDim_3<<>>(y_, x_, strideS_f, strideS_m, strideD_f, strideD_m, othersize, middle, dimsize, dimS, mask); - // cnrtQueueSync(queue); - cnrtFree(x_); + causal_softmaxDim_3<<>>(y_, strideD_f, strideD_m, othersize, middle, dimsize, dimS, mask); + cnrtQueueSync(queue); } -void causal_softmax_bang_f16(Tensor y, void *stream) { - - ASSERT(y.layout->ndim >= 2); - ASSERT(y.layout->shape[y.layout->ndim - 1] >= y.layout->shape[y.layout->ndim - 2]); - int n = 1; - - int ndim = y.layout->ndim; - - int x_stride[ndim], y_stride[ndim], shape[ndim]; - for (int i = 0; i < ndim; i++) { - x_stride[i] = static_cast(y.layout->strides[i]) / y.layout->dt.size; - y_stride[i] = static_cast(y.layout->strides[i]) / y.layout->dt.size; - shape[i] = static_cast(y.layout->shape[i]); - if(i < ndim - 1){ - n *= shape[i]; - } - } - int d = shape[ndim - 1]; - int mask = shape[ndim - 1] - shape[ndim - 2]; - + +void causal_softmax_bang_f16(CausalSoftmaxBangDescriptor_t desc, void *workspace, void *y, void *stream) { + int n = desc->n; + int d = desc->shape[desc->ndim - 1]; + int mask = desc->shape[desc->ndim - 1] - desc->shape[desc->ndim - 2]; auto queue = reinterpret_cast(stream); - if(ndim == 2){ - int strideS_f = x_stride[0]; - int strideD_f = y_stride[0]; - - causal_softmaxUnionDim_2(queue, y.data, strideS_f, strideD_f, n, d, mask); + + if (desc->ndim == 2) { + int strideD_f = desc->stride[0]; + causal_softmaxUnionDim_2(queue, y, strideD_f, n, d, mask); + + } else if (desc->ndim == 3) { + int strideD_f = desc->stride[0]; + int strideD_m = desc->stride[1]; + int middle = desc->shape[1]; + causal_softmaxUnionDim_3(queue, y, strideD_f, strideD_m, n, middle, d, mask); + + } else { + int *mlu_strideY = reinterpret_cast(workspace); + int *mlu_shape = mlu_strideY + desc->ndim; + + CNRT_CHECK(cnrtMemcpy(mlu_strideY, desc->stride, desc->ndim * sizeof(int), cnrtMemcpyHostToDev)); + CNRT_CHECK(cnrtMemcpy(mlu_shape, desc->shape, desc->ndim * sizeof(int), cnrtMemcpyHostToDev)); + + causal_softmax(queue, y, mlu_strideY, mlu_shape, n, d, mask, desc->ndim); } - - else if(ndim == 3){ - int strideS_f = x_stride[0]; - int strideD_f = y_stride[0]; - int strideS_m = x_stride[1]; - int strideD_m = y_stride[1]; - int middle = shape[1]; - - causal_softmaxUnionDim_3(queue, y.data, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, d, mask); +} + +infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream) { + if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { + return STATUS_BAD_DEVICE; } - - else{ - int *mlu_strideX, *mlu_strideY, *mlu_shape; - CNRT_CHECK(cnrtMalloc((void **)&mlu_strideX, ndim * sizeof(int))); - CNRT_CHECK(cnrtMalloc((void **)&mlu_strideY, ndim * sizeof(int))); - CNRT_CHECK(cnrtMalloc((void **)&mlu_shape, ndim * sizeof(int))); - CNRT_CHECK(cnrtMemcpy(mlu_strideX, x_stride, ndim * sizeof(int), cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpy(mlu_strideY, y_stride, ndim * sizeof(int), cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpy(mlu_shape, shape, ndim * sizeof(int), cnrtMemcpyHostToDev)); - - causal_softmax_fp16(queue, y.data, mlu_strideX, mlu_strideY, mlu_shape, n, d, mask, ndim); - cnrtFree(mlu_strideX); - cnrtFree(mlu_strideY); - cnrtFree(mlu_shape); + if (dtype_eq(desc->dtype, F16)) { + causal_softmax_bang_f16(desc, workspace, data, stream); + return STATUS_SUCCESS; } - -} + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc b/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc index 54443e9a..c1ef405d 100644 --- a/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc +++ b/src/ops/causal_softmax/bang/causal_softmax_cnnl.cc @@ -1,36 +1,71 @@ #include "causal_softmax_cnnl.h" +#include "../../../devices/bang/bang_handle.h" #include "../../../devices/bang/common_bang.h" -#include "../../../devices/bang/handle_pool.h" #include "../../utils.h" -#include "cnrt.h" +#include "cnnl_extra.h" -CausalSoftmaxBangDescriptor::CausalSoftmaxBangDescriptor(Device device) { - this->device = device; - get_cnnl_pool(); -} - -void causal_softmax_cnnl_f16(Tensor t, void *stream) { - ASSERT(t.layout->ndim >= 2); - ASSERT(t.layout->shape[t.layout->ndim - 1] >= t.layout->shape[t.layout->ndim - 2]); - cnnlTensorDescriptor_t tDesc, maskDesc; - cnnlCreateTensorDescriptor(&maskDesc); - cnnlCreateTensorDescriptor(&tDesc); +infiniopStatus_t cnnlCreateCausalSoftmaxDescriptor(BangHandle_t handle, + CausalSoftmaxCnnlDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y) { + if (y->ndim < 2 || y->shape[y->ndim - 1] < y->shape[y->ndim - 2]) { + return STATUS_BAD_TENSOR_SHAPE; + } - int ndim_ = std::max(int(t.layout->ndim), 4); + // cnnlMaskedSoftmax only support 4D or 5D tensors + int ndim_ = std::max(static_cast(y->ndim), 4); std::vector dims(ndim_, 1); - for (uint64_t i = 0; i < t.layout->ndim; i++) { - dims[ndim_ - 1 - i] = static_cast(t.layout->shape[t.layout->ndim - i - 1]); + for (uint64_t i = 0; i < y->ndim; i++) { + dims[ndim_ - 1 - i] = static_cast(y->shape[y->ndim - i - 1]); } - // 创建 mask - bool mask_matrix[dims[0]][dims[1]][dims[2]][dims[3]]; + cnnlTensorDescriptor_t yDesc, maskDesc; + cnnlCreateTensorDescriptor(&yDesc); + cnnlCreateTensorDescriptor(&maskDesc); + cnnlSetTensorDescriptor(yDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(y->dt), + dims.size(), dims.data()); + cnnlSetTensorDescriptor(maskDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_BOOL, + dims.size(), dims.data()); + + *desc_ptr = new CausalSoftmaxCnnlDescriptor{ + handle->device, + handle->device_id, + handle->cnnl_handles, + y->dt, + std::move(yDesc), + std::move(maskDesc), + std::move(dims)}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, uint64_t *size) { + *size = sizeof(bool) * desc->dims[0] * desc->dims[1] * desc->dims[2] * desc->dims[3]; + return STATUS_SUCCESS; +} + +infiniopStatus_t cnnlDestroyCausalSoftmaxDescriptor(CausalSoftmaxCnnlDescriptor_t desc) { + cnnlDestroyTensorDescriptor(desc->yDesc); + cnnlDestroyTensorDescriptor(desc->maskDesc); + delete desc; + return STATUS_SUCCESS; +} + +infiniopStatus_t cnnlCausalSoftmax(CausalSoftmaxCnnlDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream) { + if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { + return STATUS_BAD_DEVICE; + } + bool mask_matrix[desc->dims[0]][desc->dims[1]][desc->dims[2]][desc->dims[3]]; // 填充上三角矩阵(右上角为 false) - for (int i = 0; i < dims[0]; ++i) { - for (int j = 0; j < dims[1]; ++j) { - for (int m = 0; m < dims[2]; ++m) { - for (int n = 0; n < dims[3]; ++n) { - if (n - m > dims[3] - dims[2]) { + for (int i = 0; i < desc->dims[0]; ++i) { + for (int j = 0; j < desc->dims[1]; ++j) { + for (int m = 0; m < desc->dims[2]; ++m) { + for (int n = 0; n < desc->dims[3]; ++n) { + if (n - m > desc->dims[3] - desc->dims[2]) { mask_matrix[i][j][m][n] = true; } else { mask_matrix[i][j][m][n] = false; @@ -39,24 +74,16 @@ void causal_softmax_cnnl_f16(Tensor t, void *stream) { } } } + size_t mask_size = sizeof(bool) * desc->dims[0] * desc->dims[1] * desc->dims[2] * desc->dims[3]; + cnrtMemcpyAsync(workspace, mask_matrix, mask_size, (cnrtQueue_t) stream, cnrtMemcpyHostToDev); - void *mask; - cnrtMalloc((void **) &mask, sizeof(bool) * dims[0] * dims[1] * dims[2] * dims[3]); - cnrtMemcpy(mask, mask_matrix, sizeof(bool) * dims[0] * dims[1] * dims[2] * dims[3], cnrtMemcpyHostToDev); - - // 不支持 stride - cnnlSetTensorDescriptor(tDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF, - dims.size(), dims.data()); - cnnlSetTensorDescriptor(maskDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_BOOL, - dims.size(), dims.data()); - - use_cnnl((cnrtQueue_t) stream, + use_cnnl(desc->pool, desc->device_id, (cnrtQueue_t) stream, [&](cnnlHandle_t handle) { cnnlMaskedSoftmax(handle, CNNL_MASKED_SOFTMAX_MASKED_FILL, - -1, 1.0, tDesc, t.data, maskDesc, mask, - tDesc, t.data); + -1, 1.0, desc->yDesc, data, desc->maskDesc, workspace, + desc->yDesc, data); }); + cnrtQueueSync((cnrtQueue_t)stream); - cnnlDestroyTensorDescriptor(tDesc); - cnnlDestroyTensorDescriptor(maskDesc); + return STATUS_SUCCESS; } diff --git a/src/ops/causal_softmax/bang/causal_softmax_cnnl.h b/src/ops/causal_softmax/bang/causal_softmax_cnnl.h index 5f0b2adc..feaf274e 100644 --- a/src/ops/causal_softmax/bang/causal_softmax_cnnl.h +++ b/src/ops/causal_softmax/bang/causal_softmax_cnnl.h @@ -1,15 +1,35 @@ #ifndef __CNNL_CAUSAL_SOFTMAX_H__ #define __CNNL_CAUSAL_SOFTMAX_H__ +#include "../../../devices/bang/bang_handle.h" #include "cnnl.h" -#include "cnnl_extra.h" #include "operators.h" +#include -struct CausalSoftmaxBangDescriptor { +struct CausalSoftmaxCnnlDescriptor { Device device; - CausalSoftmaxBangDescriptor(Device device); + int device_id; + std::shared_ptr> pool; + DT dtype; + cnnlTensorDescriptor_t yDesc; + cnnlTensorDescriptor_t maskDesc; + std::vector dims; }; -void causal_softmax_cnnl_f16(Tensor t, void *stream); +typedef struct CausalSoftmaxCnnlDescriptor *CausalSoftmaxCnnlDescriptor_t; -#endif// __CNNL_CAUSAL_SOFTMAX_H__ +infiniopStatus_t cnnlCreateCausalSoftmaxDescriptor(BangHandle_t handle, + CausalSoftmaxCnnlDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc); + +infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cnnlCausalSoftmax(CausalSoftmaxCnnlDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream); + +infiniopStatus_t cnnlDestroyCausalSoftmaxDescriptor(CausalSoftmaxCnnlDescriptor_t desc); + +#endif diff --git a/src/ops/causal_softmax/cpu/causal_softmax_cpu.cc b/src/ops/causal_softmax/cpu/causal_softmax_cpu.cc index 0650601e..ed2a2a82 100644 --- a/src/ops/causal_softmax/cpu/causal_softmax_cpu.cc +++ b/src/ops/causal_softmax/cpu/causal_softmax_cpu.cc @@ -3,21 +3,60 @@ #include "../../utils.h" #include -void causal_softmax_cpu_f16(Tensor y) { - uint64_t ndim = y.layout->ndim; - ASSERT(ndim == 2 || ndim == 3); - uint64_t total_seq_len = y.layout->shape[ndim - 1]; - uint64_t seq_len = y.layout->shape[ndim - 2]; +infiniopStatus_t cpuCreateCausalSoftmaxDescriptor(infiniopHandle_t, + CausalSoftmaxCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y) { + uint64_t ndim = y->ndim; + if (ndim != 2 && ndim != 3) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!dtype_eq(y->dt, F16)) { + return STATUS_BAD_TENSOR_DTYPE; + } + uint64_t total_seq_len = y->shape[ndim - 1]; + uint64_t seq_len = y->shape[ndim - 2]; uint64_t batch_size = 1; - uint64_t stride_j = y.layout->strides[ndim - 1] / 2; - uint64_t stride_i = y.layout->strides[ndim - 2] / 2; + uint64_t stride_j = y->strides[ndim - 1]; + uint64_t stride_i = y->strides[ndim - 2]; uint64_t stride_b = 0; if (ndim == 3) - stride_b = y.layout->strides[ndim - 3] / 2; + stride_b = y->strides[ndim - 3]; for (size_t i = 0; i < ndim - 2; i++) { - batch_size *= y.layout->shape[i]; + batch_size *= y->shape[i]; } - auto y_ptr = reinterpret_cast(y.data); + + *desc_ptr = new CausalSoftmaxCpuDescriptor{ + DevCpu, + y->dt, + batch_size, + stride_b, + seq_len, + stride_i, + total_seq_len, + stride_j}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCpuDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyCausalSoftmaxDescriptor(CausalSoftmaxCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} + + +void causal_softmax_cpu_f16(CausalSoftmaxCpuDescriptor_t desc, void* y) { + uint64_t total_seq_len = desc->total_seq_len; + uint64_t seq_len = desc->seq_len; + uint64_t batch_size = desc->batch_size; + uint64_t stride_j = desc->stride_j; + uint64_t stride_i = desc->stride_i; + uint64_t stride_b = desc->stride_b; + auto y_ptr = reinterpret_cast(y); for (size_t b = 0; b < batch_size; b++) { for (size_t i = 0; i < seq_len; i++) { uint64_t offset = b * stride_b + i * stride_i; @@ -41,3 +80,16 @@ void causal_softmax_cpu_f16(Tensor y) { } } } + +infiniopStatus_t cpuCausalSoftmax(CausalSoftmaxCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream) { + if(dtype_eq(desc->dtype, F16)){ + causal_softmax_cpu_f16(desc, data); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/causal_softmax/cpu/causal_softmax_cpu.h b/src/ops/causal_softmax/cpu/causal_softmax_cpu.h index e77a159f..e85bc598 100644 --- a/src/ops/causal_softmax/cpu/causal_softmax_cpu.h +++ b/src/ops/causal_softmax/cpu/causal_softmax_cpu.h @@ -2,10 +2,31 @@ #define __CPU_CAUSAL_SOFTMAX_H__ #include "operators.h" -typedef struct CausalSoftmaxCpuDescriptor { +struct CausalSoftmaxCpuDescriptor { Device device; -} CausalSoftmaxCpuDescriptor; + DT dtype; + uint64_t batch_size; + uint64_t stride_b; + uint64_t seq_len; + uint64_t stride_i; + uint64_t total_seq_len; + uint64_t stride_j; +}; -void causal_softmax_cpu_f16(Tensor); +typedef struct CausalSoftmaxCpuDescriptor *CausalSoftmaxCpuDescriptor_t; + +infiniopStatus_t cpuCreateCausalSoftmaxDescriptor(infiniopHandle_t, + CausalSoftmaxCpuDescriptor_t *, + infiniopTensorDescriptor_t y_desc); + +infiniopStatus_t cpuGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCpuDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cpuCausalSoftmax(CausalSoftmaxCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream); + +infiniopStatus_t cpuDestroyCausalSoftmaxDescriptor(CausalSoftmaxCpuDescriptor_t desc); #endif diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cc b/src/ops/causal_softmax/cuda/causal_softmax.cc new file mode 100644 index 00000000..c7f4d5ed --- /dev/null +++ b/src/ops/causal_softmax/cuda/causal_softmax.cc @@ -0,0 +1,55 @@ +#include "causal_softmax.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle, + CausalSoftmaxCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y) { + uint64_t ndim = y->ndim; + // TODO: only support 2d or 3d tensor + if (ndim != 2 && ndim != 3) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!dtype_eq(y->dt, F16)) { + return STATUS_BAD_TENSOR_DTYPE; + } + uint64_t total_seq_len = y->shape[ndim - 1]; + uint64_t seq_len = y->shape[ndim - 2]; + uint64_t batch_size = 1; + uint64_t stride_b = 0; + uint64_t stride_i = y->strides[ndim - 2]; + uint64_t stride_j = y->strides[ndim - 1]; + if (stride_j != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + for (int i = 0; i < ndim - 2; i++) { + batch_size *= y->shape[i]; + } + if (ndim == 3) + stride_b = y->strides[ndim - 3]; + unsigned int max_items_per_thread = ROUND_UP_DIV(total_seq_len, MAX_THREADS_PER_BLOCK); + + *desc_ptr = new CausalSoftmaxCudaDescriptor{ + handle->device, + handle->device_id, + y->dt, + batch_size, + stride_b, + seq_len, + stride_i, + total_seq_len, + stride_j, + max_items_per_thread}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyCausalSoftmaxDescriptor(CausalSoftmaxCudaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cu b/src/ops/causal_softmax/cuda/causal_softmax.cu index dd65aef8..7f937edc 100644 --- a/src/ops/causal_softmax/cuda/causal_softmax.cu +++ b/src/ops/causal_softmax/cuda/causal_softmax.cu @@ -16,6 +16,12 @@ struct AttentionCausualMask { } }; +struct MaxOp { + __device__ float operator()(const float a, const float b) const { + return a > b ? a: b; + } +}; + template static __device__ void block_padding( Tdata *__restrict__ att, @@ -33,7 +39,12 @@ static __device__ void block_padding( __shared__ float max; { +#ifdef ENABLE_SUGON_DCU + MaxOp max_op; + auto acc = block_op.Reduce(thread_data, max_op, total_seq_len); +#else auto acc = block_op.Reduce(thread_data, cub::Max(), total_seq_len); +#endif if (threadIdx.x == 0) { max = acc; } } __syncthreads(); @@ -67,7 +78,12 @@ static __device__ void block_folding( thread_data[i] = att_idx < total_seq_len && mask(token_idx, seq_len, att_idx, total_seq_len) ? float(att[i]) : -__FLT_MAX__; +#ifdef ENABLE_SUGON_DCU + MaxOp max_op; + thread_max = max_op(thread_max, thread_data[i]); +#else thread_max = cub::Max()(thread_max, thread_data[i]); +#endif } using BlockOp = cub::BlockReduce; @@ -76,7 +92,12 @@ static __device__ void block_folding( __shared__ float max; { +#ifdef ENABLE_SUGON_DCU + MaxOp max_op; + auto acc = block_op.Reduce(thread_max, max_op); +#else auto acc = block_op.Reduce(thread_max, cub::Max()); +#endif if (threadIdx.x == 0) { max = acc; } } __syncthreads(); @@ -130,7 +151,7 @@ static __forceinline__ __device__ void folding( } template -__global__ void fused_softmax_padding( +__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void fused_softmax_padding( Tdata *__restrict__ att, unsigned int const stride_x, unsigned int const stride_y, @@ -140,7 +161,7 @@ __global__ void fused_softmax_padding( } template -__global__ void fused_softmax_folding( +__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void fused_softmax_folding( Tdata *__restrict__ att, unsigned int const stride_x, unsigned int const stride_y, @@ -152,7 +173,7 @@ __global__ void fused_softmax_folding( } template -__global__ void fused_softmax_standard( +__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void fused_softmax_standard( Tdata *__restrict__ att_, unsigned int const stride_x, unsigned int const stride_y, @@ -183,7 +204,12 @@ __global__ void fused_softmax_standard( __syncthreads(); // Block reduce max { +#ifdef ENABLE_SUGON_DCU + MaxOp max_op; + auto acc = block_op.Reduce(partial, max_op); +#else auto acc = block_op.Reduce(partial, cub::Max()); +#endif if (threadIdx.x == 0) { max_ = acc; } } __syncthreads(); @@ -200,7 +226,11 @@ __global__ void fused_softmax_standard( // Block reduce sum { +#ifdef ENABLE_SUGON_DCU + auto acc = block_op.Sum(partial); +#else auto acc = block_op.Reduce(partial, cub::Sum()); +#endif if (threadIdx.x == 0) { sum_ = acc; } } __syncthreads(); @@ -218,31 +248,41 @@ __global__ void fused_softmax_standard( } -void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor *desc, Tensor y, void *stream) { - // TODO: only support 2d or 3d tensor - ASSERT(y.layout->ndim == 2 || y.layout->ndim == 3); - uint64_t total_seq_len = y.layout->shape[y.layout->ndim - 1]; - uint64_t seq_len = y.layout->shape[y.layout->ndim - 2]; - uint64_t batch_size = 1; - uint64_t stride_x = 1; - uint64_t stride_y = y.layout->strides[y.layout->ndim - 2] / 2; - uint64_t stride_z = y.layout->strides[y.layout->ndim - 1] / 2; - ASSERT(stride_z == 1); // the last dimension should be contiguous - for (size_t i = 0; i < y.layout->ndim - 2; i++) { - batch_size *= y.layout->shape[i]; - stride_x *= y.layout->strides[i]; - } - stride_x /= 2; // covert byte strides to element strides +void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor_t desc, void *y, void *stream) { + uint64_t total_seq_len = desc->total_seq_len; + uint64_t seq_len = desc->seq_len; + uint64_t batch_size = desc->batch_size; + uint64_t stride_x = desc->stride_b; + uint64_t stride_y = desc->stride_i; + uint64_t stride_z = desc->stride_j;// covert byte strides to element strides + unsigned int max_items_per_thread = desc->max_items_per_thread; + dim3 grid(batch_size, seq_len); - auto max_items_per_thread = ROUND_UP_DIV(total_seq_len, MAX_THREADS_PER_BLOCK); + if (max_items_per_thread == 1) { fused_softmax_padding - <<>>((half *) (y.data), stride_x, stride_y, stride_z); + <<>>((half *) (y), stride_x, stride_y, stride_z); } else if (max_items_per_thread <= 16) { fused_softmax_folding - <<>>((half *) (y.data), stride_x, stride_y, stride_z, total_seq_len); + <<>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len); } else { fused_softmax_standard - <<>>((half *) (y.data), stride_x, stride_y, stride_z, total_seq_len); + <<>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len); } } + +infiniopStatus_t cudaCausalSoftmax(CausalSoftmaxCudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream) { + if (cudaSetDevice(desc->device_id) != cudaSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)) { + causal_softmax_nv_gpu_f16(desc, data, stream); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/causal_softmax/cuda/causal_softmax.cuh b/src/ops/causal_softmax/cuda/causal_softmax.cuh index 0aafab57..30516bee 100644 --- a/src/ops/causal_softmax/cuda/causal_softmax.cuh +++ b/src/ops/causal_softmax/cuda/causal_softmax.cuh @@ -1,12 +1,36 @@ -#ifndef __NV_CPU_CAUSAL_SOFTMAX_H__ -#define __NV_CPU_CAUSAL_SOFTMAX_H__ +#ifndef __CUDA_CAUSAL_SOFTMAX_H__ +#define __CUDA_CAUSAL_SOFTMAX_H__ +#include "../../../devices/cuda/cuda_handle.h" #include "operators.h" -typedef struct CausalSoftmaxCudaDescriptor { +struct CausalSoftmaxCudaDescriptor { Device device; -} CausalSoftmaxCudaDescriptor; + int device_id; + DT dtype; + uint64_t batch_size; + uint64_t stride_b; + uint64_t seq_len; + uint64_t stride_i; + uint64_t total_seq_len; + uint64_t stride_j; + unsigned int max_items_per_thread; +}; -void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor *, Tensor, void *stream); +typedef struct CausalSoftmaxCudaDescriptor *CausalSoftmaxCudaDescriptor_t; + +infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle, + CausalSoftmaxCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc); + +infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cudaCausalSoftmax(CausalSoftmaxCudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream); + +infiniopStatus_t cudaDestroyCausalSoftmaxDescriptor(CausalSoftmaxCudaDescriptor_t desc); #endif diff --git a/src/ops/causal_softmax/maca/causal_softmax_maca.cc b/src/ops/causal_softmax/maca/causal_softmax_maca.cc new file mode 100644 index 00000000..5a3803e7 --- /dev/null +++ b/src/ops/causal_softmax/maca/causal_softmax_maca.cc @@ -0,0 +1,55 @@ +#include "causal_softmax_maca.h" +#include "../../../devices/maca/common_maca.h" +#include "../../utils.h" + +infiniopStatus_t macaCreateCausalSoftmaxDescriptor(MacaHandle_t handle, + CausalSoftmaxMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y) { + uint64_t ndim = y->ndim; + // TODO: only support 2d or 3d tensor + if (ndim != 2 && ndim != 3) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!dtype_eq(y->dt, F16)) { + return STATUS_BAD_TENSOR_DTYPE; + } + uint64_t total_seq_len = y->shape[ndim - 1]; + uint64_t seq_len = y->shape[ndim - 2]; + uint64_t batch_size = 1; + uint64_t stride_b = 0; + uint64_t stride_i = y->strides[ndim - 2]; + uint64_t stride_j = y->strides[ndim - 1]; + if (stride_j != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + for (int i = 0; i < ndim - 2; i++) { + batch_size *= y->shape[i]; + } + if (ndim == 3) + stride_b = y->strides[ndim - 3]; + unsigned int max_items_per_thread = ROUND_UP_DIV(total_seq_len, MAX_THREADS_PER_BLOCK); + + *desc_ptr = new CausalSoftmaxMacaDescriptor{ + handle->device, + handle->device_id, + y->dt, + batch_size, + stride_b, + seq_len, + stride_i, + total_seq_len, + stride_j, + max_items_per_thread}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t macaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxMacaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t macaDestroyCausalSoftmaxDescriptor(CausalSoftmaxMacaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/causal_softmax/maca/causal_softmax_maca.h b/src/ops/causal_softmax/maca/causal_softmax_maca.h new file mode 100644 index 00000000..daa198b7 --- /dev/null +++ b/src/ops/causal_softmax/maca/causal_softmax_maca.h @@ -0,0 +1,36 @@ +#ifndef __MACA_CAUSAL_SOFTMAX_H__ +#define __MACA_CAUSAL_SOFTMAX_H__ + +#include "../../../devices/maca/maca_handle.h" +#include "operators.h" + +struct CausalSoftmaxMacaDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t batch_size; + uint64_t stride_b; + uint64_t seq_len; + uint64_t stride_i; + uint64_t total_seq_len; + uint64_t stride_j; + unsigned int max_items_per_thread; +}; + +typedef struct CausalSoftmaxMacaDescriptor *CausalSoftmaxMacaDescriptor_t; + +infiniopStatus_t macaCreateCausalSoftmaxDescriptor(MacaHandle_t handle, + CausalSoftmaxMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc); + +infiniopStatus_t macaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxMacaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t macaCausalSoftmax(CausalSoftmaxMacaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream); + +infiniopStatus_t macaDestroyCausalSoftmaxDescriptor(CausalSoftmaxMacaDescriptor_t desc); + +#endif diff --git a/src/ops/causal_softmax/maca/causal_softmax_maca.maca b/src/ops/causal_softmax/maca/causal_softmax_maca.maca new file mode 100644 index 00000000..94b884e8 --- /dev/null +++ b/src/ops/causal_softmax/maca/causal_softmax_maca.maca @@ -0,0 +1,259 @@ +#include "../../../devices/maca/common_maca.h" +#include "../../utils.h" +#include "causal_softmax_maca.h" +#include + +struct AttentionCausualMask { + __forceinline__ __device__ bool + operator()(int tok_id, int seq_len, + int pos_id, int total_seq_len) { + // tok_id ↓ |<-total_seq_len->| + // 0 | * * * ... * | + // 1 | * * * ... * * | + // 2 | * * * ... * * * | + // seq_len: 3 pos_id-> + return total_seq_len + tok_id >= pos_id + seq_len; + } +}; + +template +static __device__ void block_padding( + Tdata *__restrict__ att, + Tmask mask, + unsigned int const token_idx, + unsigned int const seq_len) { + auto att_idx = threadIdx.x; + auto total_seq_len = blockDim.x; + auto thread_data = mask(token_idx, seq_len, att_idx, total_seq_len) + ? float(att[att_idx]) + : -__FLT_MAX__; + + using BlockOp = cub::BlockReduce; + __shared__ typename BlockOp::TempStorage temp_storage; + auto block_op = BlockOp(temp_storage); + + __shared__ float max; + { + auto acc = block_op.Reduce(thread_data, cub::Max(), total_seq_len); + if (threadIdx.x == 0) { max = acc; } + } + __syncthreads(); + + __shared__ float mean; + { + auto acc = block_op.Sum(thread_data = expf(thread_data - max), total_seq_len); + if (threadIdx.x == 0) { mean = fdividef(1, acc); } + } + __syncthreads(); + + att[att_idx] = Tdata(thread_data * mean); +} + +template +static __device__ void block_folding( + Tdata *__restrict__ att, + Tmask mask, + unsigned int const token_idx, + unsigned int const seq_len, + unsigned int const total_seq_len) { + + auto local = (total_seq_len + blockDim.x - 1) / blockDim.x; + + auto thread_offset = threadIdx.x * local; + att += thread_offset; + + float thread_data[ITEMS_PER_THREAD], thread_max = -__FLT_MAX__, thread_sum = 0; + for (unsigned int i = 0; i < local; ++i) { + auto att_idx = thread_offset + i; + thread_data[i] = att_idx < total_seq_len && mask(token_idx, seq_len, att_idx, total_seq_len) + ? float(att[i]) + : -__FLT_MAX__; + thread_max = cub::Max()(thread_max, thread_data[i]); + } + + using BlockOp = cub::BlockReduce; + __shared__ typename BlockOp::TempStorage temp_storage; + auto block_op = BlockOp(temp_storage); + + __shared__ float max; + { + auto acc = block_op.Reduce(thread_max, cub::Max()); + if (threadIdx.x == 0) { max = acc; } + } + __syncthreads(); + + __shared__ float mean; + { + for (unsigned int i = 0; i < local; ++i) { + thread_data[i] = expf(thread_data[i] - max); + thread_sum += thread_data[i]; + } + auto acc = block_op.Sum(thread_sum); + if (threadIdx.x == 0) { mean = fdividef(1, acc); } + } + __syncthreads(); + + for (unsigned int i = 0; i < local; ++i) { + if (auto att_idx = thread_offset + i; att_idx < total_seq_len) { + att[i] = Tdata(thread_data[i] * mean); + } + } +} + +// assert BLOCK_SIZE >= blockDim.x +template +static __forceinline__ __device__ void padding( + Tdata *__restrict__ att, + Tmask mask, + int const stride_x, + int const stride_y, + int const stride_z) { + auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y, + token_idx = blockIdx.y, + seq_len = gridDim.y; + block_padding( + att + offset, mask, token_idx, seq_len); +} + +template +static __forceinline__ __device__ void folding( + Tdata *__restrict__ att, + Tmask mask, + unsigned int const total_seq_len, + int const stride_x, + int const stride_y, + int const stride_z) { + auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y, + token_idx = blockIdx.y, + seq_len = gridDim.y; + block_folding( + att + offset, mask, token_idx, seq_len, total_seq_len); +} + +template +__global__ void fused_softmax_padding( + Tdata *__restrict__ att, + unsigned int const stride_x, + unsigned int const stride_y, + unsigned int const stride_z) { + + padding(att, AttentionCausualMask(), stride_x, stride_y, stride_z); +} + +template +__global__ void fused_softmax_folding( + Tdata *__restrict__ att, + unsigned int const stride_x, + unsigned int const stride_y, + unsigned int const stride_z, + unsigned int const total_seq_len) { + { + folding(att, AttentionCausualMask(), total_seq_len, stride_x, stride_y, stride_z); + } +} + +template +__global__ void fused_softmax_standard( + Tdata *__restrict__ att_, + unsigned int const stride_x, + unsigned int const stride_y, + unsigned int const stride_z, + unsigned int const total_seq_len) { + { + auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y, + token_idx = blockIdx.y, + seq_len = gridDim.y; + + auto att = att_ + offset; + auto att_idx = threadIdx.x; + + float partial; + __shared__ float max_; + __shared__ float sum_; + using BlockOp = cub::BlockReduce; + __shared__ typename BlockOp::TempStorage temp_storage; + auto block_op = BlockOp(temp_storage); + + // Partial max + partial = -__FLT_MAX__; + for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) { + if (i <= total_seq_len - seq_len + token_idx) { + partial = max(partial, float(att[i])); + } + } + __syncthreads(); + // Block reduce max + { + auto acc = block_op.Reduce(partial, cub::Max()); + if (threadIdx.x == 0) { max_ = acc; } + } + __syncthreads(); + + // Partial sum + partial = 0.; + for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) { + if (i <= total_seq_len - seq_len + token_idx) { + float e = expf(float(att[i]) - max_); + partial += e; + } + } + __syncthreads(); + + // Block reduce sum + { + auto acc = block_op.Reduce(partial, cub::Sum()); + if (threadIdx.x == 0) { sum_ = acc; } + } + __syncthreads(); + + // Softmax + for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) { + if (i <= total_seq_len - seq_len + token_idx) { + float e = expf(float(att[i]) - max_); + att[i] = e / sum_; + } else { + att[i] = half(0); + } + } + } +} + + +void causal_softmax_nv_gpu_f16(CausalSoftmaxMacaDescriptor_t desc, void *y, void *stream) { + uint64_t total_seq_len = desc->total_seq_len; + uint64_t seq_len = desc->seq_len; + uint64_t batch_size = desc->batch_size; + uint64_t stride_x = desc->stride_b; + uint64_t stride_y = desc->stride_i; + uint64_t stride_z = desc->stride_j;// covert byte strides to element strides + unsigned int max_items_per_thread = desc->max_items_per_thread; + + dim3 grid(batch_size, seq_len); + + if (max_items_per_thread == 1) { + fused_softmax_padding + <<>>((half *) (y), stride_x, stride_y, stride_z); + } else if (max_items_per_thread <= 16) { + fused_softmax_folding + <<>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len); + } else { + fused_softmax_standard + <<>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len); + } +} + +infiniopStatus_t macaCausalSoftmax(CausalSoftmaxMacaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream) { + if (hcSetDevice(desc->device_id) != hcSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)) { + causal_softmax_nv_gpu_f16(desc, data, stream); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/causal_softmax/musa/causal_softmax_musa.cc b/src/ops/causal_softmax/musa/causal_softmax_musa.cc new file mode 100644 index 00000000..6ff55d65 --- /dev/null +++ b/src/ops/causal_softmax/musa/causal_softmax_musa.cc @@ -0,0 +1,55 @@ +#include "causal_softmax_musa.h" +#include "../../utils.h" +#include "../../../devices/musa/common_musa.h" + +infiniopStatus_t musaCreateCausalSoftmaxDescriptor(MusaHandle_t handle, + CausalSoftmaxMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y) { + uint64_t ndim = y->ndim; + // TODO: only support 2d or 3d tensor + if (ndim != 2 && ndim != 3) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!dtype_eq(y->dt, F16)) { + return STATUS_BAD_TENSOR_DTYPE; + } + uint64_t total_seq_len = y->shape[ndim - 1]; + uint64_t seq_len = y->shape[ndim - 2]; + uint64_t batch_size = 1; + uint64_t stride_b = 0; + uint64_t stride_i = y->strides[ndim - 2]; + uint64_t stride_j = y->strides[ndim - 1]; + if (stride_j != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + for (uint64_t i = 0; i < ndim - 2; i++) { + batch_size *= y->shape[i]; + } + if (ndim == 3) + stride_b = y->strides[ndim - 3]; + unsigned int max_items_per_thread = ROUND_UP_DIV(total_seq_len, MAX_THREADS_PER_BLOCK); + + *desc_ptr = new CausalSoftmaxMusaDescriptor{ + handle->device, + handle->device_id, + y->dt, + batch_size, + stride_b, + seq_len, + stride_i, + total_seq_len, + stride_j, + max_items_per_thread}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t musaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxMusaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t musaDestroyCausalSoftmaxDescriptor(CausalSoftmaxMusaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/causal_softmax/musa/causal_softmax_musa.h b/src/ops/causal_softmax/musa/causal_softmax_musa.h new file mode 100644 index 00000000..c6f81afc --- /dev/null +++ b/src/ops/causal_softmax/musa/causal_softmax_musa.h @@ -0,0 +1,35 @@ +#ifndef __MUSA_CAUSAL_SOFTMAX_H__ +#define __MUSA_CAUSAL_SOFTMAX_H__ + +#include "operators.h" +#include "../../../devices/musa/musa_handle.h" + +struct CausalSoftmaxMusaDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t batch_size; + uint64_t stride_b; + uint64_t seq_len; + uint64_t stride_i; + uint64_t total_seq_len; + uint64_t stride_j; + uint64_t max_items_per_thread; +}; + +typedef struct CausalSoftmaxMusaDescriptor *CausalSoftmaxMusaDescriptor_t; + +infiniopStatus_t musaCreateCausalSoftmaxDescriptor(MusaHandle_t handle, + CausalSoftmaxMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc); + +infiniopStatus_t musaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxMusaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t musaCausalSoftmax(CausalSoftmaxMusaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream); + +infiniopStatus_t musaDestroyCausalSoftmaxDescriptor(CausalSoftmaxMusaDescriptor_t desc); +#endif diff --git a/src/ops/causal_softmax/musa/causal_softmax_musa.mu b/src/ops/causal_softmax/musa/causal_softmax_musa.mu new file mode 100644 index 00000000..5eb5c8d9 --- /dev/null +++ b/src/ops/causal_softmax/musa/causal_softmax_musa.mu @@ -0,0 +1,262 @@ +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include "causal_softmax_musa.h" +#include + +struct AttentionCausualMask { + __forceinline__ __device__ bool + operator()(int tok_id, int seq_len, + int pos_id, int total_seq_len) { + // tok_id ↓ |<-total_seq_len->| + // 0 | * * * ... * | + // 1 | * * * ... * * | + // 2 | * * * ... * * * | + // seq_len: 3 pos_id-> + return total_seq_len + tok_id >= pos_id + seq_len; + } +}; + +template +static __device__ void block_padding( + Tdata *__restrict__ att, + Tmask mask, + unsigned int const token_idx, + unsigned int const seq_len) { + auto att_idx = threadIdx.x, total_seq_len = blockDim.x; + auto thread_data = mask(token_idx, seq_len, att_idx, total_seq_len) + ? float(att[att_idx]) + : -__FLT_MAX__; + + using BlockOp = cub::BlockReduce; + __shared__ typename BlockOp::TempStorage temp_storage; + auto block_op = BlockOp(temp_storage); + + __shared__ float max; + { + auto acc = block_op.Reduce(thread_data, cub::Max(), total_seq_len); + if (threadIdx.x == 0) { max = acc; } + } + __syncthreads(); + + __shared__ float mean; + { + auto acc = block_op.Sum(thread_data = expf(thread_data - max), total_seq_len); + if (threadIdx.x == 0) { mean = fdividef(1, acc); } + } + __syncthreads(); + + att[att_idx] = Tdata(thread_data * mean); +} + +template +static __device__ void block_folding( + Tdata *__restrict__ att, + Tmask mask, + unsigned int const token_idx, + unsigned int const seq_len, + unsigned int const total_seq_len) { + + auto local = (total_seq_len + blockDim.x - 1) / blockDim.x; + + auto thread_offset = threadIdx.x * local; + att += thread_offset; + + float thread_data[ITEMS_PER_THREAD], thread_max = -__FLT_MAX__, thread_sum = 0; + for (unsigned int i = 0; i < local; ++i) { + auto att_idx = thread_offset + i; + thread_data[i] = att_idx < total_seq_len && mask(token_idx, seq_len, att_idx, total_seq_len) + ? float(att[i]) + : -__FLT_MAX__; + thread_max = cub::Max()(thread_max, thread_data[i]); + } + + using BlockOp = cub::BlockReduce; + __shared__ typename BlockOp::TempStorage temp_storage; + auto block_op = BlockOp(temp_storage); + + __shared__ float max; + { + auto acc = block_op.Reduce(thread_max, cub::Max()); + if (threadIdx.x == 0) { max = acc; } + } + __syncthreads(); + + __shared__ float mean; + { + for (unsigned int i = 0; i < local; ++i) { + thread_data[i] = expf(thread_data[i] - max); + thread_sum += thread_data[i]; + } + auto acc = block_op.Sum(thread_sum); + if (threadIdx.x == 0) { mean = fdividef(1, acc); } + } + __syncthreads(); + + for (unsigned int i = 0; i < local; ++i) { + if (auto att_idx = thread_offset + i; att_idx < total_seq_len) { + att[i] = Tdata(thread_data[i] * mean); + } + } +} + +// assert BLOCK_SIZE >= blockDim.x +template +static __forceinline__ __device__ void padding( + Tdata *__restrict__ att, + Tmask mask, + int const stride_x, + int const stride_y, + int const stride_z) { + auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y, + token_idx = blockIdx.y, + seq_len = gridDim.y; + block_padding( + att + offset, mask, token_idx, seq_len); +} + +template +static __forceinline__ __device__ void folding( + Tdata *__restrict__ att, + Tmask mask, + unsigned int const total_seq_len, + int const stride_x, + int const stride_y, + int const stride_z) { + auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y, + token_idx = blockIdx.y, + seq_len = gridDim.y; + block_folding( + att + offset, mask, token_idx, seq_len, total_seq_len); +} + +template +__global__ void fused_softmax_padding( + Tdata *__restrict__ att, + unsigned int const stride_x, + unsigned int const stride_y, + unsigned int const stride_z) { + + padding(att, AttentionCausualMask(), stride_x, stride_y, stride_z); +} + +template +__global__ void fused_softmax_folding( + Tdata *__restrict__ att, + unsigned int const stride_x, + unsigned int const stride_y, + unsigned int const stride_z, + unsigned int const total_seq_len) { + { + folding(att, AttentionCausualMask(), total_seq_len, stride_x, stride_y, stride_z); + } +} + +template +__global__ void fused_softmax_standard( + Tdata *__restrict__ att_, + unsigned int const stride_x, + unsigned int const stride_y, + unsigned int const stride_z, + unsigned int const total_seq_len) { + { + auto offset = blockIdx.x * stride_x + blockIdx.y * stride_y, + token_idx = blockIdx.y, + seq_len = gridDim.y; + + auto att = att_ + offset; + auto att_idx = threadIdx.x; + + float partial; + __shared__ float max_; + __shared__ float sum_; + using BlockOp = cub::BlockReduce; + __shared__ typename BlockOp::TempStorage temp_storage; + auto block_op = BlockOp(temp_storage); + + // Partial max + partial = -__FLT_MAX__; + for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) { + if (i <= total_seq_len - seq_len + token_idx) { + partial = max(partial, float(att[i])); + } + } + __syncthreads(); + // Block reduce max + { + auto acc = block_op.Reduce(partial, cub::Max()); + if (threadIdx.x == 0) { max_ = acc; } + } + __syncthreads(); + + // Partial sum + partial = 0.; + for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) { + if (i <= total_seq_len - seq_len + token_idx) { + float e = expf(float(att[i]) - max_); + partial += e; + } + } + __syncthreads(); + + // Block reduce sum + { + auto acc = block_op.Reduce(partial, cub::Sum()); + if (threadIdx.x == 0) { sum_ = acc; } + } + __syncthreads(); + + // Softmax + for (unsigned int i = att_idx; i < total_seq_len; i += BLOCK_SIZE) { + if (i <= total_seq_len - seq_len + token_idx) { + float e = expf(float(att[i]) - max_); + att[i] = e / sum_; + } else { + att[i] = half(0); + } + } + } +} + + +void causal_softmax_mt_gpu_f16(CausalSoftmaxMusaDescriptor_t desc, void* y, void *stream) { + uint64_t total_seq_len = desc->total_seq_len; + uint64_t seq_len = desc->seq_len; + uint64_t batch_size = desc->batch_size; + uint64_t stride_x = desc->stride_b; + uint64_t stride_y = desc->stride_i; + uint64_t stride_z = desc->stride_j;// covert byte strides to element strides + unsigned int max_items_per_thread = desc->max_items_per_thread; + + dim3 grid(batch_size, seq_len); + + if (max_items_per_thread == 1) { + fused_softmax_padding + <<>>((half *) (y), stride_x, stride_y, stride_z); + } else if (max_items_per_thread <= 16) { + fused_softmax_folding + <<>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len); + } else { + fused_softmax_standard + <<>>((half *) (y), stride_x, stride_y, stride_z, total_seq_len); + } +} + +infiniopStatus_t musaCausalSoftmax(CausalSoftmaxMusaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *data, + void *stream) { + int current_device; + if (musaGetDevice(¤t_device) != musaSuccess) { + return STATUS_BAD_DEVICE; + } + if (current_device != desc->device_id && musaSetDevice(desc->device_id) != musaSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)) { + causal_softmax_mt_gpu_f16(desc, data, stream); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/causal_softmax/operator.cc b/src/ops/causal_softmax/operator.cc index 3b1f6b97..92498dca 100644 --- a/src/ops/causal_softmax/operator.cc +++ b/src/ops/causal_softmax/operator.cc @@ -1,4 +1,5 @@ #include "../utils.h" +#include "operators.h" #include "ops/causal_softmax/causal_softmax.h" #ifdef ENABLE_CPU @@ -7,81 +8,171 @@ #ifdef ENABLE_NV_GPU #include "../../devices/cuda/common_cuda.h" #include "cuda/causal_softmax.cuh" +#include "../../devices/cuda/cuda_handle.h" #endif #ifdef ENABLE_CAMBRICON_MLU -#include "bang/causal_softmax_cnnl.h" +#include "../../devices/bang/bang_handle.h" #include "bang/causal_softmax_bang.h" +#include "bang/causal_softmax_cnnl.h" +#endif +#ifdef ENABLE_ASCEND_NPU +#include "ascend/causal_softmax_aclnn.h" +#endif +#ifdef ENABLE_METAX_GPU +#include "maca/causal_softmax_maca.h" +#endif +#ifdef ENABLE_MTHREADS_GPU +#include "musa/causal_softmax_musa.h" +#include "../../devices/musa/common_musa.h" #endif -struct CausalSoftmaxDescriptor { - Device device; -}; +__C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor( + infiniopHandle_t handle, + infiniopCausalSoftmaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateCausalSoftmaxDescriptor(handle, (CausalSoftmaxCpuDescriptor_t *) desc_ptr, y_desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateCausalSoftmaxDescriptor((CudaHandle_t)handle, (CausalSoftmaxCudaDescriptor_t *) desc_ptr, y_desc); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return bangCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxBangDescriptor_t *) desc_ptr, y_desc); + // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnCreateCausalSoftmaxDescriptor((AscendHandle_t) handle, (CausalSoftmaxAclnnDescriptor_t *) desc_ptr, y_desc); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaCreateCausalSoftmaxDescriptor((MacaHandle_t) handle, (CausalSoftmaxMacaDescriptor_t *) desc_ptr, y_desc); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaCreateCausalSoftmaxDescriptor((MusaHandle_t) handle, (CausalSoftmaxMusaDescriptor_t *) desc_ptr, y_desc); + } +#endif + } + return STATUS_BAD_DEVICE; +} -__C CausalSoftmaxDescriptor *createCausalSoftmaxDescriptor(Device device, void *config) { - switch (device) { +__C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, uint64_t *size) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - return (CausalSoftmaxDescriptor *) (new CausalSoftmaxCpuDescriptor{device}); + return cpuGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCpuDescriptor_t) desc, size); #endif #ifdef ENABLE_NV_GPU case DevNvGpu: { - return (CausalSoftmaxDescriptor *) (new CausalSoftmaxCudaDescriptor{device}); + return cudaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCudaDescriptor_t) desc, size); } #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { - return (CausalSoftmaxDescriptor *) (new CausalSoftmaxBangDescriptor(device)); + return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t) desc, size); + // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size); + } + +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnGetCausalSoftmaxWorkspaceSize((CausalSoftmaxAclnnDescriptor_t) desc, size); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t) desc, size); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t) desc, size); } #endif - default: - PANIC(UnsupportedDevice); } - return nullptr; + return STATUS_BAD_DEVICE; } -__C void destroyCausalSoftmaxDescriptor(CausalSoftmaxDescriptor *descriptor) { - switch (descriptor->device) { +__C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *data, void *stream) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - delete (CausalSoftmaxCpuDescriptor *) (descriptor); - break; + return cpuCausalSoftmax((CausalSoftmaxCpuDescriptor_t) desc, workspace, workspace_size, data, stream); #endif #ifdef ENABLE_NV_GPU - case DevNvGpu: - delete (CausalSoftmaxCudaDescriptor *) (descriptor); - break; + case DevNvGpu: { + return cudaCausalSoftmax((CausalSoftmaxCudaDescriptor_t) desc, workspace, workspace_size, data, stream); + } + #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { - delete (CausalSoftmaxBangDescriptor *) (descriptor); - break; + return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t) desc, workspace, workspace_size, data, stream); + // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnCausalSoftmax((CausalSoftmaxAclnnDescriptor_t) desc, workspace, workspace_size, data, stream); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t) desc, workspace, workspace_size, data, stream); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t) desc, workspace, workspace_size, data, stream); } #endif - default: - PANIC(UnsupportedDevice); } + return STATUS_BAD_DEVICE; } -__C void causalSoftmax(CausalSoftmaxDescriptor *descriptor, Tensor y, void *stream) { - switch (descriptor->device) { +__C infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - causal_softmax_cpu_f16(y); - break; + return cpuDestroyCausalSoftmaxDescriptor((CausalSoftmaxCpuDescriptor_t) desc); #endif #ifdef ENABLE_NV_GPU - case DevNvGpu: - causal_softmax_nv_gpu_f16((CausalSoftmaxCudaDescriptor *) descriptor, y, stream); - break; + case DevNvGpu: { + return cudaDestroyCausalSoftmaxDescriptor((CausalSoftmaxCudaDescriptor_t) desc); + } + #endif #ifdef ENABLE_CAMBRICON_MLU - case DevCambriconMlu: - // causal_softmax_bang_f16(y, y, stream); - causal_softmax_cnnl_f16(y, stream); - break; + case DevCambriconMlu: { + return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t) desc); + // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnDestroyCausalSoftmaxDescriptor((CausalSoftmaxAclnnDescriptor_t) desc); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t) desc); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: + return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t) desc); #endif - default: - PANIC(UnsupportedDevice); } + return STATUS_BAD_DEVICE; } diff --git a/src/ops/conv/cpu/conv_cpu.cc b/src/ops/conv/cpu/conv_cpu.cc new file mode 100644 index 00000000..2646c482 --- /dev/null +++ b/src/ops/conv/cpu/conv_cpu.cc @@ -0,0 +1,242 @@ +#include "conv_cpu.h" +#include "../../utils.h" + +// get the total number of elements in arr +inline uint64_t getTotalSize(const uint64_t *arr, uint64_t ndim) { + return std::accumulate(arr, arr + ndim, 1ULL, std::multiplies()); +} + +// check if padding is needed +inline bool requirePadding(uint64_t const *pads, uint64_t ndim) { + return std::any_of(pads, pads + ndim - 2, + [](uint64_t pad) { return pad > 0; }); +} + +infiniopStatus_t cpuCreateConvDescriptor(infiniopHandle_t, + ConvCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t w, + void const *pads, + void const *strides, + void const *dilations, + uint64_t n) { + uint64_t ndim = y->ndim; + if (ndim < 3 || ndim != x->ndim || ndim != w->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (x->shape[0] != y->shape[0] || w->shape[0] != y->shape[1] || x->shape[1] != w->shape[1]) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (y->dt != F16 && y->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (y->dt != x->dt || y->dt != w->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t y_size = getTotalSize(y->shape, ndim); + const auto pads_ = reinterpret_cast(pads); + uint64_t padded_x_size = requirePadding(pads_, ndim) ? getPaddedSize(ndim, x->shape, pads_) : 0; + uint64_t *x_shape = new uint64_t[ndim]; + uint64_t *w_shape = new uint64_t[ndim]; + uint64_t *y_shape = new uint64_t[ndim]; + uint64_t *pad_ = new uint64_t[n]; + int64_t *strides_ = new int64_t[n]; + uint64_t *dilations_ = new uint64_t[n]; + memcpy(x_shape, x->shape, ndim * sizeof(uint64_t)); + memcpy(w_shape, w->shape, ndim * sizeof(uint64_t)); + memcpy(y_shape, y->shape, ndim * sizeof(uint64_t)); + for (size_t i = 0; i < n; ++i) { + pad_[i] = pads_[i]; + strides_[i] = reinterpret_cast(strides)[i]; + dilations_[i] = reinterpret_cast(dilations)[i]; + } + + *desc_ptr = new ConvCpuDescriptor{ + DevCpu, + y->dt, + ndim, + y_size, + padded_x_size, + x_shape, + w_shape, + y_shape, + pad_, + strides_, + dilations_, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuGetConvWorkspaceSize(ConvCpuDescriptor_t desc, uint64_t *size) { + *size = desc->padded_x_size * desc->dtype.size; + if (desc->dtype == F16) { + *size += desc->y_size * sizeof(float); + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyConvDescriptor(ConvCpuDescriptor_t desc) { + delete[] desc->x_shape; + delete[] desc->w_shape; + delete[] desc->y_shape; + delete[] desc->pads; + delete[] desc->strides; + delete[] desc->dilations; + delete desc; + return STATUS_SUCCESS; +} + +// initialize the padded input with the data from the original input +template +void fillPaddedInput(ConvCpuDescriptor_t desc, uint64_t const *padded_x_shape, + Tdata *padded_x, Tdata const *x, + uint64_t const *pads, uint64_t x_index, + uint64_t padded_x_index, uint64_t ndim) { + const auto x_shape = desc->x_shape[ndim]; + const auto padded_x_shape_ = padded_x_shape[ndim]; + const auto x_base_index = x_index * x_shape; + const auto padded_x_base_index = padded_x_index * padded_x_shape_ + + (x_shape == padded_x_shape_ ? 0 : pads[ndim - 2]); + + for (size_t i = 0; i < x_shape; ++i) { + // base case (last dimension) + if (ndim == desc->ndim - 1) { + padded_x[padded_x_base_index + i] = x[x_base_index + i]; + } + // recursive case + else { + fillPaddedInput(desc, padded_x_shape, padded_x, x, pads, x_base_index + i, + padded_x_base_index + i, ndim + 1); + } + } +} + +// Recursive convolution function +template +void _applyConv(ConvCpuDescriptor_t desc, Ydata *y, Xdata const *x, + Xdata const *w, uint64_t const *x_shape, + uint64_t x_index, uint64_t w_index, uint64_t y_index, + uint64_t ndim) { + const auto dim_size = x_shape[ndim]; + const auto kernel_size = desc->w_shape[ndim]; + const auto dilation = desc->dilations[ndim - 2]; + const auto stride = desc->strides[ndim - 2]; + const auto steps = + (dim_size - dilation * (kernel_size - 1) - 1) / stride + 1; + x_index *= dim_size; + w_index *= kernel_size; + y_index *= desc->y_shape[ndim]; + + // perform all the convolutions along this axis + for (size_t i = 0; i < steps; ++i, ++y_index) { + // perform a single convolution + for (size_t k = 0; k < kernel_size; ++k) { + // calculate the current indices + const auto curr_x_index = x_index + i * stride + k * dilation; + const auto curr_w_index = w_index + k; + + // base case (last dimension) + if (ndim == desc->ndim - 1) { + if (desc->dtype == F16) { + y[y_index] += f16_to_f32(x[curr_x_index]) * f16_to_f32(w[curr_w_index]); + } else { + y[y_index] += x[curr_x_index] * w[curr_w_index]; + } + } + // recursive case + else { + _applyConv(desc, y, x, w, x_shape, curr_x_index, curr_w_index, + y_index, ndim + 1); + } + } + } +} + +template +void applyConv(ConvCpuDescriptor_t desc, Ydata *y, Xdata const *x, + Xdata const *w, uint64_t const *x_shape) { + const auto y_num_channel_elements = + getTotalSize(desc->y_shape + 2, desc->ndim - 2); + +#pragma omp parallel for collapse(2) + // batch + for (size_t i = 0; i < x_shape[0]; ++i) { + + // output channel + for (size_t j = 0; j < desc->w_shape[0]; ++j) { + uint64_t y_index = i * desc->y_shape[1] + j; + + // input channel + for (size_t k = 0; k < x_shape[1]; ++k) { + uint64_t x_index = i * x_shape[1] + k; + uint64_t w_index = j * desc->w_shape[1] + k; + _applyConv(desc, y, x, w, x_shape, x_index, w_index, y_index, 2); + } + } + } +} + +template +void _conv_cpu(ConvCpuDescriptor_t desc, void *workspace, uint64_t workspace_size, + Ydata *y, Xdata const *x, Xdata const *w) { + if (desc->padded_x_size > 0) { + auto padded_x = reinterpret_cast(workspace); + std::vector padded_shape_(desc->ndim); + auto padded_shape = padded_shape_.data(); + std::fill(padded_x, padded_x + desc->padded_x_size, 0); + getPaddedShape(desc->ndim, desc->x_shape, desc->pads, padded_shape); + fillPaddedInput(desc, padded_shape, padded_x, x, desc->pads, 0, 0, 0); + applyConv(desc, y, padded_x, w, padded_shape); + } else { + applyConv(desc, y, x, w, desc->x_shape); + } +} + +// Convolution function +template +infiniopStatus_t conv_cpu(ConvCpuDescriptor_t desc, void *workspace, uint64_t workspace_size, + void *y, void const *x, void const *w) { + auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto w_ = reinterpret_cast(w); + std::fill(y_, y_ + desc->y_size, 0); + _conv_cpu(desc, workspace, workspace_size, y_, x_, w_); + return STATUS_SUCCESS; +} + +// sepcial case for fp16 (uint16_t) +template<> +infiniopStatus_t conv_cpu(ConvCpuDescriptor_t desc, void *workspace, uint64_t workspace_size, + void *y, void const *x, void const *w) { + auto y_ = reinterpret_cast(workspace); + auto x_ = reinterpret_cast(x); + auto w_ = reinterpret_cast(w); + std::fill(y_, y_ + desc->y_size, 0); + + _conv_cpu(desc, y_ + desc->y_size, workspace_size, y_, x_, w_); + + // copy data from y_ to y + auto y_16 = reinterpret_cast(y); +#pragma omp parallel for + for (size_t i = 0; i < desc->y_size; ++i) { + y_16[i] = f32_to_f16(y_[i]); + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuConv(ConvCpuDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream) { + if (desc->dtype == F16) { + return conv_cpu(desc, workspace, workspace_size, y, x, w); + } + if (desc->dtype == F32) { + return conv_cpu(desc, workspace, workspace_size, y, x, w); + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/conv/cpu/conv_cpu.h b/src/ops/conv/cpu/conv_cpu.h new file mode 100644 index 00000000..48a91990 --- /dev/null +++ b/src/ops/conv/cpu/conv_cpu.h @@ -0,0 +1,45 @@ +#ifndef __CPU_CONV_H__ +#define __CPU_CONV_H__ + +#include "../../../devices/cpu/common_cpu.h" +#include "operators.h" +#include +#include +#include + +struct ConvCpuDescriptor { + Device device; + DT dtype; + uint64_t ndim; + uint64_t y_size; + uint64_t padded_x_size; + uint64_t const *x_shape; + uint64_t const *w_shape; + uint64_t const *y_shape; + uint64_t const *pads; + int64_t const *strides; + uint64_t const *dilations; +}; + +typedef struct ConvCpuDescriptor *ConvCpuDescriptor_t; + +infiniopStatus_t cpuCreateConvDescriptor(infiniopHandle_t, + ConvCpuDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t w, + void const *pads, + void const *strides, + void const *dilations, + uint64_t n); + +infiniopStatus_t cpuGetConvWorkspaceSize(ConvCpuDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cpuConv(ConvCpuDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream); + +infiniopStatus_t cpuDestroyConvDescriptor(ConvCpuDescriptor_t desc); + +#endif diff --git a/src/ops/conv/cuda/conv.cc b/src/ops/conv/cuda/conv.cc new file mode 100644 index 00000000..2ccabfda --- /dev/null +++ b/src/ops/conv/cuda/conv.cc @@ -0,0 +1,163 @@ +#include "conv.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateConvDescriptor(CudaHandle_t handle, + ConvCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t w, + void const *pads, + void const *strides, + void const *dilations, + uint64_t n) { + uint64_t ndim = y->ndim; + if (ndim < 3 || ndim != x->ndim || ndim != w->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (x->shape[0] != y->shape[0] || w->shape[0] != y->shape[1] || x->shape[1] != w->shape[1]) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (y->dt != F16 && y->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (y->dt != x->dt || y->dt != w->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + const uint64_t new_ndim = std::max(ndim, (uint64_t)4); + // convert pads, strides, dilations into int32[] + int32_t *pad = new int32_t[new_ndim]; + int32_t *stride = new int32_t[new_ndim]; + int32_t *dilation = new int32_t[new_ndim]; + int32_t *x_shape = new int32_t[new_ndim]; + int32_t *w_shape = new int32_t[new_ndim]; + int32_t *y_shape = new int32_t[new_ndim]; + auto pads_ = reinterpret_cast(pads); + auto strides_ = reinterpret_cast(strides); + auto dilations_ = reinterpret_cast(dilations); + for (size_t i = 0; i < new_ndim; ++i) { + pad[i] = i < ndim - 2 ? static_cast(pads_[i]) : 0; + stride[i] = i < ndim - 2 ? static_cast(strides_[i]) : 1; + dilation[i] = i < ndim - 2 ? static_cast(dilations_[i]) : 1; + x_shape[i] = i < ndim ? static_cast(x->shape[i]) : 1; + w_shape[i] = i < ndim ? static_cast(w->shape[i]) : 1; + y_shape[i] = i < ndim ? static_cast(y->shape[i]) : 1; + } + + // get the data types of the tensors and the conv operator + CREATE_CHECK_ERROR(auto tensor_dt = dataTypeMap[x->dt], tensor_dt, -1, STATUS_BAD_PARAM); + cudnnDataType_t conv_op_dt = [&] { + switch (tensor_dt) { + case CUDNN_DATA_HALF: + if (ndim >= 5) { + return CUDNN_DATA_FLOAT; + } + if (handle->compute_capability_major > 5 || (handle->compute_capability_major == 5 && handle->compute_capability_minor >= 3)) { + return CUDNN_DATA_HALF; + } + return CUDNN_DATA_FLOAT; + case CUDNN_DATA_BFLOAT16: + case CUDNN_DATA_FLOAT: + return CUDNN_DATA_FLOAT; + case CUDNN_DATA_DOUBLE: + return CUDNN_DATA_DOUBLE; + default: + return CUDNN_DATA_INT32; + } + }(); + + // create and set tensor descriptors for x + cudnnTensorDescriptor_t x_desc; + checkCudnnError(cudnnCreateTensorDescriptor(&x_desc)); + checkCudnnError(cudnnSetTensorNdDescriptorEx(x_desc, CUDNN_TENSOR_NCHW, static_cast(tensor_dt), new_ndim, x_shape)); + + // create and set tensor descriptors for w + cudnnFilterDescriptor_t w_desc; + checkCudnnError(cudnnCreateFilterDescriptor(&w_desc)); + checkCudnnError(cudnnSetFilterNdDescriptor(w_desc, static_cast(tensor_dt), CUDNN_TENSOR_NCHW, new_ndim, w_shape)); + + + // create and set conv operator descriptor + cudnnConvolutionDescriptor_t op_desc; + checkCudnnError(cudnnCreateConvolutionDescriptor(&op_desc)); + checkCudnnError(cudnnSetConvolutionNdDescriptor( + op_desc, new_ndim - 2, pad, stride, dilation, CUDNN_CROSS_CORRELATION, + conv_op_dt)); + + // create and set tensor descriptors for y + cudnnTensorDescriptor_t y_desc; + std::vector outDim_(new_ndim); + auto outDim = outDim_.data(); + checkCudnnError(cudnnGetConvolutionNdForwardOutputDim(op_desc, x_desc, w_desc, new_ndim, outDim)); + checkCudnnError(cudnnCreateTensorDescriptor(&y_desc)); + checkCudnnError(cudnnSetTensorNdDescriptorEx(y_desc, CUDNN_TENSOR_NCHW, static_cast(tensor_dt), new_ndim, y_shape)); + + // tuning: get the best algorithm + int requestedAlgoCount = 1; + checkCudnnError(use_cudnn(handle->cudnn_handles_t, handle->device_id, nullptr, + [&](cudnnHandle_t handle) { return cudnnGetConvolutionForwardAlgorithmMaxCount(handle, &requestedAlgoCount); })); + int algoCounts = 0; + int chosenAlgoIndex = 0; + bool chosen = false; + size_t workspace_size = 0; + std::vector perf_results_(requestedAlgoCount); + auto perf_results = perf_results_.data(); + checkCudnnError(use_cudnn(handle->cudnn_handles_t, handle->device_id, nullptr, + [&](cudnnHandle_t handle) { return cudnnFindConvolutionForwardAlgorithm(handle, x_desc, w_desc, op_desc, y_desc, requestedAlgoCount, &algoCounts, perf_results); })); + if (algoCounts < 1) { + return STATUS_EXECUTION_FAILED; + } + for (int i = 0; i < algoCounts; ++i) { + if (use_cudnn(handle->cudnn_handles_t, handle->device_id, nullptr, + [&](cudnnHandle_t handle) { return cudnnGetConvolutionForwardWorkspaceSize(handle, x_desc, w_desc, op_desc, y_desc, perf_results[i].algo, &workspace_size); }) == CUDNN_STATUS_SUCCESS) { + chosenAlgoIndex = i; + chosen = true; + break; + } + } + if (!chosen) { + return STATUS_EXECUTION_FAILED; + } + + const float alpha = 1.0f; + const float beta = 0.0f; + + *desc_ptr = new ConvCudaDescriptor{ + DevNvGpu, + y->dt, + handle->device_id, + handle->cudnn_handles_t, + x_desc, + w_desc, + y_desc, + op_desc, + perf_results[chosenAlgoIndex].algo, + alpha, + beta, + workspace_size}; + + delete[] pad; + delete[] stride; + delete[] dilation; + delete[] x_shape; + delete[] w_shape; + delete[] y_shape; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGetConvWorkspaceSize(ConvCudaDescriptor_t desc, uint64_t *size) { + *size = desc->workspace_size; + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyConvDescriptor(ConvCudaDescriptor_t desc) { + checkCudnnError(cudnnDestroyConvolutionDescriptor(desc->op_desc)); + checkCudnnError(cudnnDestroyTensorDescriptor(desc->y_desc)); + checkCudnnError(cudnnDestroyFilterDescriptor(desc->w_desc)); + checkCudnnError(cudnnDestroyTensorDescriptor(desc->x_desc)); + desc->cudnn_handles_t = nullptr; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/conv/cuda/conv.cu b/src/ops/conv/cuda/conv.cu new file mode 100644 index 00000000..3f15843b --- /dev/null +++ b/src/ops/conv/cuda/conv.cu @@ -0,0 +1,23 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "conv.cuh" + +infiniopStatus_t conv_nv_gpu(ConvCudaDescriptor_t desc, void *workspace, uint64_t workspace_size, + void *y, void const *x, void const *w, void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + checkCudnnError(use_cudnn(desc->cudnn_handles_t, desc->device_id, (cudaStream_t) stream, + [&](cudnnHandle_t handle) { return cudnnConvolutionForward(handle, &desc->alpha, + desc->x_desc, x, desc->w_desc, w, desc->op_desc, desc->algo, workspace, workspace_size, + &desc->beta, desc->y_desc, y); })); + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaConv(ConvCudaDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream) { + if (desc->dtype == F16 || desc->dtype == F32) { + return conv_nv_gpu(desc, workspace, workspace_size, y, x, w, stream); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/conv/cuda/conv.cuh b/src/ops/conv/cuda/conv.cuh new file mode 100644 index 00000000..36f22e90 --- /dev/null +++ b/src/ops/conv/cuda/conv.cuh @@ -0,0 +1,45 @@ +#ifndef __CUDA_CONV_H__ +#define __CUDA_CONV_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include + +struct ConvCudaDescriptor { + Device device; + DT dtype; + int device_id; + std::shared_ptr> cudnn_handles_t; + cudnnTensorDescriptor_t const x_desc; + cudnnFilterDescriptor_t const w_desc; + cudnnTensorDescriptor_t const y_desc; + cudnnConvolutionDescriptor_t const op_desc; + cudnnConvolutionFwdAlgo_t algo; + const float alpha; + const float beta; + uint64_t workspace_size; +}; + +typedef struct ConvCudaDescriptor *ConvCudaDescriptor_t; + +infiniopStatus_t cudaCreateConvDescriptor(CudaHandle_t, + ConvCudaDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t w, + void const *pads, + void const *strides, + void const *dilations, + uint64_t n); + +infiniopStatus_t cudaGetConvWorkspaceSize(ConvCudaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cudaConv(ConvCudaDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream); + +infiniopStatus_t cudaDestroyConvDescriptor(ConvCudaDescriptor_t desc); + +#endif diff --git a/src/ops/conv/operator.cc b/src/ops/conv/operator.cc new file mode 100644 index 00000000..306527e5 --- /dev/null +++ b/src/ops/conv/operator.cc @@ -0,0 +1,96 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/conv/conv.h" + +#ifdef ENABLE_CPU +#include "cpu/conv_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/conv.cuh" +#endif + +__C infiniopStatus_t infiniopCreateConvDescriptor( + infiniopHandle_t handle, + infiniopConvDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t w, + void *pads, + void *strides, + void *dilations, + uint64_t n) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateConvDescriptor(handle, (ConvCpuDescriptor_t *) desc_ptr, y, x, w, pads, strides, dilations, n); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateConvDescriptor((CudaHandle_t) handle, (ConvCudaDescriptor_t *) desc_ptr, y, x, w, pads, strides, dilations, n); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopGetConvWorkspaceSize(infiniopConvDescriptor_t desc, uint64_t *size) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuGetConvWorkspaceSize((ConvCpuDescriptor_t) desc, size); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaGetConvWorkspaceSize((ConvCudaDescriptor_t) desc, size); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void const *w, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuConv((ConvCpuDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaConv((ConvCudaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyConvDescriptor((ConvCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyConvDescriptor((ConvCudaDescriptor_t) desc); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/expand/cpu/expand_cpu.cc b/src/ops/expand/cpu/expand_cpu.cc new file mode 100644 index 00000000..d3bcb866 --- /dev/null +++ b/src/ops/expand/cpu/expand_cpu.cc @@ -0,0 +1,69 @@ +#include "expand_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +infiniopStatus_t cpuCreateExpandDescriptor(infiniopHandle_t, + ExpandCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + uint64_t ndim = y->ndim; + if (!isValidBroadcastShape(y, x)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t y_data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies()); + + // get the adjusted strides for x in terms of y + int64_t *x_strides = new int64_t[ndim]; + int64_t *y_strides = new int64_t[ndim]; +#pragma omp parallel for + for (size_t i = 0; i < ndim; ++i) { + x_strides[i] = (i < ndim - x->ndim || y->shape[i] != x->shape[i + x->ndim - ndim]) ? 0 : x->strides[i + x->ndim - ndim]; + } + memcpy(y_strides, y->strides, ndim * sizeof(int64_t)); + + *desc_ptr = new ExpandCpuDescriptor{ + DevCpu, + y->dt, + ndim, + y_data_size, + x_strides, + y_strides, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyExpandDescriptor(ExpandCpuDescriptor_t desc) { + delete[] desc->x_strides; + delete[] desc->y_strides; + delete desc; + return STATUS_SUCCESS; +} + +template +infiniopStatus_t expand_cpu(ExpandCpuDescriptor_t desc, void *y, void const *x) { + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + +#pragma omp parallel for + for (uint64_t i = 0; i < desc->y_data_size; ++i) { + y_[i] = x_[getDstOffset(i, desc->ndim, desc->y_strides, desc->x_strides)]; + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuExpand(ExpandCpuDescriptor_t desc, + void *y, void const *x, + void *stream) { + if (desc->dtype == F16) { + return expand_cpu(desc, y, x); + } + if (desc->dtype == F32) { + return expand_cpu(desc, y, x); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/expand/cpu/expand_cpu.h b/src/ops/expand/cpu/expand_cpu.h new file mode 100644 index 00000000..868fefe8 --- /dev/null +++ b/src/ops/expand/cpu/expand_cpu.h @@ -0,0 +1,29 @@ +#ifndef __CPU_EXPAND_H__ +#define __CPU_EXPAND_H__ + +#include "operators.h" +#include +#include + +struct ExpandCpuDescriptor { + Device device; + DT dtype; + uint64_t ndim; + uint64_t y_data_size; + int64_t const *x_strides; + int64_t const *y_strides; +}; + +typedef struct ExpandCpuDescriptor *ExpandCpuDescriptor_t; + +infiniopStatus_t cpuCreateExpandDescriptor(infiniopHandle_t, + ExpandCpuDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +infiniopStatus_t cpuExpand(ExpandCpuDescriptor_t desc, + void *y, void const *x, void *stream); + +infiniopStatus_t cpuDestroyExpandDescriptor(ExpandCpuDescriptor_t desc); + +#endif diff --git a/src/ops/expand/cuda/expand.cc b/src/ops/expand/cuda/expand.cc new file mode 100644 index 00000000..d0467c01 --- /dev/null +++ b/src/ops/expand/cuda/expand.cc @@ -0,0 +1,51 @@ +#include "expand.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateExpandDescriptor(CudaHandle_t handle, + ExpandCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + uint64_t ndim = y->ndim; + if (!isValidBroadcastShape(y, x)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t y_data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies()); + + // get the adjusted strides for x in terms of y + int64_t *x_strides = new int64_t[ndim]; + for (size_t i = 0; i < ndim; ++i) { + x_strides[i] = (i < ndim - x->ndim || y->shape[i] != x->shape[i + x->ndim - ndim]) ? 0 : x->strides[i + x->ndim - ndim]; + } + + int64_t *x_strides_d, *y_strides_d; + char *strides_and_shape_d; + checkCudaErrorWithCode(cudaMalloc((void **) &strides_and_shape_d, ndim * (2 * sizeof(int64_t) + sizeof(uint64_t))), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d, x_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d + ndim * sizeof(int64_t), y->strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaMemcpy(strides_and_shape_d + 2 * ndim * sizeof(int64_t), y->shape, ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + + *desc_ptr = new ExpandCudaDescriptor{ + DevNvGpu, + y->dt, + handle->device_id, + ndim, + y_data_size, + static_cast(handle->prop.maxGridSize[0]), + strides_and_shape_d, + }; + + delete[] x_strides; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyExpandDescriptor(ExpandCudaDescriptor_t desc) { + checkCudaErrorWithCode(cudaFree((void *) desc->strides_and_shape_d), STATUS_EXECUTION_FAILED); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/expand/cuda/expand.cu b/src/ops/expand/cuda/expand.cu new file mode 100644 index 00000000..6d75e651 --- /dev/null +++ b/src/ops/expand/cuda/expand.cu @@ -0,0 +1,58 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "expand.cuh" + +template +__global__ void expand( + Tdata *y, + const Tdata *x, + const int64_t *y_strides, + const int64_t *x_strides, + const uint64_t *y_shape, + uint64_t y_data_size, + uint64_t ndim, + uint64_t offset) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if (idx < y_data_size) { + uint64_t y_idx = getOffset(idx, ndim, y_shape, y_strides); + y[y_idx] = x[getDstOffset(y_idx, ndim, y_strides, x_strides)]; + } +} + +template +infiniopStatus_t expand_nv_gpu(ExpandCudaDescriptor_t desc, void *y, void const *x, void *stream) { + if (desc->y_data_size == 0) { + return STATUS_SUCCESS; + } + dim3 blockDims = dim3(std::min(static_cast(256), desc->y_data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->y_data_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + const auto x_ = reinterpret_cast(x); + const auto y_ = reinterpret_cast(y); + const auto x_strides = reinterpret_cast(desc->strides_and_shape_d); + const auto y_strides = reinterpret_cast(desc->strides_and_shape_d + desc->ndim * sizeof(int64_t)); + const auto y_shape = reinterpret_cast(desc->strides_and_shape_d + 2 * desc->ndim * sizeof(int64_t)); + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < desc->y_data_size; i += step) { + expand<<>>( + y_, x_, y_strides, x_strides, y_shape, i + desc->y_data_size, desc->ndim, i); + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaExpand(ExpandCudaDescriptor_t desc, + void *y, void const *x, + void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return expand_nv_gpu(desc, y, x, stream); + } + if (desc->dtype == F32) { + return expand_nv_gpu(desc, y, x, stream); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/expand/cuda/expand.cuh b/src/ops/expand/cuda/expand.cuh new file mode 100644 index 00000000..17cc1337 --- /dev/null +++ b/src/ops/expand/cuda/expand.cuh @@ -0,0 +1,33 @@ +#ifndef __CUDA_EXPAND_H__ +#define __CUDA_EXPAND_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include + +struct ExpandCudaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t y_data_size; + uint64_t max_grid_size; + char const *strides_and_shape_d; +}; + +typedef struct ExpandCudaDescriptor *ExpandCudaDescriptor_t; + +infiniopStatus_t cudaCreateExpandDescriptor(CudaHandle_t, + ExpandCudaDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +infiniopStatus_t cudaExpand(ExpandCudaDescriptor_t desc, + void *y, void const *x, + void *stream); + +infiniopStatus_t cudaDestroyExpandDescriptor(ExpandCudaDescriptor_t desc); + +#endif diff --git a/src/ops/expand/musa/expand_musa.cc b/src/ops/expand/musa/expand_musa.cc new file mode 100644 index 00000000..0e2e4581 --- /dev/null +++ b/src/ops/expand/musa/expand_musa.cc @@ -0,0 +1,51 @@ +#include "expand_musa.h" +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" + +infiniopStatus_t musaCreateExpandDescriptor(MusaHandle_t handle, + ExpandMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + uint64_t ndim = y->ndim; + if (!isValidBroadcastShape(y, x)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t y_data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies()); + + // get the adjusted strides for x in terms of y + int64_t *x_strides = new int64_t[ndim]; + for (size_t i = 0; i < ndim; ++i) { + x_strides[i] = (i < ndim - x->ndim || y->shape[i] != x->shape[i + x->ndim - ndim]) ? 0 : x->strides[i + x->ndim - ndim]; + } + + int64_t *x_strides_d, *y_strides_d; + char *strides_and_shape_d; + checkMusaErrorWithCode(musaMalloc(&strides_and_shape_d, ndim * (2 * sizeof(int64_t) + sizeof(uint64_t))), STATUS_MEMORY_NOT_ALLOCATED); + checkMusaErrorWithCode(musaMemcpy(strides_and_shape_d, x_strides, ndim * sizeof(int64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkMusaErrorWithCode(musaMemcpy(strides_and_shape_d + ndim * sizeof(int64_t), y->strides, ndim * sizeof(int64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkMusaErrorWithCode(musaMemcpy(strides_and_shape_d + 2 * ndim * sizeof(int64_t), y->shape, ndim * sizeof(uint64_t), musaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + + *desc_ptr = new ExpandMusaDescriptor{ + DevMthreadsGpu, + y->dt, + handle->device_id, + ndim, + y_data_size, + static_cast(handle->prop.maxGridSize[0]), + strides_and_shape_d, + }; + + delete[] x_strides; + + return STATUS_SUCCESS; +} + +infiniopStatus_t musaDestroyExpandDescriptor(ExpandMusaDescriptor_t desc) { + checkMusaErrorWithCode(musaFree((void *) desc->strides_and_shape_d), STATUS_EXECUTION_FAILED); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/expand/musa/expand_musa.h b/src/ops/expand/musa/expand_musa.h new file mode 100644 index 00000000..8e4651e1 --- /dev/null +++ b/src/ops/expand/musa/expand_musa.h @@ -0,0 +1,33 @@ +#ifndef __MUSA_EXPAND_H__ +#define __MUSA_EXPAND_H__ + +#include "../../../devices/musa/common_musa.h" +#include "../../../devices/musa/musa_handle.h" +#include "operators.h" +#include +#include + +struct ExpandMusaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t y_data_size; + uint64_t max_grid_size; + char const *strides_and_shape_d; +}; + +typedef struct ExpandMusaDescriptor *ExpandMusaDescriptor_t; + +infiniopStatus_t musaCreateExpandDescriptor(MusaHandle_t, + ExpandMusaDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +infiniopStatus_t musaExpand(ExpandMusaDescriptor_t desc, + void *y, void const *x, + void *stream); + +infiniopStatus_t musaDestroyExpandDescriptor(ExpandMusaDescriptor_t desc); + +#endif diff --git a/src/ops/expand/musa/expand_musa.mu b/src/ops/expand/musa/expand_musa.mu new file mode 100644 index 00000000..4b549541 --- /dev/null +++ b/src/ops/expand/musa/expand_musa.mu @@ -0,0 +1,58 @@ +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include "expand_musa.h" + +template +__global__ void expand( + Tdata *y, + const Tdata *x, + const int64_t *y_strides, + const int64_t *x_strides, + const uint64_t *y_shape, + uint64_t y_data_size, + uint64_t ndim, + uint64_t offset) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if (idx < y_data_size) { + uint64_t y_idx = getOffset(idx, ndim, y_shape, y_strides); + y[y_idx] = x[getDstOffset(y_idx, ndim, y_strides, x_strides)]; + } +} + +template +infiniopStatus_t expand_mt_gpu(ExpandMusaDescriptor_t desc, void *y, void const *x, void *stream) { + if (desc->y_data_size == 0) { + return STATUS_SUCCESS; + } + dim3 blockDims = dim3(std::min(static_cast(256), desc->y_data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->y_data_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + const auto x_ = reinterpret_cast(x); + const auto y_ = reinterpret_cast(y); + const auto x_strides = reinterpret_cast(desc->strides_and_shape_d); + const auto y_strides = reinterpret_cast(desc->strides_and_shape_d + desc->ndim * sizeof(int64_t)); + const auto y_shape = reinterpret_cast(desc->strides_and_shape_d + 2 * desc->ndim * sizeof(int64_t)); + musaStream_t musa_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < desc->y_data_size; i += step) { + expand<<>>( + y_, x_, y_strides, x_strides, y_shape, i + desc->y_data_size, desc->ndim, i); + } + return STATUS_SUCCESS; +} + +infiniopStatus_t musaExpand(ExpandMusaDescriptor_t desc, + void *y, void const *x, + void *stream) { + checkMusaError(musaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return expand_mt_gpu(desc, y, x, stream); + } + if (desc->dtype == F32) { + return expand_mt_gpu(desc, y, x, stream); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/expand/operator.cc b/src/ops/expand/operator.cc new file mode 100644 index 00000000..b0374645 --- /dev/null +++ b/src/ops/expand/operator.cc @@ -0,0 +1,91 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/expand/expand.h" + +#ifdef ENABLE_CPU +#include "cpu/expand_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/expand.cuh" +#endif +#ifdef ENABLE_MTHREADS_GPU +#include "musa/expand_musa.h" +#endif + + +__C infiniopStatus_t infiniopCreateExpandDescriptor( + infiniopHandle_t handle, + infiniopExpandDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateExpandDescriptor(handle, (ExpandCpuDescriptor_t *) desc_ptr, y, x); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateExpandDescriptor((CudaHandle_t) handle, (ExpandCudaDescriptor_t *) desc_ptr, y, x); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaCreateExpandDescriptor((MusaHandle_t) handle, (ExpandMusaDescriptor_t *) desc_ptr, y, x); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopExpand(infiniopExpandDescriptor_t desc, void *y, void const *x, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuExpand((ExpandCpuDescriptor_t) desc, y, x, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaExpand((ExpandCudaDescriptor_t) desc, y, x, stream); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaExpand((ExpandMusaDescriptor_t) desc, y, x, stream); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyExpandDescriptor(infiniopExpandDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyExpandDescriptor((ExpandCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyExpandDescriptor((ExpandCudaDescriptor_t) desc); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaDestroyExpandDescriptor((ExpandMusaDescriptor_t) desc); + } +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/gemm/operator.cc b/src/ops/gemm/operator.cc new file mode 100644 index 00000000..7036b032 --- /dev/null +++ b/src/ops/gemm/operator.cc @@ -0,0 +1,96 @@ +#include "../utils.h" +#include "ops/expand/expand.h" +#include "ops/gemm/gemm.h" +#include "ops/matmul/matmul.h" +#include "tensor/tensor_descriptor.h" + +struct _GEMMDescriptor { + Device device; + infiniopMatmulDescriptor_t matmul_desc; + infiniopExpandDescriptor_t expand_desc; + uint64_t workspace_size; +}; + +typedef struct _GEMMDescriptor *_GEMMDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateGEMMDescriptor(infiniopHandle_t handle, + infiniopGEMMDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infiniopTensorDescriptor_t c_desc, + float alpha, + float beta, + char transA, + char transB) { + // transpose a and b if needed + a_desc = transA ? permute(a_desc, {1, 0}) : a_desc; + b_desc = transB ? permute(b_desc, {1, 0}) : b_desc; + + // expand desc + infiniopExpandDescriptor_t expand_desc = nullptr; + + // c is optional, set beta to 0 when c is not provided + if (!c_desc || c_desc->ndim == 0 || c_desc->shape == nullptr || c_desc->shape[0] == 0) { + beta = 0; + } else { + expand_desc = new ExpandDescriptor{handle->device}; + CHECK_STATUS(infiniopCreateExpandDescriptor(handle, &expand_desc, y_desc, c_desc), STATUS_SUCCESS); + } + + // matmul desc + infiniopMatmulDescriptor_t matmul_desc = new MatmulDescriptor{handle->device}; + CHECK_STATUS(infiniopCreateMatmulDescriptor(handle, &matmul_desc, y_desc, alpha, a_desc, b_desc, beta), STATUS_SUCCESS); + uint64_t workspace_size = 0; + CHECK_STATUS(infiniopGetMatmulWorkspaceSize(matmul_desc, &workspace_size), STATUS_SUCCESS); + + *(_GEMMDescriptor_t *) desc_ptr = new _GEMMDescriptor{ + handle->device, + matmul_desc, + expand_desc, + workspace_size, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetGEMMWorkspaceSize(infiniopGEMMDescriptor_t desc, uint64_t *size) { + *size = ((_GEMMDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGEMM(infiniopGEMMDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, + void const *a, + void const *b, + void const *c, + void *stream) { + auto _desc = (_GEMMDescriptor_t) desc; + if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + + if (_desc->expand_desc != nullptr) { + CHECK_STATUS(infiniopExpand(_desc->expand_desc, + y, c, stream), + STATUS_SUCCESS); + } + + CHECK_STATUS(infiniopMatmul(_desc->matmul_desc, + workspace, + workspace_size, + y, a, b, stream), + STATUS_SUCCESS); + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyGEMMDescriptor(infiniopGEMMDescriptor_t desc) { + if (((_GEMMDescriptor_t) desc)->expand_desc) { + CHECK_STATUS(infiniopDestroyExpandDescriptor(((_GEMMDescriptor_t) desc)->expand_desc), STATUS_SUCCESS); + } + CHECK_STATUS(infiniopDestroyMatmulDescriptor(((_GEMMDescriptor_t) desc)->matmul_desc), STATUS_SUCCESS); + return STATUS_SUCCESS; +} diff --git a/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.cc b/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.cc new file mode 100644 index 00000000..7650e1fd --- /dev/null +++ b/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.cc @@ -0,0 +1,84 @@ +#include "global_avg_pool_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +infiniopStatus_t cpuCreateGlobalAvgPoolDescriptor(infiniopHandle_t, + GlobalAvgPoolCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + uint64_t ndim = y->ndim; + if (ndim < 2 || ndim != x->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (size_t i = 0; i < ndim; ++i) { + if (i < 2 && y->shape[i] != x->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } else if (i >= 2 && y->shape[i] != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (y->dt != F16 && y->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t y_data_size = std::accumulate(y->shape, y->shape + 2, 1ULL, std::multiplies()); + uint64_t x_per_NC_data_size = std::accumulate(x->shape + 2, x->shape + ndim, 1ULL, std::multiplies()); + + *desc_ptr = new GlobalAvgPoolCpuDescriptor{ + DevCpu, + y->dt, + y_data_size, + x_per_NC_data_size, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolCpuDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} + +template +infiniopStatus_t global_avg_pool_cpu(GlobalAvgPoolCpuDescriptor_t desc, void *y, void const *x) { + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + const auto x_size = desc->x_per_NC_data_size; + +#pragma omp parallel for + for (uint64_t i = 0; i < desc->y_data_size; ++i) { + if constexpr (std::is_same::value) { + float sum = std::accumulate(x_ + i * x_size, x_ + (i + 1) * x_size, 0.0f, + [](float res, uint16_t value) { + return res + f16_to_f32(value); + }); + y_[i] = f32_to_f16(sum / x_size); + } else { + y_[i] = std::accumulate(x_ + i * x_size, x_ + (i + 1) * x_size, Tdata(0)) / x_size; + } + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuGlobalAvgPool(GlobalAvgPoolCpuDescriptor_t desc, + void *workspace, uint64_t workspace_size, void *y, void const *x, + void *stream) { + if (desc->dtype == F16) { + return global_avg_pool_cpu(desc, y, x); + } + if (desc->dtype == F32) { + return global_avg_pool_cpu(desc, y, x); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.h b/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.h new file mode 100644 index 00000000..f370a709 --- /dev/null +++ b/src/ops/global_avg_pool/cpu/global_avg_pool_cpu.h @@ -0,0 +1,29 @@ +#ifndef __CPU_GLOBAL_AVG_POOL_H__ +#define __CPU_GLOBAL_AVG_POOL_H__ + +#include "operators.h" +#include + +struct GlobalAvgPoolCpuDescriptor { + Device device; + DT dtype; + uint64_t y_data_size; + uint64_t x_per_NC_data_size; +}; + +typedef struct GlobalAvgPoolCpuDescriptor *GlobalAvgPoolCpuDescriptor_t; + +infiniopStatus_t cpuCreateGlobalAvgPoolDescriptor(infiniopHandle_t, + GlobalAvgPoolCpuDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +infiniopStatus_t cpuGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolCpuDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cpuGlobalAvgPool(GlobalAvgPoolCpuDescriptor_t desc, + void *workspace, uint64_t workspace_size, void *y, void const *x, + void *stream); + +infiniopStatus_t cpuDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolCpuDescriptor_t desc); + +#endif diff --git a/src/ops/global_avg_pool/cuda/global_avg_pool.cc b/src/ops/global_avg_pool/cuda/global_avg_pool.cc new file mode 100644 index 00000000..25d7acbe --- /dev/null +++ b/src/ops/global_avg_pool/cuda/global_avg_pool.cc @@ -0,0 +1,197 @@ +#include "global_avg_pool.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateGlobalAvgPoolDescriptor(CudaHandle_t handle, + GlobalAvgPoolCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + uint64_t ndim = y->ndim; + if (ndim <= 2 || ndim != x->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (size_t i = 0; i < ndim; ++i) { + if (i < 2 && y->shape[i] != x->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } else if (i >= 2 && y->shape[i] != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (y->dt != F16 && y->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + // use cuDNN lib call + if (x->ndim <= 4) { + int n = x->shape[0]; + int c = x->shape[1]; + int h = ndim == 3 ? 1 : x->shape[2]; + int w = ndim == 3 ? x->shape[2] : x->shape[3]; + + // get the data types of the tensors and the conv operator + CREATE_CHECK_ERROR(auto tensor_dt = dataTypeMap[x->dt], tensor_dt, -1, STATUS_BAD_PARAM); + + // create and set tensor descriptor for x + cudnnTensorDescriptor_t x_desc; + checkCudnnError(cudnnCreateTensorDescriptor(&x_desc)); + checkCudnnError(cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, static_cast(tensor_dt), n, c, h, w)); + + // create and set tensor descriptor for y + cudnnTensorDescriptor_t y_desc; + checkCudnnError(cudnnCreateTensorDescriptor(&y_desc)); + checkCudnnError(cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, static_cast(tensor_dt), n, c, 1, 1)); + + // Create and set pooling descriptor for average pooling + cudnnPoolingDescriptor_t pool_desc; + checkCudnnError(cudnnCreatePoolingDescriptor(&pool_desc)); + checkCudnnError(cudnnSetPooling2dDescriptor(pool_desc, + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, + CUDNN_NOT_PROPAGATE_NAN, + h,// pooling window height + w,// pooling window width + 0,// vertical padding + 0,// horizontal padding + 1,// vertical Stride + 1 // horizontal stride + )); + float alpha = 1.0f, beta = 0.0f; + + *desc_ptr = new GlobalAvgPoolCudaDescriptor{ + DevNvGpu, + y->dt, + handle->device_id, + ndim, + 0, + 0, + 0, + 0, + 0, + 0, + handle->cudnn_handles_t, + x_desc, + y_desc, + pool_desc, + alpha, + beta, + }; + + } else if (x->ndim <= 5) { + std::vector x_shape(ndim); + std::vector x_strides(ndim); + std::vector y_shape(ndim); + std::vector y_strides(ndim); + std::vector k_shape(ndim - 2); + std::vector pads(ndim - 2); + std::vector strides(ndim - 2); + +#pragma omp parallel for + for (size_t i = 0; i < ndim; ++i) { + x_shape[i] = static_cast(x->shape[i]); + x_strides[i] = static_cast(x->strides[i]); + y_shape[i] = static_cast(y->shape[i]); + y_strides[i] = static_cast(y->strides[i]); + if (i < ndim - 2) { + k_shape[i] = static_cast(x->shape[i + 2]); + pads[i] = 0; + strides[i] = 1; + } + } + + // get the data types of the tensors and the conv operator + CREATE_CHECK_ERROR(auto tensor_dt = dataTypeMap[x->dt], tensor_dt, -1, STATUS_BAD_PARAM); + + // create and set tensor descriptors for x + cudnnTensorDescriptor_t x_desc; + checkCudnnError(cudnnCreateTensorDescriptor(&x_desc)); + checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast(tensor_dt), ndim, x_shape.data(), x_strides.data())); + + // Create and set pooling descriptor for average pooling + cudnnPoolingDescriptor_t pool_desc; + checkCudnnError(cudnnCreatePoolingDescriptor(&pool_desc)); + checkCudnnError(cudnnSetPoolingNdDescriptor(pool_desc, + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, + CUDNN_NOT_PROPAGATE_NAN, + ndim - 2, + k_shape.data(), + pads.data(), + strides.data())); + // create and set tensor descriptors for y + cudnnTensorDescriptor_t y_desc; + checkCudnnError(cudnnCreateTensorDescriptor(&y_desc)); + checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape.data())); + checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast(tensor_dt), ndim, y_shape.data(), y_strides.data())); + + float alpha = 1.0f, beta = 0.0f; + + *desc_ptr = new GlobalAvgPoolCudaDescriptor{ + DevNvGpu, + y->dt, + handle->device_id, + ndim, + 0, + 0, + 0, + 0, + 0, + 0, + handle->cudnn_handles_t, + x_desc, + y_desc, + pool_desc, + alpha, + beta, + }; + + } else { + uint64_t y_data_size = std::accumulate(y->shape, y->shape + 2, 1ULL, std::multiplies()); + uint64_t x_per_NC_data_size = std::accumulate(x->shape + 2, x->shape + ndim, 1ULL, std::multiplies()); + uint64_t data_size = y_data_size * x_per_NC_data_size; + + unsigned max_block_size = std::min(256, handle->prop.maxThreadsPerBlock); + uint64_t max_grid_size = static_cast(handle->prop.maxGridSize[0]); + uint64_t items_per_thread = data_size / (max_block_size * max_grid_size); + + *desc_ptr = new GlobalAvgPoolCudaDescriptor{ + DevNvGpu, + y->dt, + handle->device_id, + ndim, + data_size, + y_data_size, + x_per_NC_data_size, + max_block_size, + max_grid_size, + items_per_thread, + handle->cudnn_handles_t, + nullptr, + nullptr, + nullptr, + 0, + 0, + }; + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolCudaDescriptor_t desc, uint64_t *size) { + *size = desc->ndim <= 5 ? 0 : (desc->dtype != F16 ? 0 : std::min(desc->dtype.size * 2, 8) * desc->y_data_size); + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolCudaDescriptor_t desc) { + if (desc->ndim <= 5) { + checkCudnnError(cudnnDestroyTensorDescriptor(desc->x_desc)); + checkCudnnError(cudnnDestroyTensorDescriptor(desc->y_desc)); + checkCudnnError(cudnnDestroyPoolingDescriptor(desc->pool_desc)); + } + desc->cudnn_handles_t = nullptr; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/global_avg_pool/cuda/global_avg_pool.cu b/src/ops/global_avg_pool/cuda/global_avg_pool.cu new file mode 100644 index 00000000..ca5965ab --- /dev/null +++ b/src/ops/global_avg_pool/cuda/global_avg_pool.cu @@ -0,0 +1,415 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "global_avg_pool.cuh" +#include +#include +#include + +namespace infini { + struct float2_t { + float x, y; + + __device__ float2_t() : x(0), y(0) {} + __device__ float2_t(int val) : x(static_cast(val)), y(static_cast(val)) {} + __device__ float2_t(const float2 &val) : x(val.x), y(val.y) {} + __device__ float2_t(const float2_t &other) : x(other.x), y(other.y) {} + __device__ float2_t(float x, float y) : x(x), y(y) {} + + __device__ float2_t &operator=(const float2_t &other) { + if (this != &other) { + this->x = other.x; + this->y = other.y; + } + return *this; + } + + __device__ float2_t operator+(const float2_t &other) const { + return float2_t{x + other.x, y + other.y}; + } + + __device__ float operator+(const float &other) const { + return x + y + other; + } + + __device__ float2_t &operator+=(const float2_t &other) { + x += other.x; + y += other.y; + return *this; + } + + __device__ float operator[](size_t index) const { + return index == 0 ? x : y; + } + }; + + struct half2 { + half x, y; + + __device__ half2 &operator=(const half2 &other) { + if (this != &other) { + this->x = other.x; + this->y = other.y; + } + return *this; + } + + __device__ half2 &operator=(const infini::float2_t &other) { + this->x = __float2half(other.x); + this->y = __float2half(other.y); + return *this; + } + + __device__ half2 operator+(const half2 &other) const { + return half2{__hadd(x, other.x), __hadd(y, other.y)}; + } + + __device__ half operator+(const half &other) const { + return __hadd(__hadd(x, y), other); + } + + __device__ half operator[](size_t index) const { + return __hadd(x, y); + } + }; + + struct half4 { + __half x, y, z, w; + + __device__ half4 operator+(const half4 &other) const { + return half4{__hadd(x, other.x), __hadd(y, other.y), __hadd(z, other.z), __hadd(w, other.w)}; + } + }; + + __device__ __forceinline__ infini::float2_t divide(infini::float2_t val, float divisor) { + return {val.x / divisor, val.y / divisor}; + } +}// namespace infini + + +struct half2float_functor { + __device__ __forceinline__ float operator()(half val) const { + return __half2float(val); + } +}; + +struct float2half_functor { + __device__ __forceinline__ half operator()(float val) const { + return __float2half(val); + } +}; + +struct half22float_functor { + __device__ __forceinline__ float operator()(infini::half2 val) const { + return __half2float(val.x) + __half2float(val.y); + } +}; + +struct float22half2_functor { + __device__ __forceinline__ infini::half2 operator()(const infini::float2_t &val) const { + return {__float2half(val.x), __float2half(val.y)}; + } +}; + +uint64_t getBlockDim(uint64_t size) { + if (size < static_cast(MAX_THREADS_PER_BLOCK)) { + return size; + } + for (size_t i = MAX_THREADS_PER_BLOCK; i > 1; --i) { + if (size % i == 0) { + return i; + } + } + return 1; +} + +/** ---------------------------------------- */ +/** --------------- Sum ----------------- */ +/** ---------------------------------------- */ + +template +__global__ void sum( + Ldata *__restrict__ y, + const Tdata *__restrict__ x, + uint64_t data_size, + uint64_t x_per_NC_data_size, + uint64_t blocks_per_y, + unsigned remainder, + uint64_t offset, + unsigned pack_size) { + uint64_t block_offset = blockIdx.x / blocks_per_y * x_per_NC_data_size + blockIdx.x % blocks_per_y * blockDim.x * pack_size; + uint64_t idx = block_offset + threadIdx.x * pack_size + offset; + + if (idx < data_size) { + Tdata thread_data[1]; + + using BlockOp = cub::BlockLoad; + __shared__ typename BlockOp::TempStorage load_temp_storage; + BlockOp(load_temp_storage).Load(x + block_offset, thread_data); + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduce_temp_storage; + Ldata block_sum; + if constexpr (std::is_same::value) { + block_sum = BlockReduce(reduce_temp_storage).Sum(__half2float(thread_data[0]), blockDim.x); + } else { + block_sum = BlockReduce(reduce_temp_storage).Sum(Ldata(thread_data[0]), blockDim.x); + } + + // add up the remaining elements + if (blockIdx.x % blocks_per_y == blocks_per_y - 1) { + __shared__ typename BlockOp::TempStorage load_r_temp_storage; + BlockOp(load_r_temp_storage).Load(x + block_offset + blockDim.x, thread_data, remainder, 0); + if constexpr (std::is_same::value) { + block_sum += __half2float(BlockReduce(reduce_temp_storage).Sum(__half2float(thread_data[0]), blockDim.x)); + } else { + block_sum += BlockReduce(reduce_temp_storage).Sum(Ldata(thread_data[0]), remainder); + } + } + + __syncthreads(); + + if (threadIdx.x == 0) { + atomicAdd(&y[idx / x_per_NC_data_size], block_sum); + } + } +} + +template +void _sum_nv_gpu(Ydata *y, Xdata const *x, uint64_t data_size, uint64_t x_per_NC_data_size, + unsigned int pack_size, uint64_t max_grid_size, void *stream) { + if (data_size == 0) { + return; + } + dim3 blockDims = dim3(256); + dim3 gridDims = dim3(std::min(data_size / blockDims.x, max_grid_size)); + uint64_t blocks_per_y = x_per_NC_data_size / blockDims.x; + unsigned int remainder = x_per_NC_data_size % blockDims.x; + + cudaStream_t cuda_stream = reinterpret_cast(stream); + + sum<<>>(y, x, data_size, x_per_NC_data_size, blocks_per_y, remainder, 0, pack_size); +} + +template +void sum_nv_gpu(void *y, void const *x, uint64_t data_size, uint64_t x_per_NC_data_size, unsigned int pack_size, uint64_t max_grid_size, void *stream) { + const auto x_ = reinterpret_cast(x); + const auto y_ = reinterpret_cast(y); + _sum_nv_gpu(y_, x_, data_size, x_per_NC_data_size, pack_size, max_grid_size, stream); +} + +/** ---------------------------------------- */ +/** -------------- Reset ---------------- */ +/** ---------------------------------------- */ +template +__global__ void reset( + Tdata *__restrict__ dst, + uint64_t data_size, + uint64_t offset, + unsigned int pack_size) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if (idx < data_size) { + dst[idx] = Tdata(0); + } +} + +template +void _reset_nv_gpu(Tdata *x, uint64_t data_size, unsigned int pack_size, uint64_t offset, uint64_t max_grid_size, void *stream) { + if (data_size == 0) { + return; + } + dim3 blockDims = dim3(std::min(static_cast(256), data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < data_size; i += step) { + reset<<>>(x, offset + data_size, offset + i, pack_size); + } +} + +template +void reset_nv_gpu(void *x, uint64_t data_size, unsigned int pack_size, uint64_t max_grid_size, void *stream) { + const auto packed_data_size = data_size / pack_size; + const auto x_vec = reinterpret_cast(x); + _reset_nv_gpu(x_vec, packed_data_size, pack_size, 0, max_grid_size, stream); + + const auto remainder = data_size % pack_size; + const auto x_ = reinterpret_cast(x); + _reset_nv_gpu(x_, remainder, 1, data_size * pack_size, max_grid_size, stream); +} + + +/** ---------------------------------------- */ +/** ------------- Average --------------- */ +/** ---------------------------------------- */ +template +__global__ void average( + Ydata *y, + Xdata const *x, + uint64_t data_size, + uint64_t x_per_NC_data_size, + uint64_t offset, + unsigned pack_size) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if (idx < data_size) { + if constexpr (std::is_same::value && std::is_same::value) { + y[idx] = __float2half(__half2float(x[idx]) / x_per_NC_data_size); + } else if constexpr (std::is_same::value) { + y[idx] = __float2half(x[idx] / x_per_NC_data_size); + } else if constexpr (std::is_same::value) { + y[idx] = __half2float(x[idx]) / x_per_NC_data_size; + } else { + y[idx] = x[idx] / x_per_NC_data_size; + } + } +} + +template +void _average_nv_gpu(Ydata *y, Xdata const *x, uint64_t data_size, uint64_t x_per_NC_data_size, + unsigned int pack_size, uint64_t offset, uint64_t max_grid_size, void *stream) { + if (data_size == 0) { + return; + } + dim3 blockDims = dim3(std::min(static_cast(256), data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < data_size; i += step) { + average<<>>(y, x, offset + data_size, x_per_NC_data_size, offset + i, pack_size); + } +} + +template +void average_nv_gpu(void *y, void const *x, uint64_t data_size, uint64_t x_per_NC_data_size, unsigned int pack_size, uint64_t max_grid_size, void *stream) { + const auto packed_data_size = data_size / pack_size; + const auto x_vec = reinterpret_cast(x); + const auto y_vec = reinterpret_cast(y); + _average_nv_gpu(y_vec, x_vec, packed_data_size, x_per_NC_data_size, pack_size, 0, max_grid_size, stream); + + const auto remainder = data_size % pack_size; + const auto x_ = reinterpret_cast(x); + const auto y_ = reinterpret_cast(y); + _average_nv_gpu(y_, x_, remainder, x_per_NC_data_size, 1, data_size * pack_size, max_grid_size, stream); +} + + +/** ---------------------------------------- */ +/** --------- Global Avg Pool ----------- */ +/** ---------------------------------------- */ + +template +__global__ void global_avg_pool_padding( + Tdata *__restrict__ y, + Tdata const *__restrict__ x, + uint64_t data_size, + uint64_t x_per_NC_data_size, + uint64_t offset, + unsigned pack_size) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if (idx < data_size) { + Tdata thread_data[1]; + + using BlockOp = cub::BlockLoad; + __shared__ typename BlockOp::TempStorage load_temp_storage; + BlockOp(load_temp_storage).Load(x + blockIdx.x * blockDim.x, thread_data); + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduce_temp_storage; + Ldata block_sum = BlockReduce(reduce_temp_storage).Sum(Ldata(thread_data[0]), blockDim.x); + + if (threadIdx.x == 0) { + y[blockIdx.x] = Tdata(block_sum / x_per_NC_data_size); + } + } +} + +template +void launch_global_avg_pool_padding(GlobalAvgPoolCudaDescriptor_t desc, Tdata *y, Tdata const *x, void *stream, unsigned pack_size) { + dim3 blockDims = dim3(std::min(static_cast(desc->max_block_size), desc->x_per_NC_data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->data_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < desc->data_size; i += step) { + global_avg_pool_padding<<>>( + y, x, desc->data_size, desc->x_per_NC_data_size, i, pack_size); + } +} + + +template +void global_avg_pool_folding_direct(GlobalAvgPoolCudaDescriptor_t desc, void *y, void const *x, void *stream, unsigned pack_size) { + reset_nv_gpu(y, desc->y_data_size, pack_size, desc->max_grid_size, stream); + sum_nv_gpu(y, x, desc->data_size, desc->x_per_NC_data_size, pack_size, desc->max_grid_size, stream); + average_nv_gpu(y, y, desc->y_data_size, desc->x_per_NC_data_size, pack_size, desc->max_grid_size, stream); +} + +template +void global_avg_pool_folding_workspace(GlobalAvgPoolCudaDescriptor_t desc, void *y, void const *x, void *workspace, void *stream, unsigned pack_size) { + reset_nv_gpu(workspace, desc->y_data_size, pack_size, desc->max_grid_size, stream); + sum_nv_gpu(workspace, x, desc->data_size, desc->x_per_NC_data_size, pack_size, desc->max_grid_size, stream); + average_nv_gpu(y, workspace, desc->y_data_size, desc->x_per_NC_data_size, pack_size, desc->max_grid_size, stream); +} + +// launch folding functions based on workspace size +template +void launch_global_avg_pool_folding(GlobalAvgPoolCudaDescriptor_t desc, void *y, void const *x, void *workspace, uint64_t workspace_size, void *stream, unsigned pack_size) { + if (workspace_size == 0) { + global_avg_pool_folding_direct(desc, y, x, stream, pack_size); + } else { + global_avg_pool_folding_workspace(desc, y, x, workspace, stream, pack_size); + } +} + +// global average pool for high dimensional data (ndim > 4) +template +void global_avg_pool_nv_gpu_hd(GlobalAvgPoolCudaDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream, unsigned pack_size) { + if (desc->data_size == 0) { + return; + } + if (desc->x_per_NC_data_size <= desc->max_block_size) { + const auto y_ = reinterpret_cast(y); + const auto x_ = reinterpret_cast(x); + launch_global_avg_pool_padding(desc, y_, x_, stream, pack_size); + } else { + launch_global_avg_pool_folding(desc, y, x, workspace, workspace_size, stream, pack_size); + } +} + +template +infiniopStatus_t global_avg_pool_nv_gpu(GlobalAvgPoolCudaDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream, unsigned pack_size) { + // use cuDNN lib + if (desc->ndim <= 5) { + checkCudnnError(use_cudnn(desc->cudnn_handles_t, desc->device_id, (cudaStream_t) stream, + [&](cudnnHandle_t handle) { return cudnnPoolingForward(handle, desc->pool_desc, + &desc->alpha, desc->x_desc, x, &desc->beta, + desc->y_desc, y); })); + } else { + global_avg_pool_nv_gpu_hd(desc, workspace, workspace_size, y, x, stream, pack_size); + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGlobalAvgPool(GlobalAvgPoolCudaDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, + void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return global_avg_pool_nv_gpu(desc, workspace, workspace_size, y, x, stream, 1); + } + if (desc->dtype == F32) { + return global_avg_pool_nv_gpu(desc, workspace, workspace_size, y, x, stream, 1); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/global_avg_pool/cuda/global_avg_pool.cuh b/src/ops/global_avg_pool/cuda/global_avg_pool.cuh new file mode 100644 index 00000000..cd97be5b --- /dev/null +++ b/src/ops/global_avg_pool/cuda/global_avg_pool.cuh @@ -0,0 +1,46 @@ +#ifndef __CUDA_GLOBAL_AVG_POOL_H__ +#define __CUDA_GLOBAL_AVG_POOL_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include +#include +#include + +struct GlobalAvgPoolCudaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t data_size; + uint64_t y_data_size; + uint64_t x_per_NC_data_size; + unsigned max_block_size; + uint64_t max_grid_size; + uint64_t items_per_thread; + std::shared_ptr> cudnn_handles_t; + cudnnTensorDescriptor_t const x_desc; + cudnnTensorDescriptor_t const y_desc; + cudnnPoolingDescriptor_t const pool_desc; + const float alpha; + const float beta; +}; + +typedef struct GlobalAvgPoolCudaDescriptor *GlobalAvgPoolCudaDescriptor_t; + +infiniopStatus_t cudaCreateGlobalAvgPoolDescriptor(CudaHandle_t, + GlobalAvgPoolCudaDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +infiniopStatus_t cudaGetGlobalAvgPoolWorkspaceSize(GlobalAvgPoolCudaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cudaGlobalAvgPool(GlobalAvgPoolCudaDescriptor_t desc, + void *workspace, uint64_t workspace_size, void *y, void const *x, + void *stream); + +infiniopStatus_t cudaDestroyGlobalAvgPoolDescriptor(GlobalAvgPoolCudaDescriptor_t desc); + +#endif diff --git a/src/ops/global_avg_pool/operator.cc b/src/ops/global_avg_pool/operator.cc new file mode 100644 index 00000000..92484283 --- /dev/null +++ b/src/ops/global_avg_pool/operator.cc @@ -0,0 +1,95 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/global_avg_pool/global_avg_pool.h" + +#ifdef ENABLE_CPU +#include "cpu/global_avg_pool_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/global_avg_pool.cuh" +#endif +#ifdef ENABLE_CAMBRICON_MLU +// TODO: Cambricon +#endif + +__C infiniopStatus_t infiniopCreateGlobalAvgPoolDescriptor( + infiniopHandle_t handle, + infiniopGlobalAvgPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateGlobalAvgPoolDescriptor(handle, (GlobalAvgPoolCpuDescriptor_t *) desc_ptr, y, x); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateGlobalAvgPoolDescriptor((CudaHandle_t) handle, (GlobalAvgPoolCudaDescriptor_t *) desc_ptr, y, x); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopGetGlobalAvgPoolWorkspaceSize(infiniopGlobalAvgPoolDescriptor_t desc, uint64_t *size) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuGetGlobalAvgPoolWorkspaceSize((GlobalAvgPoolCpuDescriptor_t) desc, size); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaGetGlobalAvgPoolWorkspaceSize((GlobalAvgPoolCudaDescriptor_t) desc, size); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO: Cambricon support +#endif + } + return STATUS_BAD_DEVICE; +} + + +__C infiniopStatus_t infiniopGlobalAvgPool(infiniopGlobalAvgPoolDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuGlobalAvgPool((GlobalAvgPoolCpuDescriptor_t) desc, workspace, workspace_size, y, x, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaGlobalAvgPool((GlobalAvgPoolCudaDescriptor_t) desc, workspace, workspace_size, y, x, stream); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyGlobalAvgPoolDescriptor(infiniopGlobalAvgPoolDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyGlobalAvgPoolDescriptor((GlobalAvgPoolCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyGlobalAvgPoolDescriptor((GlobalAvgPoolCudaDescriptor_t) desc); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/matmul/ascend/matmul_aclnn.cc b/src/ops/matmul/ascend/matmul_aclnn.cc new file mode 100644 index 00000000..1502469e --- /dev/null +++ b/src/ops/matmul/ascend/matmul_aclnn.cc @@ -0,0 +1,137 @@ +#include "matmul_aclnn.h" + +MatmulAclnnDescriptor::MatmulAclnnDescriptor(Device _device) { + device = _device; + device_id = 0; + executor = nullptr; + info = nullptr; + cDesc = new aclnnTensorDescriptor(); + aDesc = new aclnnTensorDescriptor(); + bDesc = new aclnnTensorDescriptor(); + alpha = 1.0; + beta = 0; + mt = 1; + workspaceSize = 0; +} + +infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle, + MatmulAclnnDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta, + int8_t mt) { + DT dtype = c_desc->dt; + if (dtype != F16 && dtype != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + + *desc_ptr = new MatmulAclnnDescriptor(handle->device); + (*desc_ptr)->device_id = handle->device_id; + (*desc_ptr)->dtype = dtype; + (*desc_ptr)->mt = mt; + (*desc_ptr)->alpha = alpha; + (*desc_ptr)->beta = beta; + infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED}; + auto info = new MatmulInfo(c_desc, a_desc, b_desc, status, false); + if (*status != STATUS_SUCCESS) { + return *status; + } + (*desc_ptr)->info = info; + + auto &cDesc = (*desc_ptr)->cDesc; + auto &aDesc = (*desc_ptr)->aDesc; + auto &bDesc = (*desc_ptr)->bDesc; + + // Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched operation + CHECK_STATUS(cDesc->setDescriptor(toAclDataType(c_desc->dt), {info->c_matrix.rows, info->c_matrix.cols}, {info->c_matrix.row_stride, info->c_matrix.col_stride}), STATUS_SUCCESS); + CHECK_STATUS(aDesc->setDescriptor(toAclDataType(a_desc->dt), {info->a_matrix.rows, info->a_matrix.cols}, {info->a_matrix.row_stride, info->a_matrix.col_stride}), STATUS_SUCCESS); + CHECK_STATUS(bDesc->setDescriptor(toAclDataType(b_desc->dt), {info->b_matrix.rows, info->b_matrix.cols}, {info->b_matrix.row_stride, info->b_matrix.col_stride}), STATUS_SUCCESS); + + CHECK_STATUS(cDesc->createTensor(), STATUS_SUCCESS); + CHECK_STATUS(aDesc->createTensor(), STATUS_SUCCESS); + CHECK_STATUS(bDesc->createTensor(), STATUS_SUCCESS); + + + auto &workspaceSize = (*desc_ptr)->workspaceSize; + auto &executor = (*desc_ptr)->executor; + + aclTensor *tc = cDesc->t; + aclTensor *ta = aDesc->t; + aclTensor *tb = bDesc->t; + + aclnnStatus ret; + + + int64_t transA = 0; + int64_t transB = 0; + // aclnnGemm support C = alpha * A @ B + beta * C + // see https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md + ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, (*desc_ptr)->alpha, (*desc_ptr)->beta, transA, transB, tc, + (*desc_ptr)->mt, &workspaceSize, &executor); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + aclSetAclOpExecutorRepeatable(executor); + + return STATUS_SUCCESS; +} + +infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc, + uint64_t *size) { + *size = desc->workspaceSize; + return STATUS_SUCCESS; +} + +infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b, + void *stream) { + auto &cDesc = desc->cDesc; + auto &aDesc = desc->aDesc; + auto &bDesc = desc->bDesc; + + aclTensor *tc = cDesc->t; + aclTensor *ta = aDesc->t; + aclTensor *tb = bDesc->t; + + auto batch = desc->info->batch; + + auto &executor = desc->executor; + auto &workspaceSize = desc->workspaceSize; + + // Set runing on handle device + aclrtSetDevice(desc->device_id); + + for (int i = 0; i < batch; i++) { + AclSetTensorAddr(executor, 0, ta, (char *) (a) + i * desc->info->a_matrix.stride * desc->dtype.size); + AclSetTensorAddr(executor, 1, tb, (char *) (b) + i * desc->info->b_matrix.stride * desc->dtype.size); + AclSetTensorAddr(executor, 2, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size); + AclSetTensorAddr(executor, 3, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size); + aclnnStatus ret = aclnnGemm(workspace, + workspaceSize, + executor, + stream); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + } + + return STATUS_SUCCESS; +} + + +infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc) { + delete desc->cDesc; + delete desc->bDesc; + delete desc->aDesc; + delete desc->info; + aclDestroyAclOpExecutor(desc->executor); + delete desc; + + return STATUS_SUCCESS; +} diff --git a/src/ops/matmul/ascend/matmul_aclnn.h b/src/ops/matmul/ascend/matmul_aclnn.h new file mode 100644 index 00000000..41ce92b0 --- /dev/null +++ b/src/ops/matmul/ascend/matmul_aclnn.h @@ -0,0 +1,55 @@ +#ifndef __ACLNN_MATMUL_H__ +#define __ACLNN_MATMUL_H__ + +#include "../../../devices/ascend/ascend_handle.h" +#include "../../../devices/ascend/tensor_aclnn.h" +#include "../../utils.h" +#include "../blas.h" +#include "operators.h" +#include +#include +#include +#include + +struct MatmulAclnnDescriptor { + Device device; + int device_id; + aclOpExecutor* executor; + MatmulInfo* info; + DT dtype; + aclnnTensorDescriptor_t cDesc, aDesc, bDesc; + // cubeMathType + // see doc: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md + float alpha; + float beta; + int8_t mt; + uint64_t workspaceSize; + + MatmulAclnnDescriptor(Device _device); +}; + +typedef struct MatmulAclnnDescriptor *MatmulAclnnDescriptor_t; + +infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle, + MatmulAclnnDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta, + int8_t cubeMathType); + +infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc, + uint64_t *size); + +infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc); + +#endif diff --git a/src/ops/matmul/bang/matmul_cnnl.cc b/src/ops/matmul/bang/matmul_cnnl.cc index 05a2760a..6b7948c1 100644 --- a/src/ops/matmul/bang/matmul_cnnl.cc +++ b/src/ops/matmul/bang/matmul_cnnl.cc @@ -1,19 +1,20 @@ #include "matmul_cnnl.h" +#include "../../../devices/bang/bang_handle.h" #include "../../../devices/bang/common_bang.h" -#include "../../../devices/bang/handle_pool.h" #include "../../utils.h" #include "cnrt.h" - -MatmulBangDescriptor::MatmulBangDescriptor(Device device) { - this->device = device; - get_cnnl_pool(); -} - -void matmul_cnnl_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream) { - auto info = MatmulInfo(c, a, b, false); - - int32_t use_stride = true; - +infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle, + MatmulBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta) { + infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED}; + auto info = MatmulInfo(c_desc, a_desc, b_desc, status, false); + if (*status != STATUS_SUCCESS) { + return *status; + } cnnlTensorDescriptor_t aDesc, bDesc, cDesc; cnnlCreateTensorDescriptor(&aDesc); cnnlCreateTensorDescriptor(&bDesc); @@ -28,36 +29,75 @@ void matmul_cnnl_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, void cnnlMatMulHeuristicResult_t algoResult; cnnlMatMulDescCreate(&opDesc); cnnlMatMulAlgoCreate(&algo); - cnnlCreateMatMulHeuristicResult(&algoResult); - + cnnlCreateMatMulHeuristicResult(&algoResult); + int32_t use_stride = true; cnnlSetMatMulDescAttr(opDesc, CNNL_MATMUL_USE_STRIDE, &use_stride, sizeof(int32_t)); + *desc_ptr = new MatmulBangDescriptor{ + handle->device, + handle->device_id, + info, + alpha, + beta, + c_desc->dt, + handle->cnnl_handles, + aDesc, + bDesc, + cDesc, + opDesc, + algo, + algoResult}; + return STATUS_SUCCESS; +} +infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} +infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc) { + desc->cnnl_handles = nullptr; + cnnlDestroyTensorDescriptor(desc->aDesc); + cnnlDestroyTensorDescriptor(desc->bDesc); + cnnlDestroyTensorDescriptor(desc->cDesc); + cnnlMatMulDescDestroy(desc->opDesc); + cnnlMatMulAlgoDestroy(desc->algo); + cnnlDestroyMatMulHeuristicResult(desc->algoResult); + delete desc; + return STATUS_SUCCESS; +} - void *workspace; +void matmul_cnnl_f16(MatmulBangDescriptor_t desc, void *workspace, void *c, float beta, void const *a, void const *b, float alpha, void *stream) { + auto info = desc->info; + if (info.is_transed) { + std::swap(a, b); + } - use_cnnl((cnrtQueue_t) stream, + use_cnnl(desc->cnnl_handles, desc->device_id, (cnrtQueue_t) stream, [&](cnnlHandle_t handle) { int count = 0; - cnnlGetBatchMatMulAlgoHeuristic(handle, opDesc, aDesc, - bDesc, cDesc, - NULL, 1, &algoResult, &count); + cnnlGetBatchMatMulAlgoHeuristic(handle, desc->opDesc, desc->aDesc, + desc->bDesc, desc->cDesc, + NULL, 1, &desc->algoResult, &count); size_t wsSize; - cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &wsSize); + cnnlGetBatchMatMulHeuristicResult(desc->algoResult, desc->algo, &wsSize); cnrtMalloc(&workspace, wsSize); - cnnlBatchMatMulBCast_v2(handle, opDesc, algo, - &alpha, aDesc, info.a_ptr, - bDesc, info.b_ptr, - &beta, cDesc, info.c_ptr, + cnnlBatchMatMulBCast_v2(handle, desc->opDesc, desc->algo, + &alpha, desc->aDesc, a, + desc->bDesc, b, + &beta, desc->cDesc, c, workspace, wsSize); }); - - cnrtFree(workspace); - - cnnlDestroyTensorDescriptor(aDesc); - cnnlDestroyTensorDescriptor(bDesc); - cnnlDestroyTensorDescriptor(cDesc); - cnnlMatMulDescDestroy(opDesc); - cnnlMatMulAlgoDestroy(algo); - cnnlDestroyMatMulHeuristicResult(algoResult); +} +infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream) { + if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { + return STATUS_BAD_DEVICE; + } + float alpha = desc->alpha; + float beta = desc->beta; + if (dtype_eq(desc->dtype, F16)) { + matmul_cnnl_f16(desc, workspace, c, beta, a, b, alpha, stream); + cnrtQueueSync((cnrtQueue_t)stream); + return STATUS_SUCCESS; + } + return STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/ops/matmul/bang/matmul_cnnl.h b/src/ops/matmul/bang/matmul_cnnl.h index 66ef8f71..70830450 100644 --- a/src/ops/matmul/bang/matmul_cnnl.h +++ b/src/ops/matmul/bang/matmul_cnnl.h @@ -1,6 +1,6 @@ #ifndef __CNNL_MATMUL_H__ #define __CNNL_MATMUL_H__ - +#include "../../../devices/bang/bang_handle.h" #include "../blas.h" #include "cnnl.h" #include "cnnl_extra.h" @@ -8,8 +8,34 @@ struct MatmulBangDescriptor { Device device; - MatmulBangDescriptor(Device device); + int device_id; + MatmulInfo info; + float alpha; + float beta; + DT dtype; + std::shared_ptr> cnnl_handles; + cnnlTensorDescriptor_t aDesc; + cnnlTensorDescriptor_t bDesc; + cnnlTensorDescriptor_t cDesc; + cnnlMatMulDescriptor_t opDesc; + cnnlMatMulAlgo_t algo; + cnnlMatMulHeuristicResult_t algoResult; }; +typedef struct MatmulBangDescriptor *MatmulBangDescriptor_t; + +infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle, + MatmulBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta); + +infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size); + +infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream); + +infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc); inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &matrix, bool trans = false) { int ndim = matrix.ndim; @@ -33,6 +59,5 @@ inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &mat } } -void matmul_cnnl_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream); #endif// __CNNL_MATMUL_H__ diff --git a/src/ops/matmul/blas.h b/src/ops/matmul/blas.h index 36fca6fd..7882dba2 100644 --- a/src/ops/matmul/blas.h +++ b/src/ops/matmul/blas.h @@ -17,31 +17,34 @@ typedef struct BlasMatrix { BlasMatrix() {} - BlasMatrix(TensorLayout *layout) { + BlasMatrix(infiniopTensorDescriptor_t layout, infiniopStatus_t *status) { if (layout->ndim == 2) { this->ndim = 2; this->batch = 1; this->stride = 0; this->rows = layout->shape[0]; this->cols = layout->shape[1]; - this->row_stride = layout->strides[0] / layout->dt.size; - this->col_stride = layout->strides[1] / layout->dt.size; + this->row_stride = layout->strides[0]; + this->col_stride = layout->strides[1]; } else if (layout->ndim == 3) { this->ndim = 3; this->batch = layout->shape[0]; - this->stride = this->batch == 1 ? 0 : layout->strides[0] / layout->dt.size; + this->stride = this->batch == 1 ? 0 : layout->strides[0]; this->rows = layout->shape[1]; this->cols = layout->shape[2]; - this->row_stride = layout->strides[1] / layout->dt.size; - this->col_stride = layout->strides[2] / layout->dt.size; + this->row_stride = layout->strides[1]; + this->col_stride = layout->strides[2]; } else { - PANIC(InvalidMatrixShape); + *status = STATUS_BAD_TENSOR_SHAPE; + return; } if (this->row_stride != 1 && this->col_stride != 1) { - ASSERT(false); - PANIC(MatrixIsNotContiguous); + *status = STATUS_BAD_TENSOR_STRIDES; + return; } + + *status = STATUS_SUCCESS; } bool match_batch(int batch) const { @@ -67,20 +70,23 @@ struct MatmulInfo { BlasMatrix b_matrix; BlasMatrix c_matrix; - void const *a_ptr; - void const *b_ptr; - void *c_ptr; - int m, n, k, batch; - MatmulInfo(Tensor c, Tensor a, Tensor b, bool col_major = true) { - a_matrix = BlasMatrix(a.layout); - b_matrix = BlasMatrix(b.layout); - c_matrix = BlasMatrix(c.layout); + bool is_transed = false; - a_ptr = a.data; - b_ptr = b.data; - c_ptr = c.data; + MatmulInfo(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, infiniopStatus_t *status, bool col_major = true) { + a_matrix = BlasMatrix(a_desc, status); + if (*status != STATUS_SUCCESS) { + return; + } + b_matrix = BlasMatrix(b_desc, status); + if (*status != STATUS_SUCCESS) { + return; + } + c_matrix = BlasMatrix(c_desc, status); + if (*status != STATUS_SUCCESS) { + return; + } ASSERT_EQ(c_matrix.rows, a_matrix.rows);// m ASSERT_EQ(c_matrix.cols, b_matrix.cols);// n @@ -88,7 +94,8 @@ struct MatmulInfo { batch = c_matrix.batch; if (!a_matrix.match_batch(batch) || !b_matrix.match_batch(batch)) { - PANIC(InvalidBatchSize); + *status = STATUS_BAD_PARAM; + return; } if ((col_major && c_matrix.col_stride == 1) || (!col_major && c_matrix.row_stride == 1)) { @@ -96,7 +103,7 @@ struct MatmulInfo { b_matrix.transpose(); a_matrix.transpose(); std::swap(a_matrix, b_matrix); - std::swap(a_ptr, b_ptr); + is_transed = true; } m = c_matrix.rows; diff --git a/src/ops/matmul/cpu/matmul_cpu.cc b/src/ops/matmul/cpu/matmul_cpu.cc index 000e0df0..2dcc9d2e 100644 --- a/src/ops/matmul/cpu/matmul_cpu.cc +++ b/src/ops/matmul/cpu/matmul_cpu.cc @@ -1,24 +1,94 @@ #include "matmul_cpu.h" #include "../../../devices/cpu/common_cpu.h" #include "../../utils.h" -#include "../blas.h" #include -void matmul_cpu_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha) { - auto info = MatmulInfo(c, a, b); +infiniopStatus_t cpuCreateMatmulDescriptor(CpuHandle_t handle, + MatmulCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta) { + DT dtype = c_desc->dt; + + if (dtype != F16 && dtype != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + + infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED}; + auto info = MatmulInfo(c_desc, a_desc, b_desc, status); + if (*status != STATUS_SUCCESS) { + return *status; + } + + *desc_ptr = new MatmulCpuDescriptor{ + DevCpu, + dtype, + info, + alpha, + beta}; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} + +template +infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha) { + auto info = desc->info; + + if (info.is_transed) { + std::swap(a, b); + } for (int i = 0; i < info.batch; ++i) { for (int m_ = 0; m_ < info.m; ++m_) { for (int n_ = 0; n_ < info.n; ++n_) { - auto c_ = reinterpret_cast(info.c_ptr) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride; + auto c_ = reinterpret_cast(c) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride; float sum = 0; for (int k_ = 0; k_ < info.k; ++k_) { - auto a_ = reinterpret_cast(info.a_ptr) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride; - auto b_ = reinterpret_cast(info.b_ptr) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride; - sum += f16_to_f32(*a_) * f16_to_f32(*b_); + auto a_ = reinterpret_cast(a) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride; + auto b_ = reinterpret_cast(b) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride; + if constexpr (std::is_same::value) { + sum += f16_to_f32(*a_) * f16_to_f32(*b_); + } else { + sum += *a_ * (*b_); + } + } + if constexpr (std::is_same::value) { + if (beta == 0) { + *c_ = f32_to_f16(alpha * sum); + } else { + *c_ = f32_to_f16(beta * f16_to_f32(*c_) + alpha * sum); + } + } else { + *c_ = beta * (*c_) + alpha * sum; } - *c_ = f32_to_f16(beta * f16_to_f32(*c_) + alpha * sum); } } } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b) { + if (desc->dtype == F16) { + return matmul_cpu(desc, c, desc->beta, a, b, desc->alpha); + } + if (desc->dtype == F32) { + return matmul_cpu(desc, c, desc->beta, a, b, desc->alpha); + } + return STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/ops/matmul/cpu/matmul_cpu.h b/src/ops/matmul/cpu/matmul_cpu.h index c1ddbc8f..3a5970e8 100644 --- a/src/ops/matmul/cpu/matmul_cpu.h +++ b/src/ops/matmul/cpu/matmul_cpu.h @@ -1,11 +1,37 @@ #ifndef __CPU_MATMUL_H__ #define __CPU_MATMUL_H__ +#include "../../../devices/cpu/cpu_handle.h" +#include "../blas.h" #include "operators.h" + typedef struct MatmulCpuDescriptor { Device device; + DT dtype; + MatmulInfo info; + float alpha; + float beta; } MatmulCpuDescriptor; -void matmul_cpu_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha); +typedef struct MatmulCpuDescriptor *MatmulCpuDescriptor_t; + +infiniopStatus_t cpuCreateMatmulDescriptor(CpuHandle_t handle, + MatmulCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta); + +infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b); + +infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc); #endif// __CPU_MATMUL_H__ diff --git a/src/ops/matmul/cuda/matmul_cuda.cc b/src/ops/matmul/cuda/matmul_cuda.cc new file mode 100644 index 00000000..8bac48d4 --- /dev/null +++ b/src/ops/matmul/cuda/matmul_cuda.cc @@ -0,0 +1,44 @@ +#include "matmul_cuda.h" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateMatmulDescriptor(CudaHandle_t handle, + MatmulCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta) { + DT dtype = c_desc->dt; + + if (dtype != F16 && dtype != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + + infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED}; + auto info = MatmulInfo(c_desc, a_desc, b_desc, status); + if (*status != STATUS_SUCCESS) { + return *status; + } + + *desc_ptr = new MatmulCudaDescriptor{ + DevNvGpu, + dtype, + handle->device_id, + info, + alpha, + beta, + handle->cublas_handles_t}; + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGetMatmulWorkspaceSize(MatmulCudaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyMatmulDescriptor(MatmulCudaDescriptor_t desc) { + desc->cublas_handles_t = nullptr; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/matmul/cuda/matmul_cuda.cu b/src/ops/matmul/cuda/matmul_cuda.cu index c7e25f81..fcbc755d 100644 --- a/src/ops/matmul/cuda/matmul_cuda.cu +++ b/src/ops/matmul/cuda/matmul_cuda.cu @@ -1,25 +1,36 @@ -#include "../../../devices/cuda/handle_pool.h" +#include "../../../devices/cuda/cuda_handle.h" #include "../../utils.h" #include "../blas.h" #include "matmul_cuda.h" #include #include -MatmulCudaDescriptor::MatmulCudaDescriptor(Device device) { - this->device = device; - get_cublas_pool(); -} +template +infiniopStatus_t matmul_cuda(MatmulCudaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) { + auto info = desc->info; -void matmul_nv_gpu_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream) { - auto info = MatmulInfo(c, a, b); + if (info.is_transed) { + std::swap(a, b); + } - auto alpha_f16 = __float2half(alpha); - auto beta_f16 = __float2half(beta); + cudaDataType a_type, b_type, c_type; + cublasComputeType_t compute_type; + if constexpr (std::is_same::value) { + a_type = b_type = c_type = CUDA_R_16F; + compute_type = CUBLAS_COMPUTE_32F; + } else { + a_type = b_type = c_type = CUDA_R_32F; +#ifdef ENABLE_SUGON_DCU + compute_type = CUBLAS_COMPUTE_32F; +#else + compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; +#endif + } auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; - use_cublas((cudaStream_t) stream, + use_cublas(desc->cublas_handles_t, desc->device_id, (cudaStream_t) stream, [&](cublasHandle_t handle) { cublasGemmStridedBatchedEx( handle, op_a, @@ -27,21 +38,38 @@ void matmul_nv_gpu_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, vo info.m, info.n, info.k, - &alpha_f16, - info.a_ptr, - CUDA_R_16F, + &alpha, + a, + a_type, info.a_matrix.ld(), info.a_matrix.stride, - info.b_ptr, - CUDA_R_16F, + b, + b_type, info.b_matrix.ld(), info.b_matrix.stride, - &beta_f16, - info.c_ptr, - CUDA_R_16F, + &beta, + c, + c_type, info.c_matrix.ld(), info.c_matrix.stride, info.batch, - CUBLAS_COMPUTE_16F, + compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP); }); + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaMatmul(MatmulCudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b, + void *stream) { + if (desc->dtype == F16) { + return matmul_cuda(desc, c, desc->beta, a, b, desc->alpha, stream); + } + if (desc->dtype == F32) { + return matmul_cuda(desc, c, desc->beta, a, b, desc->alpha, stream); + } + return STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/ops/matmul/cuda/matmul_cuda.h b/src/ops/matmul/cuda/matmul_cuda.h index 77760e27..3e82c1ed 100644 --- a/src/ops/matmul/cuda/matmul_cuda.h +++ b/src/ops/matmul/cuda/matmul_cuda.h @@ -1,13 +1,41 @@ -#ifndef __NV_GPU_MATMUL_H__ -#define __NV_GPU_MATMUL_H__ +#ifndef __CUDA_MATMUL_H__ +#define __CUDA_MATMUL_H__ +#include "../../../devices/cuda/cuda_handle.h" +#include "../blas.h" #include "operators.h" +#include typedef struct MatmulCudaDescriptor { Device device; - MatmulCudaDescriptor(Device device); + DT dtype; + int device_id; + MatmulInfo info; + float alpha; + float beta; + std::shared_ptr> cublas_handles_t; } MatmulCudaDescriptor; -void matmul_nv_gpu_f16(Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream); +typedef struct MatmulCudaDescriptor *MatmulCudaDescriptor_t; -#endif// __NV_GPU_MATMUL_H__ +infiniopStatus_t cudaCreateMatmulDescriptor(CudaHandle_t handle, + MatmulCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta); + +infiniopStatus_t cudaGetMatmulWorkspaceSize(MatmulCudaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cudaMatmul(MatmulCudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b, + void *stream); + +infiniopStatus_t cudaDestroyMatmulDescriptor(MatmulCudaDescriptor_t desc); + +#endif// __CUDA_MATMUL_H__ diff --git a/src/ops/matmul/maca/matmul_maca.cc b/src/ops/matmul/maca/matmul_maca.cc new file mode 100644 index 00000000..2d6658f7 --- /dev/null +++ b/src/ops/matmul/maca/matmul_maca.cc @@ -0,0 +1,44 @@ +#include "matmul_maca.h" +#include "../../../devices/maca/common_maca.h" +#include "../../utils.h" + +infiniopStatus_t macaCreateMatmulDescriptor(MacaHandle_t handle, + MatmulMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta) { + DT dtype = c_desc->dt; + + if (dtype != F16 && dtype != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + + infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED}; + auto info = MatmulInfo(c_desc, a_desc, b_desc, status); + if (*status != STATUS_SUCCESS) { + return *status; + } + + *desc_ptr = new MatmulMacaDescriptor{ + DevMetaxGpu, + dtype, + handle->device_id, + info, + alpha, + beta, + handle->mcblas_handles_t}; + return STATUS_SUCCESS; +} + +infiniopStatus_t macaGetMatmulWorkspaceSize(MatmulMacaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t macaDestroyMatmulDescriptor(MatmulMacaDescriptor_t desc) { + desc->mcblas_handles_t = nullptr; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/matmul/maca/matmul_maca.h b/src/ops/matmul/maca/matmul_maca.h new file mode 100644 index 00000000..2264cdc4 --- /dev/null +++ b/src/ops/matmul/maca/matmul_maca.h @@ -0,0 +1,41 @@ +#ifndef __MACA_MATMUL_H__ +#define __MACA_MATMUL_H__ + +#include "../../../devices/maca/maca_handle.h" +#include "../blas.h" +#include "operators.h" +#include + +typedef struct MatmulMacaDescriptor { + Device device; + DT dtype; + int device_id; + MatmulInfo info; + float alpha; + float beta; + std::shared_ptr> mcblas_handles_t; +} MatmulMacaDescriptor; + +typedef struct MatmulMacaDescriptor *MatmulMacaDescriptor_t; + +infiniopStatus_t macaCreateMatmulDescriptor(MacaHandle_t handle, + MatmulMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta); + +infiniopStatus_t macaGetMatmulWorkspaceSize(MatmulMacaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t macaMatmul(MatmulMacaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b, + void *stream); + +infiniopStatus_t macaDestroyMatmulDescriptor(MatmulMacaDescriptor_t desc); + +#endif// __MACA_MATMUL_H__ diff --git a/src/ops/matmul/maca/matmul_maca.maca b/src/ops/matmul/maca/matmul_maca.maca new file mode 100644 index 00000000..d944c85a --- /dev/null +++ b/src/ops/matmul/maca/matmul_maca.maca @@ -0,0 +1,77 @@ +#include "../../../devices/maca/maca_handle.h" +#include "../../utils.h" +#include "../blas.h" +#include "matmul_maca.h" +#include +#include + +template +infiniopStatus_t matmul_maca(MatmulMacaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) { + auto info = desc->info; + + if (info.is_transed) { + std::swap(a, b); + } + + Tdata alpha_, beta_; + hpccDataType a_type, b_type, c_type; + hcblasComputeType_t compute_type; + + if constexpr (std::is_same::value) { + alpha_ = __float2half(alpha); + beta_ = __float2half(beta); + a_type = b_type = c_type = HPCC_R_16F; + compute_type = HCBLAS_COMPUTE_16F; + } else { + alpha_ = alpha; + beta_ = beta; + a_type = b_type = c_type = HPCC_R_32F; + compute_type = HCBLAS_COMPUTE_32F_FAST_TF32; + } + + auto op_a = info.a_matrix.row_stride == 1 ? HCBLAS_OP_N : HCBLAS_OP_T; + auto op_b = info.b_matrix.row_stride == 1 ? HCBLAS_OP_N : HCBLAS_OP_T; + + use_mcblas(desc->mcblas_handles_t, desc->device_id, (hcStream_t) stream, + [&](hcblasHandle_t handle) { hcblasGemmStridedBatchedEx( + handle, + op_a, + op_b, + info.m, + info.n, + info.k, + &alpha_, + a, + a_type, + info.a_matrix.ld(), + info.a_matrix.stride, + b, + b_type, + info.b_matrix.ld(), + info.b_matrix.stride, + &beta_, + c, + c_type, + info.c_matrix.ld(), + info.c_matrix.stride, + info.batch, + compute_type, + HCBLAS_GEMM_DEFAULT_TENSOR_OP); }); + return STATUS_SUCCESS; +} + +infiniopStatus_t macaMatmul(MatmulMacaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b, + void *stream) { + if (desc->dtype == F16) { + return matmul_maca(desc, c, desc->beta, a, b, desc->alpha, stream); + } + if (desc->dtype == F32) { + return matmul_maca(desc, c, desc->beta, a, b, desc->alpha, stream); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/matmul/musa/matmul_musa.cc b/src/ops/matmul/musa/matmul_musa.cc new file mode 100644 index 00000000..3256dca6 --- /dev/null +++ b/src/ops/matmul/musa/matmul_musa.cc @@ -0,0 +1,48 @@ +#include "matmul_musa.h" +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include +#include + +#include + +infiniopStatus_t musaCreateMatmulDescriptor(MusaHandle_t handle, + MatmulMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta) { + DT dtype = c_desc->dt; + + if (dtype != F16 && dtype != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + + infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED}; + auto info = MatmulInfo(c_desc, a_desc, b_desc, status); + if (*status != STATUS_SUCCESS) { + return *status; + } + + *desc_ptr = new MatmulMusaDescriptor{ + DevMthreadsGpu, + dtype, + handle->device_id, + info, + alpha, + beta, + handle->mublas_handles_t}; + return STATUS_SUCCESS; +} + +infiniopStatus_t musaGetMatmulWorkspaceSize(MatmulMusaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t musaDestroyMatmulDescriptor(MatmulMusaDescriptor_t desc) { + desc->mublas_handles_t = nullptr; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/matmul/musa/matmul_musa.h b/src/ops/matmul/musa/matmul_musa.h new file mode 100644 index 00000000..b086a494 --- /dev/null +++ b/src/ops/matmul/musa/matmul_musa.h @@ -0,0 +1,45 @@ +#ifndef __MUSA_MATMUL_H__ +#define __MUSA_MATMUL_H__ + +#include +#include +#include +#include +#include +#include "../blas.h" +#include "operators.h" +#include "../../../devices/musa/musa_handle.h" + +typedef struct MatmulMusaDescriptor { + Device device; + DT dtype; + int device_id; + MatmulInfo info; + float alpha; + float beta; + std::shared_ptr> mublas_handles_t; +} MatmulMusaDescriptor; + +typedef struct MatmulMusaDescriptor *MatmulMusaDescriptor_t; + +infiniopStatus_t musaCreateMatmulDescriptor(MusaHandle_t handle, + MatmulMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta); + +infiniopStatus_t musaGetMatmulWorkspaceSize(MatmulMusaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t musaMatmul(MatmulMusaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b, + void *stream); + +infiniopStatus_t musaDestroyMatmulDescriptor(MatmulMusaDescriptor_t desc); + +#endif // __MUSA_MATMUL_H__ diff --git a/src/ops/matmul/musa/matmul_musa.mu b/src/ops/matmul/musa/matmul_musa.mu new file mode 100644 index 00000000..b445a7b3 --- /dev/null +++ b/src/ops/matmul/musa/matmul_musa.mu @@ -0,0 +1,77 @@ +#include "../../../devices/musa/musa_handle.h" +#include "../../utils.h" +#include "../blas.h" +#include "matmul_musa.h" +#include +#include + +template +infiniopStatus_t matmul_musa(MatmulMusaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) { + auto info = desc->info; + + if (info.is_transed) { + std::swap(a, b); + } + + Tdata alpha_, beta_; + musaDataType_t a_type, b_type, c_type; + mublasComputeType_t compute_type; + + if constexpr (std::is_same::value) { + alpha_ = __float2half(alpha); + beta_ = __float2half(beta); + a_type = b_type = c_type = MUSA_R_16F; + compute_type = MUBLAS_COMPUTE_16F; + } else { + alpha_ = alpha; + beta_ = beta; + a_type = b_type = c_type = MUSA_R_32F; + compute_type = MUBLAS_COMPUTE_32F_FAST_TF32; + } + + auto op_a = info.a_matrix.row_stride == 1 ? MUBLAS_OP_N : MUBLAS_OP_T; + auto op_b = info.b_matrix.row_stride == 1 ? MUBLAS_OP_N : MUBLAS_OP_T; + + use_mublas(desc->mublas_handles_t, desc->device_id, (MUstream) stream, + [&](mublasHandle_t handle) { mublasGemmStridedBatchedEx( + handle, + op_a, + op_b, + info.m, + info.n, + info.k, + &alpha_, + a, + a_type, + info.a_matrix.ld(), + info.a_matrix.stride, + b, + b_type, + info.b_matrix.ld(), + info.b_matrix.stride, + &beta_, + c, + c_type, + info.c_matrix.ld(), + info.c_matrix.stride, + info.batch, + compute_type, + MUBLAS_GEMM_DEFAULT);}); + return STATUS_SUCCESS; +} + +infiniopStatus_t musaMatmul(MatmulMusaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *c, + void const *a, + void const *b, + void *stream) { + if (desc->dtype == F16) { + return matmul_musa(desc, c, desc->beta, a, b, desc->alpha, stream); + } + if (desc->dtype == F32) { + return matmul_musa(desc, c, desc->beta, a, b, desc->alpha, stream); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/matmul/operator.cc b/src/ops/matmul/operator.cc index d323d009..5fa766eb 100644 --- a/src/ops/matmul/operator.cc +++ b/src/ops/matmul/operator.cc @@ -11,74 +11,172 @@ #ifdef ENABLE_CAMBRICON_MLU #include "bang/matmul_cnnl.h" #endif +#ifdef ENABLE_ASCEND_NPU +#include "ascend/matmul_aclnn.h" +#endif +#ifdef ENABLE_METAX_GPU +#include "maca/matmul_maca.h" +#endif +#ifdef ENABLE_MTHREADS_GPU +#include "musa/matmul_musa.h" +#endif -struct MatmulDescriptor { - Device device; -}; +__C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle, + infiniopMatmulDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + float alpha, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + float beta) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateMatmulDescriptor((CpuHandle_t) handle, (MatmulCpuDescriptor_t *) desc_ptr, c_desc, alpha, a_desc, b_desc, beta); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateMatmulDescriptor((CudaHandle_t) handle, (MatmulCudaDescriptor_t *) desc_ptr, c_desc, alpha, a_desc, b_desc, beta); + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return bangCreateMatmulDescriptor((BangHandle_t) handle, (MatmulBangDescriptor_t *) desc_ptr, c_desc, alpha, a_desc, b_desc, beta); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnCreateMatmulDescriptor((AscendHandle_t) handle, + (MatmulAclnnDescriptor_t *) desc_ptr, + c_desc, + alpha, + a_desc, + b_desc, + beta, + 1); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaCreateMatmulDescriptor((MacaHandle_t) handle, (MatmulMacaDescriptor_t *) desc_ptr, c_desc, alpha, a_desc, b_desc, beta); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaCreateMatmulDescriptor((MusaHandle_t) handle, (MatmulMusaDescriptor_t *) desc_ptr, c_desc, alpha, a_desc, b_desc, beta); + } +#endif + } + return STATUS_BAD_DEVICE; +} -__C MatmulDescriptor *createMatmulDescriptor(Device device, void *config) { - switch (device) { +__C infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, uint64_t *size) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - return (MatmulDescriptor *) (new MatmulCpuDescriptor{device}); + return cpuGetMatmulWorkspaceSize((MatmulCpuDescriptor_t) desc, size); #endif #ifdef ENABLE_NV_GPU case DevNvGpu: { - return (MatmulDescriptor *) (new MatmulCudaDescriptor(device)); + return cudaGetMatmulWorkspaceSize((MatmulCudaDescriptor_t) desc, size); } + #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { - return (MatmulDescriptor *) (new MatmulBangDescriptor(device)); + return bangGetMatmulWorkspaceSize((MatmulBangDescriptor_t) desc, size); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t) desc, + size); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaGetMatmulWorkspaceSize((MatmulMacaDescriptor_t) desc, size); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaGetMatmulWorkspaceSize((MatmulMusaDescriptor_t) desc, size); } #endif - default: - PANIC(UnsupportedDevice); } - return nullptr; + return STATUS_BAD_DEVICE; } -__C void destroyMatmulDescriptor(MatmulDescriptor *descriptor) { - switch (descriptor->device) { +__C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - delete (MatmulCpuDescriptor *) (descriptor); - break; + return cpuMatmul((MatmulCpuDescriptor_t) desc, workspace, workspace_size, c, a, b); #endif #ifdef ENABLE_NV_GPU case DevNvGpu: - delete (MatmulCudaDescriptor *) (descriptor); - break; + return cudaMatmul((MatmulCudaDescriptor_t) desc, workspace, workspace_size, c, a, b, stream); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { - delete (MatmulBangDescriptor *) (descriptor); - break; + return bangMatmul((MatmulBangDescriptor_t) desc, workspace, workspace_size, c, a, b, stream); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: + return aclnnMatmul((MatmulAclnnDescriptor_t) desc, + workspace, + workspace_size, + c, + a, + b, + stream); +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaMatmul((MatmulMacaDescriptor_t) desc, workspace, workspace_size, c, a, b, stream); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaMatmul((MatmulMusaDescriptor_t) desc, workspace, workspace_size, c, a, b, stream); } #endif - default: - PANIC(UnsupportedDevice); } + return STATUS_BAD_DEVICE; } -__C void matmul(MatmulDescriptor *descriptor, Tensor c, float beta, Tensor a, Tensor b, float alpha, void *stream) { - switch (descriptor->device) { +__C infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - matmul_cpu_f16(c, beta, a, b, alpha); - break; + return cpuDestroyMatmulDescriptor((MatmulCpuDescriptor_t) desc); #endif #ifdef ENABLE_NV_GPU - case DevNvGpu: - matmul_nv_gpu_f16(c, beta, a, b, alpha, stream); - break; + case DevNvGpu: { + return cudaDestroyMatmulDescriptor((MatmulCudaDescriptor_t) desc); + } + #endif #ifdef ENABLE_CAMBRICON_MLU - case DevCambriconMlu: - matmul_cnnl_f16(c, beta, a, b, alpha, stream); - break; + case DevCambriconMlu: { + return bangDestroyMatmulDescriptor((MatmulBangDescriptor_t) desc); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t) desc); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaDestroyMatmulDescriptor((MatmulMacaDescriptor_t) desc); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaDestroyMatmulDescriptor((MatmulMusaDescriptor_t) desc); + } #endif - default: - PANIC(UnsupportedDevice); } + return STATUS_BAD_DEVICE; } diff --git a/src/ops/max_pool/operator.cc b/src/ops/max_pool/operator.cc new file mode 100644 index 00000000..2644f8bd --- /dev/null +++ b/src/ops/max_pool/operator.cc @@ -0,0 +1,54 @@ +#include "../pooling/pooling.h" +#include "../utils.h" +#include "ops/max_pool/max_pool.h" + +struct _MaxPoolDescriptor { + Device device; + infiniopPoolingDescriptor_t pooling_desc; + uint64_t workspace_size; +}; + +typedef struct _MaxPoolDescriptor *_MaxPoolDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle, + infiniopMaxPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + uint64_t const *kernel_shape, + uint64_t const *pads, + int64_t const *strides, + uint64_t n) { + infiniopPoolingDescriptor_t pooling_desc; + CHECK_STATUS(infiniopCreatePoolingDescriptor(handle, &pooling_desc, y, x, kernel_shape, pads, strides, n, 0), STATUS_SUCCESS); + uint64_t workspace_size = 0; + CHECK_STATUS(infiniopGetPoolingWorkspaceSize(pooling_desc, &workspace_size), STATUS_SUCCESS); + + *(_MaxPoolDescriptor_t *) desc_ptr = new _MaxPoolDescriptor{ + handle->device, + pooling_desc, + workspace_size}; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, uint64_t *size) { + *size = ((_MaxPoolDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) { + auto _desc = (_MaxPoolDescriptor_t) desc; + if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + + CHECK_STATUS(infiniopPooling(_desc->pooling_desc, workspace, workspace_size, y, x, stream), + STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyPoolingDescriptor(((_MaxPoolDescriptor_t) desc)->pooling_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/mlp/operator.cc b/src/ops/mlp/operator.cc new file mode 100644 index 00000000..48475bb2 --- /dev/null +++ b/src/ops/mlp/operator.cc @@ -0,0 +1,130 @@ +#include "../utils.h" +#include "ops/matmul/matmul.h" +#include "ops/mlp/mlp.h" +#include "ops/swiglu/swiglu.h" +#include "tensor/tensor_descriptor.h" + +struct _MLPDescriptor { + Device device; + infiniopMatmulDescriptor_t matmul_desc1; + infiniopMatmulDescriptor_t matmul_desc2; + infiniopSwiGLUDescriptor_t swiglu_desc; + uint64_t w2_offset_by_bytes; + uint64_t workspace_size; + uint64_t matmul1_workspace_size; + uint64_t matmul2_workspace_size; + uint64_t matmul1_tensor_size; + uint64_t swiglu_tensor_size; +}; + +typedef struct _MLPDescriptor *_MLPDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMLPDescriptor(infiniopHandle_t handle, + infiniopMLPDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w12_desc, + infiniopTensorDescriptor_t w3_desc, + float alpha, + char residual) { + if (y_desc->ndim != 2 || x_desc->ndim != 2 || w12_desc->ndim != 2 || w3_desc->ndim != 2) { + return STATUS_BAD_TENSOR_SHAPE; + } + + if (x_desc->strides[1] != 1 || y_desc->strides[1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + // matmul1 desc + infiniopTensorDescriptor_t desc1 = new TensorDescriptor; + uint64_t shape1[2] = {x_desc->shape[0], w12_desc->shape[1]};// [num_tokens, 2 * intermediate_size] + CHECK_STATUS(infiniopCreateTensorDescriptor(&desc1, 2, shape1, nullptr, x_desc->dt), STATUS_SUCCESS); + infiniopMatmulDescriptor_t matmul_desc1 = new MatmulDescriptor{handle->device}; + CHECK_STATUS(infiniopCreateMatmulDescriptor(handle, &matmul_desc1, desc1, 1.0, x_desc, w12_desc, 0.0), STATUS_SUCCESS); + uint64_t matmul1_tensor_size = get_byte_size(desc1); + uint64_t matmul1_workspace_size = 0; + CHECK_STATUS(infiniopGetMatmulWorkspaceSize(matmul_desc1, &matmul1_workspace_size), STATUS_SUCCESS); + + // swiglu desc + infiniopTensorDescriptor_t desc2 = new TensorDescriptor; + uint64_t w2_offset_by_bytes = w12_desc->shape[1] / 2 * w12_desc->dt.size; + uint64_t shape2[2] = {x_desc->shape[0], w12_desc->shape[1] / 2};// [num_tokens, itermediate_size] + CHECK_STATUS(infiniopCreateTensorDescriptor(&desc2, 2, shape2, nullptr, x_desc->dt), STATUS_SUCCESS); + infiniopTensorDescriptor_t desc3 = new TensorDescriptor; + int64_t strides3[2] = {desc1->strides[0], desc1->strides[1]}; + CHECK_STATUS(infiniopCreateTensorDescriptor(&desc3, 2, shape2, strides3, x_desc->dt), STATUS_SUCCESS); + infiniopSwiGLUDescriptor_t swiglu_desc = new SwiGLUDescriptor{handle->device}; + CHECK_STATUS(infiniopCreateSwiGLUDescriptor(handle, &swiglu_desc, desc2, desc3, desc3), STATUS_SUCCESS); + uint64_t swiglu_tensor_size = get_byte_size(desc2); + + // matmul2 desc + infiniopMatmulDescriptor_t matmul_desc2 = new MatmulDescriptor{handle->device}; + CHECK_STATUS(infiniopCreateMatmulDescriptor(handle, &matmul_desc2, y_desc, alpha, desc2, w3_desc, residual ? 1.0 : 0.0), STATUS_SUCCESS); + uint64_t matmul2_workspace_size = 0; + CHECK_STATUS(infiniopGetMatmulWorkspaceSize(matmul_desc2, &matmul2_workspace_size), STATUS_SUCCESS); + + // calculate workspace size + uint64_t workspace_size = std::max(std::max(matmul1_workspace_size + matmul1_tensor_size, + matmul1_tensor_size + swiglu_tensor_size), + swiglu_tensor_size + matmul2_workspace_size); + + // create descriptor + *(_MLPDescriptor_t *) desc_ptr = new _MLPDescriptor{ + handle->device, + matmul_desc1, + matmul_desc2, + swiglu_desc, + w2_offset_by_bytes, + workspace_size, + matmul1_workspace_size, + matmul2_workspace_size, + matmul1_tensor_size, + swiglu_tensor_size}; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetMLPWorkspaceSize(infiniopMLPDescriptor_t desc, uint64_t *size) { + // compute order: matmul1, swiglu, matmul2 + *size = ((_MLPDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopMLP(infiniopMLPDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, + void const *x, + void const *w12, + void const *w3, + void *stream) { + auto _desc = (_MLPDescriptor_t) desc; + if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + + CHECK_STATUS(infiniopMatmul(_desc->matmul_desc1, + (char *) workspace + _desc->matmul1_tensor_size, + _desc->workspace_size - _desc->matmul1_tensor_size, + workspace, x, w12, stream), + STATUS_SUCCESS); + CHECK_STATUS(infiniopSwiGLU(_desc->swiglu_desc, + (char *) workspace + _desc->matmul1_tensor_size, + (char *) workspace + _desc->w2_offset_by_bytes, + workspace, stream), + STATUS_SUCCESS); + CHECK_STATUS(infiniopMatmul(_desc->matmul_desc2, (char *) workspace + _desc->matmul1_tensor_size + _desc->swiglu_tensor_size, + _desc->workspace_size - _desc->matmul1_tensor_size - _desc->swiglu_tensor_size, + y, (char *) workspace + _desc->matmul1_tensor_size, w3, stream), + STATUS_SUCCESS); + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyMLPDescriptor(infiniopMLPDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyMatmulDescriptor(((_MLPDescriptor_t) desc)->matmul_desc1), STATUS_SUCCESS); + CHECK_STATUS(infiniopDestroyMatmulDescriptor(((_MLPDescriptor_t) desc)->matmul_desc2), STATUS_SUCCESS); + CHECK_STATUS(infiniopDestroySwiGLUDescriptor(((_MLPDescriptor_t) desc)->swiglu_desc), STATUS_SUCCESS); + + return STATUS_SUCCESS; +} diff --git a/src/ops/pooling/cpu/pooling_cpu.cc b/src/ops/pooling/cpu/pooling_cpu.cc new file mode 100644 index 00000000..3c783c14 --- /dev/null +++ b/src/ops/pooling/cpu/pooling_cpu.cc @@ -0,0 +1,258 @@ +#include "pooling_cpu.h" +#include "../../utils.h" + +// get the total number of elements in arr +inline uint64_t getTotalSize(const uint64_t *arr, uint64_t ndim) { + return std::accumulate(arr, arr + ndim, 1ULL, std::multiplies()); +} + +// check if padding is needed +inline bool requirePadding(uint64_t const *pads, uint64_t ndim) { + return std::any_of(pads, pads + ndim - 2, + [](uint64_t pad) { return pad > 0; }); +} + +infiniopStatus_t cpuCreatePoolingDescriptor(infiniopHandle_t, + PoolingCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + uint64_t const *kernel_shape, + uint64_t const *pads, + int64_t const *strides, + uint64_t n, + int pooling_type) { + uint64_t ndim = y->ndim; + if (ndim < 3 || ndim != x->ndim || ndim != n + 2) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (x->shape[0] != y->shape[0] || x->shape[1] != y->shape[1]) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (pooling_type > 1) { + return STATUS_BAD_PARAM; + } + if (y->dt != F16 && y->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + const auto y_size = getTotalSize(y->shape, ndim); + const auto padded_x_size = requirePadding(pads, ndim) ? getPaddedSize(ndim, x->shape, pads) : 0; + uint64_t *x_shape = new uint64_t[ndim]; + uint64_t *y_shape = new uint64_t[ndim]; + uint64_t *kernel_ = new uint64_t[n]; + uint64_t *pads_ = new uint64_t[n]; + int64_t *strides_ = new int64_t[n]; + memcpy(x_shape, x->shape, ndim * sizeof(uint64_t)); + memcpy(y_shape, y->shape, ndim * sizeof(uint64_t)); + for (size_t i = 0; i < n; ++i) { + kernel_[i] = kernel_shape[i]; + pads_[i] = pads[i]; + strides_[i] = strides[i]; + } + + *desc_ptr = new PoolingCpuDescriptor{ + DevCpu, + y->dt, + ndim, + y_size, + padded_x_size, + x_shape, + kernel_, + y_shape, + pads_, + strides_, + pooling_type, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuGetPoolingWorkspaceSize(PoolingCpuDescriptor_t desc, uint64_t *size) { + *size = desc->padded_x_size * desc->dt.size; + if (desc->dt == F16) { + *size += desc->y_size * sizeof(float); + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyPoolingDescriptor(PoolingCpuDescriptor_t desc) { + delete[] desc->x_shape; + delete[] desc->y_shape; + delete[] desc->k_shape; + delete[] desc->pads; + delete[] desc->strides; + delete desc; + return STATUS_SUCCESS; +} + +// initialize the padded input with the data from the original input +template +void fillPaddedInput(PoolingCpuDescriptor_t desc, uint64_t const *padded_x_shape, + Tdata *padded_x, Tdata const *x, + uint64_t const *pads, uint64_t x_index, + uint64_t padded_x_index, uint64_t ndim) { + const auto x_shape = desc->x_shape[ndim]; + const auto padded_x_shape_ = padded_x_shape[ndim]; + const auto x_base_index = x_index * x_shape; + const auto padded_x_base_index = padded_x_index * padded_x_shape_ + + (x_shape == padded_x_shape_ ? 0 : pads[ndim - 2]); + + for (size_t i = 0; i < x_shape; ++i) { + // base case (last dimension) + if (ndim == desc->ndim - 1) { + padded_x[padded_x_base_index + i] = x[x_base_index + i]; + } + // recursive case + else { + fillPaddedInput(desc, padded_x_shape, padded_x, x, pads, x_base_index + i, + padded_x_base_index + i, ndim + 1); + } + } +} + +// perform the a singleton pooling operation depending on the data type and pooling type +template +inline void pool(PoolingCpuDescriptor_t desc, Ydata *y, Xdata const *x, + uint64_t const *x_shape, uint64_t curr_x_index, uint64_t y_index) { + switch (desc->pooling_mode) { + // 0. Max pooling + case 0: + if constexpr (std::is_same::value) { + y[y_index] = std::fmax(f16_to_f32(x[curr_x_index]), y[y_index]); + } else { + y[y_index] = std::max(x[curr_x_index], y[y_index]); + } + break; + // 1. Average pooling + default: + if constexpr (std::is_same::value) { + y[y_index] += f16_to_f32(x[curr_x_index]); + } else { + y[y_index] += x[curr_x_index]; + } + } +} + +// Recursive convolution function +template +void _applyPooling(PoolingCpuDescriptor_t desc, Ydata *y, Xdata const *x, + uint64_t const *x_shape, uint64_t x_index, uint64_t y_index, + uint64_t ndim) { + const auto dim_size = x_shape[ndim]; + const auto kernel_size = desc->k_shape[ndim - 2]; + const auto dilation = 1; + const auto stride = desc->strides[ndim - 2]; + const auto steps = + (dim_size - dilation * (kernel_size - 1) - 1) / stride + 1; + x_index *= dim_size; + y_index *= desc->y_shape[ndim]; + + // perform all the pooling along this axis + for (size_t i = 0; i < steps; ++i, ++y_index) { + // perform a single pooling + for (size_t k = 0; k < kernel_size; ++k) { + // calculate the current indices + const auto curr_x_index = x_index + i * stride + k * dilation; + + // base case (last dimension) + if (ndim == desc->ndim - 1) { + pool(desc, y, x, x_shape, curr_x_index, y_index); + } + // recursive case + else { + _applyPooling(desc, y, x, x_shape, curr_x_index, y_index, ndim + 1); + } + } + } +} + +template +void applyPooling(PoolingCpuDescriptor_t desc, Ydata *y, Xdata const *x, uint64_t const *x_shape) { +#pragma omp parallel for collapse(2) + // batch + for (size_t i = 0; i < x_shape[0]; ++i) { + + // channel + for (size_t j = 0; j < x_shape[1]; ++j) { + uint64_t x_index = i * x_shape[1] + j; + uint64_t y_index = i * desc->y_shape[1] + j; + _applyPooling(desc, y, x, x_shape, x_index, y_index, 2); + } + } + + // if is average pooling, take the average + if (desc->pooling_mode == 1) { + Ydata num_kernel_elements = getTotalSize(desc->k_shape, desc->ndim - 2); +#pragma omp parallel for + for (size_t i = 0; i < desc->y_size; ++i) { + y[i] /= num_kernel_elements; + } + } +} + +template +void _pooling_cpu(PoolingCpuDescriptor_t desc, void *workspace, uint64_t workspace_size, + Ydata *y, Xdata const *x) { + if (desc->padded_x_size > 0) { + auto padded_x = reinterpret_cast(workspace); + std::vector padded_shape_(desc->ndim); + auto padded_shape = padded_shape_.data(); + std::fill(padded_x, padded_x + desc->padded_x_size, 0); + getPaddedShape(desc->ndim, desc->x_shape, desc->pads, padded_shape); + fillPaddedInput(desc, padded_shape, padded_x, x, desc->pads, 0, 0, 0); + applyPooling(desc, y, padded_x, padded_shape); + } else { + applyPooling(desc, y, x, desc->x_shape); + } +} + +// Pooling function +template +infiniopStatus_t pooling_cpu(PoolingCpuDescriptor_t desc, void *workspace, uint64_t workspace_size, + void *y, void const *x) { + auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + std::fill(y_, y_ + desc->y_size, 0); + _pooling_cpu(desc, workspace, workspace_size, y_, x_); + return STATUS_SUCCESS; +} + +// sepcial case for fp16 (uint16_t) +template<> +infiniopStatus_t pooling_cpu(PoolingCpuDescriptor_t desc, void *workspace, uint64_t workspace_size, + void *y, void const *x) { + auto y_ = reinterpret_cast(workspace); + auto x_ = reinterpret_cast(x); + std::fill(y_, y_ + desc->y_size, 0); + + _pooling_cpu(desc, y_ + desc->y_size, workspace_size, y_, x_); + + // copy data from y_ to y + auto y_16 = reinterpret_cast(y); +#pragma omp parallel for + for (size_t i = 0; i < desc->y_size; ++i) { + y_16[i] = f32_to_f16(y_[i]); + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuPooling(PoolingCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, + void const *x, + void *stream) { + if (desc->dt == F16) { + return pooling_cpu(desc, workspace, workspace_size, y, x); + } + if (desc->dt == F32) { + return pooling_cpu(desc, workspace, workspace_size, y, x); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/pooling/cpu/pooling_cpu.h b/src/ops/pooling/cpu/pooling_cpu.h new file mode 100644 index 00000000..5f70f82c --- /dev/null +++ b/src/ops/pooling/cpu/pooling_cpu.h @@ -0,0 +1,48 @@ +#ifndef __CPU_POOLING_H__ +#define __CPU_POOLING_H__ + +#include "../../../devices/cpu/common_cpu.h" +#include "operators.h" +#include +#include +#include +#include + +struct PoolingCpuDescriptor { + Device device; + DataLayout dt; + uint64_t ndim; + uint64_t y_size; + uint64_t padded_x_size; + uint64_t const *x_shape; + uint64_t const *k_shape; + uint64_t const *y_shape; + uint64_t const *pads; + int64_t const *strides; + int pooling_mode; +}; + +typedef struct PoolingCpuDescriptor *PoolingCpuDescriptor_t; + +infiniopStatus_t cpuCreatePoolingDescriptor(infiniopHandle_t handle, + PoolingCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + uint64_t const *kernel_shape, + uint64_t const *pads, + int64_t const *strides, + uint64_t n, + int pooling_type); + +infiniopStatus_t cpuGetPoolingWorkspaceSize(PoolingCpuDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cpuPooling(PoolingCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, + void const *x, + void *stream); + +infiniopStatus_t cpuDestroyPoolingDescriptor(PoolingCpuDescriptor_t desc); + +#endif diff --git a/src/ops/pooling/cuda/pooling.cc b/src/ops/pooling/cuda/pooling.cc new file mode 100644 index 00000000..0cf45d64 --- /dev/null +++ b/src/ops/pooling/cuda/pooling.cc @@ -0,0 +1,167 @@ +#include "pooling.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include + +infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle, + PoolingCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + uint64_t const *kernel_shape, + uint64_t const *pads, + int64_t const *strides, + uint64_t n, + int pooling_type) { + uint64_t ndim = y->ndim; + if (ndim < 3 || ndim != x->ndim || ndim != n + 2) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (x->shape[0] != y->shape[0] || x->shape[1] != y->shape[1]) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (pooling_type > 1) { + return STATUS_BAD_PARAM; + } + if (y->dt != F16 && y->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + float alpha = 1.0f, beta = 0.0f; + + if (ndim <= 4) { + + int xn = x->shape[0]; + int xc = x->shape[1]; + int xh = ndim == 3 ? 1 : x->shape[2]; + int xw = ndim == 3 ? x->shape[2] : x->shape[3]; + int yh = ndim == 3 ? 1 : y->shape[2]; + int yw = ndim == 3 ? y->shape[2] : y->shape[3]; + const auto kernel_ = reinterpret_cast(kernel_shape); + const auto pads_ = reinterpret_cast(pads); + const auto strides_ = reinterpret_cast(strides); + int kh = ndim == 3 ? 1 : kernel_[0]; + int kw = ndim == 3 ? kernel_[0] : kernel_[1]; + int ph = ndim == 3 ? 0 : pads_[0]; + int pw = ndim == 3 ? pads_[0] : pads_[1]; + int sh = ndim == 3 ? 1 : strides_[0]; + int sw = ndim == 3 ? strides_[0] : strides_[1]; + + // get the data types of the tensors and the conv operator + CREATE_CHECK_ERROR(auto tensor_dt = dataTypeMap[x->dt], tensor_dt, -1, STATUS_BAD_PARAM); + + // create and set tensor descriptors for x + cudnnTensorDescriptor_t x_desc; + checkCudnnError(cudnnCreateTensorDescriptor(&x_desc)); + checkCudnnError(cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, static_cast(tensor_dt), xn, xc, xh, xw)); + + // Create and set pooling descriptor for average pooling + cudnnPoolingDescriptor_t pool_desc; + checkCudnnError(cudnnCreatePoolingDescriptor(&pool_desc)); + checkCudnnError(cudnnSetPooling2dDescriptor(pool_desc, + getPoolingMode(pooling_type), + CUDNN_NOT_PROPAGATE_NAN, + kh,// pooling window height + kw,// pooling window width + ph,// vertical padding + pw,// horizontal padding + sh,// vertical Stride + sw // horizontal stride + )); + // create and set tensor descriptors for y + cudnnTensorDescriptor_t y_desc; + checkCudnnError(cudnnCreateTensorDescriptor(&y_desc)); + checkCudnnError(cudnnGetPooling2dForwardOutputDim(pool_desc, x_desc, &xn, &xc, &yh, &yw)); + checkCudnnError(cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, static_cast(tensor_dt), xn, xc, yh, yw)); + + *desc_ptr = new PoolingCudaDescriptor{ + DevNvGpu, + y->dt, + handle->device_id, + handle->cudnn_handles_t, + x_desc, + y_desc, + pool_desc, + alpha, + beta, + }; + } else { + std::vector x_shape(ndim); + std::vector x_strides(ndim); + std::vector y_shape(ndim); + std::vector y_strides(ndim); + std::vector k_shape(ndim - 2); + std::vector pads_int(ndim - 2); + std::vector strides_int(ndim - 2); + +#pragma omp parallel for + for (size_t i = 0; i < ndim; ++i) { + x_shape[i] = static_cast(x->shape[i]); + x_strides[i] = static_cast(x->strides[i]); + y_shape[i] = static_cast(y->shape[i]); + y_strides[i] = static_cast(y->strides[i]); + if (i < ndim - 2) { + k_shape[i] = static_cast(kernel_shape[i]); + pads_int[i] = static_cast(pads[i]); + strides_int[i] = static_cast(strides[i]); + } + } + + // get the data types of the tensors and the conv operator + CREATE_CHECK_ERROR(auto tensor_dt = dataTypeMap[x->dt], tensor_dt, -1, STATUS_BAD_PARAM); + + // create and set tensor descriptors for x + cudnnTensorDescriptor_t x_desc; + checkCudnnError(cudnnCreateTensorDescriptor(&x_desc)); + checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, static_cast(tensor_dt), ndim, x_shape.data(), x_strides.data())); + + // Create and set pooling descriptor for average pooling + cudnnPoolingDescriptor_t pool_desc; + checkCudnnError(cudnnCreatePoolingDescriptor(&pool_desc)); + checkCudnnError(cudnnSetPoolingNdDescriptor(pool_desc, + getPoolingMode(pooling_type), + CUDNN_NOT_PROPAGATE_NAN, + ndim - 2, + k_shape.data(), + pads_int.data(), + strides_int.data())); + // create and set tensor descriptors for y + cudnnTensorDescriptor_t y_desc; + checkCudnnError(cudnnCreateTensorDescriptor(&y_desc)); + checkCudnnError(cudnnGetPoolingNdForwardOutputDim(pool_desc, x_desc, ndim, y_shape.data())); + checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, static_cast(tensor_dt), ndim, y_shape.data(), y_strides.data())); + + *desc_ptr = new PoolingCudaDescriptor{ + DevNvGpu, + y->dt, + handle->device_id, + handle->cudnn_handles_t, + x_desc, + y_desc, + pool_desc, + alpha, + beta, + }; + return STATUS_SUCCESS; + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGetPoolingWorkspaceSize(PoolingCudaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyPoolingDescriptor(PoolingCudaDescriptor_t desc) { + checkCudnnError(cudnnDestroyTensorDescriptor(desc->x_desc)); + checkCudnnError(cudnnDestroyTensorDescriptor(desc->y_desc)); + checkCudnnError(cudnnDestroyPoolingDescriptor(desc->pool_desc)); + desc->cudnn_handles_t = nullptr; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/pooling/cuda/pooling.cu b/src/ops/pooling/cuda/pooling.cu new file mode 100644 index 00000000..bac683c5 --- /dev/null +++ b/src/ops/pooling/cuda/pooling.cu @@ -0,0 +1,20 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "pooling.cuh" + +infiniopStatus_t pooling_nv_gpu(PoolingCudaDescriptor_t desc, void *y, void const *x, void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + checkCudnnError(use_cudnn(desc->cudnn_handles_t, desc->device_id, (cudaStream_t) stream, + [&](cudnnHandle_t handle) { return cudnnPoolingForward(handle, desc->pool_desc, + &desc->alpha, desc->x_desc, x, &desc->beta, + desc->y_desc, y); })); + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaPooling(PoolingCudaDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, void *stream) { + if (desc->dtype == F16 || desc->dtype == F32) { + return pooling_nv_gpu(desc, y, x, stream); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/pooling/cuda/pooling.cuh b/src/ops/pooling/cuda/pooling.cuh new file mode 100644 index 00000000..dd080e1e --- /dev/null +++ b/src/ops/pooling/cuda/pooling.cuh @@ -0,0 +1,54 @@ +#ifndef __CUDA_POOLING_H__ +#define __CUDA_POOLING_H__ + +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include + +struct PoolingCudaDescriptor { + Device device; + DT dtype; + int device_id; + std::shared_ptr> cudnn_handles_t; + cudnnTensorDescriptor_t const x_desc; + cudnnTensorDescriptor_t const y_desc; + cudnnPoolingDescriptor_t const pool_desc; + const float alpha; + const float beta; +}; + +typedef struct PoolingCudaDescriptor *PoolingCudaDescriptor_t; + +infiniopStatus_t cudaCreatePoolingDescriptor(CudaHandle_t handle, + PoolingCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + uint64_t const *kernel_shape, + uint64_t const *pads, + int64_t const *strides, + uint64_t n, + int pooling_type); + +infiniopStatus_t cudaGetPoolingWorkspaceSize(PoolingCudaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cudaPooling(PoolingCudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, + void const *x, + void *stream); + +infiniopStatus_t cudaDestroyPoolingDescriptor(PoolingCudaDescriptor_t desc); + +inline cudnnPoolingMode_t getPoolingMode(int pooling_type) { + switch (pooling_type) { + case 0: + return CUDNN_POOLING_MAX; + case 1: + return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + default: + return CUDNN_POOLING_MAX; + } +} + +#endif// __CUDA_POOLING_H__ diff --git a/src/ops/pooling/operator.cc b/src/ops/pooling/operator.cc new file mode 100644 index 00000000..4772be52 --- /dev/null +++ b/src/ops/pooling/operator.cc @@ -0,0 +1,101 @@ +#include "../utils.h" +#include "operators.h" +#include "pooling.h" + +#ifdef ENABLE_CPU +#include "cpu/pooling_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/common_cuda.h" +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/pooling.cuh" +#endif +#ifdef ENABLE_CAMBRICON_MLU +// TODO +#endif + +__C infiniopStatus_t infiniopCreatePoolingDescriptor( + infiniopHandle_t handle, + infiniopPoolingDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + uint64_t const *kernel_shape, + uint64_t const *pads, + int64_t const *strides, + uint64_t n, + int pooling_type) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreatePoolingDescriptor(handle, (PoolingCpuDescriptor_t *) desc_ptr, y, x, kernel_shape, pads, strides, n, pooling_type); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreatePoolingDescriptor((CudaHandle_t) handle, (PoolingCudaDescriptor_t *) desc_ptr, y, x, kernel_shape, pads, strides, n, pooling_type); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopGetPoolingWorkspaceSize(infiniopPoolingDescriptor_t desc, uint64_t *size) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuGetPoolingWorkspaceSize((PoolingCpuDescriptor_t) desc, size); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaGetPoolingWorkspaceSize((PoolingCudaDescriptor_t) desc, size); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO + +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopPooling(infiniopPoolingDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuPooling((PoolingCpuDescriptor_t) desc, workspace, workspace_size, y, x, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaPooling((PoolingCudaDescriptor_t) desc, workspace, workspace_size, y, x, stream); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyPoolingDescriptor(infiniopPoolingDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyPoolingDescriptor((PoolingCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyPoolingDescriptor((PoolingCudaDescriptor_t) desc); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/pooling/pooling.h b/src/ops/pooling/pooling.h new file mode 100644 index 00000000..b57856f0 --- /dev/null +++ b/src/ops/pooling/pooling.h @@ -0,0 +1,27 @@ +#ifndef POOLING_H +#define POOLING_H + +#include "export.h" +#include "operators.h" + +typedef struct PoolingDescriptor { + Device device; +} PoolingDescriptor; +typedef PoolingDescriptor *infiniopPoolingDescriptor_t; + +__C infiniopStatus_t infiniopCreatePoolingDescriptor(infiniopHandle_t handle, + infiniopPoolingDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + uint64_t const *kernel_shape, + uint64_t const *pads, + int64_t const *strides, + uint64_t n, + int pooling_type); + +__C infiniopStatus_t infiniopGetPoolingWorkspaceSize(infiniopPoolingDescriptor_t desc, uint64_t *size); + +__C infiniopStatus_t infiniopPooling(infiniopPoolingDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream); + +__C infiniopStatus_t infiniopDestroyPoolingDescriptor(infiniopPoolingDescriptor_t desc); +#endif diff --git a/src/ops/random_sample/ascend/random_sample.cc b/src/ops/random_sample/ascend/random_sample.cc new file mode 100644 index 00000000..b16159dc --- /dev/null +++ b/src/ops/random_sample/ascend/random_sample.cc @@ -0,0 +1,153 @@ +#include "random_sample.h" + +RandomSampleAscendDescriptor::RandomSampleAscendDescriptor(Device _device) { + device = _device; + device_id = 0; + pDesc = new aclnnTensorDescriptor(); + topkIdxDesc = new aclnnTensorDescriptor(); + topkValDesc = new aclnnTensorDescriptor(); + resDesc = new aclnnTensorDescriptor(); +} + +infiniopStatus_t ascendCreateRandomSampleDescriptor(AscendHandle_t handle, + RandomSampleAscendDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs) { + if (probs->ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!dtype_eq(result->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + if (result->ndim != 1 && result->shape[0] != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + + (*desc_ptr) = new RandomSampleAscendDescriptor(handle->device); + (*desc_ptr)->device_id = handle->device_id; + + CHECK_STATUS((*desc_ptr)->pDesc->fromInfiniOpTensorDescriptor(probs), STATUS_SUCCESS); + CHECK_STATUS((*desc_ptr)->resDesc->fromInfiniOpTensorDescriptor(result), STATUS_SUCCESS); + // Ascend aclnnTopk doesn't support U64 type + (*desc_ptr)->resDesc->dataType = aclDataType::ACL_INT64; + + return STATUS_SUCCESS; +} + + +infiniopStatus_t ascendGetRandomSampleWorkspaceSize(RandomSampleAscendDescriptor_t desc, + uint64_t *size) { + auto &pDesc = desc->pDesc; + *size = numElements(pDesc->shape.data(), pDesc->ndim) * aclDataTypeSize(pDesc->dataType) + + numElements(pDesc->shape.data(), pDesc->ndim) * sizeof(I64); + + return STATUS_SUCCESS; +} + +infiniopStatus_t ascendRandomSample(RandomSampleAscendDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + if (topk <= 0 || topp < 0 || topp > 1.0) { + return STATUS_BAD_PARAM; + } + + if (random_val < 0 || random_val > 1.0) { + return STATUS_BAD_PARAM; + } + + auto &pDesc = desc->pDesc; + auto &topkIdxDesc = desc->topkIdxDesc; + auto &topkValDesc = desc->topkValDesc; + auto ndim = static_cast(pDesc->ndim); + auto voc = pDesc->shape[0]; + auto topk_ = topk <= voc ? topk : voc; + bool doSample = topk_ > 1 && temperature != 0 && topp != 0; + + auto topkShape = std::vector(pDesc->shape); + topkShape[ndim - 1] = doSample ? topk_ : 1; + + auto topkStrides = std::vector(pDesc->strides); + // Infer contiguous strides + topkStrides[ndim - 1] = 1; + for (int64_t i = ndim - 2; i >= 0; --i) { + topkStrides[i] = topkStrides[i + 1] * topkShape[i + 1]; + } + + CHECK_STATUS(topkValDesc->setDescriptor(pDesc->dataType, topkShape, topkStrides), STATUS_SUCCESS); + CHECK_STATUS(topkIdxDesc->setDescriptor(aclDataType::ACL_INT64, topkShape, topkStrides), STATUS_SUCCESS); + + // Infer data ptr + auto workspaceTmp = workspace; + auto topkValAddr = workspaceTmp; + workspaceTmp = (void *) ((uint8_t *) workspace + + numElements(topkValDesc->shape.data(), topkValDesc->ndim) * aclDataTypeSize(topkValDesc->dataType)); + auto topkIdxAddr = workspaceTmp; + auto pAddr = (void *) probs; + + // Create aclTensor + CHECK_STATUS(pDesc->createTensor(pAddr), STATUS_SUCCESS); + CHECK_STATUS(topkValDesc->createTensor(topkValAddr), STATUS_SUCCESS); + CHECK_STATUS(topkIdxDesc->createTensor(topkIdxAddr), STATUS_SUCCESS); + if (!doSample) { + CHECK_STATUS(desc->resDesc->createTensor(result), STATUS_SUCCESS); + } + + // Do Topk calculate + uint64_t topkWorkspaceSize = 0; + aclOpExecutor *topkExecutor = nullptr; + auto ret = aclnnTopkGetWorkspaceSize(pDesc->t, + topkShape[ndim - 1], + ndim - 1, + true, + true, + topkValDesc->t, + doSample ? topkIdxDesc->t + : desc->resDesc->t, + &topkWorkspaceSize, + &topkExecutor); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnTopkGetWorkspaceSize failed ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + void *topkWorkspace; + CHECK_STATUS(mallocWorkspace(&topkWorkspace, topkWorkspaceSize), STATUS_SUCCESS); + ret = aclnnTopk(topkWorkspace, + topkWorkspaceSize, + topkExecutor, + stream); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnTopk failed ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + CHECK_STATUS(freeWorkspace(topkWorkspace), STATUS_SUCCESS); + + if (doSample) { + // Do softmax and topp random sample + CHECK_STATUS(random_sample_do( + pAddr, + result, + topkValAddr, + topkIdxAddr, + topk, + static_cast(pDesc->shape[0]), + topp, + temperature, + random_val, + pDesc->dataType, + stream), + STATUS_SUCCESS); + } + return STATUS_SUCCESS; +} + +infiniopStatus_t ascendDestroyRandomSampleDescriptor(RandomSampleAscendDescriptor_t desc) { + delete desc->pDesc; + delete desc->topkIdxDesc; + delete desc->topkValDesc; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/random_sample/ascend/random_sample.h b/src/ops/random_sample/ascend/random_sample.h new file mode 100644 index 00000000..1ecc16fc --- /dev/null +++ b/src/ops/random_sample/ascend/random_sample.h @@ -0,0 +1,52 @@ +#ifndef __ASCEND_RANDOM_SAMPLE_H__ +#define __ASCEND_RANDOM_SAMPLE_H__ + +#include "../../../devices/ascend/ascend_handle.h" +#include "../../../devices/ascend/tensor_aclnn.h" +#include "../../utils.h" +#include "operators.h" +#include +#include +#include +#include + + +struct RandomSampleAscendDescriptor { + Device device; + int device_id; + aclnnTensorDescriptor_t pDesc; + aclnnTensorDescriptor_t topkValDesc; + aclnnTensorDescriptor_t topkIdxDesc; + aclnnTensorDescriptor_t resDesc; + RandomSampleAscendDescriptor(Device _device); +}; + +typedef struct RandomSampleAscendDescriptor *RandomSampleAscendDescriptor_t; + +infiniopStatus_t ascendCreateRandomSampleDescriptor(AscendHandle_t handle, + RandomSampleAscendDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs); + +infiniopStatus_t ascendGetRandomSampleWorkspaceSize(RandomSampleAscendDescriptor_t desc, + uint64_t *size); + +infiniopStatus_t ascendRandomSample(RandomSampleAscendDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream); + +infiniopStatus_t ascendDestroyRandomSampleDescriptor(RandomSampleAscendDescriptor_t desc); + +extern "C" infiniopStatus_t +random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr, + int32_t topk, int32_t voc, float topp, float temper, + float random, int dtype, void *stream); + +#endif diff --git a/src/ops/random_sample/ascend/random_sample_kernel.cpp b/src/ops/random_sample/ascend/random_sample_kernel.cpp new file mode 100644 index 00000000..18b482bc --- /dev/null +++ b/src/ops/random_sample/ascend/random_sample_kernel.cpp @@ -0,0 +1,232 @@ +#include "../../../../include/status.h" +#include "kernel_operator.h" + +using namespace AscendC; + +template +class KernelRandomSample { +public: + __aicore__ inline KernelRandomSample() {} + __aicore__ inline void Init(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr, + GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_, + float topp_, float temper_, float random_) { + + topk = topk_; + voc = voc_; + topp = topp_; + temperature = temper_; + random = random_; + blockSize = 256 * 2; + + // CumSumInfo + if (sizeof(T) == sizeof(float)) { + topkAligned = (topk + 7) / 8 * 8; + vocAligned = (voc + 7) / 8 * 8; + } else { + topkAligned = (topk + 15) / 16 * 16; + vocAligned = (voc + 15) / 16 * 16; + } + topkIdxAligned = (topk + 3) / 4 * 4; + + // Set Gm + pGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(p), voc); + topkGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(topkAddr), topk); + topkIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(topkIdxAddr), topk); + resGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(res), 1); + + // Global input and output + pipe.InitBuffer(pQue, 1, vocAligned * sizeof(T)); + pipe.InitBuffer(topkQue, 1, topkAligned * sizeof(T)); + pipe.InitBuffer(topkIdxQue, 1, topkIdxAligned * sizeof(int64_t)); + pipe.InitBuffer(resQue, 1, 32);// 32 bytes for aligned + + pipe.InitBuffer(softMaxBuf1, blockSize); + pipe.InitBuffer(softMaxBuf2, blockSize); + pipe.InitBuffer(softMaxBuf3, blockSize); + pipe.InitBuffer(softMaxOutBuf, topkAligned * sizeof(T)); + + pipe.InitBuffer(inclusiveSumOutBuf, topkAligned * sizeof(T)); + } + __aicore__ inline void Process() { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + // Softmax + __aicore__ inline void SoftMax(LocalTensor &valIn, + LocalTensor &topkValIn, + LocalTensor &softMaxOut) { + int32_t repeatTimes = vocAligned * sizeof(T) / blockSize; + int32_t remainder = vocAligned * sizeof(T) % blockSize / sizeof(T); + int32_t tileLength = blockSize / sizeof(T); + float negMax = -static_cast(topkValIn(0)); + float invTemperature = 1.0f / temperature; + float sum = 0.f; + float sum_s = 0.f; + LocalTensor tmpBuffer = softMaxBuf1.Get(); + LocalTensor tmpBuffer2 = softMaxBuf2.Get(); + LocalTensor tmpBuffer3 = softMaxBuf3.Get(); + for (int32_t i = 0; i < repeatTimes; i++) { + Adds(tmpBuffer, valIn[i * tileLength], static_cast(negMax), tileLength); + Muls(tmpBuffer2, tmpBuffer, static_cast(invTemperature), tileLength); + Exp(tmpBuffer3, tmpBuffer2, tileLength); + sum_s = 0.f; + for (int j = 0; j < tileLength; ++j) { + sum_s += static_cast(tmpBuffer3(j)); + } + sum += sum_s; + } + if (remainder != 0) { + Adds(tmpBuffer, valIn[repeatTimes * tileLength], static_cast(negMax), remainder); + Muls(tmpBuffer2, tmpBuffer, static_cast(invTemperature), remainder); + Exp(tmpBuffer3, tmpBuffer2, remainder); + sum_s = 0.f; + for (int i = 0; i < remainder; ++i) { + sum_s += static_cast(tmpBuffer3(i)); + } + sum += sum_s; + } + float invSum = 1.0f / sum; + Adds(tmpBuffer, topkValIn, static_cast(negMax), topk); + Muls(tmpBuffer2, tmpBuffer, static_cast(invTemperature), topk); + Exp(tmpBuffer3, tmpBuffer2, topk); + Muls(softMaxOut, tmpBuffer3, static_cast(invSum), topk); + } + + // Cumsum + __aicore__ inline void InclusiveSum(LocalTensor &topkValIn, + LocalTensor &topkValOut) { + static constexpr CumSumConfig cumSumConfig{true, false, false}; + LocalTensor lastRowLocal; + CumSum(topkValOut, lastRowLocal, topkValIn, + {1, static_cast(topkAligned)}); + } + + // Random sample + __aicore__ inline void RandomSample(LocalTensor &valIn, + LocalTensor &Index, + LocalTensor &result) { + int end = 0; + for (end = 0; end < topk; end++) { + if (static_cast(valIn(end)) >= topp) { + break; + } + } + if (end < topk - 1) { + end += 1; + } else { + end = topk; + } + + auto randomVal = random * static_cast(valIn(end - 1)); + for (int i = 0; i < end; i++) { + if (randomVal < static_cast(valIn(i))) { + result(0) = Index(i); + return; + } + } + result(0) = Index(end - 1); + } + + __aicore__ inline void CopyIn() { + LocalTensor pLocal = pQue.AllocTensor(); + LocalTensor topkValLocal = topkQue.AllocTensor(); + LocalTensor topkIdxLocal = topkIdxQue.AllocTensor(); + + DataCopy(pLocal, pGm, vocAligned); + DataCopy(topkValLocal, topkGm, topkAligned); + DataCopy(topkIdxLocal, topkIdxGm, topkIdxAligned); + + pQue.EnQue(pLocal); + topkQue.EnQue(topkValLocal); + topkIdxQue.EnQue(topkIdxLocal); + } + + __aicore__ inline void Compute() { + // Get input data + LocalTensor pLocal = pQue.DeQue(); + LocalTensor topkValLocal = topkQue.DeQue(); + + // SoftMax + LocalTensor softMaxOutLocal = softMaxOutBuf.Get(); + SoftMax(pLocal, topkValLocal, softMaxOutLocal); + + // InclusiveSum + LocalTensor inclusiveOutLocal = inclusiveSumOutBuf.Get(); + InclusiveSum(softMaxOutLocal, inclusiveOutLocal); + + // randomSample + LocalTensor topkIdxLocal = topkIdxQue.DeQue(); + LocalTensor resultLocal = resQue.AllocTensor(); + RandomSample(inclusiveOutLocal, topkIdxLocal, resultLocal); + + pQue.FreeTensor(pLocal); + topkQue.FreeTensor(topkValLocal); + topkIdxQue.FreeTensor(topkIdxLocal); + resQue.EnQue(resultLocal); + } + __aicore__ inline void CopyOut() { + LocalTensor resLocal = resQue.DeQue(); + DataCopy(resGm, resLocal, 32 / sizeof(int64_t)); + resQue.FreeTensor(resLocal); + } + +private: + GlobalTensor pGm; + GlobalTensor topkGm; + GlobalTensor topkIdxGm; + GlobalTensor resGm; + + TPipe pipe; + + TQue pQue; + TQue topkQue; + TQue topkIdxQue; + TQue resQue; + + TBuf softMaxBuf1; + TBuf softMaxBuf2; + TBuf softMaxBuf3; + TBuf softMaxOutBuf; + + TBuf inclusiveSumOutBuf; + + // Kernel params + int32_t topk; + int32_t voc; + float topp; + float temperature; + float random; + + int32_t topkAligned; + int32_t topkIdxAligned; + int32_t vocAligned; + int32_t blockSize; +}; + +extern "C" __global__ __aicore__ void +random_sample_kernel_f16(GM_ADDR p, GM_ADDR res, GM_ADDR topkAddr, + GM_ADDR topkIdxAddr, int32_t topk_, int32_t voc_, + float topp_, float temper_, float random_) { + KernelRandomSample op; + op.Init(p, res, topkAddr, topkIdxAddr, topk_, voc_, topp_, temper_, random_); + op.Process(); +} + +extern "C" infiniopStatus_t +random_sample_do(void *p, void *res, void *topkAddr, void *topkIdxAddr, + int32_t topk, int32_t voc, float topp, float temper, + float random, int dtype, void *stream) { + + switch (dtype) { + case 0: + return STATUS_SUCCESS; + case 1: + random_sample_kernel_f16<<<1, nullptr, stream>>>( + p, res, topkAddr, topkIdxAddr, topk, voc, topp, temper, random); + return STATUS_SUCCESS; + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/random_sample/bang/random_sample_bang.cc b/src/ops/random_sample/bang/random_sample_bang.cc new file mode 100644 index 00000000..ed1945da --- /dev/null +++ b/src/ops/random_sample/bang/random_sample_bang.cc @@ -0,0 +1,39 @@ +#include "random_sample_bang.h" +#include "../../utils.h" + +infiniopStatus_t bangCreateRandomSampleDescriptor(BangHandle_t handle, + RandomSampleBangDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs) { + if (probs->ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!dtype_eq(probs->dt, F16)) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (!dtype_eq(result->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + int voc = probs->shape[0]; + int rLength = result->shape[0]; + if (result->ndim != 1 && rLength != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + *desc_ptr = new RandomSampleBangDescriptor{ + handle->device, + handle->device_id, + probs->dt, + voc, + result->dt, + rLength}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, uint64_t *size) { + *size = desc->voc * (sizeof(uint64_t) + sizeof(desc->dtype)) + sizeof(desc->dtype); + return STATUS_SUCCESS; +} + +infiniopStatus_t bangDestroyRandomSampleDescriptor(RandomSampleBangDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/random_sample/bang/random_sample_bang.h b/src/ops/random_sample/bang/random_sample_bang.h new file mode 100644 index 00000000..de830fbf --- /dev/null +++ b/src/ops/random_sample/bang/random_sample_bang.h @@ -0,0 +1,39 @@ +#ifndef __BANG_RANDOM_SAMPLE_H__ +#define __BANG_RANDOM_SAMPLE_H__ + +#include "../../../devices/bang/bang_handle.h" +#include "../../utils.h" +#include "operators.h" + +struct RandomSampleBangDescriptor { + Device device; + int device_id; + DT dtype; + int voc; + DT rDtype; + int rLength; +}; + +typedef struct RandomSampleBangDescriptor *RandomSampleBangDescriptor_t; + +infiniopStatus_t bangCreateRandomSampleDescriptor(BangHandle_t handle, + RandomSampleBangDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs); + +infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t desc, uint64_t *size); + +infiniopStatus_t bangRandomSample(RandomSampleBangDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream); + +infiniopStatus_t bangDestroyRandomSampleDescriptor(RandomSampleBangDescriptor_t desc); + + +#endif diff --git a/src/ops/random_sample/bang/random_sample_bang.mlu b/src/ops/random_sample/bang/random_sample_bang.mlu new file mode 100644 index 00000000..eb6f636f --- /dev/null +++ b/src/ops/random_sample/bang/random_sample_bang.mlu @@ -0,0 +1,512 @@ +#include "bang.h" +#include "bang_device_functions.h" +#include "cnrt.h" +#include "random_sample_bang.h" +#include "../../../devices/bang/common_bang.h" +#include + +const int SRC_MAX_SIZE = 1024 * 32; +__nram__ char nram_buffer[NRAM_MAX_SIZE]; +template +__mlu_global__ void random_sampleX(T const *source, uint64_t *indices, uint64_t *indGdram, T *globalTopk, T *globalSum, float random_val, float topp, int topk, float temperature, int voc){ + const int maxNum = SRC_MAX_SIZE/sizeof(T); + int wSize = 128 / sizeof(T); + int segNum = maxNum / wSize; + + T temInv = 1.0 / static_cast(temperature); + + int remainT = voc % taskDim; + int stepEasy = (voc - remainT) / taskDim; + int stepHard = stepEasy + 1; + int step = (taskId < remainT ? stepHard : stepEasy); + int indStart = (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy); + + char *nram_bufferInd = nram_buffer + (2 * maxNum + wSize + taskDim * topk) * sizeof(T); + uint64_t *srcInd = (uint64_t *)nram_bufferInd;//[maxNum],必须要求maxNum >= max{step, topk} + uint64_t *indGlobal = srcInd + maxNum;//[taskDim * topk] + + __sync_all(); + + T *src = (T *)nram_buffer;//[maxNum],必须要求maxNum >= max{step, topk} + T *destSum = src + maxNum;//[maxNum] + T *destSumFinal = destSum + maxNum;//[wSize] + T *srcGlobal = destSumFinal + wSize;//[taskDim * topk] + __bang_write_value(src, maxNum, -INFINITY); + __bang_write_zero(destSum, maxNum); + __bang_write_zero(destSumFinal, wSize); + + + + if(step){ + for(int i = 0; i < step; i++){ + srcInd[i] = indStart + i; + } + __memcpy(src, source + indStart, step * sizeof(T), GDRAM2NRAM); + if(step >= topk){ + for(int i = 0; i < topk; i++){ + for(int j = i + 1; j < step; j++){ + if(src[i] < src[j]){ + T tmp = src[i]; + src[i] = src[j]; + src[j] = tmp; + + uint64_t indexTmp = srcInd[i]; + srcInd[i] = srcInd[j]; + srcInd[j] = indexTmp; + } + } + } + } + else{ + for(int i = step; i < topk; i++){ + src[i] = -INFINITY; + srcInd[i] = -1; + } + } + __memcpy(globalTopk + taskId * topk, src, topk * sizeof(T), NRAM2GDRAM); + __memcpy(indGdram + taskId * topk, srcInd, topk * sizeof(uint64_t), NRAM2GDRAM); + __sync_all(); + } + if(taskId == 0){ + __memcpy(srcGlobal, globalTopk, taskDim * topk * sizeof(T), GDRAM2NRAM); + __memcpy(indGlobal, indGdram, taskDim * topk * sizeof(uint64_t), GDRAM2NRAM); + for(int i = 0; i < topk; i++){ + for(int j = i + 1; j < taskDim * topk; j++){ + if(srcGlobal[i] < srcGlobal[j]){ + T tmpg = srcGlobal[i]; + srcGlobal[i] = srcGlobal[j]; + srcGlobal[j] = tmpg; + + uint64_t indexTmpg = indGlobal[i]; + indGlobal[i] = indGlobal[j]; + indGlobal[j] = indexTmpg; + } + } + } + __memcpy(globalTopk, srcGlobal, taskDim * topk * sizeof(T), NRAM2GDRAM); + __memcpy(indGdram, indGlobal, taskDim * topk * sizeof(uint64_t), NRAM2GDRAM); + } + __sync_all(); + T globalM = globalTopk[0]; + __bang_write_zero(destSum, maxNum); + __bang_write_zero(destSumFinal, wSize); + if(step){ + __bang_write_value(src, maxNum, globalM); + __memcpy(src, source + indStart, step * sizeof(T), GDRAM2NRAM); + __bang_sub_scalar(src, src, globalM, maxNum); + __bang_mul_scalar(src, src, temInv, maxNum); + __bang_active_exp_less_0(src, src, maxNum); + __bang_add(destSum, destSum, src, maxNum); + } + if(maxNum >= wSize){ + for(int strip = segNum/2; strip > 0; strip = strip / 2){//segNum要求是2的幂次即maxNum必须选取2的幂次 + for(int i = 0; i < strip ; i++){ + __bang_add(destSum + i * wSize, destSum + i * wSize, destSum + (i + strip) * wSize, wSize); + } + } + + __bang_reduce_sum(destSumFinal, destSum, wSize); + } + else{ + for(int i = 0; i < maxNum; i++){ + destSumFinal[0] += destSum[i]; + } + } + if(step){ + destSumFinal[0] = destSumFinal[0] - (maxNum - step);//把上面多加的(maxNum - step)减掉 + } + globalSum[0] = 0.0; + + __sync_all(); + __bang_atomic_add(destSumFinal, globalSum, destSumFinal, 1);//globalSum[0]必须初始化为0 + + T globalSumInv = 1.0 / globalSum[0];//计算出全局数值和 + + if(taskId == 0){ + __memcpy(srcGlobal, globalTopk, topk * sizeof(T), GDRAM2NRAM);//前topk个元素就是前k个最大值 + + + __bang_sub_scalar(srcGlobal, srcGlobal, globalM, topk); + __bang_mul_scalar(srcGlobal, srcGlobal, temInv, topk); + __bang_active_exp_less_0(srcGlobal, srcGlobal, topk); + __bang_mul_scalar(srcGlobal, srcGlobal, globalSumInv, topk); + + __bang_write_zero(destSum, 2 * topk); + destSum[0] = srcGlobal[0]; + for(int i = 1; i < topk; i++){ + destSum[i] = destSum[i - 1] + srcGlobal[i]; + } + + int end = 0; + for(end = 0; end < topk; end++){ + if(destSum[end] >= static_cast(topp)){ + break; + } + } + if(end < topk - 1){ + end += 1; + } + else{ + end = topk; + } + + random_val *= destSum[end - 1]; + for(int i = 0; i < end; i++){ + if(random_val < destSum[i]){ + indices[0] = indGdram[i]; + break; + } + } + __memcpy(globalTopk, srcGlobal, topk * sizeof(T), NRAM2GDRAM); + } +} + +template +__mlu_global__ void random_sampleD(T const *source, uint64_t *indices, uint64_t *indGdram, T *globalTopk, T *globalSum, float random_val, float topp, int topk, float temperature, int voc){ + const int maxNum = SRC_MAX_SIZE/sizeof(T); + + int wSize = 128 / sizeof(T); + int segNum = maxNum / wSize; + + T temInv = 1.0 / static_cast(temperature); + int taskSize = taskDim * maxNum; + int remain = voc % taskSize; + int repeat = (voc - remain) / taskSize; + + int remainT = remain % taskDim; + int stepEasy = (remain - remainT) / taskDim; + int stepHard = stepEasy + 1; + int step = (taskId < remainT ? stepHard : stepEasy); + int indStart = (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy); + + char *nram_bufferInd = nram_buffer + (2 * maxNum + wSize + 2 * topk + taskDim * topk) * sizeof(T); + uint64_t *srcInd = (uint64_t *)nram_bufferInd;//[maxNum] + uint64_t *topkInd = srcInd + maxNum;//[2 * topk] + uint64_t *indGlobal = topkInd + 2 * topk; + __bang_write_zero(topkInd, 2 * topk); + + T *src = (T *)nram_buffer;//[maxNum] + T *srcTopk = src + maxNum;//[2 * topk] + T *destSum = srcTopk + 2 * topk;//[maxNum] + T *destSumFinal = destSum + maxNum;//[wSize] + T *srcGlobal = destSumFinal + wSize;//[taskDim * topk] + for(int i = 0; i < 2 * topk; i++){ + srcTopk[i] = -INFINITY;//不能使用__bang_write_value + } + for(int j = 0; j < maxNum; j++){ + srcInd[j] = taskId * maxNum + j; + } + for(int r = 0; r < repeat; r++){ + if(r > 0){ + __bang_add_scalar(srcInd, srcInd, taskSize, maxNum);//每次都在上一次基础上增加taskSize + } + __memcpy(src, source + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + for(int i = 0; i < topk; i++){ + for(int j = i + 1; j < maxNum; j++){ + if(src[i] < src[j]){ + T tmp = src[i]; + src[i] = src[j]; + src[j] = tmp; + + uint64_t indexTmp = srcInd[i]; + srcInd[i] = srcInd[j]; + srcInd[j] = indexTmp; + } + } + + } + for(int i = 0; i < topk; i++){ + srcTopk[topk + i] = src[i]; + topkInd[topk + i] = srcInd[i]; + } + + for(int i = 0; i < topk; i++){ + for(int j = i + 1; j < 2 * topk; j++){ + if(srcTopk[i] < srcTopk[j]){ + T tmpk = srcTopk[i]; + srcTopk[i] = srcTopk[j]; + srcTopk[j] = tmpk; + + uint64_t indexTmpk = topkInd[i]; + topkInd[i] = topkInd[j]; + topkInd[j] = indexTmpk; + } + } + } + + } + if(step){ + for(int j = 0; j < step; j++){ + srcInd[j] = repeat * taskSize + indStart + j; + } + __memcpy(src, source + repeat * taskSize + indStart, step * sizeof(T), GDRAM2NRAM); + if(step >= topk){ + for(int i = 0; i < topk; i++){ + for(int j = i + 1; j < step; j++){ + if(src[i] < src[j]){ + T tmp = src[i]; + src[i] = src[j]; + src[j] = tmp; + + uint64_t indexTmp = srcInd[i]; + srcInd[i] = srcInd[j]; + srcInd[j] = indexTmp; + } + } + + } + for(int i = 0; i < topk; i++){ + srcTopk[topk + i] = src[i]; + topkInd[topk + i] = srcInd[i]; + } + } + else{ + for(int i = 0; i < step; i++){ + srcTopk[topk + i] = src[i]; + topkInd[topk + i] = srcInd[i]; + } + } + for(int i = 0; i < topk; i++){ + for(int j = i + 1; j < 2 * topk; j++){ + if(srcTopk[i] < srcTopk[j]){ + T tmpk = srcTopk[i]; + srcTopk[i] = srcTopk[j]; + srcTopk[j] = tmpk; + + uint64_t indexTmpk = topkInd[i]; + topkInd[i] = topkInd[j]; + topkInd[j] = indexTmpk; + } + } + } + } + + __memcpy(globalTopk + taskId * topk, srcTopk, topk * sizeof(T), NRAM2GDRAM); + __memcpy(indGdram + taskId * topk, topkInd, topk * sizeof(uint64_t), NRAM2GDRAM); + __sync_all(); + + if(taskId == 0){ + __memcpy(srcGlobal, globalTopk, taskDim * topk * sizeof(T), GDRAM2NRAM); + __memcpy(indGlobal, indGdram, taskDim * topk * sizeof(uint64_t), GDRAM2NRAM); + for(int i = 0; i < topk; i++){ + for(int j = i + 1; j < taskDim * topk; j++){ + if(srcGlobal[i] < srcGlobal[j]){ + T tmpg = srcGlobal[i]; + srcGlobal[i] = srcGlobal[j]; + srcGlobal[j] = tmpg; + + uint64_t indexTmpg = indGlobal[i]; + indGlobal[i] = indGlobal[j]; + indGlobal[j] = indexTmpg; + } + } + } + __memcpy(globalTopk, srcGlobal, taskDim * topk * sizeof(T), NRAM2GDRAM); + __memcpy(indGdram, indGlobal, taskDim * topk * sizeof(uint64_t), NRAM2GDRAM); + } + __sync_all(); + //下面开始做类似于softmax变换 + T globalM = globalTopk[0]; + __bang_write_zero(destSum, maxNum); + __bang_write_zero(destSumFinal, wSize); + for(int r = 0; r < repeat; r++){ + __memcpy(src, source + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __bang_sub_scalar(src, src, globalM, maxNum); + __bang_mul_scalar(src, src, temInv, maxNum); + __bang_active_exp_less_0(src, src, maxNum); + __bang_add(destSum, destSum, src, maxNum); + } + if(step){ + __bang_write_zero(src, maxNum); + __memcpy(src, source + repeat * taskSize + indStart, step * sizeof(T), GDRAM2NRAM); + __bang_sub_scalar(src, src, globalM, step); + __bang_mul_scalar(src, src, temInv, step); + __bang_active_exp_less_0(src, src, step); + __bang_add(destSum, destSum, src, maxNum); + } + if(maxNum >= wSize){ + for(int strip = segNum/2; strip > 0; strip = strip / 2){//segNum要求是2的幂次即maxNum必须选取2的幂次 + for(int i = 0; i < strip ; i++){ + __bang_add(destSum + i * wSize, destSum + i * wSize, destSum + (i + strip) * wSize, wSize); + } + } + for(int i = 0; i < wSize; i++){ + + destSumFinal[0] += destSum[i];//__bang_reduce_sum失效,只能手动reduce + } + } + + else{ + for(int i = 0; i < maxNum; i++){ + + destSumFinal[0] += destSum[i]; + } + + } + + globalSum[0] = 0.0; + + __sync_all(); + __bang_atomic_add(destSumFinal, globalSum, destSumFinal, 1);//globalSum[0]必须初始化为0 + + T globalSumInv = 1.0 / globalSum[0];//计算出全局数值和 + + if(taskId == 0){ + __memcpy(srcGlobal, globalTopk, topk * sizeof(T), GDRAM2NRAM);//前topk个元素就是前k个最大值 + + + __bang_sub_scalar(srcGlobal, srcGlobal, globalM, topk); + __bang_mul_scalar(srcGlobal, srcGlobal, temInv, topk); + __bang_active_exp_less_0(srcGlobal, srcGlobal, topk); + __bang_mul_scalar(srcGlobal, srcGlobal, globalSumInv, topk); + + __bang_write_zero(srcTopk, 2 * topk); + srcTopk[0] = srcGlobal[0]; + for(int i = 1; i < topk; i++){ + srcTopk[i] = srcTopk[i - 1] + srcGlobal[i]; + } + + int end = 0; + for(end = 0; end < topk; end++){ + if(srcTopk[end] >= static_cast(topp)){ + break; + } + } + if(end < topk - 1){ + end += 1; + } + else{ + end = topk; + } + + random_val *= srcTopk[end - 1]; + for(int i = 0; i < end; i++){ + if(random_val < srcTopk[i]){ + indices[0] = indGdram[i]; + break; + } + } + __memcpy(globalTopk, srcGlobal, topk * sizeof(T), NRAM2GDRAM); + } +} +template +__mlu_global__ void random_sample(T const *source, uint64_t *indices, uint64_t *indGdram, int voc){ + const uint64_t maxNum = SRC_MAX_SIZE/sizeof(T); + + uint64_t taskSize = taskDim * maxNum; + uint64_t remain = voc % taskSize; + uint64_t repeat = (voc - remain) / taskSize; + + uint64_t remainT = remain % taskDim; + uint64_t stepEasy = (remain - remainT) / taskDim; + uint64_t stepHard = stepEasy + 1; + uint64_t step = (taskId < remainT ? stepHard : stepEasy); + uint64_t indStart = repeat * taskSize + (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy); + + T *src = (T *)nram_buffer; + T *srcMax = src + maxNum; + uint64_t index = 0; + + T newMax = -INFINITY; + for(uint64_t r = 0; r < repeat; r++){ + __memcpy(src, source + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __bang_argmax(srcMax, src, maxNum); + if(newMax < srcMax[0]){ + newMax = srcMax[0]; + index = r * taskSize + taskId * maxNum + *((int64_t*)&srcMax[1]); + } + + } + if(step){ + __bang_write_value(src, maxNum, -INFINITY); + __memcpy(src, source + indStart, step * sizeof(T), GDRAM2NRAM); + __bang_argmax(srcMax, src, maxNum); + if(newMax < srcMax[0]){ + newMax = srcMax[0]; + index = indStart + *((int64_t*)&srcMax[1]); + } + + } + + indGdram[taskId] = index; + __sync_all(); + if(taskId == 0){ + uint64_t globalInd = indGdram[0]; + T globalM = source[globalInd]; + for(uint64_t id = 0; id < taskDim; id++){ + if(globalM < source[indGdram[id]]){ + globalM = source[indGdram[id]]; + globalInd = indGdram[id]; + } + } + indices[0] = globalInd; + } +} +template +void random_sampleUnion(cnrtQueue_t queue, void *workspace, void const *source, void *indices, float random_val, float topp, int topk, float temperature, int voc) { + auto logits_ = reinterpret_cast(source); + auto index_ = reinterpret_cast(indices); + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + + k_dim.x = 4; + k_dim.y = 1; + k_dim.z = 1; + k_type = CNRT_FUNC_TYPE_UNION1; + + int taskNum = k_dim.x * k_dim.y * k_dim.z; + if(topp > 0 && topk > 1){ + const int maxNum = SRC_MAX_SIZE/sizeof(T); + char *origin = reinterpret_cast(workspace); + char *indTmp = origin + taskNum * topk * sizeof(uint64_t); + uint64_t *indGdram = (uint64_t *)origin; + T *globalTopk = (T *)indTmp; + T *globalSum = globalTopk + taskNum * topk; + + if(voc >= taskNum * maxNum){ + random_sampleD<<>>(logits_, index_, indGdram, globalTopk, globalSum, random_val, topp, topk, temperature, voc); + } + else{ + random_sampleX<<>>(logits_, index_, indGdram, globalTopk, globalSum, random_val, topp, topk, temperature, voc); + } + } + else{ + uint64_t *indGdram = reinterpret_cast(workspace); + random_sample<<>>(logits_, index_, indGdram, voc); + } + cnrtQueueSync(queue); + + +} + +void random_sample_bang_f16(RandomSampleBangDescriptor_t desc, void *workspace, void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + auto queue = reinterpret_cast(stream); + int voc = desc->voc; + + random_sampleUnion(queue, workspace, probs, result, random_val, topp, topk, temperature, voc); +} +infiniopStatus_t bangRandomSample(RandomSampleBangDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)) { + random_sample_bang_f16(desc, workspace, result, probs, random_val, topp, topk, temperature, stream); + return STATUS_SUCCESS; + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/random_sample/cpu/random_sample.cc b/src/ops/random_sample/cpu/random_sample.cc new file mode 100644 index 00000000..28de5b93 --- /dev/null +++ b/src/ops/random_sample/cpu/random_sample.cc @@ -0,0 +1,185 @@ +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" +#include "random_sample_cpu.h" +#include + + +infiniopStatus_t cpuCreateRandomSampleDescriptor(infiniopHandle_t, + RandomSampleCpuDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs) { + int ndim = probs->ndim; + if (ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!dtype_eq(probs->dt, F16)) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (!dtype_eq(result->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + int voc = probs->shape[0]; + int rLength = result->shape[0]; + if (result->ndim != 1 && rLength != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + *desc_ptr = new RandomSampleCpuDescriptor{ + DevCpu, + probs->dt, + voc, + result->dt, + rLength}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuGetRandomSampleWorkspaceSize(RandomSampleCpuDescriptor_t desc, uint64_t *size) { + *size = desc->voc * (sizeof(uint64_t) + sizeof(desc->dtype)); + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyRandomSampleDescriptor(RandomSampleCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} + + +void random_sample_cpu_f16(RandomSampleCpuDescriptor_t desc, + void *workspace, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature) { + int voc = desc->voc; + char *origin = reinterpret_cast(workspace); + //排序得到前k个最大值,按照从大到小顺序存储在logits_前k个位置里面 + char *logitsTmp = origin + voc * sizeof(uint64_t); + uint64_t *indexTmp = (uint64_t *) origin; + uint16_t *logits_ = (uint16_t *) logitsTmp; + + + auto source = reinterpret_cast(probs); + + std::copy(source, source + voc, logits_); + auto index_ = reinterpret_cast(result); + + // 如果k大于voc,调整k为voc + if (topk > voc) { + topk = voc; + } + + for (int i = 0; i < voc; i++) { + indexTmp[i] = i; + } + for (int i = 0; i < topk; i++) { + for (int j = i + 1; j < voc; j++) { + if (f16_to_f32(logits_[i]) < f16_to_f32(logits_[j])) { + float M = f16_to_f32(logits_[i]); + logits_[i] = logits_[j]; + logits_[j] = f32_to_f16(M); + + + int index = indexTmp[i]; + indexTmp[i] = indexTmp[j]; + indexTmp[j] = index; + } + } + } + + //做类似于softmax的temperature变换 + float reduceM = f16_to_f32(logits_[0]); + float reduceS = 0.0f; + for (int i = 0; i < voc; i++) { + reduceS += std::exp((f16_to_f32(logits_[i]) - reduceM) / temperature); + } + for (int i = 0; i < voc; i++) { + logits_[i] = f32_to_f16(std::exp((f16_to_f32(logits_[i]) - reduceM) / temperature) / reduceS); + } + //在前k个元素里面利用topp选取不超过topp的元素作为数据集 + float tmp = 0.0f; + int end = 0; + for (end = 0; end < topk; end++) { + tmp += f16_to_f32(logits_[end]); + if (tmp >= topp) { + break; + } + } + //printf("%d\n", end); + if (end < topk - 1) { + end += 1; + } else { + end = topk; + } + //利用随机数随机输出满足同时满足topk,topp的某个元素在原始向量的index + + float sum_s = 0.0f; + for (int i = 0; i < end; i++) { + sum_s += f16_to_f32(logits_[i]); + } + random_val *= sum_s; + + sum_s = 0.0f; + for (int i = 0; i < end; i++) { + sum_s += f16_to_f32(logits_[i]); + if (random_val < sum_s) { + index_[0] = indexTmp[i]; + break; + } + } +} +void random_sample_cpu_f16(RandomSampleCpuDescriptor_t desc, + void *workspace, + void *result, + void const *probs) { + int voc = desc->voc; + auto index_ = reinterpret_cast(result); + auto source = reinterpret_cast(probs); + + char *origin = reinterpret_cast(workspace); + uint16_t *logits_ = (uint16_t *) origin; + + std::copy(source, source + voc, logits_); + + float M = f16_to_f32(logits_[0]); + int index = 0; + for (int j = 1; j < voc; j++) { + if (M < f16_to_f32(logits_[j])) { + M = f16_to_f32(logits_[j]); + index = j; + } + } + + index_[0] = index; +} + +infiniopStatus_t cpuRandomSample(RandomSampleCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + if (dtype_eq(desc->dtype, F16)) { + if (topp > 0 && topk > 1) { + random_sample_cpu_f16(desc, + workspace, + result, + probs, + random_val, + topp, + topk, + temperature); + } else { + random_sample_cpu_f16(desc, + workspace, + result, + probs); + } + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/random_sample/cpu/random_sample_cpu.h b/src/ops/random_sample/cpu/random_sample_cpu.h new file mode 100644 index 00000000..b4b501be --- /dev/null +++ b/src/ops/random_sample/cpu/random_sample_cpu.h @@ -0,0 +1,34 @@ +#ifndef __CPU_RANDOM_SAMPLE_H__ +#define __CPU_RANDOM_SAMPLE_H__ + +#include "operators.h" +struct RandomSampleCpuDescriptor { + Device device; + DT dtype; + int voc; + DT rDtype; + int rLength; +}; + +typedef struct RandomSampleCpuDescriptor *RandomSampleCpuDescriptor_t; + +infiniopStatus_t cpuCreateRandomSampleDescriptor(infiniopHandle_t, + RandomSampleCpuDescriptor_t *, infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs); + +infiniopStatus_t cpuGetRandomSampleWorkspaceSize(RandomSampleCpuDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cpuRandomSample(RandomSampleCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream); + +infiniopStatus_t cpuDestroyRandomSampleDescriptor(RandomSampleCpuDescriptor_t desc); + +#endif diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu new file mode 100644 index 00000000..12bc03b2 --- /dev/null +++ b/src/ops/random_sample/cuda/random_sample.cu @@ -0,0 +1,180 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "random_sample.cuh" +#include +#include + +template +__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void softmax( + T *val_out, + int topk, + float temperature, int voc) { + float sum_s = 0.0f; + for (int i = threadIdx.x; i < topk; i += BLOCK_DIM) { + sum_s += __expf(static_cast(val_out[i] - val_out[0]) / temperature); + } + __shared__ float sum_inverse_total; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + float block_sum = BlockReduce(temp_storage).Reduce(sum_s, cub::Sum()); + if (threadIdx.x == 0) { + sum_inverse_total = __fdividef(1.0F, block_sum);//高精度除法 + } + + __syncthreads(); + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < topk) { + val_out[tid] = static_cast(__expf(static_cast(val_out[tid] - val_out[0]) / temperature) * sum_inverse_total); + } +} + +__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void index(uint64_t *key_in, int voc) { + int ind = threadIdx.x + blockIdx.x * blockDim.x; + if (ind < voc) { + key_in[ind] = static_cast(ind); + } +} +template +__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void random_sample_kernel(uint64_t *result, + T *val_out, + float random_val, + float topp, + int topk, + uint64_t *key_out) { + int end = 0; + for (end = 0; end < topk; end++) { + if (val_out[end] >= static_cast(topp)) { + break; + } + } + if (end < topk - 1) { + end += 1; + } else { + end = topk; + } + + random_val *= static_cast(val_out[end - 1]); + for (int i = 0; i < end; i++) { + if (random_val < static_cast(val_out[i])) { + result[0] = key_out[i]; + break; + } + } +} +template +void sort_pairs_descending( + void *workspace, size_t &size_radix_sort, + T const *val_in, T *val_out, + I *key_in, I *key_out, + int voc, cudaStream_t stream) { + cub::DeviceRadixSort::SortPairsDescending( + workspace, size_radix_sort, + val_in, val_out, + key_in, key_out, + voc, 0, sizeof(T) * 8, stream); +} +template +void inclusive_sum( + void *workspace, size_t &size_scan, + T *data, int voc, + cudaStream_t stream) { + cub::DeviceScan::InclusiveSum( + workspace, size_scan, + data, data, voc, + stream); +} +template +void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, + int voc, cudaStream_t stream) { + + + sort_pairs_descending(nullptr, size_radix_sort, + nullptr, nullptr, + nullptr, nullptr, + voc, stream); + + inclusive_sum( + nullptr, size_scan, + nullptr, voc, + stream); +} +__global__ void random_sample_kernel(uint64_t *result, + uint64_t *key_out) { + result[0] = key_out[0]; +} +void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace, void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + int voc = desc->voc; + //下面这段代码在排序 + char *origin = reinterpret_cast(workspace); + char *keyTmp = origin + voc * sizeof(half); + half *val_out = (half *) origin; + + uint64_t *key_in = (uint64_t *) keyTmp; + uint64_t *key_out = key_in + voc; + + int block_dim = MAX_THREADS_PER_BLOCK; + int num_blocks = ROUND_UP_DIV(voc, block_dim); + index<<>>(key_in, voc); + //下面开始计算workspace空间 + size_t size_radix_sort; + size_t size_scan; + random_sample_workspace(size_radix_sort, size_scan, + voc, (cudaStream_t) stream); + void *workspace_extra; + cudaMalloc(&workspace_extra, size_radix_sort + size_scan); + sort_pairs_descending( + workspace_extra, size_radix_sort, + (half *) probs, val_out, + key_in, key_out, + voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上 + //排序结束,然后开始做softmax变换 + if (topp > 0 && topk > 1) { + softmax<<>>(val_out, topk, + temperature, voc); + + + inclusive_sum( + workspace_extra, size_scan, + val_out, voc, + (cudaStream_t) stream);//该函数会实现scan功能不断累加结果 + random_sample_kernel<<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result, + val_out, + random_val, + topp, + topk, + key_out); + + } else { + random_sample_kernel<<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result, + key_out); + } + cudaFree(workspace_extra); +} + +infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + if (cudaSetDevice(desc->device_id) != cudaSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)) { + random_sample_nv_gpu_f16(desc, workspace, result, probs, random_val, topp, topk, temperature, stream); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/random_sample/cuda/random_sample.cuh b/src/ops/random_sample/cuda/random_sample.cuh new file mode 100644 index 00000000..d3fff76d --- /dev/null +++ b/src/ops/random_sample/cuda/random_sample.cuh @@ -0,0 +1,38 @@ +#ifndef __CUDA_RANDOM_SAMPLE_H__ +#define __CUDA_RANDOM_SAMPLE_H__ + +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" + +struct RandomSampleCudaDescriptor { + Device device; + int device_id; + DT dtype; + int voc; + DT rDtype; + int rLength; +}; + +typedef struct RandomSampleCudaDescriptor *RandomSampleCudaDescriptor_t; + +infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle, + RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs); + +infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream); + +infiniopStatus_t cudaDestroyRandomSampleDescriptor(RandomSampleCudaDescriptor_t desc); + + +#endif diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cc b/src/ops/random_sample/cuda/random_sample_cuda.cc new file mode 100644 index 00000000..022a113b --- /dev/null +++ b/src/ops/random_sample/cuda/random_sample_cuda.cc @@ -0,0 +1,37 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "random_sample.cuh" + +infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle, + RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs) { + if (probs->ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!dtype_eq(result->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + int voc = probs->shape[0]; + int rLength = result->shape[0]; + if (result->ndim != 1 && rLength != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + *desc_ptr = new RandomSampleCudaDescriptor{ + handle->device, + handle->device_id, + probs->dt, + voc, + result->dt, + rLength}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size) { + *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype)); + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyRandomSampleDescriptor(RandomSampleCudaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/random_sample/maca/random_sample_maca.cc b/src/ops/random_sample/maca/random_sample_maca.cc new file mode 100644 index 00000000..1cb0fe74 --- /dev/null +++ b/src/ops/random_sample/maca/random_sample_maca.cc @@ -0,0 +1,37 @@ +#include "../../../devices/maca/common_maca.h" +#include "../../utils.h" +#include "random_sample_maca.h" + +infiniopStatus_t macaCreateRandomSampleDescriptor(MacaHandle_t handle, + RandomSampleMacaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs) { + if (probs->ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!dtype_eq(result->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + int voc = probs->shape[0]; + int rLength = result->shape[0]; + if (result->ndim != 1 && rLength != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + *desc_ptr = new RandomSampleMacaDescriptor{ + handle->device, + handle->device_id, + probs->dt, + voc, + result->dt, + rLength}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t macaGetRandomSampleWorkspaceSize(RandomSampleMacaDescriptor_t desc, uint64_t *size) { + *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype)); + return STATUS_SUCCESS; +} + +infiniopStatus_t macaDestroyRandomSampleDescriptor(RandomSampleMacaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/random_sample/maca/random_sample_maca.h b/src/ops/random_sample/maca/random_sample_maca.h new file mode 100644 index 00000000..3cf1ab59 --- /dev/null +++ b/src/ops/random_sample/maca/random_sample_maca.h @@ -0,0 +1,38 @@ +#ifndef __MACA_RANDOM_SAMPLE_H__ +#define __MACA_RANDOM_SAMPLE_H__ + +#include "../../../devices/maca/maca_handle.h" +#include "operators.h" + +struct RandomSampleMacaDescriptor { + Device device; + int device_id; + DT dtype; + int voc; + DT rDtype; + int rLength; +}; + +typedef struct RandomSampleMacaDescriptor *RandomSampleMacaDescriptor_t; + +infiniopStatus_t macaCreateRandomSampleDescriptor(MacaHandle_t handle, + RandomSampleMacaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs); + +infiniopStatus_t macaGetRandomSampleWorkspaceSize(RandomSampleMacaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t macaRandomSample(RandomSampleMacaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream); + +infiniopStatus_t macaDestroyRandomSampleDescriptor(RandomSampleMacaDescriptor_t desc); + + +#endif diff --git a/src/ops/random_sample/maca/random_sample_maca.maca b/src/ops/random_sample/maca/random_sample_maca.maca new file mode 100644 index 00000000..310343fb --- /dev/null +++ b/src/ops/random_sample/maca/random_sample_maca.maca @@ -0,0 +1,180 @@ +#include "../../../devices/maca/common_maca.h" +#include "../../utils.h" +#include "random_sample_maca.h" +#include +#include + +template +__global__ void softmax( + T *val_out, + int topk, + float temperature, int voc) { + float sum_s = 0.0f; + for (int i = threadIdx.x; i < topk; i += BLOCK_DIM) { + sum_s += __expf(static_cast(val_out[i] - val_out[0]) / temperature); + } + __shared__ float sum_inverse_total; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + float block_sum = BlockReduce(temp_storage).Reduce(sum_s, cub::Sum()); + if (threadIdx.x == 0) { + sum_inverse_total = __fdividef(1.0F, block_sum);//高精度除法 + } + + __syncthreads(); + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < topk) { + val_out[tid] = static_cast(__expf(static_cast(val_out[tid] - val_out[0]) / temperature) * sum_inverse_total); + } +} + +__global__ void index(uint64_t *key_in, int voc) { + int ind = threadIdx.x + blockIdx.x * blockDim.x; + if (ind < voc) { + key_in[ind] = static_cast(ind); + } +} +template +__global__ void random_sample_kernel(uint64_t *result, + T *val_out, + float random_val, + float topp, + int topk, + uint64_t *key_out) { + int end = 0; + for (end = 0; end < topk; end++) { + if (val_out[end] >= static_cast(topp)) { + break; + } + } + if (end < topk - 1) { + end += 1; + } else { + end = topk; + } + + random_val *= static_cast(val_out[end - 1]); + for (int i = 0; i < end; i++) { + if (random_val < static_cast(val_out[i])) { + result[0] = key_out[i]; + break; + } + } +} +template +void sort_pairs_descending( + void *workspace, size_t &size_radix_sort, + T const *val_in, T *val_out, + I *key_in, I *key_out, + int voc, hcStream_t stream) { + cub::DeviceRadixSort::SortPairsDescending( + workspace, size_radix_sort, + val_in, val_out, + key_in, key_out, + voc, 0, sizeof(T) * 8, stream); +} +template +void inclusive_sum( + void *workspace, size_t &size_scan, + T *data, int voc, + hcStream_t stream) { + cub::DeviceScan::InclusiveSum( + workspace, size_scan, + data, data, voc, + stream); +} +template +void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, + int voc, hcStream_t stream) { + + + sort_pairs_descending(nullptr, size_radix_sort, + nullptr, nullptr, + nullptr, nullptr, + voc, stream); + + inclusive_sum( + nullptr, size_scan, + nullptr, voc, + stream); +} +__global__ void random_sample_kernel(uint64_t *result, + uint64_t *key_out) { + result[0] = key_out[0]; +} +void random_sample_nv_gpu_f16(RandomSampleMacaDescriptor_t desc, void *workspace, void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + int voc = desc->voc; + //下面这段代码在排序 + char *origin = reinterpret_cast(workspace); + char *keyTmp = origin + voc * sizeof(half); + half *val_out = (half *) origin; + + uint64_t *key_in = (uint64_t *) keyTmp; + uint64_t *key_out = key_in + voc; + + index<<<(voc + 1023) / 1024, 1024, 0, (hcStream_t) stream>>>(key_in, voc); + //下面开始计算workspace空间 + size_t size_radix_sort; + size_t size_scan; + random_sample_workspace(size_radix_sort, size_scan, + voc, (hcStream_t) stream); + void *workspace_extra; + hcMalloc(&workspace_extra, size_radix_sort + size_scan); + sort_pairs_descending( + workspace_extra, size_radix_sort, + (half *) probs, val_out, + key_in, key_out, + voc, (hcStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上 + //排序结束,然后开始做softmax变换 + if (topp > 0 && topk > 1) { + int BLOCK_DIM = 1024; + int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM; + softmax<<>>(val_out, topk, + temperature, voc); + + + inclusive_sum( + workspace_extra, size_scan, + val_out, voc, + (hcStream_t) stream);//该函数会实现scan功能不断累加结果 + random_sample_kernel<<<1, 1, 0, (hcStream_t) stream>>>((uint64_t *) result, + val_out, + random_val, + topp, + topk, + key_out); + + } else { + random_sample_kernel<<<1, 1, 0, (hcStream_t) stream>>>((uint64_t *) result, + key_out); + } + hcFree(workspace_extra); +} + +infiniopStatus_t macaRandomSample(RandomSampleMacaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + if (hcSetDevice(desc->device_id) != hcSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)) { + random_sample_nv_gpu_f16(desc, workspace, result, probs, random_val, topp, topk, temperature, stream); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/random_sample/musa/random_sample_musa.cc b/src/ops/random_sample/musa/random_sample_musa.cc new file mode 100644 index 00000000..70ff941c --- /dev/null +++ b/src/ops/random_sample/musa/random_sample_musa.cc @@ -0,0 +1,37 @@ +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include "random_sample_musa.h" + +infiniopStatus_t musaCreateRandomSampleDescriptor(MusaHandle_t handle, + RandomSampleMusaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs) { + if (probs->ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!dtype_eq(result->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + int voc = probs->shape[0]; + int rLength = result->shape[0]; + if (result->ndim != 1 && rLength != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + *desc_ptr = new RandomSampleMusaDescriptor{ + handle->device, + handle->device_id, + probs->dt, + voc, + result->dt, + rLength}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t musaGetRandomSampleWorkspaceSize(RandomSampleMusaDescriptor_t desc, uint64_t *size) { + *size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype)); + return STATUS_SUCCESS; +} + +infiniopStatus_t musaDestroyRandomSampleDescriptor(RandomSampleMusaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/random_sample/musa/random_sample_musa.h b/src/ops/random_sample/musa/random_sample_musa.h new file mode 100644 index 00000000..d8839ff1 --- /dev/null +++ b/src/ops/random_sample/musa/random_sample_musa.h @@ -0,0 +1,38 @@ +#ifndef __MUSA_RANDOM_SAMPLE_H__ +#define __MUSA_RANDOM_SAMPLE_H__ + +#include "../../../devices/musa/musa_handle.h" +#include "operators.h" + +struct RandomSampleMusaDescriptor { + Device device; + int device_id; + DT dtype; + int voc; + DT rDtype; + int rLength; +}; + +typedef struct RandomSampleMusaDescriptor *RandomSampleMusaDescriptor_t; + +infiniopStatus_t musaCreateRandomSampleDescriptor(MusaHandle_t handle, + RandomSampleMusaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, + infiniopTensorDescriptor_t probs); + +infiniopStatus_t musaGetRandomSampleWorkspaceSize(RandomSampleMusaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t musaRandomSample(RandomSampleMusaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream); + +infiniopStatus_t musaDestroyRandomSampleDescriptor(RandomSampleMusaDescriptor_t desc); + + +#endif diff --git a/src/ops/random_sample/musa/random_sample_musa.mu b/src/ops/random_sample/musa/random_sample_musa.mu new file mode 100644 index 00000000..55dbdd0a --- /dev/null +++ b/src/ops/random_sample/musa/random_sample_musa.mu @@ -0,0 +1,184 @@ +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include "random_sample_musa.h" +#include +#include + +template +__global__ void softmax( + T *val_out, + int topk, + float temperature, int voc) { + float sum_s = 0.0f; + for (int i = threadIdx.x; i < topk; i += BLOCK_DIM) { + sum_s += __expf(static_cast(val_out[i] - val_out[0]) / temperature); + } + __shared__ float sum_inverse_total; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + float block_sum = BlockReduce(temp_storage).Reduce(sum_s, cub::Sum()); + if (threadIdx.x == 0) { + sum_inverse_total = __fdividef(1.0F, block_sum);//高精度除法 + } + + __syncthreads(); + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < topk) { + val_out[tid] = static_cast(__expf(static_cast(val_out[tid] - val_out[0]) / temperature) * sum_inverse_total); + } +} + +__global__ void index(uint64_t *key_in, int voc) { + int ind = threadIdx.x + blockIdx.x * blockDim.x; + if (ind < voc) { + key_in[ind] = static_cast(ind); + } +} +template +__global__ void random_sample_kernel(uint64_t *result, + T *val_out, + float random_val, + float topp, + int topk, + uint64_t *key_out) { + int end = 0; + for (end = 0; end < topk; end++) { + if (val_out[end] >= static_cast(topp)) { + break; + } + } + if (end < topk - 1) { + end += 1; + } else { + end = topk; + } + + random_val *= static_cast(val_out[end - 1]); + for (int i = 0; i < end; i++) { + if (random_val < static_cast(val_out[i])) { + result[0] = key_out[i]; + break; + } + } +} +template +void sort_pairs_descending( + void *workspace, size_t &size_radix_sort, + T const *val_in, T *val_out, + I *key_in, I *key_out, + int voc, musaStream_t stream) { + cub::DeviceRadixSort::SortPairsDescending( + workspace, size_radix_sort, + val_in, val_out, + key_in, key_out, + voc, 0, sizeof(T) * 8, stream); +} +template +void inclusive_sum( + void *workspace, size_t &size_scan, + T *data, int voc, + musaStream_t stream) { + cub::DeviceScan::InclusiveSum( + workspace, size_scan, + data, data, voc, + stream); +} +template +void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan, + int voc, musaStream_t stream) { + + + sort_pairs_descending(nullptr, size_radix_sort, + nullptr, nullptr, + nullptr, nullptr, + voc, stream); + + inclusive_sum( + nullptr, size_scan, + nullptr, voc, + stream); +} +__global__ void random_sample_kernel(uint64_t *result, + uint64_t *key_out) { + result[0] = key_out[0]; +} +void random_sample_nv_gpu_f16(RandomSampleMusaDescriptor_t desc, void *workspace, void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + int voc = desc->voc; + //下面这段代码在排序 + char *origin = reinterpret_cast(workspace); + char *keyTmp = origin + voc * sizeof(half); + half *val_out = (half *) origin; + + uint64_t *key_in = (uint64_t *) keyTmp; + uint64_t *key_out = key_in + voc; + + index<<<(voc + 1023) / 1024, 1024, 0, (musaStream_t) stream>>>(key_in, voc); + //下面开始计算workspace空间 + size_t size_radix_sort; + size_t size_scan; + random_sample_workspace(size_radix_sort, size_scan, + voc, (musaStream_t) stream); + void *workspace_extra; + musaMalloc(&workspace_extra, size_radix_sort + size_scan); + sort_pairs_descending( + workspace_extra, size_radix_sort, + (half *) probs, val_out, + key_in, key_out, + voc, (musaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上 + //排序结束,然后开始做softmax变换 + if (topp > 0 && topk > 1) { + int BLOCK_DIM = 1024; + int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM; + softmax<<>>(val_out, topk, + temperature, voc); + + + inclusive_sum( + workspace_extra, size_scan, + val_out, voc, + (musaStream_t) stream);//该函数会实现scan功能不断累加结果 + random_sample_kernel<<<1, 1, 0, (musaStream_t) stream>>>((uint64_t *) result, + val_out, + random_val, + topp, + topk, + key_out); + + } else { + random_sample_kernel<<<1, 1, 0, (musaStream_t) stream>>>((uint64_t *) result, + key_out); + } + musaFree(workspace_extra); +} + +infiniopStatus_t musaRandomSample(RandomSampleMusaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + int current_device; + if (musaGetDevice(¤t_device) != musaSuccess) { + return STATUS_BAD_DEVICE; + } + if (current_device != desc->device_id && musaSetDevice(desc->device_id) != musaSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)) { + random_sample_nv_gpu_f16(desc, workspace, result, probs, random_val, topp, topk, temperature, stream); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/random_sample/operator.cc b/src/ops/random_sample/operator.cc new file mode 100644 index 00000000..40a8ec03 --- /dev/null +++ b/src/ops/random_sample/operator.cc @@ -0,0 +1,172 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/random_sample/random_sample.h" + +#ifdef ENABLE_CPU +#include "cpu/random_sample_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "cuda/random_sample.cuh" +#endif +#ifdef ENABLE_CAMBRICON_MLU +#include "bang/random_sample_bang.h" +#endif +#ifdef ENABLE_ASCEND_NPU +#include "ascend/random_sample.h" +#endif +#ifdef ENABLE_METAX_GPU +#include "maca/random_sample_maca.h" +#endif +#ifdef ENABLE_MTHREADS_GPU +#include "musa/random_sample_musa.h" +#endif + +__C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *) desc_ptr, result, probs); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaCreateRandomSampleDescriptor((CudaHandle_t) handle, (RandomSampleCudaDescriptor_t *) desc_ptr, result, probs); +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return bangCreateRandomSampleDescriptor((BangHandle_t) handle, + (RandomSampleBangDescriptor_t *) desc_ptr, result, + probs); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return ascendCreateRandomSampleDescriptor((AscendHandle_t) handle, + (RandomSampleAscendDescriptor_t *) desc_ptr, result, probs); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaCreateRandomSampleDescriptor((MacaHandle_t) handle, + (RandomSampleMacaDescriptor_t *) desc_ptr, result, + probs); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: + return musaCreateRandomSampleDescriptor((MusaHandle_t) handle, (RandomSampleMusaDescriptor_t *) desc_ptr, result, probs); +#endif + } + return STATUS_BAD_DEVICE; +}; + +__C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t) desc, size); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t) desc, size); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t) desc, size); + // return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t) desc, size); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t) desc, size); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t) desc, size); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *result, + void const *probs, + float random_val, + float topp, + int topk, + float temperature, + void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuRandomSample((RandomSampleCpuDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaRandomSample((RandomSampleCudaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return bangRandomSample((RandomSampleBangDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return ascendRandomSample((RandomSampleAscendDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaRandomSample((RandomSampleMacaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: + return musaRandomSample((RandomSampleMusaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t) desc); +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t) desc); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t) desc); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t) desc); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: + return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t) desc); +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/rearrange/ascend/rearrange_aclnn.cc b/src/ops/rearrange/ascend/rearrange_aclnn.cc new file mode 100644 index 00000000..f1db82cd --- /dev/null +++ b/src/ops/rearrange/ascend/rearrange_aclnn.cc @@ -0,0 +1,113 @@ +#include "rearrange_aclnn.h" +#include "../../utils.h" + +RearrangeAclnnDescriptor::RearrangeAclnnDescriptor(Device _device) { + device = _device; + device_id = 0; + executor = nullptr; + dstDesc = new aclnnTensorDescriptor(); + srcDesc = new aclnnTensorDescriptor(); + workspaceSize = 0; + workspaceAddr = nullptr; +} + +infiniopStatus_t aclnnCreateRearrangeDescriptor(AscendHandle_t handle, + RearrangeAclnnDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src) { + *desc_ptr = new RearrangeAclnnDescriptor(handle->device); + (*desc_ptr)->device_id = handle->device_id; + + auto &dstDesc = (*desc_ptr)->dstDesc; + auto &srcDesc = (*desc_ptr)->srcDesc; + + CHECK_STATUS(dstDesc->fromInfiniOpTensorDescriptor(dst), STATUS_SUCCESS); + CHECK_STATUS(srcDesc->fromInfiniOpTensorDescriptor(src), STATUS_SUCCESS); + + // CHECK_STATUS(dstDesc->createTensor(), STATUS_SUCCESS); + // CHECK_STATUS(srcDesc->createTensor(), STATUS_SUCCESS); + + // aclTensor *td = dstDesc->t; + // aclTensor *ts = srcDesc->t; + + // auto &workspaceSize = (*desc_ptr)->workspaceSize; + // auto &executor = (*desc_ptr)->executor; + + // auto ret = aclnnInplaceCopyGetWorkspaceSize(td, + // ts, + // &workspaceSize, + // &executor); + // aclSetAclOpExecutorRepeatable(executor); + // CHECK_RET(ret == ACL_SUCCESS, + // LOG_PRINT("aclnnInplaceCopyGetWorkspaceSize failed. ERROR: %d\n", ret); + // return STATUS_EXECUTION_FAILED); + + // (*desc_ptr)->workspaceAddr = mallocWorkspace(workspaceSize); + + return STATUS_SUCCESS; +} + +infiniopStatus_t aclnnRearrange(RearrangeAclnnDescriptor_t desc, + void *dst, + void const *src, + void *stream) { + // Set runing on handle device + aclrtSetDevice(desc->device_id); + + /// TODO: something is wrong with aclSetTensorAddr, do all the preparation here for now + desc->dstDesc->t = aclCreateTensor(desc->dstDesc->shape.data(), + desc->dstDesc->ndim, + desc->dstDesc->dataType, + desc->dstDesc->strides.data(), + desc->dstDesc->offset, + desc->dstDesc->format, + desc->dstDesc->storageShape.data(), + desc->dstDesc->storageNdim, + dst); + desc->srcDesc->t = aclCreateTensor(desc->srcDesc->shape.data(), + desc->srcDesc->ndim, + desc->srcDesc->dataType, + desc->srcDesc->strides.data(), + desc->srcDesc->offset, + desc->srcDesc->format, + desc->srcDesc->storageShape.data(), + desc->srcDesc->storageNdim, + (void *) src); + + aclTensor *td = desc->dstDesc->t; + aclTensor *ts = desc->srcDesc->t; + aclOpExecutor *executor; + uint64_t workspaceSize; + aclnnInplaceCopyGetWorkspaceSize(td, + ts, + &workspaceSize, + &executor); + CHECK_STATUS(mallocWorkspace(&(desc->workspaceAddr), workspaceSize), STATUS_SUCCESS); + + + // AclSetTensorAddr(executor, 0, td, dst); + // AclSetTensorAddr(executor, 1, ts, (void *) src); + auto ret = aclnnInplaceCopy(desc->workspaceAddr, + desc->workspaceSize, + executor, + stream); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnInplaceCopy failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + + desc->dstDesc->destroyTensor(); + desc->srcDesc->destroyTensor(); + CHECK_STATUS(freeWorkspace(desc->workspaceAddr), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +infiniopStatus_t aclnnDestroyRearrangeDescriptor(RearrangeAclnnDescriptor_t desc) { + delete desc->srcDesc; + delete desc->dstDesc; + /// TODO: this aclDestroyAclOpExecutor seems to trigger a double free error + // aclDestroyAclOpExecutor(desc->executor); + // freeWorkspace(desc->workspaceAddr); + delete desc; + + return STATUS_SUCCESS; +} diff --git a/src/ops/rearrange/ascend/rearrange_aclnn.h b/src/ops/rearrange/ascend/rearrange_aclnn.h new file mode 100644 index 00000000..4b60e4e7 --- /dev/null +++ b/src/ops/rearrange/ascend/rearrange_aclnn.h @@ -0,0 +1,36 @@ +#ifndef __ACLNN_REARRANGE_H__ +#define __ACLNN_REARRANGE_H__ + +#include "../../../devices/ascend/ascend_handle.h" +#include "../../../devices/ascend/tensor_aclnn.h" +#include "operators.h" +#include +#include +#include + +struct RearrangeAclnnDescriptor { + Device device; + int device_id; + aclOpExecutor *executor; + aclnnTensorDescriptor_t dstDesc, srcDesc; + uint64_t workspaceSize; + void *workspaceAddr; + + RearrangeAclnnDescriptor(Device device); +}; + +typedef struct RearrangeAclnnDescriptor *RearrangeAclnnDescriptor_t; + +infiniopStatus_t aclnnCreateRearrangeDescriptor(AscendHandle_t handle, + RearrangeAclnnDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src); + +infiniopStatus_t aclnnRearrange(RearrangeAclnnDescriptor_t desc, + void *dst, + void const *src, + void *stream); + +infiniopStatus_t aclnnDestroyRearrangeDescriptor(RearrangeAclnnDescriptor_t desc); + +#endif diff --git a/src/ops/rearrange/bang/rearrange_bang.cc b/src/ops/rearrange/bang/rearrange_bang.cc new file mode 100644 index 00000000..e846f2d1 --- /dev/null +++ b/src/ops/rearrange/bang/rearrange_bang.cc @@ -0,0 +1,89 @@ +#include "rearrange_bang.h" +#include "../../../devices/bang/common_bang.h" +#include "../../utils.h" +#include + +infiniopStatus_t bangCreateRearrangeDescriptor(BangHandle_t handle, + RearrangeBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src) { + auto dt = dst->dt; + if (!dtype_eq(src->dt, dt)) { + return STATUS_BAD_TENSOR_DTYPE; + } + + auto ndim = dst->ndim; + if (src->ndim != ndim || ndim == 0) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (decltype(ndim) i = 0; i < ndim; ++i) { + if (dst->shape[i] != src->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + unsigned int r = 0; + std::vector shape_; + std::vector dst_strides, src_strides; + switch (ndim) { + case 1: + shape_.push_back(dst->shape[0]); + dst_strides.push_back(0); + src_strides.push_back(0); + r = 1; + break; + case 2: + r = dst->shape[0]; + break; + case 3: + r = dst->shape[0] * dst->shape[1]; + break; + default: { + for (size_t i = ndim - 3; i >= 1; --i) { + if (static_cast(dst->shape[i]) * static_cast(dst->strides[i]) != static_cast(dst->strides[i - 1]) || + static_cast(src->shape[i]) * static_cast(src->strides[i]) != static_cast(src->strides[i - 1])) { + return STATUS_BAD_TENSOR_STRIDES; + } + } + r = std::accumulate(dst->shape, dst->shape + ndim - 1, 1, std::multiplies()); + break; + } + } + + for (decltype(ndim) i = 0; i < ndim; ++i) { + shape_.push_back(dst->shape[i]); + dst_strides.push_back(dst->strides[i]); + src_strides.push_back(src->strides[i]); + } + + char *tmpDevice; + CNRT_CHECK(cnrtMalloc((void **) &tmpDevice, ndim * sizeof(uint64_t) + 2 * ndim * sizeof(int64_t))); + char *mlu_stride = tmpDevice + ndim * sizeof(uint64_t); + uint64_t *mlu_shape = (uint64_t *) tmpDevice; + + int64_t *mlu_strides_dst = (int64_t *) mlu_stride; + int64_t *mlu_strides_src = mlu_strides_dst + ndim; + + CNRT_CHECK(cnrtMemcpy(mlu_shape, shape_.data(), ndim * sizeof(uint64_t), cnrtMemcpyHostToDev)); + CNRT_CHECK(cnrtMemcpy(mlu_strides_dst, dst_strides.data(), ndim * sizeof(int64_t), cnrtMemcpyHostToDev)); + CNRT_CHECK(cnrtMemcpy(mlu_strides_src, src_strides.data(), ndim * sizeof(int64_t), cnrtMemcpyHostToDev)); + *desc_ptr = new RearrangeBangDescriptor{ + handle->device, + handle->device_id, + dst->dt, + r, + ndim, + mlu_shape, + mlu_strides_dst, + mlu_strides_src}; + return STATUS_SUCCESS; +} +infiniopStatus_t bangDestroyRearrangeDescriptor(RearrangeBangDescriptor_t desc) { + cnrtFree(desc->mlu_shape); + + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rearrange/bang/rearrange_bang.h b/src/ops/rearrange/bang/rearrange_bang.h new file mode 100644 index 00000000..dc64f76a --- /dev/null +++ b/src/ops/rearrange/bang/rearrange_bang.h @@ -0,0 +1,34 @@ +#ifndef __BANG_REARRANGE_H__ +#define __BANG_REARRANGE_H__ + +#include "../../../devices/bang/bang_handle.h" +#include "operators.h" + +struct RearrangeBangDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t r; + uint64_t ndim; + uint64_t *mlu_shape; + int64_t + *mlu_strides_dst, + *mlu_strides_src; +}; + +typedef struct RearrangeBangDescriptor *RearrangeBangDescriptor_t; + +infiniopStatus_t bangCreateRearrangeDescriptor(BangHandle_t handle, + RearrangeBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src); + +infiniopStatus_t bangRearrange(RearrangeBangDescriptor_t desc, + void *dst, + void const *src, + void *stream); + +infiniopStatus_t bangDestroyRearrangeDescriptor(RearrangeBangDescriptor_t desc); + + +#endif// __BANG_REARRANGE_H__ diff --git a/src/ops/rearrange/bang/rearrange_bang.mlu b/src/ops/rearrange/bang/rearrange_bang.mlu new file mode 100644 index 00000000..5c14a516 --- /dev/null +++ b/src/ops/rearrange/bang/rearrange_bang.mlu @@ -0,0 +1,104 @@ +#include "bang.h" +#include "bang_device_functions.h" +#include "cnrt.h" +#include "rearrange_bang.h" +#include "../../../devices/bang/common_bang.h" +#include + +const int SRC_MAX_SIZE = 1024 * 1024 * 128; + +__mlu_global__ void rearrange( + char *dst, + char const *src, + uint64_t *mlu_shape, + int64_t *mlu_strides_dst, + int64_t *mlu_strides_src, + int r, + int ndim, int byteSize){ + const int maxNum = SRC_MAX_SIZE/byteSize; + + int remainT = r % taskDim; + int stepEasy = (r - remainT) / taskDim; + int stepHard = stepEasy + 1; + int step = (taskId < remainT ? stepHard : stepEasy); + int indStart = (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy); + + int dimsize = mlu_shape[ndim - 1]; + if(dimsize < maxNum){ + for(int i = indStart; i < indStart + step; i++){ + int tidS = 0; + int tidD = 0; + int indi = i; + for(int j = ndim - 2; j >= 0; --j){ + tidS += (indi % mlu_shape[j]) * mlu_strides_src[j]; + tidD += (indi % mlu_shape[j]) * mlu_strides_dst[j]; + indi /= mlu_shape[j]; + } + __memcpy(dst + tidD * byteSize, src + tidS * byteSize, dimsize * byteSize, GDRAM2GDRAM); + } + + } + else{ + int remain = dimsize % maxNum; + int repeat = (dimsize - remain) / maxNum; + for(int i = indStart; i < indStart + step; i++){ + int tidS = 0; + int tidD = 0; + int indi = i; + for(int j = ndim - 2; j >= 0; --j){ + tidS += (indi % mlu_shape[j]) * mlu_strides_src[j]; + tidD += (indi % mlu_shape[j]) * mlu_strides_dst[j]; + indi /= mlu_shape[j]; + } + for(int index = 0; index < repeat; index++){ + __memcpy(dst + (tidD + index * maxNum) * byteSize, src + (tidS + index * maxNum) * byteSize, maxNum * byteSize, GDRAM2GDRAM); + } + if(remain){ + __memcpy(dst + (tidD + repeat * maxNum) * byteSize, src + (tidS + repeat * maxNum) * byteSize, remain * byteSize, GDRAM2GDRAM); + } + } + + } +} + +void rearrangeUnion(cnrtQueue_t queue, void *destination, void const *source, + uint64_t *mlu_shape, + int64_t *mlu_strides_dst, + int64_t *mlu_strides_src, + int r, + int ndim, int byteSize) { + auto dst = reinterpret_cast< char *>(destination); + auto src = reinterpret_cast(source); + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + + k_dim.x = 4; + k_dim.y = 1; + k_dim.z = 1; + k_type = CNRT_FUNC_TYPE_UNION1; + + rearrange<<>>(dst, src, mlu_shape, mlu_strides_dst, mlu_strides_src, r, ndim, byteSize); + + cnrtQueueSync(queue); +} + +void rearrange_bang(RearrangeBangDescriptor_t desc, void *dst, + void const *src, + void *stream) { + auto queue = reinterpret_cast(stream); + int r = desc->r; + int ndim = desc->ndim; + int byteSize = desc->dtype.size; + rearrangeUnion(queue, dst, src, desc->mlu_shape, desc->mlu_strides_dst, desc->mlu_strides_src, r, ndim, byteSize); +} +infiniopStatus_t bangRearrange(RearrangeBangDescriptor_t desc, + void *dst, + void const *src, + void *stream) { + + if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { + return STATUS_BAD_DEVICE; + } + rearrange_bang(desc, dst, src, stream); + return STATUS_SUCCESS; +} diff --git a/src/ops/rearrange/cpu/rearrange_cpu.cc b/src/ops/rearrange/cpu/rearrange_cpu.cc new file mode 100644 index 00000000..a5540727 --- /dev/null +++ b/src/ops/rearrange/cpu/rearrange_cpu.cc @@ -0,0 +1,100 @@ +#include "rearrange_cpu.h" +#include "../../utils.h" +#include +#include +#include + +infiniopStatus_t cpuCreateRearrangeDescriptor(infiniopHandle_t, + RearrangeCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src) { + if (!dtype_eq(dst->dt, src->dt)) { + return STATUS_BAD_TENSOR_DTYPE; + } + + auto ndim = dst->ndim; + if (src->ndim != ndim || ndim == 0) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (int i = 0; i < ndim; ++i) { + if (dst->shape[i] != src->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + std::vector + shape(dst->shape, dst->shape + ndim); + std::vector + strides_dst(dst->strides, dst->strides + ndim), + strides_src(src->strides, src->strides + ndim); + + unsigned int r = 0; + switch (ndim) { + case 1: + ndim = 2; + strides_dst.insert(strides_dst.begin(), shape[0]); + strides_src.insert(strides_src.begin(), shape[0]); + shape.insert(shape.begin(), 1); + case 2: + r = shape[0]; + break; + case 3: + r = shape[0] * shape[1]; + break; + default: + for (int i = ndim - 3; i >= 1; --i) { + if (shape[i] * strides_dst[i] != strides_dst[i - 1] || shape[i] * strides_src[i] != strides_src[i - 1]) { + return STATUS_BAD_TENSOR_STRIDES; + } + } + r = std::accumulate(shape.begin(), shape.end() - 1, 1, std::multiplies{}); + break; + } + *desc_ptr = new RearrangeCpuDescriptor{ + DevCpu, + dst->dt, + r, + shape, + strides_dst, + strides_src, + }; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyRearrangeDescriptor(RearrangeCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} + +inline int indices(uint64_t i, uint64_t ndim, std::vector strides, std::vector shape) { + uint64_t ans = 0; + for (int j = ndim - 2; j >= 0; --j) { + ans += (i % shape[j]) * strides[j]; + i /= shape[j]; + } + return ans; +} + +void reform_cpu(RearrangeCpuDescriptor_t desc, void *dst, void const *src) { + auto dst_ptr = reinterpret_cast(dst); + auto src_ptr = reinterpret_cast(src); + auto ndim = desc->shape.size(); + int bytes_size = desc->shape[ndim - 1] * desc->dt.size; +#pragma omp parallel for + for (uint64_t i = 0; i < desc->r; ++i) { + auto dst_offset = indices(i, ndim, desc->strides_dst, desc->shape); + auto src_offset = indices(i, ndim, desc->strides_src, desc->shape); + std::memcpy(dst_ptr + dst_offset * desc->dt.size, src_ptr + src_offset * desc->dt.size, bytes_size); + } +} + +infiniopStatus_t cpuRearrange(RearrangeCpuDescriptor_t desc, + void *dst, + void const *src, + void *stream) { + reform_cpu(desc, dst, src); + return STATUS_SUCCESS; +} diff --git a/src/ops/rearrange/cpu/rearrange_cpu.h b/src/ops/rearrange/cpu/rearrange_cpu.h new file mode 100644 index 00000000..99cc62e6 --- /dev/null +++ b/src/ops/rearrange/cpu/rearrange_cpu.h @@ -0,0 +1,31 @@ +#ifndef __CPU_REARRANGE_H__ +#define __CPU_REARRANGE_H__ + +#include "operators.h" +#include +struct RearrangeCpuDescriptor { + Device device; + DataLayout dt; + uint64_t r; + std::vector shape; + std::vector strides_dst; + std::vector strides_src; +}; + +typedef struct RearrangeCpuDescriptor *RearrangeCpuDescriptor_t; + +infiniopStatus_t cpuCreateRearrangeDescriptor(infiniopHandle_t handle, + RearrangeCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src); + +infiniopStatus_t cpuRearrange(RearrangeCpuDescriptor_t desc, + void *dst, + void const *src, + void *stream); + +infiniopStatus_t cpuDestroyRearrangeDescriptor(RearrangeCpuDescriptor_t desc); + +void reform_cpu(RearrangeCpuDescriptor_t desc, void *y, void const *x); + +#endif diff --git a/src/ops/rearrange/cuda/rearrange.cc b/src/ops/rearrange/cuda/rearrange.cc new file mode 100644 index 00000000..da23489b --- /dev/null +++ b/src/ops/rearrange/cuda/rearrange.cc @@ -0,0 +1,70 @@ +#include "rearrange.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include + +infiniopStatus_t cudaCreateRearrangeDescriptor(CudaHandle_t handle, + RearrangeCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src) { + auto dt = dst->dt; + if (!dtype_eq(src->dt, dt)) { + return STATUS_BAD_TENSOR_DTYPE; + } + + auto ndim = dst->ndim; + if (src->ndim != ndim || ndim == 0) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (int i = 0; i < ndim; ++i) { + if (dst->shape[i] != src->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + switch (ndim) { + case 1: + *desc_ptr = new RearrangeCudaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[0], + 1, 1, + 0, 0, + 0, 0}; + break; + case 2: + *desc_ptr = new RearrangeCudaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[1], + 1, dst->shape[0], + 0, dst->strides[0], + 0, src->strides[0]}; + break; + case 3: + *desc_ptr = new RearrangeCudaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[2], + dst->shape[0], dst->shape[1], + dst->strides[0], dst->strides[1], + src->strides[0], src->strides[1]}; + break; + default: + return STATUS_BAD_TENSOR_SHAPE; + } + + (*desc_ptr)->dst_rs *= dt.size; + (*desc_ptr)->dst_cs *= dt.size; + (*desc_ptr)->src_rs *= dt.size; + (*desc_ptr)->src_cs *= dt.size; + + return STATUS_SUCCESS; +} +infiniopStatus_t cudaDestroyRearrangeDescriptor(RearrangeCudaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rearrange/cuda/rearrange.cu b/src/ops/rearrange/cuda/rearrange.cu new file mode 100644 index 00000000..8f90924c --- /dev/null +++ b/src/ops/rearrange/cuda/rearrange.cu @@ -0,0 +1,77 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "rearrange.cuh" +#include "../../utils.h" + +template +static __launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void rearrange( + void *__restrict__ dst, + int const rsa, + int const csa, + void const *__restrict__ src, + int const rsb, + int const csb, + unsigned int const ncols) { + + auto row = blockIdx.y, + col = blockIdx.x * blockDim.y + threadIdx.y; + if (col >= ncols) return; + + auto thread = threadIdx.x, + warp_size = blockDim.x; + auto i = (row * rsa + col * csa) * warp_size + thread; + auto j = (row * rsb + col * csb) * warp_size + thread; + + reinterpret_cast(dst)[i] = reinterpret_cast(src)[j]; +} + +void rearrange_nv_gpu(RearrangeCudaDescriptor_t desc, void *y, void const *x, void *stream) { + auto cuda_stream = reinterpret_cast(stream); + auto unit = desc->unit, + r = desc->r, c = desc->c; + auto dst_rs = desc->dst_rs, dst_cs = desc->dst_cs, + src_rs = desc->src_rs, src_cs = desc->src_cs; + + if (r == 1 && c == 1) { + cudaMemcpyAsync(y, x, unit, cudaMemcpyDeviceToDevice, cuda_stream); + return; + } + + auto warps = MAX_THREADS_PER_BLOCK / WARP_SIZE; + auto grid = dim3(ROUND_UP_DIV(c, warps), r); + auto block = dim3(WARP_SIZE, ROUND_UP_DIV(c, grid.x)); + dst_rs /= unit; + dst_cs /= unit; + src_rs /= unit; + src_cs /= unit; + + switch (unit / WARP_SIZE) { + case 1: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 2: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 4: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 8: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 16: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 32: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + default: + break; + } +} +infiniopStatus_t cudaRearrange(RearrangeCudaDescriptor_t desc, + void *dst, void const *src, void *stream) { + if (cudaSetDevice(desc->device_id) != cudaSuccess) { + return STATUS_BAD_DEVICE; + } + rearrange_nv_gpu(desc, dst, src, stream); + return STATUS_SUCCESS; +} diff --git a/src/ops/rearrange/cuda/rearrange.cuh b/src/ops/rearrange/cuda/rearrange.cuh new file mode 100644 index 00000000..f31f74b3 --- /dev/null +++ b/src/ops/rearrange/cuda/rearrange.cuh @@ -0,0 +1,29 @@ +#ifndef __CUDA_REARRANGE_H__ +#define __CUDA_REARRANGE_H__ + +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" + +struct RearrangeCudaDescriptor { + Device device; + int device_id; + uint64_t unit, r, c; + int64_t dst_rs, dst_cs, src_rs, src_cs; +}; + +typedef struct RearrangeCudaDescriptor *RearrangeCudaDescriptor_t; + +infiniopStatus_t cudaCreateRearrangeDescriptor(CudaHandle_t handle, + RearrangeCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src); + +infiniopStatus_t cudaRearrange(RearrangeCudaDescriptor_t desc, + void *dst, + void const *src, + void *stream); + +infiniopStatus_t cudaDestroyRearrangeDescriptor(RearrangeCudaDescriptor_t desc); + +void rearrange_nv_gpu(RearrangeCudaDescriptor_t, void *y, void const *x, void *stream); +#endif// __CUDA_REARRANGE_H__ diff --git a/src/ops/rearrange/maca/rearrange_maca.cc b/src/ops/rearrange/maca/rearrange_maca.cc new file mode 100644 index 00000000..ac33fe06 --- /dev/null +++ b/src/ops/rearrange/maca/rearrange_maca.cc @@ -0,0 +1,70 @@ +#include "rearrange_maca.h" +#include "../../../devices/maca/common_maca.h" +#include "../../utils.h" +#include + +infiniopStatus_t macaCreateRearrangeDescriptor(MacaHandle_t handle, + RearrangeMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src) { + auto dt = dst->dt; + if (!dtype_eq(src->dt, dt)) { + return STATUS_BAD_TENSOR_DTYPE; + } + + auto ndim = dst->ndim; + if (src->ndim != ndim || ndim == 0) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (int i = 0; i < ndim; ++i) { + if (dst->shape[i] != src->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + switch (ndim) { + case 1: + *desc_ptr = new RearrangeMacaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[0], + 1, 1, + 0, 0, + 0, 0}; + break; + case 2: + *desc_ptr = new RearrangeMacaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[1], + 1, dst->shape[0], + 0, dst->strides[0], + 0, src->strides[0]}; + break; + case 3: + *desc_ptr = new RearrangeMacaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[2], + dst->shape[0], dst->shape[1], + dst->strides[0], dst->strides[1], + src->strides[0], src->strides[1]}; + break; + default: + return STATUS_BAD_TENSOR_SHAPE; + } + + (*desc_ptr)->dst_rs *= dt.size; + (*desc_ptr)->dst_cs *= dt.size; + (*desc_ptr)->src_rs *= dt.size; + (*desc_ptr)->src_cs *= dt.size; + + return STATUS_SUCCESS; +} +infiniopStatus_t macaDestroyRearrangeDescriptor(RearrangeMacaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rearrange/maca/rearrange_maca.h b/src/ops/rearrange/maca/rearrange_maca.h new file mode 100644 index 00000000..701f55bb --- /dev/null +++ b/src/ops/rearrange/maca/rearrange_maca.h @@ -0,0 +1,29 @@ +#ifndef __MACA_REARRANGE_H__ +#define __MACA_REARRANGE_H__ + +#include "../../../devices/maca/maca_handle.h" +#include "operators.h" + +struct RearrangeMacaDescriptor { + Device device; + int device_id; + uint64_t unit, r, c; + int64_t dst_rs, dst_cs, src_rs, src_cs; +}; + +typedef struct RearrangeMacaDescriptor *RearrangeMacaDescriptor_t; + +infiniopStatus_t macaCreateRearrangeDescriptor(MacaHandle_t handle, + RearrangeMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src); + +infiniopStatus_t macaRearrange(RearrangeMacaDescriptor_t desc, + void *dst, + void const *src, + void *stream); + +infiniopStatus_t macaDestroyRearrangeDescriptor(RearrangeMacaDescriptor_t desc); + +void rearrange_mc_gpu(RearrangeMacaDescriptor_t, void *y, void const *x, void *stream); +#endif// __MACA_REARRANGE_H__ diff --git a/src/ops/rearrange/maca/rearrange_maca.maca b/src/ops/rearrange/maca/rearrange_maca.maca new file mode 100644 index 00000000..b5152c15 --- /dev/null +++ b/src/ops/rearrange/maca/rearrange_maca.maca @@ -0,0 +1,76 @@ +#include "../../../devices/maca/common_maca.h" +#include "rearrange_maca.h" + +template +static __global__ void rearrange( + void *__restrict__ dst, + int const rsa, + int const csa, + void const *__restrict__ src, + int const rsb, + int const csb, + unsigned int const ncols) { + + auto row = blockIdx.y, + col = blockIdx.x * blockDim.y + threadIdx.y; + if (col >= ncols) return; + + auto thread = threadIdx.x; + auto warp_size = blockDim.x; + auto i = (row * rsa + col * csa) * warp_size + thread; + auto j = (row * rsb + col * csb) * warp_size + thread; + + reinterpret_cast(dst)[i] = reinterpret_cast(src)[j]; +} + +void rearrange_mc_gpu(RearrangeMacaDescriptor_t desc, void *y, void const *x, void *stream) { + auto maca_stream = reinterpret_cast(stream); + auto unit = desc->unit, + r = desc->r, c = desc->c; + auto dst_rs = desc->dst_rs, dst_cs = desc->dst_cs, + src_rs = desc->src_rs, src_cs = desc->src_cs; + + if (r == 1 && c == 1) { + hcMemcpyAsync(y, x, unit, hcMemcpyDeviceToDevice, maca_stream); + return; + } + + auto warps = 1024 / WARP_SIZE; + auto grid = dim3((c + warps - 1) / warps, r); + auto block = dim3(WARP_SIZE, (c + grid.x - 1) / grid.x); + dst_rs /= unit; + dst_cs /= unit; + src_rs /= unit; + src_cs /= unit; + + switch (unit / WARP_SIZE) { + case 1: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 2: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 4: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 8: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 16: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 32: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + default: + break; + } +} +infiniopStatus_t macaRearrange(RearrangeMacaDescriptor_t desc, + void *dst, void const *src, void *stream) { + if (hcSetDevice(desc->device_id) != hcSuccess) { + return STATUS_BAD_DEVICE; + } + rearrange_mc_gpu(desc, dst, src, stream); + return STATUS_SUCCESS; +} diff --git a/src/ops/rearrange/musa/rearrange_musa.cc b/src/ops/rearrange/musa/rearrange_musa.cc new file mode 100644 index 00000000..5fa2e768 --- /dev/null +++ b/src/ops/rearrange/musa/rearrange_musa.cc @@ -0,0 +1,70 @@ +#include "rearrange_musa.h" +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include + +infiniopStatus_t musaCreateRearrangeDescriptor(MusaHandle_t handle, + RearrangeMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src) { + auto dt = dst->dt; + if (!dtype_eq(src->dt, dt)) { + return STATUS_BAD_TENSOR_DTYPE; + } + + auto ndim = dst->ndim; + if (src->ndim != ndim || ndim == 0) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (int i = 0; i < ndim; ++i) { + if (dst->shape[i] != src->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (dst->strides[ndim - 1] != 1 || src->strides[ndim - 1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + switch (ndim) { + case 1: + *desc_ptr = new RearrangeMusaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[0], + 1, 1, + 0, 0, + 0, 0}; + break; + case 2: + *desc_ptr = new RearrangeMusaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[1], + 1, dst->shape[0], + 0, dst->strides[0], + 0, src->strides[0]}; + break; + case 3: + *desc_ptr = new RearrangeMusaDescriptor{ + handle->device, + handle->device_id, + dt.size * dst->shape[2], + dst->shape[0], dst->shape[1], + dst->strides[0], dst->strides[1], + src->strides[0], src->strides[1]}; + break; + default: + return STATUS_BAD_TENSOR_SHAPE; + } + + (*desc_ptr)->dst_rs *= dt.size; + (*desc_ptr)->dst_cs *= dt.size; + (*desc_ptr)->src_rs *= dt.size; + (*desc_ptr)->src_cs *= dt.size; + + return STATUS_SUCCESS; +} +infiniopStatus_t musaDestroyRearrangeDescriptor(RearrangeMusaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rearrange/musa/rearrange_musa.h b/src/ops/rearrange/musa/rearrange_musa.h new file mode 100644 index 00000000..df6ade12 --- /dev/null +++ b/src/ops/rearrange/musa/rearrange_musa.h @@ -0,0 +1,30 @@ +#ifndef __MUSA_REARRANGE_H__ +#define __MUSA_REARRANGE_H__ + +#include "operators.h" +#include "../../../devices/musa/musa_handle.h" + +struct RearrangeMusaDescriptor { + Device device; + int device_id; + uint64_t unit, r, c; + int64_t dst_rs, dst_cs, src_rs, src_cs; +}; + +typedef struct RearrangeMusaDescriptor *RearrangeMusaDescriptor_t; + +infiniopStatus_t musaCreateRearrangeDescriptor(MusaHandle_t handle, + RearrangeMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src); + +infiniopStatus_t musaRearrange(RearrangeMusaDescriptor_t desc, + void *dst, + void const *src, + void *stream); + +infiniopStatus_t musaDestroyRearrangeDescriptor(RearrangeMusaDescriptor_t desc); + +void rearrange_mt_gpu(RearrangeMusaDescriptor *, void *y, void const *x, void *stream); +#endif // __MUSA_REARRANGE_H__ + diff --git a/src/ops/rearrange/musa/rearrange_musa.mu b/src/ops/rearrange/musa/rearrange_musa.mu new file mode 100644 index 00000000..887923b3 --- /dev/null +++ b/src/ops/rearrange/musa/rearrange_musa.mu @@ -0,0 +1,81 @@ +#include "../../../devices/musa/common_musa.h" +#include "rearrange_musa.h" + +template +static __global__ void rearrange( + void *__restrict__ dst, + int const rsa, + int const csa, + void const *__restrict__ src, + int const rsb, + int const csb, + unsigned int const ncols) { + + auto row = blockIdx.y, + col = blockIdx.x * blockDim.y + threadIdx.y; + if (col >= ncols) return; + + auto thread = threadIdx.x, + warp_size = blockDim.x; + auto i = (row * rsa + col * csa) * warp_size + thread; + auto j = (row * rsb + col * csb) * warp_size + thread; + + reinterpret_cast(dst)[i] = reinterpret_cast(src)[j]; +} + + +void rearrange_mt_gpu(RearrangeMusaDescriptor_t desc, void *y, void const *x, void *stream) { + auto musa_stream = reinterpret_cast(stream); + auto unit = desc->unit, + r = desc->r, c = desc->c; + auto dst_rs = desc->dst_rs, dst_cs = desc->dst_cs, + src_rs = desc->src_rs, src_cs = desc->src_cs; + + if (r == 1 && c == 1) { + musaMemcpyAsync(y, x, unit, musaMemcpyDeviceToDevice, musa_stream); + return; + } + + auto warps = 1024 / WARP_SIZE; + auto grid = dim3((c + warps - 1) / warps, r); + auto block = dim3(WARP_SIZE, (c + grid.x - 1) / grid.x); + dst_rs /= unit; + dst_cs /= unit; + src_rs /= unit; + src_cs /= unit; + + switch (unit / WARP_SIZE) { + case 1: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 2: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 4: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 8: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 16: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + case 32: + rearrange<<>>(y, dst_rs, dst_cs, x, src_rs, src_cs, c); + break; + default: + break; + } +} +infiniopStatus_t musaRearrange(RearrangeMusaDescriptor_t desc, + void *dst, void const *src, void *stream) { + int current_device; + if (musaGetDevice(¤t_device) != musaSuccess) { + return STATUS_BAD_DEVICE; + } + if (current_device != desc->device_id && musaSetDevice(desc->device_id) != musaSuccess) { + return STATUS_BAD_DEVICE; + } + rearrange_mt_gpu(desc, dst, src, stream); + return STATUS_SUCCESS; +} diff --git a/src/ops/rearrange/operator.cc b/src/ops/rearrange/operator.cc new file mode 100644 index 00000000..4a922dc7 --- /dev/null +++ b/src/ops/rearrange/operator.cc @@ -0,0 +1,143 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/rearrange/rearrange.h" + +#ifdef ENABLE_CPU +#include "cpu/rearrange_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/common_cuda.h" +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/rearrange.cuh" +#endif +#ifdef ENABLE_CAMBRICON_MLU +#include "bang/rearrange_bang.h" +//#include "bang/rearrange_cnnl.h" +#endif +#ifdef ENABLE_ASCEND_NPU +#include "ascend/rearrange_aclnn.h" +#endif +#ifdef ENABLE_METAX_GPU +#include "maca/rearrange_maca.h" +#endif +#ifdef ENABLE_MTHREADS_GPU +#include "musa/rearrange_musa.h" +#endif + +__C infiniopStatus_t infiniopCreateRearrangeDescriptor( + infiniopHandle_t handle, + infiniopRearrangeDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateRearrangeDescriptor(handle, (RearrangeCpuDescriptor_t *) desc_ptr, dst, src); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateRearrangeDescriptor((CudaHandle_t) handle, (RearrangeCudaDescriptor_t *) desc_ptr, dst, src); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return bangCreateRearrangeDescriptor((BangHandle_t) handle, (RearrangeBangDescriptor_t *) desc_ptr, dst, src); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnCreateRearrangeDescriptor((AscendHandle_t) handle, + (RearrangeAclnnDescriptor_t *) desc_ptr, + dst, + src); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaCreateRearrangeDescriptor((MacaHandle_t) handle, (RearrangeMacaDescriptor_t *) desc_ptr, dst, src); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaCreateRearrangeDescriptor((MusaHandle_t)handle, (RearrangeMusaDescriptor_t *) desc_ptr, dst, src); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void *dst, void const *src, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuRearrange((RearrangeCpuDescriptor_t) desc, dst, src, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaRearrange((RearrangeCudaDescriptor_t) desc, dst, src, stream); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return bangRearrange((RearrangeBangDescriptor_t) desc, dst, src, stream); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnRearrange((RearrangeAclnnDescriptor_t) desc, + dst, + src, + stream); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaRearrange((RearrangeMacaDescriptor_t) desc, dst, src, stream); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaRearrange((RearrangeMusaDescriptor_t) desc, dst, src, stream); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyRearrangeDescriptor((RearrangeCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyRearrangeDescriptor((RearrangeCudaDescriptor_t) desc); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return bangDestroyRearrangeDescriptor((RearrangeBangDescriptor_t) desc); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t) desc); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaDestroyRearrangeDescriptor((RearrangeMacaDescriptor_t) desc); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaDestroyRearrangeDescriptor((RearrangeMusaDescriptor_t) desc); + } +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/reform/bang/reform_bang.h b/src/ops/reform/bang/reform_bang.h deleted file mode 100644 index 2c65d52c..00000000 --- a/src/ops/reform/bang/reform_bang.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef __BANG_REFORM_H__ -#define __BANG_REFORM_H__ - -#include "../../utils.h" -#include "cnrt.h" -#include "operators.h" - -struct ReformBangDescriptor { - Device device; -}; - -void reform_bang(Tensor y, Tensor x, void *stream); - -#endif// __BANG_REFORM_H__ diff --git a/src/ops/reform/bang/reform_bang.mlu b/src/ops/reform/bang/reform_bang.mlu deleted file mode 100644 index 130a6847..00000000 --- a/src/ops/reform/bang/reform_bang.mlu +++ /dev/null @@ -1,247 +0,0 @@ -#include -#include -#include "reform_bang.h" -#include "../../../devices/bang/common_bang.h" - -template -__mlu_device__ void reformKernel(T *source, T *destination, int *strideSrc, int *strideDest, int *shape, int n, int dimsize, int nDim){ - - if (dimsize * sizeof(T) > GDRAM_MAX_SIZE){ - int maxNum = GDRAM_MAX_SIZE / sizeof(T); - int remain = dimsize % maxNum; - int repeat = (dimsize - remain) / maxNum; - - int remainT = n % taskDim; - int stepEasy = (n - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i; - for (int j = nDim - 2; j >= 0; --j) { - inds += (indi % shape[j]) * strideSrc[j]; - indd += (indi % shape[j]) * strideDest[j]; - indi /= shape[j]; - } - for (int s = 0; s < repeat; s++){ - __memcpy(destination + indd + s * maxNum, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2GDRAM); - } - if (remain){ - __memcpy(destination + indd + repeat * maxNum, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2GDRAM); - } - } - } - else { - int remainT = n % taskDim; - int stepEasy = (n - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i; - for (int j = nDim - 2; j >= 0; --j) { - inds += (indi % shape[j]) * strideSrc[j]; - indd += (indi % shape[j]) * strideDest[j]; - indi /= shape[j]; - } - __memcpy(destination + indd, source + inds, dimsize * sizeof(T), GDRAM2GDRAM); - } - } - -} - -template -__mlu_global__ void reformUnion1(T *source, T *destination, int *strideSrc, int *strideDest, int *shape, int n, int dimsize, int ndim){ - - reformKernel(source, destination, strideSrc, strideDest, shape, n, dimsize, ndim); - -} - -void reform(cnrtQueue_t queue, void *y, void *x, int *y_stride, int *x_stride, int *shape, int n, int dimsize, int ndim){ - - auto y_ = reinterpret_cast(y); - auto x_ = reinterpret_cast(x); - - cnrtDim3_t dim = {16, 1, 1}; - cnrtFunctionType_t ktype = CNRT_FUNC_TYPE_UNION1; - - reformUnion1<<>>(x_, y_, x_stride, y_stride, shape, n, dimsize, ndim); - // cnrtQueueSync(queue); - -} -template -__mlu_global__ void reformDim_2(T *source, T *destination, int strideS_f, int strideD_f, int n, int dimsize){ - if (dimsize * sizeof(T) > GDRAM_MAX_SIZE){ - int maxNum = GDRAM_MAX_SIZE / sizeof(T); - int remain = dimsize % maxNum; - int repeat = (dimsize - remain) / maxNum; - - int remainT = n % taskDim; - int stepEasy = (n - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i; - inds += (indi % n) * strideS_f; - indd += (indi % n) * strideD_f; - for (int s = 0; s < repeat; s++){ - __memcpy(destination + indd + s * maxNum, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2GDRAM); - } - if (remain){ - __memcpy(destination + indd + repeat * maxNum, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2GDRAM); - } - } - } - else { - int remainT = n % taskDim; - int stepEasy = (n - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i; - inds += (indi % n) * strideS_f; - indd += (indi % n) * strideD_f; - __memcpy(destination + indd, source + inds, dimsize * sizeof(T), GDRAM2GDRAM); - } - } -} -void reformUnionDim_2(cnrtQueue_t queue, void *y, void *x , int strideS_f, int strideD_f, int n, int dimsize){ - - auto y_ = reinterpret_cast(y); - auto x_ = reinterpret_cast(x); - - cnrtDim3_t dim = {16, 1, 1}; - cnrtFunctionType_t ktype = CNRT_FUNC_TYPE_UNION1; - - reformDim_2<<>>(x_, y_, strideS_f, strideD_f, n, dimsize); - // cnrtQueueSync(queue); - -} -template -__mlu_global__ void reformDim_3(T *source, T *destination, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int n, int middle, int dimsize){ - int startDim = n / middle; - if (dimsize * sizeof(T) > GDRAM_MAX_SIZE){ - int maxNum = GDRAM_MAX_SIZE / sizeof(T); - int remain = dimsize % maxNum; - int repeat = (dimsize - remain) / maxNum; - - int remainT = n % taskDim; - int stepEasy = (n - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i; - inds += (indi % middle) * strideS_m; - indd += (indi % middle) * strideD_m; - indi /= middle; - inds += (indi % startDim) * strideS_f; - indd += (indi % startDim) * strideD_f; - for (int s = 0; s < repeat; s++){ - __memcpy(destination + indd + s * maxNum, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2GDRAM); - } - if (remain){ - __memcpy(destination + indd + repeat * maxNum, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2GDRAM); - } - } - } - else { - int remainT = n % taskDim; - int stepEasy = (n - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i; - inds += (indi % middle) * strideS_m; - indd += (indi % middle) * strideD_m; - indi /= middle; - inds += (indi % startDim) * strideS_f; - indd += (indi % startDim) * strideD_f; - __memcpy(destination + indd, source + inds, dimsize * sizeof(T), GDRAM2GDRAM); - } - } -} -void reformUnionDim_3(cnrtQueue_t queue, void *y, void *x, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int n, int middle, int dimsize){ - - auto y_ = reinterpret_cast(y); - auto x_ = reinterpret_cast(x); - - cnrtDim3_t dim = {16, 1, 1}; - cnrtFunctionType_t ktype = CNRT_FUNC_TYPE_UNION1; - - reformDim_3<<>>(x_, y_, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, dimsize); - // cnrtQueueSync(queue); - -} -void reform_bang(Tensor y, Tensor x, void *stream) { - ASSERT_EQ(y.layout->ndim, x.layout->ndim); - int ndim = y.layout->ndim; - ASSERT(ndim >= 2); - for (int i = 0; i < ndim; ++i) { - ASSERT_EQ(y.layout->shape[i], x.layout->shape[i]); - } - ASSERT_EQ(y.layout->strides[ndim - 1], y.layout->dt.size); - ASSERT_EQ(x.layout->strides[ndim - 1], x.layout->dt.size); - - int x_stride[ndim], y_stride[ndim], shape[ndim]; - int n = 1; - for (int i = 0; i < ndim; i++) { - x_stride[i] = static_cast(x.layout->strides[i])/y.layout->dt.size; - y_stride[i] = static_cast(y.layout->strides[i])/y.layout->dt.size; - shape[i] = static_cast(y.layout->shape[i]); - n *= shape[i]; - } - int dimsize = shape[ndim - 1]; - n /= dimsize; - auto queue = reinterpret_cast(stream); - if(ndim == 2){ - int strideS_f = x_stride[0]; - int strideD_f = y_stride[0]; - reformUnionDim_2(queue, y.data, x.data, strideS_f, strideD_f, n, dimsize); - } - else if(ndim == 3){ - int strideS_f = x_stride[0]; - int strideD_f = y_stride[0]; - int strideS_m = x_stride[1]; - int strideD_m = y_stride[1]; - int middle = shape[1]; - reformUnionDim_3(queue, y.data, x.data, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, dimsize); - } - else{ - int *mlu_strideX, *mlu_strideY, *mlu_shape; - CNRT_CHECK(cnrtMalloc((void **)&mlu_strideX, ndim * sizeof(int))); - CNRT_CHECK(cnrtMalloc((void **)&mlu_strideY, ndim * sizeof(int))); - CNRT_CHECK(cnrtMalloc((void **)&mlu_shape, ndim * sizeof(int))); - CNRT_CHECK(cnrtMemcpy(mlu_strideX, x_stride, ndim * sizeof(int), cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpy(mlu_strideY, y_stride, ndim * sizeof(int), cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpy(mlu_shape, shape, ndim * sizeof(int), cnrtMemcpyHostToDev)); - - - reform(queue, y.data, x.data, mlu_strideY, mlu_strideX, mlu_shape, n, dimsize, ndim); - cnrtFree(mlu_strideX); - cnrtFree(mlu_strideY); - cnrtFree(mlu_shape); - } - -} diff --git a/src/ops/reform/cpu/reform_cpu.cc b/src/ops/reform/cpu/reform_cpu.cc deleted file mode 100644 index 7296e414..00000000 --- a/src/ops/reform/cpu/reform_cpu.cc +++ /dev/null @@ -1,59 +0,0 @@ -#include "reform_cpu.h" -#include "../../../devices/cpu/common_cpu.h" -#include "../../utils.h" -#include -#include - -inline int indices(int i, int ndim, int64_t *strides, uint64_t *shape) { - int ans = 0; - for (int j = ndim - 2; j >= 0; --j) { - ans += (i % shape[j]) * strides[j]; - i /= shape[j]; - } - return ans; -} - -void copy_contiguous(uint8_t *dst_ptr, uint8_t const *src_ptr, int n, Tensor y, Tensor x) { -#pragma omp parallel for - for (int i = 0; i < n; ++i) { - auto dst_offset = indices(i, y.layout->ndim, y.layout->strides, y.layout->shape); - auto src_offset = indices(i, y.layout->ndim, x.layout->strides, x.layout->shape); - std::memcpy(dst_ptr + dst_offset, src_ptr + src_offset, y.layout->shape[y.layout->ndim - 1] * y.layout->dt.size); - } -} - -union DataLayout_ { - DataLayout i; - unsigned short u; -}; - -void reform_cpu(Tensor y, Tensor x) { - DataLayout_ dl_y, dl_x; - dl_y.i = y.layout->dt; - dl_x.i = x.layout->dt; - ASSERT_EQ(dl_y.u, dl_x.u); - ASSERT_EQ(y.layout->ndim, x.layout->ndim); - auto ndim = y.layout->ndim; - ASSERT(ndim >= 2); - for (int i = 0; i < ndim; ++i) { - ASSERT_EQ(y.layout->shape[i], x.layout->shape[i]); - } - ASSERT_EQ(y.layout->strides[ndim - 1], y.layout->dt.size); - ASSERT_EQ(x.layout->strides[ndim - 1], x.layout->dt.size); - unsigned int r = 0; - if (ndim == 2) { - r = y.layout->shape[0]; - } else if (ndim == 3) { - r = y.layout->shape[0] * y.layout->shape[1]; - } else { - for (int i = ndim - 3; i >= 1; --i) { - ASSERT_EQ(y.layout->shape[i] * y.layout->strides[i], y.layout->strides[i - 1]); - ASSERT_EQ(x.layout->shape[i] * x.layout->strides[i], x.layout->strides[i - 1]); - } - r = std::accumulate(y.layout->shape, y.layout->shape + ndim - 1, 1, std::multiplies()); - } - auto dst_ptr = reinterpret_cast(y.data); - auto src_ptr = reinterpret_cast(x.data); - - copy_contiguous(dst_ptr, src_ptr, r, y, x); -} diff --git a/src/ops/reform/cpu/reform_cpu.h b/src/ops/reform/cpu/reform_cpu.h deleted file mode 100644 index e0194cd5..00000000 --- a/src/ops/reform/cpu/reform_cpu.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __CPU_REFORM_H__ -#define __CPU_REFORM_H__ - -#include "operators.h" - -struct ReformCpuDescriptor { - Device device; -}; - -void reform_cpu(Tensor y, Tensor x); - -#endif// __CPU_REFORM_H__ diff --git a/src/ops/reform/cuda/reform.cu b/src/ops/reform/cuda/reform.cu deleted file mode 100644 index 1a82c8c0..00000000 --- a/src/ops/reform/cuda/reform.cu +++ /dev/null @@ -1,107 +0,0 @@ -#include "../../utils.h" -#include "reform.cuh" -#include - -template -static __global__ void reform( - void *__restrict__ dst, - unsigned int const rsa, - unsigned int const csa, - void const *__restrict__ src, - unsigned int const rsb, - unsigned int const csb, - unsigned int const ncols) { - - auto row = blockIdx.y, - col = blockIdx.x * blockDim.y + threadIdx.y; - if (col >= ncols) return; - - auto thread = threadIdx.x, - warp_size = blockDim.x; - auto i = (row * rsa + col * csa) * warp_size + thread; - auto j = (row * rsb + col * csb) * warp_size + thread; - - reinterpret_cast(dst)[i] = reinterpret_cast(src)[j]; -} - -union DataLayout_ { - DataLayout i; - unsigned short u; -}; - -void reform_nv_gpu(Tensor y, Tensor x, void *stream) { - DataLayout_ dl_y, dl_x; - dl_y.i = y.layout->dt; - dl_x.i = x.layout->dt; - ASSERT_EQ(dl_y.u, dl_x.u); - ASSERT_EQ(y.layout->ndim, x.layout->ndim); - auto ndim = y.layout->ndim; - ASSERT(ndim >= 2); - for (int i = 0; i < ndim; ++i) { - ASSERT_EQ(y.layout->shape[i], x.layout->shape[i]); - } - ASSERT_EQ(y.layout->strides[ndim - 1], y.layout->dt.size); - ASSERT_EQ(x.layout->strides[ndim - 1], x.layout->dt.size); - unsigned int r = 0, c = 0, b = 0; - unsigned int rsa = 0, csa = 0, rsb = 0, csb = 0; - if (ndim == 2) { - c = y.layout->shape[0]; - b = y.layout->shape[1]; - csa = y.layout->strides[0] / y.layout->dt.size; - csb = x.layout->strides[0] / x.layout->dt.size; - } else if (ndim == 3) { - r = y.layout->shape[0]; - c = y.layout->shape[1]; - b = y.layout->shape[2]; - csa = y.layout->strides[1] / y.layout->dt.size; - csb = x.layout->strides[1] / x.layout->dt.size; - rsa = y.layout->strides[0] / y.layout->dt.size; - rsb = x.layout->strides[0] / x.layout->dt.size; - } else { - for (int i = ndim - 3; i >= 1; --i) { - ASSERT_EQ(y.layout->shape[i] * y.layout->strides[i], y.layout->strides[i - 1]); - ASSERT_EQ(x.layout->shape[i] * x.layout->strides[i], x.layout->strides[i - 1]); - } - r = std::accumulate(y.layout->shape, y.layout->shape + ndim - 2, 1, std::multiplies()); - c = y.layout->shape[ndim - 2]; - b = y.layout->shape[ndim - 1]; - csa = y.layout->strides[ndim - 2] / y.layout->dt.size; - csb = x.layout->strides[ndim - 2] / x.layout->dt.size; - rsa = y.layout->strides[ndim - 3] / y.layout->dt.size; - rsb = x.layout->strides[ndim - 3] / x.layout->dt.size; - } - auto contiguous_bytes = b * y.layout->dt.size; - ASSERT_EQ(contiguous_bytes % WARP_SIZE, 0); - auto bytes_per_thread = contiguous_bytes / WARP_SIZE; - ASSERT(bytes_per_thread > 0 && bytes_per_thread <= 32 && (bytes_per_thread & (bytes_per_thread - 1)) == 0); - - auto dst_ptr = static_cast(reinterpret_cast(y.data)); - rsa /= b; - csa /= b; - auto src_ptr = static_cast(reinterpret_cast(x.data)); - rsb /= b; - csb /= b; - auto cuda_stream = reinterpret_cast(stream); - dim3 grid_dims = dim3((c + MAX_WARP_PER_BLOCK - 1) / MAX_WARP_PER_BLOCK, r); - dim3 block_dims = dim3(WARP_SIZE, (c + grid_dims.x - 1) / grid_dims.x); - switch (bytes_per_thread) { - case 1: - reform<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); - break; - case 2: - reform<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); - break; - case 4: - reform<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); - break; - case 8: - reform<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); - break; - case 16: - reform<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); - break; - case 32: - reform<<>>(dst_ptr, rsa, csa, src_ptr, rsb, csb, c); - break; - } -} diff --git a/src/ops/reform/cuda/reform.cuh b/src/ops/reform/cuda/reform.cuh deleted file mode 100644 index c1f6ebf6..00000000 --- a/src/ops/reform/cuda/reform.cuh +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef __NV_GPU_REFORM_H__ -#define __NV_GPU_REFORM_H__ - -#include "../../../devices/cuda/common_cuda.h" -#include "operators.h" - -struct ReformCudaDescriptor { - Device device; -}; - -void reform_nv_gpu(Tensor y, Tensor x, void *stream); - -#endif// __NV_GPU_REFORM_H__ diff --git a/src/ops/reform/operator.cc b/src/ops/reform/operator.cc deleted file mode 100644 index bce59b04..00000000 --- a/src/ops/reform/operator.cc +++ /dev/null @@ -1,83 +0,0 @@ -#include "../utils.h" -#include "ops/reform/reform.h" - -#ifdef ENABLE_CPU -#include "cpu/reform_cpu.h" -#endif -#ifdef ENABLE_NV_GPU -#include "cuda/reform.cuh" -#endif -#ifdef ENABLE_CAMBRICON_MLU -#include "bang/reform_bang.h" -#endif - -struct ReformDescriptor { - Device device; -}; - -__C ReformDescriptor *createReformDescriptor(Device device, void *config) { - switch (device) { -#ifdef ENABLE_CPU - case DevCpu: - return (ReformDescriptor *) (new ReformCpuDescriptor{device}); -#endif -#ifdef ENABLE_NV_GPU - case DevNvGpu: { - return (ReformDescriptor *) (new ReformCudaDescriptor{device}); - } -#endif -#ifdef ENABLE_CAMBRICON_MLU - case DevCambriconMlu: { - return (ReformDescriptor *) (new ReformBangDescriptor{device}); - } -#endif - default: - PANIC(UnsupportedDevice); - } - return nullptr; -} - -__C void destroyReformDescriptor(ReformDescriptor *descriptor) { - switch (descriptor->device) { -#ifdef ENABLE_CPU - case DevCpu: - delete (ReformCpuDescriptor *) (descriptor); - break; -#endif -#ifdef ENABLE_NV_GPU - case DevNvGpu: - delete (ReformCudaDescriptor *) (descriptor); - break; -#endif -#ifdef ENABLE_CAMBRICON_MLU - case DevCambriconMlu: { - delete (ReformBangDescriptor *) (descriptor); - break; - } -#endif - default: - PANIC(UnsupportedDevice); - } -} - -__C void reform(ReformDescriptor *descriptor, Tensor y, Tensor x, void *stream) { - switch (descriptor->device) { -#ifdef ENABLE_CPU - case DevCpu: - reform_cpu(y, x); - break; -#endif -#ifdef ENABLE_NV_GPU - case DevNvGpu: - reform_nv_gpu(y, x, stream); - break; -#endif -#ifdef ENABLE_CAMBRICON_MLU - case DevCambriconMlu: - reform_bang(y, x, stream); - break; -#endif - default: - PANIC(UnsupportedDevice); - } -}; diff --git a/src/ops/relu/cpu/relu_cpu.cc b/src/ops/relu/cpu/relu_cpu.cc new file mode 100644 index 00000000..2ac7d324 --- /dev/null +++ b/src/ops/relu/cpu/relu_cpu.cc @@ -0,0 +1,72 @@ +#include "relu_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +infiniopStatus_t cpuCreateReluDescriptor(infiniopHandle_t, + ReluCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + uint64_t ndim = y->ndim; + if (ndim != x->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (size_t i = 0; i < ndim; ++i) { + if (y->shape[i] != x->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (y->dt != F16 && y->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies()); + + *desc_ptr = new ReluCpuDescriptor{ + DevCpu, + y->dt, + data_size, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyReluDescriptor(ReluCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} + +template +infiniopStatus_t relu_cpu(ReluCpuDescriptor_t desc, void *y, void const *x) { + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + +#pragma omp parallel for + for (uint64_t i = 0; i < desc->data_size; ++i) { + if constexpr (std::is_same::value) { + float x_f32 = f16_to_f32(x_[i]); + y_[i] = f32_to_f16(x_f32 < 0 ? 0 : x_f32); + } else { + Tdata x_val = x_[i]; + y_[i] = x_val < 0 ? 0 : x_val; + } + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuRelu(ReluCpuDescriptor_t desc, + void *y, void const *x, + void *stream) { + if (desc->dtype == F16) { + return relu_cpu(desc, y, x); + } + if (desc->dtype == F32) { + return relu_cpu(desc, y, x); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/relu/cpu/relu_cpu.h b/src/ops/relu/cpu/relu_cpu.h new file mode 100644 index 00000000..e4e51532 --- /dev/null +++ b/src/ops/relu/cpu/relu_cpu.h @@ -0,0 +1,26 @@ +#ifndef __CPU_RELU_H__ +#define __CPU_RELU_H__ + +#include "operators.h" +#include + +struct ReluCpuDescriptor { + Device device; + DT dtype; + uint64_t data_size; +}; + +typedef struct ReluCpuDescriptor *ReluCpuDescriptor_t; + +infiniopStatus_t cpuCreateReluDescriptor(infiniopHandle_t, + ReluCpuDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +infiniopStatus_t cpuRelu(ReluCpuDescriptor_t desc, + void *y, void const *x, + void *stream); + +infiniopStatus_t cpuDestroyReluDescriptor(ReluCpuDescriptor_t desc); + +#endif diff --git a/src/ops/relu/cuda/relu.cc b/src/ops/relu/cuda/relu.cc new file mode 100644 index 00000000..3dfadd8a --- /dev/null +++ b/src/ops/relu/cuda/relu.cc @@ -0,0 +1,45 @@ +#include "relu.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateReluDescriptor(CudaHandle_t handle, + ReluCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + uint64_t ndim = y->ndim; + if (ndim != x->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (size_t i = 0; i < ndim; ++i) { + if (y->shape[i] != x->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (y->dt != F16 && y->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies()); + + *desc_ptr = new ReluCudaDescriptor{ + DevNvGpu, + y->dt, + handle->device_id, + ndim, + data_size, + static_cast(handle->prop.maxGridSize[0]), + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyReluDescriptor(ReluCudaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/relu/cuda/relu.cu b/src/ops/relu/cuda/relu.cu new file mode 100644 index 00000000..7c9884e6 --- /dev/null +++ b/src/ops/relu/cuda/relu.cu @@ -0,0 +1,111 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "relu.cuh" + +/** + * @brief A templated vector struct that supports applying relu on arrays. + * + * @tparam T - The access data type for elements in the vector. + * @tparam TComp - The computation data type used for arithmetic operations. sizeof(T) should + * be >= sizeof(TComp) + * @tparam N - The number of elements of type T in the vector for a single access. + */ +template +struct vecN { + T data[N]; + constexpr static size_t pack_size = sizeof(T) / sizeof(TComp); + + // Constructor that initializes the data array with type TComp + __device__ __forceinline__ constexpr vecN(const TComp &val) { + const auto data_ = reinterpret_cast(data); + const auto size = N * pack_size; +#pragma unroll + for (size_t i = 0; i < size; ++i) { + data_[i] = 0; + } + } + + // Assignment operator with relu assignment logic + __device__ __forceinline__ vecN &operator=(const vecN &other) { + if constexpr (std::is_same::value) { +#pragma unroll + for (int i = 0; i < N; ++i) { + data[i] = other.data[i] < TComp(0) ? TComp(0) : other.data[i]; + } + } else { + auto *data_this = reinterpret_cast *>(data); + auto *data_other = reinterpret_cast *>(other.data); +#pragma unroll + for (int i = 0; i < N; ++i) { + data_this[i] = data_other[i]; + } + } + return *this; + } + + // Always returns false since the actual relu logic is in the assignment process + __device__ __forceinline__ bool operator<(const vecN &other) const { + return false; + } + + __device__ __forceinline__ const T &operator[](size_t i) const { + return data[i]; + } +}; + +template +__global__ void relu( + Tdata *y, + const Tdata *x, + uint64_t data_size, + uint64_t offset) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if (idx < data_size) { + y[idx] = x[idx] < Tdata(0) ? Tdata(0) : x[idx]; + } +} + +template +void relu_nv_gpu(ReluCudaDescriptor_t desc, Tdata *y, Tdata const *x, uint64_t data_size, uint64_t offset, void *stream) { + if (data_size == 0) { + return; + } + dim3 blockDims = dim3(std::min(static_cast(256), data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < data_size; i += step) { + relu<<>>(y, x, offset + data_size, offset + i); + } +} + +template +infiniopStatus_t relu_nv_gpu(ReluCudaDescriptor_t desc, void *y, void const *x, void *stream, uint64_t pack_size) { + const auto data_size = desc->data_size / pack_size; + const auto x_vec = reinterpret_cast(x); + const auto y_vec = reinterpret_cast(y); + relu_nv_gpu(desc, y_vec, x_vec, data_size, 0, stream); + + const auto remainder = desc->data_size % pack_size; + const auto x_ = reinterpret_cast(x); + const auto y_ = reinterpret_cast(y); + relu_nv_gpu(desc, y_, x_, remainder, data_size * pack_size, stream); + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaRelu(ReluCudaDescriptor_t desc, + void *y, void const *x, + void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return relu_nv_gpu, half>(desc, y, x, stream, 4); + } + if (desc->dtype == F32) { + return relu_nv_gpu, float>(desc, y, x, stream, 4); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/relu/cuda/relu.cuh b/src/ops/relu/cuda/relu.cuh new file mode 100644 index 00000000..82020eb6 --- /dev/null +++ b/src/ops/relu/cuda/relu.cuh @@ -0,0 +1,32 @@ +#ifndef __CUDA_RELU_H__ +#define __CUDA_RELU_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include + +struct ReluCudaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t data_size; + uint64_t max_grid_size; +}; + +typedef struct ReluCudaDescriptor *ReluCudaDescriptor_t; + +infiniopStatus_t cudaCreateReluDescriptor(CudaHandle_t, + ReluCudaDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +infiniopStatus_t cudaRelu(ReluCudaDescriptor_t desc, + void *y, void const *x, + void *stream); + +infiniopStatus_t cudaDestroyReluDescriptor(ReluCudaDescriptor_t desc); + +#endif diff --git a/src/ops/relu/musa/relu_musa.cc b/src/ops/relu/musa/relu_musa.cc new file mode 100644 index 00000000..6baaef18 --- /dev/null +++ b/src/ops/relu/musa/relu_musa.cc @@ -0,0 +1,45 @@ +#include "relu_musa.h" +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" + +infiniopStatus_t musaCreateReluDescriptor(MusaHandle_t handle, + ReluMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + uint64_t ndim = y->ndim; + if (ndim != x->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (size_t i = 0; i < ndim; ++i) { + if (y->shape[i] != x->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (y->dt != F16 && y->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies()); + + *desc_ptr = new ReluMusaDescriptor{ + DevMthreadsGpu, + y->dt, + handle->device_id, + ndim, + data_size, + static_cast(handle->prop.maxGridSize[0]), + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t musaDestroyReluDescriptor(ReluMusaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/relu/musa/relu_musa.h b/src/ops/relu/musa/relu_musa.h new file mode 100644 index 00000000..84276369 --- /dev/null +++ b/src/ops/relu/musa/relu_musa.h @@ -0,0 +1,32 @@ +#ifndef __MUSA_RELU_H__ +#define __MUSA_RELU_H__ + +#include "../../../devices/musa/common_musa.h" +#include "../../../devices/musa/musa_handle.h" +#include "operators.h" +#include +#include + +struct ReluMusaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t data_size; + uint64_t max_grid_size; +}; + +typedef struct ReluMusaDescriptor *ReluMusaDescriptor_t; + +infiniopStatus_t musaCreateReluDescriptor(MusaHandle_t, + ReluMusaDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +infiniopStatus_t musaRelu(ReluMusaDescriptor_t desc, + void *y, void const *x, + void *stream); + +infiniopStatus_t musaDestroyReluDescriptor(ReluMusaDescriptor_t desc); + +#endif diff --git a/src/ops/relu/musa/relu_musa.mu b/src/ops/relu/musa/relu_musa.mu new file mode 100644 index 00000000..3d91b4e2 --- /dev/null +++ b/src/ops/relu/musa/relu_musa.mu @@ -0,0 +1,111 @@ +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include "relu_musa.h" + +/** + * @brief A templated vector struct that supports applying relu on arrays. + * + * @tparam T - The access data type for elements in the vector. + * @tparam TComp - The computation data type used for arithmetic operations. sizeof(T) should + * be >= sizeof(TComp) + * @tparam N - The number of elements of type T in the vector for a single access. + */ +template +struct vecN { + T data[N]; + constexpr static size_t pack_size = sizeof(T) / sizeof(TComp); + + // Constructor that initializes the data array with type TComp + __device__ __forceinline__ constexpr vecN(const TComp &val) { + const auto data_ = reinterpret_cast(data); + const auto size = N * pack_size; +#pragma unroll + for (size_t i = 0; i < size; ++i) { + data_[i] = 0; + } + } + + // Assignment operator with relu assignment logic + __device__ __forceinline__ vecN &operator=(const vecN &other) { + if constexpr (std::is_same::value) { +#pragma unroll + for (int i = 0; i < N; ++i) { + data[i] = other.data[i] < TComp(0) ? TComp(0) : other.data[i]; + } + } else { + auto *data_this = reinterpret_cast *>(data); + auto *data_other = reinterpret_cast *>(other.data); +#pragma unroll + for (int i = 0; i < N; ++i) { + data_this[i] = data_other[i]; + } + } + return *this; + } + + // Always returns false since the actual relu logic is in the assignment process + __device__ __forceinline__ bool operator<(const vecN &other) const { + return false; + } + + __device__ __forceinline__ const T &operator[](size_t i) const { + return data[i]; + } +}; + +template +__global__ void relu( + Tdata *y, + const Tdata *x, + uint64_t data_size, + uint64_t offset) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if (idx < data_size) { + y[idx] = x[idx] < Tdata(0) ? Tdata(0) : x[idx]; + } +} + +template +void relu_mt_gpu(ReluMusaDescriptor_t desc, Tdata *y, Tdata const *x, uint64_t data_size, uint64_t offset, void *stream) { + if (data_size == 0) { + return; + } + dim3 blockDims = dim3(std::min(static_cast(256), data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + musaStream_t musa_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < data_size; i += step) { + relu<<>>(y, x, offset + data_size, offset + i); + } +} + +template +infiniopStatus_t relu_mt_gpu(ReluMusaDescriptor_t desc, void *y, void const *x, void *stream, uint64_t pack_size) { + const auto data_size = desc->data_size / pack_size; + const auto x_vec = reinterpret_cast(x); + const auto y_vec = reinterpret_cast(y); + relu_mt_gpu(desc, y_vec, x_vec, data_size, 0, stream); + + const auto remainder = desc->data_size % pack_size; + const auto x_ = reinterpret_cast(x); + const auto y_ = reinterpret_cast(y); + relu_mt_gpu(desc, y_, x_, remainder, data_size * pack_size, stream); + return STATUS_SUCCESS; +} + +infiniopStatus_t musaRelu(ReluMusaDescriptor_t desc, + void *y, void const *x, + void *stream) { + checkMusaError(musaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return relu_mt_gpu, half>(desc, y, x, stream, 4); + } + if (desc->dtype == F32) { + return relu_mt_gpu, float>(desc, y, x, stream, 4); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/relu/operator.cc b/src/ops/relu/operator.cc new file mode 100644 index 00000000..7a3a2e2f --- /dev/null +++ b/src/ops/relu/operator.cc @@ -0,0 +1,91 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/relu/relu.h" + +#ifdef ENABLE_CPU +#include "cpu/relu_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/relu.cuh" +#endif +#ifdef ENABLE_MTHREADS_GPU +#include "musa/relu_musa.h" +#endif + + +__C infiniopStatus_t infiniopCreateReluDescriptor( + infiniopHandle_t handle, + infiniopReluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateReluDescriptor(handle, (ReluCpuDescriptor_t *) desc_ptr, y, x); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateReluDescriptor((CudaHandle_t) handle, (ReluCudaDescriptor_t *) desc_ptr, y, x); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaCreateReluDescriptor((MusaHandle_t) handle, (ReluMusaDescriptor_t *) desc_ptr, y, x); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopRelu(infiniopReluDescriptor_t desc, void *y, void const *x, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuRelu((ReluCpuDescriptor_t) desc, y, x, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaRelu((ReluCudaDescriptor_t) desc, y, x, stream); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaRelu((ReluMusaDescriptor_t) desc, y, x, stream); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyReluDescriptor((ReluCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyReluDescriptor((ReluCudaDescriptor_t) desc); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaDestroyReluDescriptor((ReluMusaDescriptor_t) desc); + } +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/rms_norm/ascend/rms_norm_aclnn.cc b/src/ops/rms_norm/ascend/rms_norm_aclnn.cc new file mode 100644 index 00000000..d264be39 --- /dev/null +++ b/src/ops/rms_norm/ascend/rms_norm_aclnn.cc @@ -0,0 +1,215 @@ +#include "rms_norm_aclnn.h" + +RMSNormAclnnDescriptor::RMSNormAclnnDescriptor(Device _device) { + device = _device; + device_id = 0; + executor = nullptr; + castExecutor = nullptr; + workspaceSize = 0; + castWorkspaceSize = 0; + yDesc = new aclnnTensorDescriptor(); + xDesc = new aclnnTensorDescriptor(); + wDesc = new aclnnTensorDescriptor(); + rstdDesc = new aclnnTensorDescriptor(); + castDesc = nullptr; + epsilon = 1e-5; +} + + +infiniopStatus_t aclnnCreateRMSNormDescriptor(AscendHandle_t handle, + RMSNormAclnnDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t w, + float eps) { + *desc_ptr = new RMSNormAclnnDescriptor(handle->device); + (*desc_ptr)->device_id = handle->device_id; + (*desc_ptr)->epsilon = static_cast(eps); + + auto &yDesc = (*desc_ptr)->yDesc; + auto &xDesc = (*desc_ptr)->xDesc; + auto &wDesc = (*desc_ptr)->wDesc; + auto &castDesc = (*desc_ptr)->castDesc; + auto &rstdDesc = (*desc_ptr)->rstdDesc; + + CHECK_STATUS(yDesc->fromInfiniOpTensorDescriptor(y), STATUS_SUCCESS); + CHECK_STATUS(xDesc->fromInfiniOpTensorDescriptor(x), STATUS_SUCCESS); + CHECK_STATUS(wDesc->fromInfiniOpTensorDescriptor(w), STATUS_SUCCESS); + + // Set rstdDesc + // See: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnRmsNorm.md + // rstdTensor cannot set nullptr in aclnn + int64_t wsize = 1; + for (uint64_t i = 0; i < wDesc->ndim; ++i) { + wsize *= (wDesc->shape)[i]; + } + int64_t xsize = 1; + uint64_t rstd_dim = xDesc->ndim - 1; + for (int64_t i = xDesc->ndim - 1; i >= 0; --i) { + xsize *= (xDesc->shape)[i]; + rstd_dim = static_cast(i); + if (xsize == wsize) { + break; + } + } + + auto rstd_shape = std::vector(xDesc->ndim, 1); + auto rstd_strides = std::vector(xDesc->ndim, 1); + + for (uint64_t i = 0; i < rstd_dim; ++i) { + rstd_shape[i] = (xDesc->shape)[i]; + } + for (int64_t i = xDesc->ndim - 2; i >= 0; --i) { + rstd_strides[i] = rstd_strides[i + 1] * rstd_shape[i + 1]; + } + CHECK_STATUS(rstdDesc->setDescriptor(toAclDataType(F32), rstd_shape, rstd_strides), STATUS_SUCCESS); + + if (wDesc->dataType != xDesc->dataType) { + castDesc = new aclnnTensorDescriptor(); + CHECK_STATUS(castDesc->fromInfiniOpTensorDescriptor(w), STATUS_SUCCESS); + castDesc->dataType = xDesc->dataType; + CHECK_STATUS(castDesc->createTensor(), STATUS_SUCCESS); + } + + CHECK_STATUS(yDesc->createTensor(), STATUS_SUCCESS); + CHECK_STATUS(xDesc->createTensor(), STATUS_SUCCESS); + CHECK_STATUS(wDesc->createTensor(), STATUS_SUCCESS); + CHECK_STATUS(rstdDesc->createTensor(), STATUS_SUCCESS); + + // Get Tensor + aclTensor *ty = yDesc->t; + aclTensor *tx = xDesc->t; + aclTensor *tw = wDesc->t; + aclTensor *trstd = rstdDesc->t; + + // Get workspaceSize and set executor + auto &workspaceSize = (*desc_ptr)->workspaceSize; + auto &executor = (*desc_ptr)->executor; + auto ret = aclnnRmsNormGetWorkspaceSize(tx, + castDesc == nullptr ? tw + : castDesc->t, + (*desc_ptr)->epsilon, + ty, + trstd, + &workspaceSize, + &executor); + aclSetAclOpExecutorRepeatable(executor); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnRmsNormGetWorkspaceSize failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + + // Get Cast workspaceSize and set castExecutor + if (castDesc != nullptr) { + auto &castExecutor = (*desc_ptr)->castExecutor; + auto &castWorkspaceSize = (*desc_ptr)->castWorkspaceSize; + aclTensor *tcast = castDesc->t; + ret = aclnnCastGetWorkspaceSize(tw, + castDesc->dataType, + tcast, + &castWorkspaceSize, + &castExecutor); + aclSetAclOpExecutorRepeatable(castExecutor); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnCastGetWorkspaceSize failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t aclnnGetRMSNormWorkspaceSize(RMSNormAclnnDescriptor_t desc, + uint64_t *size) { + auto &rstdDesc = desc->rstdDesc; + auto &castDesc = desc->castDesc; + + *size = desc->workspaceSize + + numElements(rstdDesc->shape.data(), rstdDesc->ndim) * aclDataTypeSize(rstdDesc->dataType); + + if (castDesc != nullptr) { + *size += desc->castWorkspaceSize; + *size += numElements(castDesc->shape.data(), castDesc->ndim) * aclDataTypeSize(castDesc->dataType); + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t aclnnRMSNorm(RMSNormAclnnDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, + void const *x, + void const *w, + void *stream) { + auto &yDesc = desc->yDesc; + auto &xDesc = desc->xDesc; + auto &wDesc = desc->wDesc; + auto &rstdDesc = desc->rstdDesc; + auto &castDesc = desc->castDesc; + + // Get Tensor + aclTensor *ty = yDesc->t; + aclTensor *tx = xDesc->t; + aclTensor *tw = wDesc->t; + aclTensor *trstd = rstdDesc->t; + + auto &executor = desc->executor; + auto &castExecutor = desc->castExecutor; + auto &workspaceSize = desc->workspaceSize; + auto &castWorkspaceSize = desc->castWorkspaceSize; + + auto rstd = (void *) ((uint8_t *) workspace + workspaceSize); + + // Set device + aclrtSetDevice(desc->device_id); + aclnnStatus ret; + + void *castPtr = nullptr; + + // Cast w + if (castDesc != nullptr) { + aclTensor *tcast = castDesc->t; + castPtr = (void *) ((float *) rstd + numElements(rstdDesc->shape.data(), rstdDesc->ndim)); + + AclSetTensorAddr(castExecutor, 0, tw, (void *) w); + AclSetTensorAddr(castExecutor, 1, tcast, castPtr); + ret = aclnnCast(nullptr, castWorkspaceSize, castExecutor, stream); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnCast failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + } + + // Do RmsNorm calc + AclSetTensorAddr(executor, 0, tx, (void *) x); + if (castDesc != nullptr) { + AclSetTensorAddr(executor, 1, castDesc->t, castPtr); + } else { + AclSetTensorAddr(executor, 1, tw, (void *) w); + } + AclSetTensorAddr(executor, 2, ty, y); + AclSetTensorAddr(executor, 3, trstd, rstd); + + ret = aclnnRmsNorm(workspace, + workspaceSize, + executor, + stream); + CHECK_RET(ret == ACL_SUCCESS, + LOG_PRINT("aclnnRmsNorm failed. ERROR: %d\n", ret); + return STATUS_EXECUTION_FAILED); + + return STATUS_SUCCESS; +} + +infiniopStatus_t aclnnDestroyRMSNormDescriptor(RMSNormAclnnDescriptor_t desc) { + delete desc->yDesc; + delete desc->wDesc; + delete desc->xDesc; + delete desc->rstdDesc; + aclDestroyAclOpExecutor(desc->executor); + if (desc->castDesc != nullptr) { + delete desc->castDesc; + aclDestroyAclOpExecutor(desc->castExecutor); + } + delete desc; + + return STATUS_SUCCESS; +} diff --git a/src/ops/rms_norm/ascend/rms_norm_aclnn.h b/src/ops/rms_norm/ascend/rms_norm_aclnn.h new file mode 100644 index 00000000..2999fefd --- /dev/null +++ b/src/ops/rms_norm/ascend/rms_norm_aclnn.h @@ -0,0 +1,49 @@ +#ifndef __ACLNN_RMS_NORM_H__ +#define __ACLNN_RMS_NORM_H__ + +#include "../../../devices/ascend/ascend_handle.h" +#include "../../../devices/ascend/tensor_aclnn.h" +#include "../../utils.h" +#include "operators.h" +#include +#include +#include +#include +#include + +struct RMSNormAclnnDescriptor { + Device device; + int device_id; + aclOpExecutor *executor; + aclOpExecutor *castExecutor; + aclnnTensorDescriptor_t yDesc, xDesc, wDesc, rstdDesc, castDesc; + uint64_t workspaceSize; + uint64_t castWorkspaceSize; + double epsilon; + + RMSNormAclnnDescriptor(Device device); +}; + +typedef RMSNormAclnnDescriptor *RMSNormAclnnDescriptor_t; + +infiniopStatus_t aclnnCreateRMSNormDescriptor(AscendHandle_t handle, + RMSNormAclnnDescriptor_t *desc, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t w, + float eps); + +infiniopStatus_t aclnnGetRMSNormWorkspaceSize(RMSNormAclnnDescriptor_t desc, + uint64_t *size); + +infiniopStatus_t aclnnRMSNorm(RMSNormAclnnDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, + void const *x, + void const *w, + void *stream); + +infiniopStatus_t aclnnDestroyRMSNormDescriptor(RMSNormAclnnDescriptor_t desc); + +#endif diff --git a/src/ops/rms_norm/bang/rms_norm_bang.cc b/src/ops/rms_norm/bang/rms_norm_bang.cc new file mode 100644 index 00000000..fbf7f689 --- /dev/null +++ b/src/ops/rms_norm/bang/rms_norm_bang.cc @@ -0,0 +1,44 @@ +#include "rms_norm_bang.h" +#include "../../utils.h" +infiniopStatus_t bangCreateRMSNormDescriptor(BangHandle_t handle, RMSNormBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + float epsilon) { + if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + + auto n = y_desc->shape[0], + d = y_desc->shape[1]; + + if (x_desc->shape[0] != n || x_desc->shape[1] != d || w_desc->shape[0] != d) { + return STATUS_BAD_TENSOR_SHAPE; + } + + uint64_t stride_y = y_desc->strides[0]; + uint64_t stride_x = x_desc->strides[0]; + auto w_datatype = w_desc->dt; + *desc_ptr = new RMSNormBangDescriptor{ + handle->device, + handle->device_id, + y_desc->dt, + n, + d, + stride_y, + stride_x, + w_datatype, + epsilon}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t bangDestroyRMSNormDescriptor(RMSNormBangDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rms_norm/bang/rms_norm_bang.h b/src/ops/rms_norm/bang/rms_norm_bang.h index 26187c97..bfd94158 100644 --- a/src/ops/rms_norm/bang/rms_norm_bang.h +++ b/src/ops/rms_norm/bang/rms_norm_bang.h @@ -1,10 +1,39 @@ #ifndef __BANG_RMS_NORM_H__ #define __BANG_RMS_NORM_H__ +#include "../../../devices/bang/bang_handle.h" #include "../../utils.h" -#include "cnrt.h" #include "operators.h" -void rms_norm_bang_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream); +struct RMSNormBangDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t n; + uint64_t d; + uint64_t stride_y; + uint64_t stride_x; + DT w_datatype; + float epsilon; +}; + +typedef struct RMSNormBangDescriptor *RMSNormBangDescriptor_t; + +infiniopStatus_t bangCreateRMSNormDescriptor(BangHandle_t handle, + RMSNormBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + float epsilon); + +infiniopStatus_t bangGetRMSNormWorkspaceSize(RMSNormBangDescriptor_t desc, uint64_t *size); + +infiniopStatus_t bangRMSNorm(RMSNormBangDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream); + +infiniopStatus_t bangDestroyRMSNormDescriptor(RMSNormBangDescriptor_t desc); #endif// __BANG_RMS_NORM_H__ diff --git a/src/ops/rms_norm/bang/rms_norm_bang.mlu b/src/ops/rms_norm/bang/rms_norm_bang.mlu index 6b4dcfc3..755e1e3c 100644 --- a/src/ops/rms_norm/bang/rms_norm_bang.mlu +++ b/src/ops/rms_norm/bang/rms_norm_bang.mlu @@ -1,143 +1,148 @@ #include "bang.h" -#include "bang_device_functions.h" #include "cnrt.h" #include "rms_norm_bang.h" #include "../../../devices/bang/common_bang.h" -const int SRC_MAX_SIZE = 1024 * 64;//至少大于等于128字节 +const int SRC_MAX_SIZE = 1024 * 64;//尽量取大一些 __nram__ char nram_buffer[NRAM_MAX_SIZE]; -const int wSize = 64; -template -__mlu_device__ void rmsNormKernel(T *destination, T const *source, T const *weight, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int dimS, float eps, int ndim) {//axis=-1 - - const int maxNum = SRC_MAX_SIZE/sizeof(T); +template +__mlu_global__ void rms_norm(T *destination, T const *source, float const *weight, int stride_y, int stride_x, float eps, int othersize, int dimsize, int dimS){ + const int maxNum = SRC_MAX_SIZE/sizeof(float); + int wSize = 128 / sizeof(T); + + int remainT = othersize % taskDim; + int stepEasy = (othersize - remainT) / taskDim; + int stepHard = stepEasy + 1; + int step = (taskId < remainT ? stepHard : stepEasy); + int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); + if(dimsize >= maxNum){ - + + char *nram_buffer1 = nram_buffer + (2 * maxNum + 3 * wSize) * sizeof(T); T *src = (T *)nram_buffer;//[maxNum] - T *destSumFinal = src + maxNum;//[wSize] + T *wet = src + maxNum;//[maxNum] + T *destSumFinal = wet + maxNum;//[wSize] T *destSum = destSumFinal + wSize;//[wSize] - T *wet = destSum + wSize;//[maxNum] - + T *srcTmp = destSum + wSize;//[wSize] + __bang_write_zero(srcTmp, wSize); + float *wetTmp = (float *)nram_buffer1; + int remain = dimsize % maxNum; int repeat = (dimsize - remain) / maxNum; - int tidS; - int tidD; + int segNum = maxNum / wSize;//准备数值求和 - int remainT = othersize % taskDim; - int stepEasy = (othersize - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - for(int i = indStart; i < indStart + step; i++){ int inds = 0; int indd = 0; int indi = i; - for (int j = ndim - 2; j >= 0; --j) { - inds += (indi % shape[j]) * strideSrc[j]; - indd += (indi % shape[j]) * strideDest[j]; - indi /= shape[j]; - } + inds += (indi % othersize) * stride_x; + indd += (indi % othersize) * stride_y; __bang_write_zero(destSumFinal, wSize); + __bang_write_zero(destSum, wSize); for(int s = 0; s < repeat; s++){ - __bang_write_zero(destSum, wSize); - tidS = inds + s * maxNum; - __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM); + __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); __bang_mul(src, src, src, maxNum);//src = src * src - int segNum = maxNum / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ - __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize); + + if(maxNum >= wSize){ + for(int strip = segNum / 2; strip > 0; strip = strip / 2){ + for(int j = 0; j < strip; j++){ + __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize); + } } + __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 + __bang_add(destSumFinal, destSumFinal, destSum, wSize); + } + else{ + __memcpy(srcTmp, src, maxNum * sizeof(T), NRAM2NRAM); + __bang_reduce_sum(destSum, srcTmp, wSize); + __bang_add(destSumFinal, destSumFinal, destSum, wSize); } - __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - __bang_add(destSumFinal, destSumFinal, destSum, wSize); } - if(remain){ - tidS = inds + repeat * maxNum; __bang_write_zero(src, maxNum); - __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM); + __bang_write_zero(destSum, wSize); + __memcpy(src, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM); __bang_mul(src, src, src, maxNum);//src = src * src - int segNum = maxNum / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ - __bang_add(src + j * wSize, src + j * wSize, src + (j+ strip) * wSize, wSize); + if(maxNum >= wSize){ + for(int strip = segNum / 2; strip > 0; strip = strip / 2){ + for(int j = 0; j < strip; j++){ + __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize); + } } + __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 + __bang_add(destSumFinal, destSumFinal, destSum, wSize); + } + else{ + __memcpy(srcTmp, src, remain * sizeof(T), NRAM2NRAM); + __bang_reduce_sum(destSum, srcTmp, wSize); + __bang_add(destSumFinal, destSumFinal, destSum, wSize); } - __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - __bang_add(destSumFinal, destSumFinal, destSum, wSize); } - - destSumFinal[0] += eps; destSumFinal[0] /= dimsize; - destSum[0] = pow(destSum[0], 0.5); + destSumFinal[0] += eps; + destSumFinal[0] = pow(destSumFinal[0], 0.5); T globalSumInv = 1.0 / destSumFinal[0]; - - // 写回 global memory for(int s = 0; s < repeat; s++){ - tidS = inds + s * maxNum; - tidD = indd + s * maxNum; - __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM); - - __memcpy(wet, weight + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - + __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __memcpy(wetTmp, weight + s * maxNum, maxNum * sizeof(float), GDRAM2NRAM); + __bang_float2half_dn(wet, wetTmp, maxNum); __bang_mul(src, src, wet, maxNum);//src = src * wet __bang_mul_scalar(src, src, globalSumInv, maxNum); - __memcpy(destination + tidD, src, maxNum * sizeof(T), NRAM2GDRAM); + __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM); } if(remain){ - tidS = inds + repeat * maxNum; - tidD = indd + repeat * maxNum; - __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM); - __memcpy(wet, weight + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM); + __memcpy(src, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM); + __memcpy(wetTmp, weight + repeat * maxNum, remain * sizeof(float), GDRAM2NRAM); + __bang_float2half_dn(wet, wetTmp, maxNum); __bang_mul(src, src, wet, maxNum);//src = src * wet __bang_mul_scalar(src, src, globalSumInv, maxNum); - __memcpy(destination + tidD, src, remain * sizeof(T), NRAM2GDRAM); + __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM); } } } - else{//dimsize < maxNum - - T *src = (T *)nram_buffer; - T *wet = src + dimsize; - T *destSum = wet + dimsize; - T *destSumFinal = destSum + dimS; - - __bang_write_zero(destSum, dimS); - __bang_write_zero(destSumFinal, dimS); - __memcpy(wet, weight, dimsize * sizeof(T), GDRAM2NRAM); - - int remainT = othersize % taskDim; - int stepEasy = (othersize - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - + else{ + char *nram_buffer1 = nram_buffer + (2 * dimsize + 2 * wSize + dimS) * sizeof(T); + T *src = (T *)nram_buffer;//[dimsize] + T *wet = src + dimsize;//[dimsize] + T *destSumFinal = wet + dimsize;//[wSize] + T *destSum = destSumFinal + wSize;//[dimS] + T *srcTmp = destSum + dimS; + __bang_write_zero(srcTmp, wSize); + float *wetTmp = (float *)nram_buffer1; + + + int segNum = dimS / wSize; + for(int i = indStart; i < indStart + step; i++){ + __bang_write_zero(destSum, dimS); + __bang_write_zero(destSumFinal, wSize); int inds = 0; int indd = 0; - int indi = i ; - for (int j = ndim - 2; j >= 0; --j) { - inds += (indi % shape[j]) * strideSrc[j]; - indd += (indi % shape[j]) * strideDest[j]; - indi /= shape[j]; - } + int indi = i; + inds += (indi % othersize) * stride_x; + indd += (indi % othersize) * stride_y; __memcpy(src, source + inds, dimsize * sizeof(T), GDRAM2NRAM); __bang_mul(destSum, src, src, dimsize);//src = src * src - int segNum = dimS / wSize; - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ - __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); + if(dimS >= wSize){ + for(int strip = segNum / 2; strip > 0; strip = strip / 2){ + for(int j = 0; j < strip; j++){ + __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); + } } + __bang_reduce_sum(destSumFinal, destSum, wSize); + } + else{ + __memcpy(srcTmp, destSum, dimsize * sizeof(T), NRAM2NRAM); + __bang_reduce_sum(destSumFinal, srcTmp, wSize); } - __bang_reduce_sum(destSumFinal, destSum, wSize); destSumFinal[0] /= dimsize; destSumFinal[0] += eps; - T globalSum = pow(destSumFinal[0], 0.5); - T globalSumInv = 1.0 / globalSum; - __bang_mul(src, src, wet, dimsize); + destSumFinal[0] = pow(destSumFinal[0], 0.5); + T globalSumInv = 1.0 / destSumFinal[0]; + __memcpy(wetTmp, weight, dimsize * sizeof(float), GDRAM2NRAM); + __bang_float2half_dn(wet, wetTmp, dimsize); + __bang_mul(src, src, wet, dimsize);//src = src * wet __bang_mul_scalar(src, src, globalSumInv, dimsize); __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM); } @@ -145,336 +150,136 @@ __mlu_device__ void rmsNormKernel(T *destination, T const *source, T const *weig } template -__mlu_global__ void rmsNormUnion1(T *mlu_destination, T const *mlu_src, T const *mlu_weight, int *strideSrc, int *strideDest, int *shape, int othersize, int dimsize, int dimS, float eps, int ndim) { - - rmsNormKernel(mlu_destination, mlu_src, mlu_weight, strideSrc, strideDest, shape, othersize, dimsize, dimS, eps, ndim); -} - -template -void rmsNorm(cnrtQueue_t queue, void *y, void const *x, void const *w, int *strideSrc, int *strideDest, int *shape, int n, int d, float eps, int ndim) { - const int wSize = 128 / sizeof(T); - auto y_ = reinterpret_cast(y); - auto x_ = reinterpret_cast(x); - auto w_ = reinterpret_cast(w); - - int dimS; - float mi = log2(d); - if (floor(mi) == mi) { - dimS = d; - } else { - dimS = pow(2, floor(mi) + 1); - } - if (dimS < wSize) { - dimS = wSize; - } - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - - k_dim.x = 4; - k_dim.y = 1; - k_dim.z = 1; - k_type = CNRT_FUNC_TYPE_UNION1; +__mlu_global__ void rms_norm(T *destination, T const *source, T const *weight, int stride_y, int stride_x, float eps, int othersize, int dimsize, int dimS){ + const int maxNum = SRC_MAX_SIZE/sizeof(T); + int wSize = 128 / sizeof(T); - rmsNormUnion1<<>>(y_, x_, w_, strideSrc, strideDest, shape, n, d, dimS, eps, ndim); - // cnrtQueueSync(queue); -} + int remainT = othersize % taskDim; + int stepEasy = (othersize - remainT) / taskDim; + int stepHard = stepEasy + 1; + int step = (taskId < remainT ? stepHard : stepEasy); + int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); -void rmsNorm_fp16(cnrtQueue_t queue, void *y, void const *x, void const *w, int *strideSrc, int *strideDest, int *shape, int n, int d, float eps, int ndim) { - rmsNorm(queue, y, x, w, strideSrc, strideDest, shape, n, d, eps, ndim); -} -template -__mlu_global__ void rmsNormDim_2(T *destination, T const *source, T const *weight, int strideS_f, int strideD_f, int othersize, int dimsize, int dimS, float eps) {//axis=-1 - - const int maxNum = SRC_MAX_SIZE/sizeof(T); if(dimsize >= maxNum){ - + T *src = (T *)nram_buffer;//[maxNum] - T *destSumFinal = src + maxNum;//[wSize] + T *wet = src + maxNum;//[maxNum] + T *destSumFinal = wet + maxNum;//[wSize] T *destSum = destSumFinal + wSize;//[wSize] - T *wet = destSum + wSize;//[maxNum] - + T *srcTmp = destSum + wSize;//[wSize] + __bang_write_zero(srcTmp, wSize); + int remain = dimsize % maxNum; int repeat = (dimsize - remain) / maxNum; - int tidS; - int tidD; + int segNum = maxNum / wSize;//准备数值求和 - int remainT = othersize % taskDim; - int stepEasy = (othersize - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - for(int i = indStart; i < indStart + step; i++){ int inds = 0; int indd = 0; int indi = i; - inds += (indi % othersize) * strideS_f; - indd += (indi % othersize) * strideD_f; + inds += (indi % othersize) * stride_x; + indd += (indi % othersize) * stride_y; __bang_write_zero(destSumFinal, wSize); + __bang_write_zero(destSum, wSize); for(int s = 0; s < repeat; s++){ - __bang_write_zero(destSum, wSize); - tidS = inds + s * maxNum; - __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM); + __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); __bang_mul(src, src, src, maxNum);//src = src * src - int segNum = maxNum / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ - __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize); + + if(maxNum >= wSize){ + for(int strip = segNum / 2; strip > 0; strip = strip / 2){ + for(int j = 0; j < strip; j++){ + __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize); + } } + __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 + __bang_add(destSumFinal, destSumFinal, destSum, wSize); + } + else{ + __memcpy(srcTmp, src, maxNum * sizeof(T), NRAM2NRAM); + __bang_reduce_sum(destSum, srcTmp, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 + __bang_add(destSumFinal, destSumFinal, destSum, wSize); } - __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - __bang_add(destSumFinal, destSumFinal, destSum, wSize); } - if(remain){ - tidS = inds + repeat * maxNum; __bang_write_zero(src, maxNum); - __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM); + __bang_write_zero(destSum, wSize); + __memcpy(src, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM); __bang_mul(src, src, src, maxNum);//src = src * src - int segNum = maxNum / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ - __bang_add(src + j * wSize, src + j * wSize, src + (j+ strip) * wSize, wSize); + if(maxNum >= wSize){ + for(int strip = segNum / 2; strip > 0; strip = strip / 2){ + for(int j = 0; j < strip; j++){ + __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize); + } } + __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 + __bang_add(destSumFinal, destSumFinal, destSum, wSize); + } + else{ + __memcpy(srcTmp, src, remain * sizeof(T), NRAM2NRAM); + __bang_reduce_sum(destSum, srcTmp, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 + __bang_add(destSumFinal, destSumFinal, destSum, wSize); } - __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - __bang_add(destSumFinal, destSumFinal, destSum, wSize); } - - destSumFinal[0] += eps; destSumFinal[0] /= dimsize; - destSum[0] = pow(destSum[0], 0.5); + destSumFinal[0] += eps; + destSumFinal[0] = pow(destSumFinal[0], 0.5); T globalSumInv = 1.0 / destSumFinal[0]; - - // 写回 global memory for(int s = 0; s < repeat; s++){ - tidS = inds + s * maxNum; - tidD = indd + s * maxNum; - __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM); - + __memcpy(src, source + inds + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); __memcpy(wet, weight + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - __bang_mul(src, src, wet, maxNum);//src = src * wet __bang_mul_scalar(src, src, globalSumInv, maxNum); - __memcpy(destination + tidD, src, maxNum * sizeof(T), NRAM2GDRAM); + __memcpy(destination + indd + s * maxNum, src, maxNum * sizeof(T), NRAM2GDRAM); } if(remain){ - tidS = inds + repeat * maxNum; - tidD = indd + repeat * maxNum; - __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM); + __memcpy(src, source + inds + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM); __memcpy(wet, weight + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM); __bang_mul(src, src, wet, maxNum);//src = src * wet __bang_mul_scalar(src, src, globalSumInv, maxNum); - __memcpy(destination + tidD, src, remain * sizeof(T), NRAM2GDRAM); + __memcpy(destination + indd + repeat * maxNum, src, remain * sizeof(T), NRAM2GDRAM); } } } - else{//dimsize < maxNum - - T *src = (T *)nram_buffer; - T *wet = src + dimsize; - T *destSum = wet + dimsize; - T *destSumFinal = destSum + dimS; - - __bang_write_zero(destSum, dimS); - __bang_write_zero(destSumFinal, dimS); - __memcpy(wet, weight, dimsize * sizeof(T), GDRAM2NRAM); - - int remainT = othersize % taskDim; - int stepEasy = (othersize - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i ; - inds += (indi % othersize) * strideS_f; - indd += (indi % othersize) * strideD_f; - __memcpy(src, source + inds, dimsize * sizeof(T), GDRAM2NRAM); - __bang_mul(destSum, src, src, dimsize);//src = src * src - int segNum = dimS / wSize; - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ - __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); - } - } - __bang_reduce_sum(destSumFinal, destSum, wSize); - destSumFinal[0] /= dimsize; - destSumFinal[0] += eps; - T globalSum = pow(destSumFinal[0], 0.5); - T globalSumInv = 1.0 / globalSum; - __bang_mul(src, src, wet, dimsize); - __bang_mul_scalar(src, src, globalSumInv, dimsize); - __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM); - } - } -} - - - -template -void rmsNormUnionDim_2(cnrtQueue_t queue, void *y, void const *x, void const *w, int strideS_f, int strideD_f, int n, int d, float eps) { - const int wSize = 128 / sizeof(T); - auto y_ = reinterpret_cast(y); - auto x_ = reinterpret_cast(x); - auto w_ = reinterpret_cast(w); + else{ - int dimS; - float mi = log2(d); - if (floor(mi) == mi) { - dimS = d; - } else { - dimS = pow(2, floor(mi) + 1); - } - if (dimS < wSize) { - dimS = wSize; - } - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; + T *src = (T *)nram_buffer;//[dimsize] + T *wet = src + dimsize;//[dimsize] + T *destSumFinal = wet + dimsize;//[wSize] + T *destSum = destSumFinal + wSize;//[dimS] + T *srcTmp = destSum + dimS;//[wSize] - k_dim.x = 4; - k_dim.y = 1; - k_dim.z = 1; - k_type = CNRT_FUNC_TYPE_UNION1; - rmsNormDim_2<<>>(y_, x_, w_, strideS_f, strideD_f, n, d, dimS, eps); - // cnrtQueueSync(queue); -} -template -__mlu_global__ void rmsNormDim_3(T *destination, T const *source, T const *weight, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int othersize, int middle, int dimsize, int dimS, float eps) {//axis=-1 - - const int maxNum = SRC_MAX_SIZE/sizeof(T); - int startDim = othersize / middle; - if(dimsize >= maxNum){ - - T *src = (T *)nram_buffer;//[maxNum] - T *destSumFinal = src + maxNum;//[wSize] - T *destSum = destSumFinal + wSize;//[wSize] - T *wet = destSum + wSize;//[maxNum] - - int remain = dimsize % maxNum; - int repeat = (dimsize - remain) / maxNum; - int tidS; - int tidD; + int segNum = dimS / wSize; - int remainT = othersize % taskDim; - int stepEasy = (othersize - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - for(int i = indStart; i < indStart + step; i++){ + __bang_write_zero(destSum, dimS); + __bang_write_zero(destSumFinal, wSize); int inds = 0; int indd = 0; int indi = i; - inds += (indi % middle) * strideS_m; - indd += (indi % middle) * strideD_m; - indi /= middle; - inds += (indi % startDim) * strideS_f; - indd += (indi % startDim) * strideD_f; - __bang_write_zero(destSumFinal, wSize); - for(int s = 0; s < repeat; s++){ - __bang_write_zero(destSum, wSize); - tidS = inds + s * maxNum; - __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM); - __bang_mul(src, src, src, maxNum);//src = src * src - int segNum = maxNum / wSize;//准备数值求和 - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ - __bang_add(src + j * wSize, src + j * wSize, src + (j + strip) * wSize, wSize); - } - } - __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - __bang_add(destSumFinal, destSumFinal, destSum, wSize); - } - - if(remain){ - tidS = inds + repeat * maxNum; - __bang_write_zero(src, maxNum); - __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM); - __bang_mul(src, src, src, maxNum);//src = src * src - int segNum = maxNum / wSize;//准备数值求和 + inds += (indi % othersize) * stride_x; + indd += (indi % othersize) * stride_y; + __memcpy(src, source + inds, dimsize * sizeof(T), GDRAM2NRAM); + __bang_mul(destSum, src, src, dimsize);//src = src * src + if(dimS >= wSize){ for(int strip = segNum / 2; strip > 0; strip = strip / 2){ for(int j = 0; j < strip; j++){ - __bang_add(src + j * wSize, src + j * wSize, src + (j+ strip) * wSize, wSize); + __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); } } - __bang_reduce_sum(destSum, src, wSize);//此时destSum[0]保存的就是当前maxNum长度数据的数值和 - __bang_add(destSumFinal, destSumFinal, destSum, wSize); + __bang_reduce_sum(destSumFinal, destSum, wSize); } - - destSumFinal[0] += eps; - destSumFinal[0] /= dimsize; - destSum[0] = pow(destSum[0], 0.5); - T globalSumInv = 1.0 / destSumFinal[0]; - - // 写回 global memory - for(int s = 0; s < repeat; s++){ - tidS = inds + s * maxNum; - tidD = indd + s * maxNum; - __memcpy(src, source + tidS, maxNum * sizeof(T), GDRAM2NRAM); - - __memcpy(wet, weight + s * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - - __bang_mul(src, src, wet, maxNum);//src = src * wet - __bang_mul_scalar(src, src, globalSumInv, maxNum); - __memcpy(destination + tidD, src, maxNum * sizeof(T), NRAM2GDRAM); - } - if(remain){ - tidS = inds + repeat * maxNum; - tidD = indd + repeat * maxNum; - __memcpy(src, source + tidS, remain * sizeof(T), GDRAM2NRAM); - __memcpy(wet, weight + repeat * maxNum, remain * sizeof(T), GDRAM2NRAM); - __bang_mul(src, src, wet, maxNum);//src = src * wet - __bang_mul_scalar(src, src, globalSumInv, maxNum); - __memcpy(destination + tidD, src, remain * sizeof(T), NRAM2GDRAM); - } - } - } - else{//dimsize < maxNum - - T *src = (T *)nram_buffer; - T *wet = src + dimsize; - T *destSum = wet + dimsize; - T *destSumFinal = destSum + dimS; - - __bang_write_zero(destSum, dimS); - __bang_write_zero(destSumFinal, dimS); - __memcpy(wet, weight, dimsize * sizeof(T), GDRAM2NRAM); - - int remainT = othersize % taskDim; - int stepEasy = (othersize - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i ; - inds += (indi % middle) * strideS_m; - indd += (indi % middle) * strideD_m; - indi /= middle; - inds += (indi % startDim) * strideS_f; - indd += (indi % startDim) * strideD_f; - __memcpy(src, source + inds, dimsize * sizeof(T), GDRAM2NRAM); - __bang_mul(destSum, src, src, dimsize);//src = src * src - int segNum = dimS / wSize; - for(int strip = segNum / 2; strip > 0; strip = strip / 2){ - for(int j = 0; j < strip; j++){ - __bang_add(destSum + j * wSize, destSum + j * wSize, destSum + (j + strip) * wSize, wSize); - } + else{ + __memcpy(srcTmp, destSum, dimsize * sizeof(T), NRAM2NRAM); + __bang_reduce_sum(destSumFinal, srcTmp, wSize); + } - __bang_reduce_sum(destSumFinal, destSum, wSize); destSumFinal[0] /= dimsize; destSumFinal[0] += eps; - T globalSum = pow(destSumFinal[0], 0.5); - T globalSumInv = 1.0 / globalSum; - __bang_mul(src, src, wet, dimsize); + destSumFinal[0] = pow(destSumFinal[0], 0.5); + T globalSumInv = 1.0 / destSumFinal[0]; + __memcpy(wet, weight, dimsize * sizeof(T), GDRAM2NRAM); + __bang_mul(src, src, wet, dimsize);//src = src * wet __bang_mul_scalar(src, src, globalSumInv, dimsize); __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM); } @@ -482,14 +287,15 @@ __mlu_global__ void rmsNormDim_3(T *destination, T const *source, T const *weigh } +template +void rms_normUnion(cnrtQueue_t queue, T *y, T const *x, Tw const *w, int stride_y, int stride_x, float epsilon, int n, int d){ + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; -template -void rmsNormUnionDim_3(cnrtQueue_t queue, void *y, void const *x, void const *w, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int n, int middle, int d, float eps) { - const int wSize = 128 / sizeof(T); - auto y_ = reinterpret_cast(y); - auto x_ = reinterpret_cast(x); - auto w_ = reinterpret_cast(w); - + k_dim.x = 4; + k_dim.y = 1; + k_dim.z = 1; + k_type = CNRT_FUNC_TYPE_UNION1; int dimS; float mi = log2(d); if (floor(mi) == mi) { @@ -497,74 +303,45 @@ void rmsNormUnionDim_3(cnrtQueue_t queue, void *y, void const *x, void const *w, } else { dimS = pow(2, floor(mi) + 1); } - if (dimS < wSize) { - dimS = wSize; - } - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - - k_dim.x = 4; - k_dim.y = 1; - k_dim.z = 1; - k_type = CNRT_FUNC_TYPE_UNION1; + rms_norm<<>>(y, x, w, stride_y, stride_x, epsilon, n, d, dimS); + cnrtQueueSync(queue); - rmsNormDim_3<<>>(y_, x_, w_, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, d, dimS, eps); - // cnrtQueueSync(queue); } - -void rms_norm_bang_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream) { - int num = 1; - int ndim = y.layout->ndim; - int x_stride[ndim], y_stride[ndim], shape[ndim]; - for (int i = 0; i < ndim; i++) { - x_stride[i] = static_cast(x.layout->strides[i]) / y.layout->dt.size; - y_stride[i] = static_cast(y.layout->strides[i]) / y.layout->dt.size; - shape[i] = static_cast(y.layout->shape[i]); - num *= shape[i]; - } +void rms_norm_bang_f16(RMSNormBangDescriptor_t desc, void *y, void const *x, void const *w, + void *stream){ auto queue = reinterpret_cast(stream); - if(ndim == 2){ - ASSERT_EQ(y.layout->ndim, 2); - ASSERT_EQ(x.layout->ndim, 2); - ASSERT_EQ(w.layout->ndim, 1); - - auto n = y.layout->shape[0], - d = y.layout->shape[1]; - - ASSERT_EQ(x.layout->shape[0], n); - ASSERT_EQ(x.layout->shape[1], d); - ASSERT_EQ(w.layout->shape[0], d); + int n = static_cast(desc->n); + int d = static_cast(desc->d); + auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto epsilon = desc->epsilon;//float - int strideS_f = x_stride[0]; - int strideD_f = y_stride[0]; - rmsNormUnionDim_2(queue, y.data, x.data, w.data, strideS_f, strideD_f, n, d, epsilon); - } - else if(ndim == 3){ - int strideS_f = x_stride[0]; - int strideD_f = y_stride[0]; - int strideS_m = x_stride[1]; - int strideD_m = y_stride[1]; - int middle = shape[1]; - int d = shape[ndim - 1]; - int n = num / d; - rmsNormUnionDim_3(queue, y.data, x.data, w.data, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, d, epsilon); + // Get strides in terms of elements + int stride_y = static_cast(desc->stride_y); + int stride_x = static_cast(desc->stride_x); + auto w_datatype = desc->w_datatype; + if (dtype_eq(w_datatype, F16)) { + auto w_ = reinterpret_cast(w); + rms_normUnion(queue, y_, x_, w_, stride_y, stride_x, epsilon, n, d); } else{ - int d = shape[ndim - 1]; - int n = num / d; - int *mlu_strideX, *mlu_strideY, *mlu_shape; - CNRT_CHECK(cnrtMalloc((void **)&mlu_strideX, ndim * sizeof(int))); - CNRT_CHECK(cnrtMalloc((void **)&mlu_strideY, ndim * sizeof(int))); - CNRT_CHECK(cnrtMalloc((void **)&mlu_shape, ndim * sizeof(int))); - CNRT_CHECK(cnrtMemcpy(mlu_strideX, x_stride, ndim * sizeof(int), cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpy(mlu_strideY, y_stride, ndim * sizeof(int), cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpy(mlu_shape, shape, ndim * sizeof(int), cnrtMemcpyHostToDev)); - - rmsNorm_fp16(queue, y.data, x.data, w.data, mlu_strideX, mlu_strideY, mlu_shape, n, d, epsilon, ndim); - cnrtFree(mlu_strideX); - cnrtFree(mlu_strideY); - cnrtFree(mlu_shape); + auto w_ = reinterpret_cast(w); + rms_normUnion(queue, y_, x_, w_, stride_y, stride_x, epsilon, n, d); + } + +} +infiniopStatus_t bangRMSNorm(RMSNormBangDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream){ + if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)){ + rms_norm_bang_f16(desc, y, x, w, stream); + return STATUS_SUCCESS; } - -} + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/rms_norm/bang/rms_norm_cnnl.cc b/src/ops/rms_norm/bang/rms_norm_cnnl.cc deleted file mode 100644 index 9e80918d..00000000 --- a/src/ops/rms_norm/bang/rms_norm_cnnl.cc +++ /dev/null @@ -1,56 +0,0 @@ -#include "rms_norm_cnnl.h" -#include "../../../devices/bang/common_bang.h" -#include "../../../devices/bang/handle_pool.h" -#include "../../utils.h" -#include "cnrt.h" - -RMSNormBangDescriptor::RMSNormBangDescriptor(Device device) { - this->device = device; - get_cnnl_pool(); -} - -void rms_norm_cnnl_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream) { - ASSERT_EQ(y.layout->ndim, 2); - ASSERT_EQ(x.layout->ndim, 2); - ASSERT_EQ(w.layout->ndim, 1); - - auto n = y.layout->shape[0], - d = y.layout->shape[1]; - - ASSERT_EQ(x.layout->shape[0], n); - ASSERT_EQ(x.layout->shape[1], d); - ASSERT_EQ(w.layout->shape[0], d); - - cnnlTensorDescriptor_t yDesc, xDesc, wDesc; - cnnlCreateTensorDescriptor(&yDesc); - cnnlCreateTensorDescriptor(&xDesc); - cnnlCreateTensorDescriptor(&wDesc); - setCnnlTensor(yDesc, y.layout); - setCnnlTensor(xDesc, x.layout); - setCnnlTensor(wDesc, w.layout); - - cnnlFuseNormDescriptor_t opDesc; - cnnlCreateFuseNormDescriptor(&opDesc); - cnnlSetFuseNormDescriptor(opDesc, epsilon, 1.0, true, - false, false, false, false, - CNNL_DTYPE_HALF, CNNL_TRANSFORMER_RMSNORM); - - void *workspace; - - use_cnnl((cnrtQueue_t) stream, - [&](cnnlHandle_t handle) { - size_t wsSize; - cnnlGetFuseNormWorkspaceSize(handle, opDesc, xDesc, &wsSize); - cnrtMalloc(&workspace, wsSize); - cnnlFuseNorm(handle, opDesc, xDesc, x.data, - wDesc, w.data, nullptr, nullptr, - nullptr, nullptr, nullptr, nullptr, - workspace, wsSize, yDesc, y.data, nullptr, nullptr); - }); - - cnrtFree(workspace); - cnnlDestroyFuseNormDescriptor(opDesc); - cnnlDestroyTensorDescriptor(xDesc); - cnnlDestroyTensorDescriptor(yDesc); - cnnlDestroyTensorDescriptor(wDesc); -} diff --git a/src/ops/rms_norm/bang/rms_norm_cnnl.h b/src/ops/rms_norm/bang/rms_norm_cnnl.h deleted file mode 100644 index ab0972ce..00000000 --- a/src/ops/rms_norm/bang/rms_norm_cnnl.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __CNNL_RMS_NORM_H__ -#define __CNNL_RMS_NORM_H__ - -#include "cnnl.h" -#include "cnnl_extra.h" -#include "operators.h" - -struct RMSNormBangDescriptor { - Device device; - RMSNormBangDescriptor(Device device); -}; - -void rms_norm_cnnl_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream); - -#endif// __CNNL_RMS_NORM_H__ diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cc b/src/ops/rms_norm/cpu/rms_norm_cpu.cc index 38e4581f..3152b5b9 100644 --- a/src/ops/rms_norm/cpu/rms_norm_cpu.cc +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cc @@ -3,25 +3,66 @@ #include "../../utils.h" #include -void rms_norm_cpu_f16(Tensor y, Tensor x, Tensor w, float epsilon) { - ASSERT_EQ(y.layout->ndim, 2); - ASSERT_EQ(x.layout->ndim, 2); - ASSERT_EQ(w.layout->ndim, 1); +infiniopStatus_t cpuCreateRMSNormDescriptor(infiniopHandle_t, RMSNormCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, infiniopTensorDescriptor_t w_desc, float epsilon) { + if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + + auto n = y_desc->shape[0], + d = y_desc->shape[1]; - auto n = y.layout->shape[0], - d = y.layout->shape[1]; + if (x_desc->shape[0] != n || x_desc->shape[1] != d || w_desc->shape[0] != d) { + return STATUS_BAD_TENSOR_SHAPE; + } - ASSERT_EQ(x.layout->shape[0], n); - ASSERT_EQ(x.layout->shape[1], d); - ASSERT_EQ(w.layout->shape[0], d); + uint64_t stride_y = y_desc->strides[0]; + uint64_t stride_x = y_desc->strides[0]; + auto w_datatype = w_desc->dt; - auto stride_y = y.layout->strides[0]; - auto stride_x = x.layout->strides[0]; + *desc_ptr = new RMSNormCpuDescriptor{ + DevCpu, + y_desc->dt, + n, + d, + stride_y, + stride_x, + w_datatype, + epsilon}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuGetRMSNormWorkspaceSize(RMSNormCpuDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyRMSNormDescriptor(RMSNormCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} + +void rms_norm_cpu_f16(RMSNormCpuDescriptor_t desc, void *y, void const *x, void const *w) { + auto n = desc->n, d = desc->d; + auto stride_y = desc->stride_y; + auto stride_x = desc->stride_x; + auto epsilon = desc->epsilon; + + auto y_ptr = reinterpret_cast(y); + auto x_ptr = reinterpret_cast(x); + void const *w_ptr = w; + void const *w_ = nullptr; + auto w_datatype = desc->w_datatype; + if (dtype_eq(w_datatype, F16)) { + w_ = reinterpret_cast(w_ptr); + } else { + w_ = reinterpret_cast(w_ptr); + } for (size_t i = 0; i < n; ++i) { - auto y_ = reinterpret_cast(reinterpret_cast(y.data) + i * stride_y); - auto x_ = reinterpret_cast(reinterpret_cast(x.data) + i * stride_x); - auto w_ = reinterpret_cast(w.data); + auto y_ = reinterpret_cast(y_ptr + i * stride_y); + auto x_ = reinterpret_cast(x_ptr + i * stride_x); auto sum_sq = 0.0f; for (size_t j = 0; j < d; ++j) { @@ -32,8 +73,27 @@ void rms_norm_cpu_f16(Tensor y, Tensor x, Tensor w, float epsilon) { auto k = std::pow(sum_sq / d + epsilon, -.5); for (size_t j = 0; j < d; ++j) { auto x__ = f16_to_f32(x_[j]); - auto w__ = f16_to_f32(w_[j]); + float w__ = 0.0f; + if (dtype_eq(w_datatype, F16)) { + w__ = f16_to_f32(static_cast(w_)[j]); + } else { + w__ = static_cast(w_)[j]; + } + y_[j] = f32_to_f16(k * x__ * w__); } } } + +infiniopStatus_t cpuRMSNorm(RMSNormCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream) { + if (dtype_eq(desc->dtype, F16)) { + rms_norm_cpu_f16(desc, y, x, w); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.h b/src/ops/rms_norm/cpu/rms_norm_cpu.h index 9f598c55..ddf1de66 100644 --- a/src/ops/rms_norm/cpu/rms_norm_cpu.h +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.h @@ -5,8 +5,30 @@ struct RMSNormCpuDescriptor { Device device; + DT dtype; + uint64_t n; + uint64_t d; + uint64_t stride_y; + uint64_t stride_x; + DT w_datatype; + float epsilon; }; -void rms_norm_cpu_f16(Tensor y, Tensor x, Tensor w, float epsilon); +typedef struct RMSNormCpuDescriptor *RMSNormCpuDescriptor_t; + +infiniopStatus_t cpuCreateRMSNormDescriptor(infiniopHandle_t handle, RMSNormCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, float epsilon); + +infiniopStatus_t cpuGetRMSNormWorkspaceSize(RMSNormCpuDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cpuRMSNorm(RMSNormCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream); + +infiniopStatus_t cpuDestroyRMSNormDescriptor(RMSNormCpuDescriptor_t desc); #endif// __CPU_RMS_NORM_H__ diff --git a/src/ops/rms_norm/cuda/rms_norm.cc b/src/ops/rms_norm/cuda/rms_norm.cc new file mode 100644 index 00000000..92d34a99 --- /dev/null +++ b/src/ops/rms_norm/cuda/rms_norm.cc @@ -0,0 +1,46 @@ +#include "rms_norm.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle, RMSNormCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + float epsilon) { + if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + + auto n = y_desc->shape[0], + d = y_desc->shape[1]; + + if (x_desc->shape[0] != n || x_desc->shape[1] != d || w_desc->shape[0] != d) { + return STATUS_BAD_TENSOR_SHAPE; + } + + int64_t stride_y = y_desc->strides[0]; + int64_t stride_x = x_desc->strides[0]; + auto w_datatype = w_desc->dt; + *desc_ptr = new RMSNormCudaDescriptor{ + handle->device, + handle->device_id, + y_desc->dt, + n, + d, + stride_y, + stride_x, + w_datatype, + epsilon}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyRMSNormDescriptor(RMSNormCudaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rms_norm/cuda/rms_norm.cu b/src/ops/rms_norm/cuda/rms_norm.cu index 88608baf..aa36f2f0 100644 --- a/src/ops/rms_norm/cuda/rms_norm.cu +++ b/src/ops/rms_norm/cuda/rms_norm.cu @@ -5,13 +5,13 @@ #include // assert BLOCK_SIZE >= blockDim.x -template -static __global__ void rms_norm_padding( +template +__launch_bounds__(MAX_THREADS_PER_BLOCK) static __global__ void rms_norm_padding( Tdata *__restrict__ o_, unsigned int const stride_y, Tdata const *__restrict__ x_, unsigned int const stride_x, - Tdata const *__restrict__ w_, + Wdata const *__restrict__ w_, float const epsilon) { auto y = o_ + blockIdx.x * stride_y + threadIdx.x; auto x = x_[blockIdx.x * stride_x + threadIdx.x]; @@ -19,24 +19,27 @@ static __global__ void rms_norm_padding( using BlockOp = cub::BlockReduce; __shared__ typename BlockOp::TempStorage temp_storage; +#ifdef ENABLE_SUGON_DCU + auto acc = BlockOp(temp_storage).Sum(x * x); +#else auto acc = BlockOp(temp_storage).Reduce(x * x, cub::Sum()); - +#endif __shared__ Tdata rms; if (threadIdx.x == 0) { rms = Tdata(rsqrtf(acc / float(blockDim.x) + epsilon)); } __syncthreads(); - *y = rms * x * w; + *y = rms * x * (Tdata) w; } -template -static __global__ void rms_norm_folding( +template +__launch_bounds__(MAX_THREADS_PER_BLOCK) static __global__ void rms_norm_folding( Tdata *__restrict__ y, unsigned int const stride_y, Tdata const *__restrict__ x, unsigned int const stride_x, - Tdata const *__restrict__ w, + Wdata const *__restrict__ w, float const epsilon, unsigned int const items_size) { y += blockIdx.x * stride_y; @@ -59,7 +62,11 @@ static __global__ void rms_norm_folding( { using BlockOp = cub::BlockReduce; __shared__ typename BlockOp::TempStorage temp_storage; +#ifdef ENABLE_SUGON_DCU + acc = BlockOp(temp_storage).Sum(squared); +#else acc = BlockOp(temp_storage).Reduce(squared, cub::Sum()); +#endif } __shared__ Tdata rms; @@ -76,13 +83,13 @@ static __global__ void rms_norm_folding( } } -template +template static __global__ void rms_norm_standard( Tdata *__restrict__ y_, unsigned int const stride_y, Tdata const *__restrict__ x_, unsigned int const stride_x, - Tdata const *__restrict__ w, + Wdata const *__restrict__ w, float const epsilon, unsigned int const d) { auto y = y_ + blockIdx.x * stride_y; @@ -112,41 +119,62 @@ static __global__ void rms_norm_standard( __syncthreads(); for (int i = threadIdx.x; i < d; i += BLOCK_SIZE) { - y[i] = rms * x[i] * w[i]; + y[i] = rms * x[i] * (Tdata) w[i]; } } - -void rms_norm_nv_gpu_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream) { - ASSERT_EQ(y.layout->ndim, 2); - ASSERT_EQ(x.layout->ndim, 2); - ASSERT_EQ(w.layout->ndim, 1); - - auto n = y.layout->shape[0], - d = y.layout->shape[1]; - - ASSERT_EQ(x.layout->shape[0], n); - ASSERT_EQ(x.layout->shape[1], d); - ASSERT_EQ(w.layout->shape[0], d); - - auto y_ = reinterpret_cast(y.data); - auto x_ = reinterpret_cast(x.data); - auto w_ = reinterpret_cast(w.data); +void rms_norm_nv_gpu_f16(RMSNormCudaDescriptor_t desc, void *y, void const *x, void const *w, void *stream) { + auto n = desc->n, d = desc->d; + auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto epsilon = desc->epsilon; // Get strides in terms of elements - auto stride_y = y.layout->strides[0] / sizeof(half); - auto stride_x = x.layout->strides[0] / sizeof(half); + auto stride_y = desc->stride_y; + auto stride_x = desc->stride_x; auto cuda_stream = reinterpret_cast(stream); unsigned int items_per_thread = ROUND_UP_DIV(d, MAX_THREADS_PER_BLOCK); - if (items_per_thread == 1) { - rms_norm_padding - <<>>(y_, stride_y, x_, stride_x, w_, epsilon); - } else if (items_per_thread <= 16) { - rms_norm_folding - <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + auto w_datatype = desc->w_datatype; + if (dtype_eq(w_datatype, F16)) { + auto w_ = reinterpret_cast(w); + if (items_per_thread == 1) { + rms_norm_padding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon); + } else if (items_per_thread <= 16) { + rms_norm_folding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } else { + rms_norm_standard + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } } else { - rms_norm_standard - <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + auto w_ = reinterpret_cast(w); + if (items_per_thread == 1) { + rms_norm_padding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon); + } else if (items_per_thread <= 16) { + rms_norm_folding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } else { + rms_norm_standard + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } + } +} + +infiniopStatus_t cudaRMSNorm(RMSNormCudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream) { + if (cudaSetDevice(desc->device_id) != cudaSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)) { + rms_norm_nv_gpu_f16(desc, y, x, w, stream); + return STATUS_SUCCESS; } + + return STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/ops/rms_norm/cuda/rms_norm.cuh b/src/ops/rms_norm/cuda/rms_norm.cuh index 0d187c7c..683011f2 100644 --- a/src/ops/rms_norm/cuda/rms_norm.cuh +++ b/src/ops/rms_norm/cuda/rms_norm.cuh @@ -1,12 +1,40 @@ #ifndef __NV_GPU_RMS_NORM_H__ #define __NV_GPU_RMS_NORM_H__ +#include "../../../devices/cuda/cuda_handle.h" #include "operators.h" struct RMSNormCudaDescriptor { Device device; + int device_id; + DT dtype; + uint64_t n; + uint64_t d; + int64_t stride_y; + int64_t stride_x; + DT w_datatype; + float epsilon; }; -void rms_norm_nv_gpu_f16(Tensor y, Tensor x, Tensor w, float epsilon, void *stream); +typedef struct RMSNormCudaDescriptor *RMSNormCudaDescriptor_t; + +infiniopStatus_t cudaCreateRMSNormDescriptor(CudaHandle_t handle, + RMSNormCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + float epsilon); + +infiniopStatus_t cudaGetRMSNormWorkspaceSize(RMSNormCudaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cudaRMSNorm(RMSNormCudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream); + +infiniopStatus_t cudaDestroyRMSNormDescriptor(RMSNormCudaDescriptor_t desc); + +void rms_norm_nv_gpu_f16(RMSNormCudaDescriptor_t desc, void *y, void const *x, void const *w, float epsilon, void *stream); #endif// __NV_GPU_RMS_NORM_H__ diff --git a/src/ops/rms_norm/maca/rms_norm_maca.cc b/src/ops/rms_norm/maca/rms_norm_maca.cc new file mode 100644 index 00000000..054be969 --- /dev/null +++ b/src/ops/rms_norm/maca/rms_norm_maca.cc @@ -0,0 +1,46 @@ +#include "rms_norm_maca.h" +#include "../../../devices/maca/common_maca.h" +#include "../../utils.h" + +infiniopStatus_t macaCreateRMSNormDescriptor(MacaHandle_t handle, RMSNormMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + float epsilon) { + if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + + auto n = y_desc->shape[0], + d = y_desc->shape[1]; + + if (x_desc->shape[0] != n || x_desc->shape[1] != d || w_desc->shape[0] != d) { + return STATUS_BAD_TENSOR_SHAPE; + } + + int64_t stride_y = y_desc->strides[0]; + int64_t stride_x = x_desc->strides[0]; + auto w_datatype = w_desc->dt; + *desc_ptr = new RMSNormMacaDescriptor{ + handle->device, + handle->device_id, + y_desc->dt, + n, + d, + stride_y, + stride_x, + w_datatype, + epsilon}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t macaGetRMSNormWorkspaceSize(RMSNormMacaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t macaDestroyRMSNormDescriptor(RMSNormMacaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rms_norm/maca/rms_norm_maca.h b/src/ops/rms_norm/maca/rms_norm_maca.h new file mode 100644 index 00000000..f244ce97 --- /dev/null +++ b/src/ops/rms_norm/maca/rms_norm_maca.h @@ -0,0 +1,40 @@ +#ifndef __MACA_RMS_NORM_H__ +#define __MACA_RMS_NORM_H__ + +#include "../../../devices/maca/maca_handle.h" +#include "operators.h" + +struct RMSNormMacaDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t n; + uint64_t d; + int64_t stride_y; + int64_t stride_x; + DT w_datatype; + float epsilon; +}; + +typedef struct RMSNormMacaDescriptor *RMSNormMacaDescriptor_t; + +infiniopStatus_t macaCreateRMSNormDescriptor(MacaHandle_t handle, + RMSNormMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + float epsilon); + +infiniopStatus_t macaGetRMSNormWorkspaceSize(RMSNormMacaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t macaRMSNorm(RMSNormMacaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream); + +infiniopStatus_t macaDestroyRMSNormDescriptor(RMSNormMacaDescriptor_t desc); + +void rms_norm_mc_gpu_f16(RMSNormMacaDescriptor_t desc, void *y, void const *x, void const *w, float epsilon, void *stream); + +#endif// __MACA_RMS_NORM_H__ diff --git a/src/ops/rms_norm/maca/rms_norm_maca.maca b/src/ops/rms_norm/maca/rms_norm_maca.maca new file mode 100644 index 00000000..3becfab6 --- /dev/null +++ b/src/ops/rms_norm/maca/rms_norm_maca.maca @@ -0,0 +1,173 @@ +#include "../../../devices/maca/common_maca.h" +#include "../../utils.h" +#include "rms_norm_maca.h" +#include +#include + +// assert BLOCK_SIZE >= blockDim.x +template +static __global__ void rms_norm_padding( + Tdata *__restrict__ o_, + unsigned int const stride_y, + Tdata const *__restrict__ x_, + unsigned int const stride_x, + Wdata const *__restrict__ w_, + float const epsilon) { + auto y = o_ + blockIdx.x * stride_y + threadIdx.x; + auto x = x_[blockIdx.x * stride_x + threadIdx.x]; + auto w = w_[threadIdx.x]; + + using BlockOp = cub::BlockReduce; + __shared__ typename BlockOp::TempStorage temp_storage; + auto acc = BlockOp(temp_storage).Reduce(x * x, cub::Sum()); + + __shared__ Tdata rms; + if (threadIdx.x == 0) { + rms = Tdata(rsqrtf(acc / float(blockDim.x) + epsilon)); + } + __syncthreads(); + + *y = rms * x * (Tdata) w; +} + +template +static __global__ void rms_norm_folding( + Tdata *__restrict__ y, + unsigned int const stride_y, + Tdata const *__restrict__ x, + unsigned int const stride_x, + Wdata const *__restrict__ w, + float const epsilon, + unsigned int const items_size) { + y += blockIdx.x * stride_y; + x += blockIdx.x * stride_x; + + float thread_data[ITEMS_PER_THREAD]; + { + using BlockOp = cub::BlockLoad; + __shared__ typename BlockOp::TempStorage temp_storage; + BlockOp(temp_storage).Load(x, thread_data, items_size, 0.f); + } + + float squared[ITEMS_PER_THREAD]; +#pragma unroll + for (unsigned int i = 0; i < ITEMS_PER_THREAD; ++i) { + squared[i] = thread_data[i] * thread_data[i]; + } + + float acc; + { + using BlockOp = cub::BlockReduce; + __shared__ typename BlockOp::TempStorage temp_storage; + acc = BlockOp(temp_storage).Reduce(squared, cub::Sum()); + } + + __shared__ Tdata rms; + if (threadIdx.x == 0) { + rms = Tdata(rsqrtf(acc / float(items_size) + epsilon)); + } + __syncthreads(); + +#pragma unroll + for (unsigned int i = 0; i < ITEMS_PER_THREAD; ++i) { + if (auto j = i + threadIdx.x * ITEMS_PER_THREAD; j < items_size) { + y[j] = Tdata(float(rms) * float(thread_data[i]) * float(w[j])); + } + } +} + +template +static __global__ void rms_norm_standard( + Tdata *__restrict__ y_, + unsigned int const stride_y, + Tdata const *__restrict__ x_, + unsigned int const stride_x, + Wdata const *__restrict__ w, + float const epsilon, + unsigned int const d) { + auto y = y_ + blockIdx.x * stride_y; + auto x = x_ + blockIdx.x * stride_x; + + __shared__ float partial_sum[BLOCK_SIZE]; + + float sum = 0.0f; + for (int i = threadIdx.x; i < d; i += BLOCK_SIZE) { + sum += float(x[i]) * float(x[i]); + } + + partial_sum[threadIdx.x] = sum; + __syncthreads(); + for (int stride = BLOCK_SIZE / 2; stride > 0; stride >>= 1) { + if (threadIdx.x < stride) { + partial_sum[threadIdx.x] += partial_sum[threadIdx.x + stride]; + } + __syncthreads(); + } + + __shared__ Tdata rms; + if (threadIdx.x == 0) { + float row_sum = partial_sum[0]; + rms = Tdata(rsqrtf(row_sum / float(d) + epsilon)); + } + __syncthreads(); + + for (int i = threadIdx.x; i < d; i += BLOCK_SIZE) { + y[i] = rms * x[i] * (Tdata) w[i]; + } +} + +void rms_norm_mc_gpu_f16(RMSNormMacaDescriptor_t desc, void *y, void const *x, void const *w, void *stream) { + auto n = desc->n, d = desc->d; + auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto epsilon = desc->epsilon; + + // Get strides in terms of elements + auto stride_y = desc->stride_y; + auto stride_x = desc->stride_x; + + auto maca_stream = reinterpret_cast(stream); + unsigned int items_per_thread = ROUND_UP_DIV(d, MAX_THREADS_PER_BLOCK); + auto w_datatype = desc->w_datatype; + if (dtype_eq(w_datatype, F16)) { + auto w_ = reinterpret_cast(w); + if (items_per_thread == 1) { + rms_norm_padding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon); + } else if (items_per_thread <= 16) { + rms_norm_folding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } else { + rms_norm_standard + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } + } else { + auto w_ = reinterpret_cast(w); + if (items_per_thread == 1) { + rms_norm_padding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon); + } else if (items_per_thread <= 16) { + rms_norm_folding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } else { + rms_norm_standard + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } + } +} + +infiniopStatus_t macaRMSNorm(RMSNormMacaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream) { + if (hcSetDevice(desc->device_id) != hcSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)) { + rms_norm_mc_gpu_f16(desc, y, x, w, stream); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/rms_norm/musa/rms_norm_musa.cc b/src/ops/rms_norm/musa/rms_norm_musa.cc new file mode 100644 index 00000000..99c22c6e --- /dev/null +++ b/src/ops/rms_norm/musa/rms_norm_musa.cc @@ -0,0 +1,46 @@ +#include "rms_norm_musa.h" +#include "../../utils.h" +#include "../../../devices/musa/common_musa.h" + +infiniopStatus_t musaCreateRMSNormDescriptor(MusaHandle_t handle, RMSNormMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + float epsilon) { + if (y_desc->ndim != 2 || x_desc->ndim != 2 || w_desc->ndim != 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + + auto n = y_desc->shape[0], + d = y_desc->shape[1]; + + if (x_desc->shape[0] != n || x_desc->shape[1] != d || w_desc->shape[0] != d) { + return STATUS_BAD_TENSOR_SHAPE; + } + + uint64_t stride_y = y_desc->strides[0]; + uint64_t stride_x = x_desc->strides[0]; + auto w_datatype = w_desc->dt; + *desc_ptr = new RMSNormMusaDescriptor{ + handle->device, + handle->device_id, + y_desc->dt, + n, + d, + stride_y, + stride_x, + w_datatype, + epsilon}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t musaGetRMSNormWorkspaceSize(RMSNormMusaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t musaDestroyRMSNormDescriptor(RMSNormMusaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rms_norm/musa/rms_norm_musa.h b/src/ops/rms_norm/musa/rms_norm_musa.h new file mode 100644 index 00000000..ee8dfb72 --- /dev/null +++ b/src/ops/rms_norm/musa/rms_norm_musa.h @@ -0,0 +1,40 @@ +#ifndef __MUSA_RMS_NORM_H__ +#define __MUSA_RMS_NORM_H__ + +#include "operators.h" +#include "../../../devices/musa/musa_handle.h" + +struct RMSNormMusaDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t n; + uint64_t d; + uint64_t stride_y; + uint64_t stride_x; + DT w_datatype; + float epsilon; +}; + +typedef struct RMSNormMusaDescriptor *RMSNormMusaDescriptor_t; + +infiniopStatus_t musaCreateRMSNormDescriptor(MusaHandle_t handle, + RMSNormMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + float epsilon); + +infiniopStatus_t musaGetRMSNormWorkspaceSize(RMSNormMusaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t musaRMSNorm(RMSNormMusaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream); + +infiniopStatus_t musaDestroyRMSNormDescriptor(RMSNormMusaDescriptor_t desc); + +void rms_norm_mt_gpu_f16(RMSNormMusaDescriptor_t desc, void *y, void const *x, void const *w, float epsilon, void *stream); + +#endif// __MT_GPU_RMS_NORM_H__ diff --git a/src/ops/rms_norm/musa/rms_norm_musa.mu b/src/ops/rms_norm/musa/rms_norm_musa.mu new file mode 100644 index 00000000..d80bdac9 --- /dev/null +++ b/src/ops/rms_norm/musa/rms_norm_musa.mu @@ -0,0 +1,177 @@ +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include "rms_norm_musa.h" +#include +#include + +// assert BLOCK_SIZE >= blockDim.x +template +static __global__ void rms_norm_padding( + Tdata *__restrict__ o_, + unsigned int const stride_y, + Tdata const *__restrict__ x_, + unsigned int const stride_x, + Wdata const *__restrict__ w_, + float const epsilon) { + auto y = o_ + blockIdx.x * stride_y + threadIdx.x; + auto x = x_[blockIdx.x * stride_x + threadIdx.x]; + auto w = w_[threadIdx.x]; + + using BlockOp = cub::BlockReduce; + __shared__ typename BlockOp::TempStorage temp_storage; + auto acc = BlockOp(temp_storage).Reduce(x * x, cub::Sum()); + + __shared__ Tdata rms; + if (threadIdx.x == 0) { + rms = Tdata(rsqrtf(acc / float(blockDim.x) + epsilon)); + } + __syncthreads(); + + *y = rms * x * (Tdata)w; +} + +template +static __global__ void rms_norm_folding( + Tdata *__restrict__ y, + unsigned int const stride_y, + Tdata const *__restrict__ x, + unsigned int const stride_x, + Wdata const *__restrict__ w, + float const epsilon, + unsigned int const items_size) { + y += blockIdx.x * stride_y; + x += blockIdx.x * stride_x; + + float thread_data[ITEMS_PER_THREAD]; + { + using BlockOp = cub::BlockLoad; + __shared__ typename BlockOp::TempStorage temp_storage; + BlockOp(temp_storage).Load(x, thread_data, items_size, 0.f); + } + + float squared[ITEMS_PER_THREAD]; +#pragma unroll + for (unsigned int i = 0; i < ITEMS_PER_THREAD; ++i) { + squared[i] = thread_data[i] * thread_data[i]; + } + + float acc; + { + using BlockOp = cub::BlockReduce; + __shared__ typename BlockOp::TempStorage temp_storage; + acc = BlockOp(temp_storage).Reduce(squared, cub::Sum()); + } + + __shared__ Tdata rms; + if (threadIdx.x == 0) { + rms = Tdata(rsqrtf(acc / float(items_size) + epsilon)); + } + __syncthreads(); + +#pragma unroll + for (unsigned int i = 0; i < ITEMS_PER_THREAD; ++i) { + if (auto j = i + threadIdx.x * ITEMS_PER_THREAD; j < items_size) { + y[j] = Tdata(float(rms) * float(thread_data[i]) * float(w[j])); + } + } +} + +template +static __global__ void rms_norm_standard( + Tdata *__restrict__ y_, + unsigned int const stride_y, + Tdata const *__restrict__ x_, + unsigned int const stride_x, + Wdata const *__restrict__ w, + float const epsilon, + unsigned int const d) { + auto y = y_ + blockIdx.x * stride_y; + auto x = x_ + blockIdx.x * stride_x; + + __shared__ float partial_sum[BLOCK_SIZE]; + + float sum = 0.0f; + for (int i = threadIdx.x; i < d; i += BLOCK_SIZE) { + sum += float(x[i]) * float(x[i]); + } + + partial_sum[threadIdx.x] = sum; + __syncthreads(); + for (int stride = BLOCK_SIZE / 2; stride > 0; stride >>= 1) { + if (threadIdx.x < stride) { + partial_sum[threadIdx.x] += partial_sum[threadIdx.x + stride]; + } + __syncthreads(); + } + + __shared__ Tdata rms; + if (threadIdx.x == 0) { + float row_sum = partial_sum[0]; + rms = Tdata(rsqrtf(row_sum / float(d) + epsilon)); + } + __syncthreads(); + + for (int i = threadIdx.x; i < d; i += BLOCK_SIZE) { + y[i] = rms * x[i] * (Tdata)w[i]; + } +} + +void rms_norm_mt_gpu_f16(RMSNormMusaDescriptor_t desc, void *y, void const *x, void const *w, void *stream) { + auto n = desc->n, d = desc->d; + auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto epsilon = desc->epsilon; + + // Get strides in terms of elements + auto stride_y = desc->stride_y; + auto stride_x = desc->stride_x; + + auto musa_stream = reinterpret_cast(stream); + unsigned int items_per_thread = ROUND_UP_DIV(d, MAX_THREADS_PER_BLOCK); + auto w_datatype = desc->w_datatype; + if (dtype_eq(w_datatype, F16)) { + auto w_ = reinterpret_cast(w); + if (items_per_thread == 1) { + rms_norm_padding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon); + } else if (items_per_thread <= 16) { + rms_norm_folding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } else { + rms_norm_standard + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } + } else { + auto w_ = reinterpret_cast(w); + if (items_per_thread == 1) { + rms_norm_padding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon); + } else if (items_per_thread <= 16) { + rms_norm_folding + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } else { + rms_norm_standard + <<>>(y_, stride_y, x_, stride_x, w_, epsilon, d); + } + } +} + +infiniopStatus_t musaRMSNorm(RMSNormMusaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *y, void const *x, void const *w, + void *stream){ + int current_device; + if (musaGetDevice(¤t_device) != musaSuccess) { + return STATUS_BAD_DEVICE; + } + if (current_device != desc->device_id && musaSetDevice(desc->device_id) != musaSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)){ + rms_norm_mt_gpu_f16(desc, y, x, w, stream); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/rms_norm/operator.cc b/src/ops/rms_norm/operator.cc index fae458d9..317e7ef2 100644 --- a/src/ops/rms_norm/operator.cc +++ b/src/ops/rms_norm/operator.cc @@ -1,85 +1,187 @@ #include "../utils.h" +#include "operators.h" #include "ops/rms_norm/rms_norm.h" #ifdef ENABLE_CPU #include "cpu/rms_norm_cpu.h" #endif #ifdef ENABLE_NV_GPU +#include "../../devices/cuda/common_cuda.h" +#include "../../devices/cuda/cuda_handle.h" #include "cuda/rms_norm.cuh" #endif #ifdef ENABLE_CAMBRICON_MLU -#include "bang/rms_norm_cnnl.h" +#include "../../devices/bang/bang_handle.h" #include "bang/rms_norm_bang.h" #endif +#ifdef ENABLE_ASCEND_NPU +#include "ascend/rms_norm_aclnn.h" +#endif +#ifdef ENABLE_METAX_GPU +#include "maca/rms_norm_maca.h" +#endif +#ifdef ENABLE_MTHREADS_GPU +#include "musa/rms_norm_musa.h" +#endif -struct RMSNormDescriptor { - Device device; -}; +__C infiniopStatus_t infiniopCreateRMSNormDescriptor( + infiniopHandle_t handle, + infiniopRMSNormDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + float epsilon) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateRMSNormDescriptor(handle, (RMSNormCpuDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateRMSNormDescriptor((CudaHandle_t) handle, (RMSNormCudaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon); + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return bangCreateRMSNormDescriptor((BangHandle_t) handle, (RMSNormBangDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnCreateRMSNormDescriptor((AscendHandle_t) handle, + (RMSNormAclnnDescriptor_t *) desc_ptr, + y_desc, + x_desc, + w_desc, + epsilon); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaCreateRMSNormDescriptor((MacaHandle_t) handle, (RMSNormMacaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaCreateRMSNormDescriptor((MusaHandle_t) handle, (RMSNormMusaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon); + } +#endif + } + return STATUS_BAD_DEVICE; +} -__C void *createRMSNormDescriptor(Device device, void *config) { - switch (device) { +__C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - return (RMSNormDescriptor *) (new RMSNormCpuDescriptor{device}); + return cpuGetRMSNormWorkspaceSize((RMSNormCpuDescriptor_t) desc, size); #endif #ifdef ENABLE_NV_GPU - case DevNvGpu: - return (RMSNormDescriptor *) (new RMSNormCudaDescriptor{device}); + case DevNvGpu: { + return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t) desc, size); + } + #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { - return (RMSNormDescriptor *) (new RMSNormBangDescriptor(device)); + return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t) desc, size); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t) desc, + size); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t) desc, size); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaGetRMSNormWorkspaceSize((RMSNormMusaDescriptor_t) desc, size); } #endif - default: - PANIC(UnsupportedDevice); } - return nullptr; + return STATUS_BAD_DEVICE; } -__C void destroyRMSNormDescriptor(RMSNormDescriptor *descriptor) { - switch (descriptor->device) { +__C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, uint64_t workspace_size, + void *y, void const *x, void const *w, void *stream) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - delete (RMSNormCpuDescriptor *) (descriptor); - break; + return cpuRMSNorm((RMSNormCpuDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); #endif #ifdef ENABLE_NV_GPU - case DevNvGpu: - delete (RMSNormCudaDescriptor *) (descriptor); - break; + case DevNvGpu: { + return cudaRMSNorm((RMSNormCudaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); + } + #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { - delete (RMSNormBangDescriptor *) (descriptor); - break; + return bangRMSNorm((RMSNormBangDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnRMSNorm((RMSNormAclnnDescriptor_t) desc, + workspace, + workspace_size, + y, + x, + w, + stream); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaRMSNorm((RMSNormMacaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaRMSNorm((RMSNormMusaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); } #endif - default: - PANIC(UnsupportedDevice); } + return STATUS_BAD_DEVICE; } -__C void rmsNorm(RMSNormDescriptor *descriptor, Tensor y, Tensor x, Tensor w, float epsilon, void *stream) { - switch (descriptor->device) { +__C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - rms_norm_cpu_f16(y, x, w, epsilon); - break; + return cpuDestroyRMSNormDescriptor((RMSNormCpuDescriptor_t) desc); #endif #ifdef ENABLE_NV_GPU - case DevNvGpu: - rms_norm_nv_gpu_f16(y, x, w, epsilon, stream); - break; + case DevNvGpu: { + return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t) desc); + } + #endif #ifdef ENABLE_CAMBRICON_MLU - case DevCambriconMlu: - // Using BANGC Kernel - rms_norm_bang_f16(y, x, w, epsilon, stream); - // rms_norm_cnnl_f16(y, x, w, epsilon, stream); - break; -#endif - default: - PANIC(UnsupportedDevice); + case DevCambriconMlu: { + return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t) desc); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t) desc); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t) desc); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaDestroyRMSNormDescriptor((RMSNormMusaDescriptor_t) desc); + } +#endif } + return STATUS_BAD_DEVICE; } diff --git a/src/ops/rotary_embedding/ascend/rotary_embedding.cc b/src/ops/rotary_embedding/ascend/rotary_embedding.cc new file mode 100644 index 00000000..5908af2a --- /dev/null +++ b/src/ops/rotary_embedding/ascend/rotary_embedding.cc @@ -0,0 +1,99 @@ +#include "rotary_embedding.h" +#include "../../utils.h" + +infiniopStatus_t ascendCreateRoPEDescriptor(AscendHandle_t handle, + RoPEAscendDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table) { + if (t->ndim != 3 || + pos_ids->ndim != 1 || + sin_table->ndim != 2 || + cos_table->ndim != 2) { + return STATUS_BAD_TENSOR_SHAPE; + } + + auto seq_len = t->shape[0]; + auto nh = t->shape[1]; + auto dim = t->shape[2]; + auto total_seq_len = sin_table->shape[0]; + auto stride_seq = t->strides[0]; + auto stride_head = t->strides[1]; + + + if (dim % 2 != 0 || dim <= 32) { + return STATUS_BAD_TENSOR_SHAPE; + } + + if (pos_ids->shape[0] != seq_len || + sin_table->shape[1] != dim || + cos_table->shape[1] != dim || + sin_table->shape[0] != cos_table->shape[0]) { + return STATUS_BAD_TENSOR_SHAPE; + } + + if (t->strides[2] != 1 || + pos_ids->strides[0] != 1 || + sin_table->strides[1] != 1 || + cos_table->strides[1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + aclDataType dt; + if (dtype_eq(t->dt, F16)) { + dt = aclDataType::ACL_FLOAT16; + } else if (dtype_eq(t->dt, F32)) { + dt = aclDataType::ACL_FLOAT; + } else { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32)) + return STATUS_BAD_TENSOR_DTYPE; + + *desc_ptr = new RoPEAscendDescriptor{ + handle->device, + handle->device_id, + dt, + seq_len, + nh, + dim, + total_seq_len, + stride_seq, + stride_head}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t ascendGetRoPEWorkspaceSize(RoPEAscendDescriptor_t desc, + uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + +infiniopStatus_t ascendRoPE(RoPEAscendDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream) { + auto nt = static_cast(desc->seq_len); + auto nh = static_cast(desc->nhead); + auto dh = static_cast(desc->dim); + auto stt = static_cast(desc->stride_seq); + auto sth = static_cast(desc->stride_head); + + // Set device + aclrtSetDevice(desc->device_id); + + return rope_kernel_do(t, (void *) pos_ids, (void *) sin_table, (void *) cos_table, + nt, nh, dh, stt, sth, desc->dt, stream); +} + +infiniopStatus_t ascendDestroyRoPEDescriptor(RoPEAscendDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rotary_embedding/ascend/rotary_embedding.h b/src/ops/rotary_embedding/ascend/rotary_embedding.h new file mode 100644 index 00000000..679b238a --- /dev/null +++ b/src/ops/rotary_embedding/ascend/rotary_embedding.h @@ -0,0 +1,46 @@ +#ifndef __ASCEND_ROTARY_EMBEDDING_H__ +#define __ASCEND_ROTARY_EMBEDDING_H__ + +#include "../../../devices/ascend/ascend_handle.h" +#include "operators.h" + +struct RoPEAscendDescriptor { + Device device; + int device_id; + aclDataType dt; + uint64_t seq_len; + uint64_t nhead; + uint64_t dim; + uint64_t total_seq_len; + int64_t stride_seq; + int64_t stride_head; +}; + +typedef struct RoPEAscendDescriptor *RoPEAscendDescriptor_t; + +infiniopStatus_t ascendCreateRoPEDescriptor(AscendHandle_t handle, + RoPEAscendDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table); + +infiniopStatus_t ascendGetRoPEWorkspaceSize(RoPEAscendDescriptor_t desc, + uint64_t *size); + +infiniopStatus_t ascendRoPE(RoPEAscendDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream); + +infiniopStatus_t ascendDestroyRoPEDescriptor(RoPEAscendDescriptor_t desc); + +extern "C" infiniopStatus_t rope_kernel_do(void *t, void *pos, void *sin, void *cos, + int32_t nt, int32_t nh, int32_t dh, int32_t stt, + int32_t sth, int dtype, void *stream); + +#endif diff --git a/src/ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp b/src/ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp new file mode 100644 index 00000000..989b1422 --- /dev/null +++ b/src/ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp @@ -0,0 +1,230 @@ +#include "kernel_operator.h" +#include "../../../../include/status.h" + +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 1; + +template class RoPE { +public: + __aicore__ inline RoPE() {} + // Init op + // pos position vector + // t input tensor + // input tensor shape [nt, nh, dh] + // make block_num = nh, tile_len = dh + __aicore__ inline void Init(GM_ADDR t, GM_ADDR pos, GM_ADDR sin, + GM_ADDR cos, int32_t nt, int32_t nh, + int32_t dh, int32_t stt, int32_t sth); + __aicore__ inline void Process(); + +private: + // Copy a tile into UB + __aicore__ inline void CopyIn(int32_t i); + __aicore__ inline void Compute(int32_t i); + __aicore__ inline void CopyOut(int32_t i); + +private: + TPipe pipe; + TQue inQue; + TQue sinQue; + TQue cosQue; + TQue outQue; + TBuf tmpOddBuf; + TBuf tmpEvenBuf; + TBuf tmpBuf; + TBuf tmp2Buf; + TBuf tmp3Buf; + TBuf tmp4Buf; + TBuf tmpSinBuf; + TBuf tmpCosBuf; + + GlobalTensor xGm; + GlobalTensor pGm; + GlobalTensor sinGm; + GlobalTensor cosGm; + GlobalTensor oGm; + + // TODO: Change to uint64_t + uint32_t _block_idx; + uint32_t _tile_len; + + // t[nt, nh, dh] + // nt num of tokens + // nh num of heads + // dh dimension of each head + int32_t nt; + int32_t nh; + int32_t dh; + int32_t sth; + int32_t stt; +}; + +template +__aicore__ inline void RoPE::Init(GM_ADDR t, GM_ADDR pos, GM_ADDR sin, + GM_ADDR cos, int32_t nt, int32_t nh, + int32_t dh, int32_t stt, int32_t sth) { + this->nt = nt; + this->nh = nh; + this->dh = dh; + this->stt = stt; + this->sth = sth; + + _block_idx = GetBlockIdx(); + _tile_len = dh; + + // Init global buffer + xGm.SetGlobalBuffer((__gm__ T *) t); + pGm.SetGlobalBuffer((__gm__ uint64_t *) pos); + sinGm.SetGlobalBuffer((__gm__ float *) sin); + cosGm.SetGlobalBuffer((__gm__ float *) cos); + oGm.SetGlobalBuffer((__gm__ T *) t); + + // Init Queue buffer + pipe.InitBuffer(inQue, BUFFER_NUM, _tile_len * sizeof(T)); + pipe.InitBuffer(outQue, BUFFER_NUM, _tile_len * sizeof(T)); + pipe.InitBuffer(sinQue, BUFFER_NUM, _tile_len * sizeof(float)); + pipe.InitBuffer(cosQue, BUFFER_NUM, _tile_len * sizeof(float)); + pipe.InitBuffer(tmpOddBuf, _tile_len / 2 * sizeof(T)); + pipe.InitBuffer(tmpEvenBuf, _tile_len / 2 * sizeof(T)); + pipe.InitBuffer(tmpBuf, _tile_len / 2 * sizeof(T)); + pipe.InitBuffer(tmp2Buf, _tile_len / 2 * sizeof(T)); + pipe.InitBuffer(tmp3Buf, _tile_len / 2 * sizeof(T)); + pipe.InitBuffer(tmp4Buf, _tile_len / 2 * sizeof(T)); + pipe.InitBuffer(tmpSinBuf, _tile_len * sizeof(T)); + pipe.InitBuffer(tmpCosBuf, _tile_len * sizeof(T)); +} + +template +__aicore__ inline void RoPE::CopyIn(int32_t i) { + LocalTensor inputUb = inQue.AllocTensor(); + LocalTensor sinUb = sinQue.AllocTensor(); + LocalTensor cosUb = cosQue.AllocTensor(); + // Get idx of current tile in total input + auto idx = i * stt + _block_idx * sth; + // Copy tile current tile into UB + DataCopy(inputUb, xGm[idx], _tile_len); + // Copy sin cos tile + auto pos_idx = pGm(i); + // Cast sin cos to T type + DataCopy(sinUb, sinGm[pos_idx * dh], _tile_len); + DataCopy(cosUb, cosGm[pos_idx * dh], _tile_len); + // Push in operands + inQue.EnQue(inputUb); + sinQue.EnQue(sinUb); + cosQue.EnQue(cosUb); +} + +template +__aicore__ inline void RoPE::Compute(int32_t i) { + LocalTensor inputUb = inQue.DeQue(); + LocalTensor sinUb = sinQue.DeQue(); + LocalTensor cosUb = cosQue.DeQue(); + LocalTensor outUb = outQue.AllocTensor(); + + // Choose odd and even position + LocalTensor tmpOdd = tmpOddBuf.Get(); + LocalTensor tmpEven = tmpEvenBuf.Get(); + LocalTensor tmpUb = tmpBuf.Get(); + LocalTensor tmp2Ub = tmp2Buf.Get(); + LocalTensor tmp3Ub = tmp3Buf.Get(); + LocalTensor tmp4Ub = tmp4Buf.Get(); + LocalTensor tmpSinUb = tmpSinBuf.Get(); + LocalTensor tmpCosUb = tmpCosBuf.Get(); + + // Cast from float to T + Cast(tmpSinUb, sinUb, RoundMode::CAST_FLOOR, _tile_len); + Cast(tmpCosUb, cosUb, RoundMode::CAST_FLOOR, _tile_len); + PipeBarrier(); + + // Select odd & even numbers + uint64_t rsvdCnt = 0; + GatherMaskParams gMaskParams = { + 1, + static_cast((_tile_len * sizeof(T) + 255) / 256), + 8, + 8, + }; + GatherMask(tmpOdd, inputUb, 1, false, 0, gMaskParams, rsvdCnt); + GatherMask(tmpEven, inputUb, 2, false, 0, gMaskParams, rsvdCnt); + + // Calc odd position + GatherMask(tmpUb, tmpCosUb, 1, false, 0, gMaskParams, rsvdCnt); + GatherMask(tmp2Ub, tmpSinUb, 1, false, 0, gMaskParams, rsvdCnt); + PipeBarrier(); + tmpUb = tmpOdd * tmpUb; + tmp2Ub = tmpEven * tmp2Ub; + PipeBarrier(); + tmpUb = tmpUb - tmp2Ub; + + // Calc even position + GatherMask(tmp3Ub, tmpSinUb, 2, false, 0, gMaskParams, rsvdCnt); + GatherMask(tmp4Ub, tmpCosUb, 2, false, 0, gMaskParams, rsvdCnt); + PipeBarrier(); + tmp3Ub = tmpOdd * tmp3Ub; + tmp4Ub = tmpEven * tmp4Ub; + PipeBarrier(); + tmp3Ub = tmp3Ub + tmp4Ub; + + // Scatter + // Scatter(outUb, tmpUb, tmpOffsetUb, (uint32_t)sizeof(T), tile_len / 2); + for (uint32_t i = 0; i < _tile_len / 2; i += 1) { + outUb(i * 2 + 1) = tmp3Ub(i); + outUb(i * 2) = tmpUb(i); + } + + outQue.EnQue(outUb); + inQue.FreeTensor(inputUb); + sinQue.FreeTensor(sinUb); + cosQue.FreeTensor(cosUb); +} + +template +__aicore__ inline void RoPE::CopyOut(int32_t i) { + LocalTensor outUb = outQue.DeQue(); + auto idx = i * stt + _block_idx * sth; + // DataCopy(oGm[idx], outUb, _tile_len); + DataCopyExtParams dcep = { + 1, + static_cast(_tile_len * sizeof(T)), + 0, 0, 0}; + DataCopyPad(oGm[idx], outUb, dcep); + outQue.FreeTensor(outUb); +} + +template __aicore__ inline void RoPE::Process() { + + for (int32_t i = 0; i < nt; ++i) { + CopyIn(i); + Compute(i); + CopyOut(i); + } +} + +// Kernel func +__global__ __aicore__ void rope_kernel_fp16(GM_ADDR t, GM_ADDR pos, + GM_ADDR sin, GM_ADDR cos, + int32_t nt, int32_t nh, + int32_t dh, int32_t stt, + int32_t sth) { + RoPE op; + op.Init(t, pos, sin, cos, nt, nh, dh, stt, sth); + op.Process(); +} + +extern "C" infiniopStatus_t rope_kernel_do(void *t, void *pos, void *sin, void *cos, + int32_t nt, int32_t nh, int32_t dh, + int32_t stt, int32_t sth, + int dtype, void *stream) { + switch (dtype) { + case 0:// ACL_FLOAT32 + // TODO: + break; + case 1:// ACL_FLOAT16 + rope_kernel_fp16<<>>(t, pos, sin, cos, nt, nh, dh, stt, sth); + return STATUS_SUCCESS; + default: + break; + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/rotary_embedding/bang/rotary_embedding_bang.cc b/src/ops/rotary_embedding/bang/rotary_embedding_bang.cc new file mode 100644 index 00000000..c5c51449 --- /dev/null +++ b/src/ops/rotary_embedding/bang/rotary_embedding_bang.cc @@ -0,0 +1,74 @@ +#include "rotary_embedding_bang.h" +#include "../../utils.h" + + +infiniopStatus_t bangCreateRoPEDescriptor(BangHandle_t handle, + RoPEBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table) { + + if (desc_ptr == nullptr) + return STATUS_MEMORY_NOT_ALLOCATED; + + if (t->ndim != 3 || + pos_ids->ndim != 1 || + sin_table->ndim != 2 || + cos_table->ndim != 2) + return STATUS_BAD_TENSOR_SHAPE; + + auto seq_len = t->shape[0]; + auto nhead = t->shape[1]; + auto dim = t->shape[2]; + auto total_seq_len = sin_table->shape[0]; + + if (dim % 2 != 0) + return STATUS_BAD_TENSOR_SHAPE; + + if (pos_ids->shape[0] != seq_len || + sin_table->shape[1] != dim || + cos_table->shape[1] != dim || + sin_table->shape[0] != cos_table->shape[0]) + return STATUS_BAD_TENSOR_SHAPE; + + if (t->strides[2] != 1 || + pos_ids->strides[0] != 1 || + sin_table->strides[1] != 1 || + cos_table->strides[1] != 1) + return STATUS_BAD_TENSOR_STRIDES; + + if (!dtype_eq(t->dt, F16)) + return STATUS_BAD_TENSOR_DTYPE; + + if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32)) + return STATUS_BAD_TENSOR_DTYPE; + + if (!dtype_eq(pos_ids->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + int stride_0 = static_cast(t->strides[0]); + int stride_1 = static_cast(t->strides[1]); + *desc_ptr = new RoPEBangDescriptor{ + handle->device, + handle->device_id, + t->dt, + seq_len, + nhead, + dim, + total_seq_len, + stride_0, stride_1}; + + return STATUS_SUCCESS; +} + + +infiniopStatus_t bangGetRoPEWorkspaceSize(RoPEBangDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + + +infiniopStatus_t bangDestroyRoPEDescriptor(RoPEBangDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rotary_embedding/bang/rotary_embedding_bang.h b/src/ops/rotary_embedding/bang/rotary_embedding_bang.h new file mode 100644 index 00000000..4ede6d33 --- /dev/null +++ b/src/ops/rotary_embedding/bang/rotary_embedding_bang.h @@ -0,0 +1,44 @@ +#ifndef __BANG_ROTARY_EMBEDDING_H__ +#define __BANG_ROTARY_EMBEDDING_H__ + +#include "../../../devices/bang/bang_handle.h" +#include "../../utils.h" +#include "operators.h" + +struct RoPEBangDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t seq_len; + uint64_t nhead; + uint64_t dim; + uint64_t total_seq_len; + int stride_0; + int stride_1; +}; + + +typedef struct RoPEBangDescriptor *RoPEBangDescriptor_t; + +infiniopStatus_t bangCreateRoPEDescriptor(BangHandle_t handle, + RoPEBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table); + +infiniopStatus_t bangGetRoPEWorkspaceSize(RoPEBangDescriptor_t desc, uint64_t *size); + +infiniopStatus_t bangRoPE(RoPEBangDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream); + +infiniopStatus_t bangDestroyRoPEDescriptor(RoPEBangDescriptor_t desc); + + +#endif// __BANG_RMS_NORM_H__ diff --git a/src/ops/rotary_embedding/bang/rotary_embedding_bang.mlu b/src/ops/rotary_embedding/bang/rotary_embedding_bang.mlu new file mode 100644 index 00000000..b7d3658e --- /dev/null +++ b/src/ops/rotary_embedding/bang/rotary_embedding_bang.mlu @@ -0,0 +1,451 @@ +#include "bang.h" +#include "bang_device_functions.h" +#include "cnrt.h" +#include "rotary_embedding_bang.h" +#include "../../../devices/bang/common_bang.h" +#include "../../utils.h" + +const int SRC_MAX_SIZE = 1024 * 8;//8 = 256/32 +__nram__ char nram_buffer[NRAM_MAX_SIZE]; + +template +__mlu_global__ void RoPE(T *destination, uint64_t const *pos_ids, float const *sin_table, float const *cos_table, int stride_0, int stride_1, int nt, int nh, int dimsize) {//axis=-1 + + const int maxNum = SRC_MAX_SIZE/sizeof(float); + + int othersize = nt * nh; + + int segsize = sizeof(T); + int srcStrideL = 2 * sizeof(T); + int destStrideL = 1 * sizeof(T); + + int srcStrideW = 1 * sizeof(T); + int destStrideW = 2 * sizeof(T); + + int segsize_table = sizeof(float); + int srcStrideL_table = 2 * sizeof(float); + int destStrideL_table = 1 * sizeof(float); + + + int remainT = othersize % taskDim; + int stepEasy = (othersize - remainT) / taskDim; + int stepHard = stepEasy + 1; + int step = (taskId < remainT ? stepHard : stepEasy); + int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); + + if(nt < maxNum){ + char *nram_buffer1 = nram_buffer + nt * sizeof(uint64_t); + uint64_t *srcP = (uint64_t *)nram_buffer;//[nt] + + __memcpy(srcP, pos_ids, nt * sizeof(uint64_t), GDRAM2NRAM); + + if(dimsize >= maxNum){ + int dSize = 2 * maxNum; + char *nram_buffer2 = nram_buffer1 + (2 * dSize + 14 * maxNum) * sizeof(float); + float *srcSin = (float *)nram_buffer1;//[dSize] + float *srcCos = srcSin + dSize;//[dSize] + float *sin0 = srcCos + dSize;//[3 * maxNum] + float *cos0 = sin0 + 3 * maxNum;//[3 * maxNum] + float *sin1 = cos0 + 3 * maxNum;//[3 * maxNum],需要多申请内存,方便后面数据移动 + float *cos1 = sin1 + 3 * maxNum;//[3 * maxNum],需要多申请内存,方便后面数据移动 + float *tmpa = cos1 + 3 * maxNum;//[maxNum] + float *tmpb = tmpa + maxNum;//[maxNum] + + + T *srca = (T *)nram_buffer2;//[maxNum] + T *srcb = srca + maxNum;//[3 * maxNum] + T *src = srcb + 3 * maxNum;//[dSize] + + + int segnum = 2 * maxNum; + + int remain = dimsize % dSize; + int repeat = (dimsize - remain) / dSize; + + for(int i = indStart; i < indStart + step; i++){ + int indd = 0; + int indi = i; + indd += (indi % nh) * stride_1; + indi /= nh; + indd += (indi % nt) * stride_0; + int index = srcP[(indi % nt)] * dimsize; + for(int s = 0; s < repeat; s++){ + __memcpy(srcSin, sin_table + index + s * dSize, dSize * sizeof(float), GDRAM2NRAM); + __memcpy(sin0, srcSin, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + __memcpy(srcCos, cos_table + index + s * dSize, dSize * sizeof(float), GDRAM2NRAM); + __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + __memcpy(src, destination + indd + s * dSize, dSize * sizeof(T), GDRAM2NRAM); + __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); + __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); + + __bang_half2float(tmpa, srca, maxNum); + __bang_half2float(tmpb, srcb, maxNum); + + __bang_mul(cos0, tmpa, cos0, maxNum); + __bang_mul(sin0, tmpb, sin0, maxNum); + __bang_sub(cos0, cos0, sin0, maxNum);//结果临时存储在cos0上 + + __bang_mul(sin1, tmpa, sin1, maxNum); + __bang_mul(cos1, tmpb, cos1, maxNum); + __bang_add(cos1, sin1, cos1, maxNum); + + __bang_float2half_dn(srca, cos0, maxNum); + __bang_float2half_dn(srcb, cos1, maxNum); + + __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum); + __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum); + __memcpy(destination + indd + s * dSize, src, dSize * sizeof(T), NRAM2GDRAM); + + + } + if(remain){ + __memcpy(srcSin, sin_table + index + repeat * dSize, remain * sizeof(float), GDRAM2NRAM); + __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + __memcpy(srcCos, cos_table + index + repeat * dSize, remain * sizeof(float), GDRAM2NRAM); + __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + __memcpy(src, destination + indd + repeat * dSize, remain * sizeof(T), GDRAM2NRAM); + __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, remain); + __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, remain); + + __bang_half2float(tmpa, srca, maxNum); + __bang_half2float(tmpb, srcb, maxNum); + + __bang_mul(cos0, tmpa, cos0, maxNum); + __bang_mul(sin0, tmpb, sin0, maxNum); + __bang_sub(cos0, cos0, sin0, maxNum);//结果临时存储在cos0上 + + __bang_mul(sin1, tmpa, sin1, maxNum); + __bang_mul(cos1, tmpb, cos1, maxNum); + __bang_add(cos1, sin1, cos1, maxNum); + + __bang_float2half_dn(srca, cos0, maxNum); + __bang_float2half_dn(srcb, cos1, maxNum); + + __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, remain); + __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, remain); + __memcpy(destination + indd + repeat * dSize, src, remain * sizeof(T), NRAM2GDRAM); + + + } + } + + } + else{ + + int segnum = dimsize; + int dh = dimsize / 2; + + char *nram_buffer2 = nram_buffer1 + (2 * dimsize + 14 * dh) * sizeof(float); + float *srcSin = (float *)nram_buffer1;//[dimsize] + float *srcCos = srcSin + dimsize;//[dimsize] + float *sin0 = srcCos + dimsize;//[dh] + float *cos0 = sin0 + 3 * dh;//[dh] + float *sin1 = cos0 + 3 * dh;//[dh] + float *cos1 = sin1 + 3 * dh;//[dh] + float *tmpa = cos1 + 3 * dh;//[dh] + float *tmpb = tmpa + dh;//[dh] + + T *srca = (T *)nram_buffer2;//[dh] + T *srcb = srca + dh;//[dh] + T *src = srcb + 3 * dh;//[dimsize] + + for(int i = indStart; i < indStart + step; i++){ + int indd = 0; + int indi = i; + indd += (indi % nh) * stride_1; + indi /= nh; + indd += (indi % nt) * stride_0; + + int index = srcP[(indi % nt)] * dimsize; + + __memcpy(srcSin, sin_table + index, dimsize * sizeof(float), GDRAM2NRAM); + __memcpy(sin0, srcSin, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + + + __memcpy(srcCos, cos_table + index, dimsize * sizeof(float), GDRAM2NRAM); + __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + + + __memcpy(src, destination + indd, dimsize * sizeof(T), GDRAM2NRAM); + __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); + __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); + + + __bang_half2float(tmpa, srca, dh); + __bang_half2float(tmpb, srcb, dh); + + + + __bang_mul(cos0, tmpa, cos0, dh); + __bang_mul(sin0, tmpb, sin0, dh); + __bang_sub(cos0, cos0, sin0, dh);//结果临时存储在cos0上 + + __bang_mul(sin1, tmpa, sin1, dh); + __bang_mul(cos1, tmpb, cos1, dh); + __bang_add(cos1, sin1, cos1, dh); + + __bang_float2half_dn(srca, cos0, dh); + __bang_float2half_dn(srcb, cos1, dh); + + + __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum); + __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum); + __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM); + + + + } + + } + } + else{ + + if(dimsize >= maxNum){ + int dSize = 2 * maxNum; + char *nram_buffer1 = nram_buffer + (2 * dSize + 14 * maxNum) * sizeof(float); + float *srcSin = (float *)nram_buffer;//[dSize] + float *srcCos = srcSin + dSize;//[dSize] + float *sin0 = srcCos + dSize;//[3 *maxNum] + float *cos0 = sin0 + 3 * maxNum;//[3 * maxNum] + float *sin1 = cos0 + 3 * maxNum;//[3 * maxNum],需要多申请内存,方便后面数据移动 + float *cos1 = sin1 + 3 * maxNum;//[3 * maxNum],需要多申请内存,方便后面数据移动 + float *tmpa = cos1 + 3 * maxNum;//[maxNum] + float *tmpb = tmpa + maxNum;//[maxNum] + + + T *srca = (T *)nram_buffer1;//[maxNum] + T *srcb = srca + maxNum;//[3 * maxNum] + T *src = srcb + 3 * maxNum;//[dSize] + + + int segnum = 2 * maxNum; + + int remain = dimsize % dSize; + int repeat = (dimsize - remain) / dSize; + + for(int i = indStart; i < indStart + step; i++){ + int indd = 0; + int indi = i; + indd += (indi % nh) * stride_1; + indi /= nh; + indd += (indi % nt) * stride_0; + int index = pos_ids[(indi % nt)] * dimsize; + for(int s = 0; s < repeat; s++){ + __memcpy(srcSin, sin_table + index + s * dSize, dSize * sizeof(float), GDRAM2NRAM); + __memcpy(sin0, srcSin, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + __memcpy(srcCos, cos_table + index + s * dSize, dSize * sizeof(float), GDRAM2NRAM); + __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + __memcpy(src, destination + indd + s * dSize, dSize * sizeof(T), GDRAM2NRAM); + __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); + __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); + + __bang_half2float(tmpa, srca, maxNum); + __bang_half2float(tmpb, srcb, maxNum); + + __bang_mul(cos0, tmpa, cos0, maxNum); + __bang_mul(sin0, tmpb, sin0, maxNum); + __bang_sub(cos0, cos0, sin0, maxNum);//结果临时存储在cos0上 + + __bang_mul(sin1, tmpa, sin1, maxNum); + __bang_mul(cos1, tmpb, cos1, maxNum); + __bang_add(cos1, sin1, cos1, maxNum); + + __bang_float2half_dn(srca, cos0, maxNum); + __bang_float2half_dn(srcb, cos1, maxNum); + + __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum); + __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum); + __memcpy(destination + indd + s * dSize, src, dSize * sizeof(T), NRAM2GDRAM); + + + } + if(remain){ + __memcpy(srcSin, sin_table + index + repeat * dSize, remain * sizeof(float), GDRAM2NRAM); + __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + __memcpy(srcCos, cos_table + index + repeat * dSize, remain * sizeof(float), GDRAM2NRAM); + __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + __memcpy(src, destination + indd + repeat * dSize, remain * sizeof(T), GDRAM2NRAM); + __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, remain); + __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, remain); + + __bang_half2float(tmpa, srca, maxNum); + __bang_half2float(tmpb, srcb, maxNum); + + __bang_mul(cos0, tmpa, cos0, maxNum); + __bang_mul(sin0, tmpb, sin0, maxNum); + __bang_sub(cos0, cos0, sin0, maxNum);//结果临时存储在cos0上 + + __bang_mul(sin1, tmpa, sin1, maxNum); + __bang_mul(cos1, tmpb, cos1, maxNum); + __bang_add(cos1, sin1, cos1, maxNum); + + __bang_float2half_dn(srca, cos0, maxNum); + __bang_float2half_dn(srcb, cos1, maxNum); + + __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, remain); + __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, remain); + __memcpy(destination + indd + repeat * dSize, src, remain * sizeof(T), NRAM2GDRAM); + + + } + } + + } + else{ + + int segnum = dimsize; + int dh = dimsize / 2; + + char *nram_buffer1 = nram_buffer + (2 * dimsize + 14 * dh) * sizeof(float); + float *srcSin = (float *)nram_buffer;//[dimsize] + float *srcCos = srcSin + dimsize;//[dimsize] + float *sin0 = srcCos + dimsize;//[dh] + float *cos0 = sin0 + 3 * dh;//[dh] + float *sin1 = cos0 + 3 * dh;//[dh] + float *cos1 = sin1 + 3 * dh;//[dh] + float *tmpa = cos1 + 3 * dh;//[dh] + float *tmpb = tmpa + dh;//[dh] + + T *srca = (T *)nram_buffer1;//[dh] + T *srcb = srca + dh;//[dh] + T *src = srcb + 3 * dh;//[dimsize] + + for(int i = indStart; i < indStart + step; i++){ + int indd = 0; + int indi = i; + indd += (indi % nh) * stride_1; + indi /= nh; + indd += (indi % nt) * stride_0; + + int index = pos_ids[(indi % nt)] * dimsize; + + __memcpy(srcSin, sin_table + index, dimsize * sizeof(float), GDRAM2NRAM); + __memcpy(sin0, srcSin, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + __memcpy(sin1, srcSin + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + + + __memcpy(srcCos, cos_table + index, dimsize * sizeof(float), GDRAM2NRAM); + __memcpy(cos0, srcCos, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + __memcpy(cos1, srcCos + 1, segsize_table, NRAM2NRAM, destStrideL_table, srcStrideL_table, segnum); + + + + __memcpy(src, destination + indd, dimsize * sizeof(T), GDRAM2NRAM); + __memcpy(srca, src, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); + __memcpy(srcb, src + 1, segsize, NRAM2NRAM, destStrideL, srcStrideL, segnum); + + + __bang_half2float(tmpa, srca, dh); + __bang_half2float(tmpb, srcb, dh); + + + + __bang_mul(cos0, tmpa, cos0, dh); + __bang_mul(sin0, tmpb, sin0, dh); + __bang_sub(cos0, cos0, sin0, dh);//结果临时存储在cos0上 + + __bang_mul(sin1, tmpa, sin1, dh); + __bang_mul(cos1, tmpb, cos1, dh); + __bang_add(cos1, sin1, cos1, dh); + + __bang_float2half_dn(srca, cos0, dh); + __bang_float2half_dn(srcb, cos1, dh); + + + __memcpy(src, srca, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum); + __memcpy(src + 1, srcb, segsize, NRAM2NRAM, destStrideW, srcStrideW, segnum); + __memcpy(destination + indd, src, dimsize * sizeof(T), NRAM2GDRAM); + + + + } + + } + } + +} + +template +void RoPEUnion(cnrtQueue_t queue, void *destination, void const *pos_ids, void const *sin_table, void const *cos_table, int stride_0, int stride_1, int nt, int nh, int dimsize) { + + auto pos_ = reinterpret_cast(pos_ids); + auto sin_ = reinterpret_cast(sin_table); + auto cos_ = reinterpret_cast(cos_table); + auto t_ = reinterpret_cast(destination); + + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + + k_dim.x = 4; + k_dim.y = 1; + k_dim.z = 1; + k_type = CNRT_FUNC_TYPE_UNION1; + + + RoPE<<>>(t_, pos_, sin_, cos_, stride_0, stride_1, nt, nh, dimsize); + cnrtQueueSync(queue); + + + +} + + +void RoPE_bang_f16(RoPEBangDescriptor_t desc, void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, void *stream) { + auto queue = reinterpret_cast(stream); + int nt = static_cast(desc->seq_len); + int nh = static_cast(desc->nhead); + int dimsize = static_cast(desc->dim); + auto stride_0 = desc->stride_0; + auto stride_1 = desc->stride_1; + + RoPEUnion(queue, t, pos_ids, sin_table, cos_table, stride_0, stride_1, nt, nh, dimsize); + +} + +infiniopStatus_t bangRoPE(RoPEBangDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream) { + if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { + return STATUS_BAD_DEVICE; + } + if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr) + return STATUS_BAD_PARAM; + + if (dtype_eq(desc->dtype, F16)) { + RoPE_bang_f16(desc, t, + pos_ids, + sin_table, + cos_table, stream); + } else { + return STATUS_BAD_TENSOR_DTYPE; + } + + return STATUS_SUCCESS; +} diff --git a/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.cc b/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.cc deleted file mode 100644 index c6d66faa..00000000 --- a/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.cc +++ /dev/null @@ -1,131 +0,0 @@ -#include "rotary_embedding_cnnl.h" -#include "../../../devices/bang/common_bang.h" -#include "../../../devices/bang/handle_pool.h" -#include "../../utils.h" -#include "cnrt.h" - -RotaryEmbeddingBangDescriptor::RotaryEmbeddingBangDescriptor(Device device) { - this->device = device; - get_cnnl_pool(); -} - -void rotary_embedding_cnnl_f16(RotaryEmbeddingBangDescriptor *descriptor, Tensor t, Tensor pos, float theta, void *stream) { - ASSERT_EQ(t.layout->ndim, 3); - ASSERT_EQ(pos.layout->ndim, 1); - ASSERT_EQ(pos.layout->shape[0], t.layout->shape[0]); - - auto nt = static_cast(t.layout->shape[0]), - nh = static_cast(t.layout->shape[1]), - dh = static_cast(t.layout->shape[2]); - - int inDim[4] = {nt, 1, nh, dh}; - int inDimStride[4] = {static_cast(t.layout->strides[0] / t.layout->dt.size), - 0, - static_cast(t.layout->strides[1] / t.layout->dt.size), - static_cast(t.layout->strides[2] / t.layout->dt.size)}; - int posDim[2] = {nt, 1}; - int thetaDim[2] = {1, dh / 2}; - int freqDim[2] = {nt, dh / 2}; - int freqConcatDim[2] = {nt, dh}; - int scalerDim[1] = {1}; - - cnnlTensorDescriptor_t inDesc, posDesc, thetaDesc, freqDesc, freqConcatDesc, scalerDesc; - cnnlCreateTensorDescriptor(&inDesc); - cnnlCreateTensorDescriptor(&posDesc); - cnnlCreateTensorDescriptor(&thetaDesc); - cnnlCreateTensorDescriptor(&freqDesc); - cnnlCreateTensorDescriptor(&freqConcatDesc); - cnnlCreateTensorDescriptor(&scalerDesc); - - cnnlSetTensorDescriptor(posDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT32, 2, posDim); - cnnlSetTensorDescriptorEx(inDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF, 4, inDim, inDimStride); - cnnlSetTensorDescriptor(thetaDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, thetaDim); - cnnlSetTensorDescriptor(freqDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, freqDim); - cnnlSetTensorDescriptor(freqConcatDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 2, freqConcatDim); - cnnlSetTensorDescriptor(scalerDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_FLOAT, 1, scalerDim); - - void *thetaData, *freqData, *freqConcatData, *scalerData; - cnrtMalloc(&thetaData, dh / 2 * sizeof(float) + nt * dh / 2 * sizeof(float) + nt * dh * sizeof(float) + sizeof(float)); - freqData = static_cast(thetaData) + dh / 2 * sizeof(float); - freqConcatData = static_cast(freqData) + nt * dh / 2 * sizeof(float); - scalerData = static_cast(freqConcatData) + nt * dh * sizeof(float); - - void *powWorkspace, *outerWorkspace, *concatWorkspace; - float zero = 0.0f, one = 1.0f; - float scaler = -2.0f / dh; - - use_cnnl((cnrtQueue_t) stream, - [&](cnnlHandle_t handle) { - cnrtMemcpy(scalerData, &scaler, sizeof(float), cnrtMemcpyHostToDev); - - void *workspace; - size_t workspaceSize = 0; - size_t powWorkspaceSize; - cnnlGetPowWorkspaceSize(handle, scalerDesc, thetaDesc, - thetaDesc, &powWorkspaceSize); - workspaceSize += powWorkspaceSize; - - // Use Broadcast Mul to calc t * theta_n - size_t outerWorkspaceSize; - cnnlGetOpTensorWorkspaceSize_v2(handle, descriptor->outerDesc, &one, - posDesc, pos.data, - &one, thetaDesc, thetaData, - &zero, freqDesc, freqData, - &outerWorkspaceSize); - workspaceSize += outerWorkspaceSize; - - // Concat two freqs to get [freq, freq] - size_t concatWorkspaceSize; - cnnlGetConcatWorkspaceSize(handle, 2, &concatWorkspaceSize); - workspaceSize += concatWorkspaceSize; - - cnrtMalloc(&workspace, workspaceSize); - powWorkspace = workspace; - outerWorkspace = static_cast(powWorkspace) + powWorkspaceSize; - concatWorkspace = static_cast(outerWorkspace) + outerWorkspaceSize; - - // Use Arange to get [0, 1, 2, ..., dh / 2] - cnnlArange_v2(handle, CNNL_COMPUTATION_ULTRAHIGH_PRECISION, &zero, - &scaler, thetaDesc, thetaData); - - // Use PowR to calc ((theta)^(-2/d))^n - cnrtMemcpy(scalerData, &theta, sizeof(float), cnrtMemcpyHostToDev); - - - cnnlPow(handle, CNNL_COMPUTATION_ULTRAHIGH_PRECISION, - scalerDesc, scalerData, thetaDesc, thetaData, - powWorkspace, powWorkspaceSize, thetaDesc, thetaData); - - - cnnlOpTensor(handle, descriptor->outerDesc, &one, - posDesc, pos.data, - &one, thetaDesc, thetaData, - outerWorkspace, outerWorkspaceSize, - &zero, freqDesc, freqData); - - - cnnlTensorDescriptor_t concatDescs[2] = {freqDesc, freqDesc}; - void *const concatData[2] = {freqData, freqData}; - - cnnlConcat(handle, 2, -1, concatDescs, concatData, - concatWorkspace, concatWorkspaceSize, - freqConcatDesc, freqConcatData); - - // Do RotaryEmbedding with t(fp16) and [freq, freq](fp32) - cnnlRotaryEmbedding_v2(handle, descriptor->ropeDesc, inDesc, t.data, - nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - freqConcatDesc, freqConcatData, - nullptr, nullptr, nullptr, 0, - inDesc, t.data, nullptr, nullptr); - }); - - cnrtFree(thetaData); - cnrtFree(powWorkspace); - - cnnlDestroyTensorDescriptor(inDesc); - cnnlDestroyTensorDescriptor(posDesc); - cnnlDestroyTensorDescriptor(thetaDesc); - cnnlDestroyTensorDescriptor(freqDesc); - cnnlDestroyTensorDescriptor(freqConcatDesc); - cnnlDestroyTensorDescriptor(scalerDesc); -} diff --git a/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.h b/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.h deleted file mode 100644 index a83a525d..00000000 --- a/src/ops/rotary_embedding/bang/rotary_embedding_cnnl.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef __CNNL_ROTARY_EMBEDDING_H__ -#define __CNNL_ROTARY_EMBEDDING_H__ - -#include "cnnl.h" -#include "cnnl_extra.h" -#include "operators.h" - -struct RotaryEmbeddingBangDescriptor { - Device device; - cnnlOpTensorDescriptor_t outerDesc; - cnnlRotaryEmbeddingDescriptor_t ropeDesc; - - RotaryEmbeddingBangDescriptor(Device device); - void createCnnlDescriptors() { - cnnlCreateOpTensorDescriptor(&outerDesc); - cnnlCreateRotaryEmbeddingDescriptor(&ropeDesc); - cnnlSetOpTensorDescriptor(outerDesc, CNNL_OP_TENSOR_MUL, - CNNL_DTYPE_FLOAT, CNNL_NOT_PROPAGATE_NAN); - cnnlSetRotaryEmbeddingDescriptor_v2(ropeDesc, false, true, - false, false, CNNL_SEQDATA_TNBC); - } - void destroyCnnlDescriptors() { - cnnlDestroyOpTensorDescriptor(outerDesc); - cnnlDestroyRotaryEmbeddingDescriptor(ropeDesc); - } -}; - -void rotary_embedding_cnnl_f16(RotaryEmbeddingBangDescriptor *descriptor, Tensor t, Tensor pos, float theta, void *stream); - -#endif// __CNNL_ROTARY_EMBEDDING_H__ diff --git a/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.cc b/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.cc index 31c26de0..f433ed20 100644 --- a/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.cc +++ b/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.cc @@ -3,33 +3,136 @@ #include "../../utils.h" #include -void rotary_embedding_cpu_f16(Tensor t, Tensor pos, float theta) { - ASSERT_EQ(t.layout->ndim, 3); - ASSERT_EQ(pos.layout->ndim, 1); +struct RoPECpuDescriptor { + Device device; + DT dtype; + uint64_t seq_len; + uint64_t nhead; + uint64_t dim; + uint64_t total_seq_len; + int64_t strides[2]; +}; - auto nt = t.layout->shape[0], - nh = t.layout->shape[1], - dh = t.layout->shape[2] / 2; +void rotary_embedding_cpu_f16(RoPECpuDescriptor_t desc, + void *t, + uint64_t const *pos_ids, + float const *sin_table, + float const *cos_table) { + auto nt = desc->seq_len, + nh = desc->nhead, + dim = desc->dim, + dk = dim / 2; - ASSERT_EQ(pos.layout->shape[0], nt); - - auto stride_0 = t.layout->strides[0]; - auto stride_1 = t.layout->strides[1]; + auto stride_0 = desc->strides[0]; + auto stride_1 = desc->strides[1]; for (int i = 0; i < nt; ++i) { - auto pos_ = reinterpret_cast(pos.data) + i; + auto sin_ = sin_table + pos_ids[i] * dim; + auto cos_ = cos_table + pos_ids[i] * dim; for (int j = 0; j < nh; ++j) { - auto t_ = reinterpret_cast(reinterpret_cast(t.data) + i * stride_0 + j * stride_1); - for (int k = 0; k < dh; ++k) { + auto t_ = reinterpret_cast(t) + i * stride_0 + j * stride_1; + for (int k = 0; k < dk; ++k) { auto a = f16_to_f32(t_[2 * k]); auto b = f16_to_f32(t_[2 * k + 1]); - auto pos__ = *pos_; - float freq = float(pos__) / powf(theta, float(k) / float(dh)); - float sin = sinf(freq); - float cos = cosf(freq); - t_[2 * k] = f32_to_f16(a * cos - b * sin); - t_[2 * k + 1] = f32_to_f16(a * sin + b * cos); + float sin0 = sin_[k * 2], cos0 = cos_[k * 2]; + float sin1 = sin_[k * 2 + 1], cos1 = cos_[k * 2 + 1]; + t_[2 * k] = f32_to_f16(a * cos0 - b * sin0); + t_[2 * k + 1] = f32_to_f16(a * sin1 + b * cos1); } } } } + + +infiniopStatus_t cpuCreateRoPEDescriptor(CpuHandle_t handle, + RoPECpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table) { + + if (desc_ptr == nullptr) + return STATUS_MEMORY_NOT_ALLOCATED; + + if (t->ndim != 3 || + pos_ids->ndim != 1 || + sin_table->ndim != 2 || + cos_table->ndim != 2) + return STATUS_BAD_TENSOR_SHAPE; + + auto seq_len = t->shape[0]; + auto nhead = t->shape[1]; + auto dim = t->shape[2]; + auto total_seq_len = sin_table->shape[0]; + + if (dim % 2 != 0) + return STATUS_BAD_TENSOR_SHAPE; + + if (pos_ids->shape[0] != seq_len || + sin_table->shape[1] != dim || + cos_table->shape[1] != dim || + sin_table->shape[0] != cos_table->shape[0]) + return STATUS_BAD_TENSOR_SHAPE; + + if (t->strides[2] != 1 || + pos_ids->strides[0] != 1 || + sin_table->strides[1] != 1 || + cos_table->strides[1] != 1) + return STATUS_BAD_TENSOR_STRIDES; + + if (!dtype_eq(t->dt, F16)) + return STATUS_BAD_TENSOR_DTYPE; + + if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32)) + return STATUS_BAD_TENSOR_DTYPE; + + if (!dtype_eq(pos_ids->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + + *desc_ptr = new RoPECpuDescriptor{ + handle->device, + t->dt, + seq_len, + nhead, + dim, + total_seq_len, + {t->strides[0], t->strides[1]}}; + + return STATUS_SUCCESS; +} + + +infiniopStatus_t cpuGetRoPEWorkspaceSize(RoPECpuDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + + +infiniopStatus_t cpuRoPE(RoPECpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream) { + if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr) + return STATUS_BAD_PARAM; + + if (dtype_eq(desc->dtype, F16)) { + rotary_embedding_cpu_f16(desc, t, + reinterpret_cast(pos_ids), + reinterpret_cast(sin_table), + reinterpret_cast(cos_table)); + } else { + return STATUS_BAD_TENSOR_DTYPE; + } + + return STATUS_SUCCESS; +} + + +infiniopStatus_t cpuDestroyRoPEDescriptor(RoPECpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.h b/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.h index 15a1831a..8957b8c5 100644 --- a/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.h +++ b/src/ops/rotary_embedding/cpu/rotary_embedding_cpu.h @@ -2,11 +2,31 @@ #define __CPU_ROTARY_EMBEDDING_H__ #include "operators.h" +#include "../../../devices/cpu/cpu_handle.h" -struct RotaryEmbeddingCpuDescriptor { - Device device; -}; +struct RoPECpuDescriptor; + +typedef struct RoPECpuDescriptor *RoPECpuDescriptor_t; + +infiniopStatus_t cpuCreateRoPEDescriptor(CpuHandle_t handle, + RoPECpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table); + +infiniopStatus_t cpuGetRoPEWorkspaceSize(RoPECpuDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cpuRoPE(RoPECpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream); + +infiniopStatus_t cpuDestroyRoPEDescriptor(RoPECpuDescriptor_t desc); -void rotary_embedding_cpu_f16(Tensor t, Tensor pos, float theta); #endif// __CPU_RMS_NORM_H__ diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cc b/src/ops/rotary_embedding/cuda/rotary_embedding.cc new file mode 100644 index 00000000..102eb474 --- /dev/null +++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cc @@ -0,0 +1,76 @@ +#include "rotary_embedding.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateRoPEDescriptor(CudaHandle_t handle, + RoPECudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table) { + if (desc_ptr == nullptr) + return STATUS_MEMORY_NOT_ALLOCATED; + + if (t->ndim != 3 || + pos_ids->ndim != 1 || + sin_table->ndim != 2 || + cos_table->ndim != 2) + return STATUS_BAD_TENSOR_SHAPE; + + auto seq_len = t->shape[0]; + auto nhead = t->shape[1]; + auto dim = t->shape[2]; + auto total_seq_len = sin_table->shape[0]; + + if (dim % 2 != 0) + return STATUS_BAD_TENSOR_SHAPE; + + if (pos_ids->shape[0] != seq_len || + sin_table->shape[1] != dim || + cos_table->shape[1] != dim || + sin_table->shape[0] != cos_table->shape[0]) + return STATUS_BAD_TENSOR_SHAPE; + + // TODO: support larger dim in the future + if (dim / 2 > MAX_THREADS_PER_BLOCK) { + return STATUS_BAD_TENSOR_SHAPE; + } + + if (t->strides[2] != 1 || + pos_ids->strides[0] != 1 || + sin_table->strides[1] != 1 || + cos_table->strides[1] != 1) + return STATUS_BAD_TENSOR_STRIDES; + + if (!dtype_eq(t->dt, F16)) + return STATUS_BAD_TENSOR_DTYPE; + + if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32)) + return STATUS_BAD_TENSOR_DTYPE; + + if (!dtype_eq(pos_ids->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + + *desc_ptr = new RoPECudaDescriptor{ + handle->device, + handle->device_id, + t->dt, + seq_len, + nhead, + dim, + total_seq_len, + {t->strides[0], t->strides[1]}}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + + +infiniopStatus_t cudaDestroyRoPEDescriptor(RoPECudaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cu b/src/ops/rotary_embedding/cuda/rotary_embedding.cu index 373abcb1..62579c3d 100644 --- a/src/ops/rotary_embedding/cuda/rotary_embedding.cu +++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cu @@ -2,41 +2,69 @@ #include "rotary_embedding.cuh" #include -static __global__ void padding( - half2 *__restrict__ x_, - unsigned int const *__restrict__ pos_, - float const theta, - unsigned int const leading_dim) { - auto dh = blockDim.x; +static __global__ void padding_f16( + half *__restrict__ x_, + uint64_t const *__restrict__ pos_, + float const *__restrict__ sin_, + float const *__restrict__ cos_, + long const stride0, + long const stride1) { + auto dk = blockDim.x; auto k = threadIdx.x; + auto offset = blockIdx.x * stride0 + blockIdx.y * stride1 + k * 2; + auto &x = reinterpret_cast(x_[offset]); + auto pos = pos_[blockIdx.x]; + auto sincos_offset = pos * dk * 2 + k * 2; - auto &x = x_[blockIdx.x * leading_dim + blockIdx.y * dh + k]; - auto pos = float(pos_[blockIdx.x]); + float sin0 = sin_[sincos_offset], cos0 = cos_[sincos_offset], + sin1 = sin_[sincos_offset + 1], cos1 = cos_[sincos_offset + 1]; + float x0 = __half2float(x.x) * cos0 - __half2float(x.y) * sin0; + float x1 = __half2float(x.y) * cos1 + __half2float(x.x) * sin1; + x = half2(x0, x1); +} - float sin, cos; - sincosf(pos / powf(theta, float(k) / float(dh)), &sin, &cos); - x = x * half2(cos, cos) + half2(-x.y, x.x) * half2(sin, sin); -} +void rotary_embedding_nv_gpu_f16( + RoPECudaDescriptor_t desc, + half *t, + uint64_t const *pos, + float const *sin_, float const *cos_, + void *stream) { + auto nt = desc->seq_len, + nh = desc->nhead, + dh = desc->dim; -constexpr static int - BLOCK_SIZE = 1024; + // batching 2 half together + auto stride0 = desc->strides[0], + stride1 = desc->strides[1]; -void rotary_embedding_nv_gpu_f16(Tensor t, Tensor pos, float theta, void *stream) { - ASSERT_EQ(t.layout->ndim, 3); - ASSERT_EQ(pos.layout->ndim, 1); + auto cuda_stream = reinterpret_cast(stream); + padding_f16<<>>(t, pos, sin_, cos_, stride0, stride1); +} - auto nt = t.layout->shape[0], - nh = t.layout->shape[1], - dh = t.layout->shape[2]; +infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream) { + if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr) + return STATUS_BAD_PARAM; - ASSERT_EQ(pos.layout->shape[0], nt); - ASSERT(dh < BLOCK_SIZE); + checkCudaError(cudaSetDevice(desc->device_id)); - auto t_ptr = reinterpret_cast(t.data); - auto pos_ptr = reinterpret_cast(pos.data); - auto leading_dim = t.layout->strides[0] / 4; + if (dtype_eq(desc->dtype, F16)) { + rotary_embedding_nv_gpu_f16(desc, + reinterpret_cast(t), + reinterpret_cast(pos_ids), + reinterpret_cast(sin_table), + reinterpret_cast(cos_table), + stream); + } else { + return STATUS_BAD_TENSOR_DTYPE; + } - auto cuda_stream = reinterpret_cast(stream); - padding<<>>(t_ptr, pos_ptr, theta, leading_dim); + return STATUS_SUCCESS; } diff --git a/src/ops/rotary_embedding/cuda/rotary_embedding.cuh b/src/ops/rotary_embedding/cuda/rotary_embedding.cuh index 83ee010e..36b14194 100644 --- a/src/ops/rotary_embedding/cuda/rotary_embedding.cuh +++ b/src/ops/rotary_embedding/cuda/rotary_embedding.cuh @@ -1,12 +1,40 @@ #ifndef __NV_GPU_ROTARY_EMBEDDING_H__ #define __NV_GPU_ROTARY_EMBEDDING_H__ +#include "../../../devices/cuda/cuda_handle.h" #include "operators.h" -struct RotaryEmbeddingCudaDescriptor { +struct RoPECudaDescriptor { Device device; + int device_id; + DT dtype; + uint64_t seq_len; + uint64_t nhead; + uint64_t dim; + uint64_t total_seq_len; + int64_t strides[2]; }; -void rotary_embedding_nv_gpu_f16(Tensor t, Tensor pos, float theta, void *stream); +typedef struct RoPECudaDescriptor *RoPECudaDescriptor_t; + +infiniopStatus_t cudaCreateRoPEDescriptor(CudaHandle_t handle, + RoPECudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table); + +infiniopStatus_t cudaGetRoPEWorkspaceSize(RoPECudaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cudaRoPE(RoPECudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream); + +infiniopStatus_t cudaDestroyRoPEDescriptor(RoPECudaDescriptor_t desc); #endif// __NV_GPU_ROTARY_EMBEDDING_H__ diff --git a/src/ops/rotary_embedding/maca/rotary_embedding_maca.cc b/src/ops/rotary_embedding/maca/rotary_embedding_maca.cc new file mode 100644 index 00000000..171f1c57 --- /dev/null +++ b/src/ops/rotary_embedding/maca/rotary_embedding_maca.cc @@ -0,0 +1,76 @@ +#include "rotary_embedding_maca.h" +#include "../../../devices/maca/common_maca.h" +#include "../../utils.h" + +infiniopStatus_t macaCreateRoPEDescriptor(MacaHandle_t handle, + RoPEMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table) { + if (desc_ptr == nullptr) + return STATUS_MEMORY_NOT_ALLOCATED; + + if (t->ndim != 3 || + pos_ids->ndim != 1 || + sin_table->ndim != 2 || + cos_table->ndim != 2) + return STATUS_BAD_TENSOR_SHAPE; + + auto seq_len = t->shape[0]; + auto nhead = t->shape[1]; + auto dim = t->shape[2]; + auto total_seq_len = sin_table->shape[0]; + + if (dim % 2 != 0) + return STATUS_BAD_TENSOR_SHAPE; + + if (pos_ids->shape[0] != seq_len || + sin_table->shape[1] != dim || + cos_table->shape[1] != dim || + sin_table->shape[0] != cos_table->shape[0]) + return STATUS_BAD_TENSOR_SHAPE; + + // TODO: support larger dim in the future + if (dim / 2 > MAX_THREADS_PER_BLOCK) { + return STATUS_BAD_TENSOR_SHAPE; + } + + if (t->strides[2] != 1 || + pos_ids->strides[0] != 1 || + sin_table->strides[1] != 1 || + cos_table->strides[1] != 1) + return STATUS_BAD_TENSOR_STRIDES; + + if (!dtype_eq(t->dt, F16)) + return STATUS_BAD_TENSOR_DTYPE; + + if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32)) + return STATUS_BAD_TENSOR_DTYPE; + + if (!dtype_eq(pos_ids->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + + *desc_ptr = new RoPEMacaDescriptor{ + handle->device, + handle->device_id, + t->dt, + seq_len, + nhead, + dim, + total_seq_len, + {t->strides[0], t->strides[1]}}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t macaGetRoPEWorkspaceSize(RoPEMacaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + + +infiniopStatus_t macaDestroyRoPEDescriptor(RoPEMacaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rotary_embedding/maca/rotary_embedding_maca.h b/src/ops/rotary_embedding/maca/rotary_embedding_maca.h new file mode 100644 index 00000000..f5de3b14 --- /dev/null +++ b/src/ops/rotary_embedding/maca/rotary_embedding_maca.h @@ -0,0 +1,40 @@ +#ifndef __METAX_GPU_ROTARY_EMBEDDING_H__ +#define __METAX_GPU_ROTARY_EMBEDDING_H__ + +#include "../../../devices/maca/maca_handle.h" +#include "operators.h" + +struct RoPEMacaDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t seq_len; + uint64_t nhead; + uint64_t dim; + uint64_t total_seq_len; + int64_t strides[2]; +}; + +typedef struct RoPEMacaDescriptor *RoPEMacaDescriptor_t; + +infiniopStatus_t macaCreateRoPEDescriptor(MacaHandle_t handle, + RoPEMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table); + +infiniopStatus_t macaGetRoPEWorkspaceSize(RoPEMacaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t macaRoPE(RoPEMacaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream); + +infiniopStatus_t macaDestroyRoPEDescriptor(RoPEMacaDescriptor_t desc); + +#endif// __METAX_GPU_ROTARY_EMBEDDING_H__ diff --git a/src/ops/rotary_embedding/maca/rotary_embedding_maca.maca b/src/ops/rotary_embedding/maca/rotary_embedding_maca.maca new file mode 100644 index 00000000..aaa52250 --- /dev/null +++ b/src/ops/rotary_embedding/maca/rotary_embedding_maca.maca @@ -0,0 +1,70 @@ +#include "../../utils.h" +#include "rotary_embedding_maca.h" +#include + +static __global__ void padding_f16( + half *__restrict__ x_, + uint64_t const *__restrict__ pos_, + float const *__restrict__ sin_, + float const *__restrict__ cos_, + long const stride0, + long const stride1) { + auto dk = blockDim.x; + auto k = threadIdx.x; + auto offset = blockIdx.x * stride0 + blockIdx.y * stride1 + k * 2; + auto &x = reinterpret_cast(x_[offset]); + auto pos = pos_[blockIdx.x]; + auto sincos_offset = pos * dk * 2 + k * 2; + + float sin0 = sin_[sincos_offset], cos0 = cos_[sincos_offset], + sin1 = sin_[sincos_offset + 1], cos1 = cos_[sincos_offset + 1]; + float x0 = __half2float(x.x) * cos0 - __half2float(x.y) * sin0; + float x1 = __half2float(x.y) * cos1 + __half2float(x.x) * sin1; + x = half2(x0, x1); +} + + +void rotary_embedding_mc_gpu_f16( + RoPEMacaDescriptor_t desc, + half *t, + uint64_t const *pos, + float const *sin_, float const *cos_, + void *stream) { + auto nt = desc->seq_len, + nh = desc->nhead, + dh = desc->dim; + + // batching 2 half together + auto stride0 = desc->strides[0], + stride1 = desc->strides[1]; + + auto maca_stream = reinterpret_cast(stream); + padding_f16<<>>(t, pos, sin_, cos_, stride0, stride1); +} + +infiniopStatus_t macaRoPE(RoPEMacaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream) { + if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr) + return STATUS_BAD_PARAM; + + checkMacaError(hcSetDevice(desc->device_id)); + + if (dtype_eq(desc->dtype, F16)) { + rotary_embedding_mc_gpu_f16(desc, + reinterpret_cast(t), + reinterpret_cast(pos_ids), + reinterpret_cast(sin_table), + reinterpret_cast(cos_table), + stream); + } else { + return STATUS_BAD_TENSOR_DTYPE; + } + + return STATUS_SUCCESS; +} diff --git a/src/ops/rotary_embedding/musa/rotary_embedding_musa.cc b/src/ops/rotary_embedding/musa/rotary_embedding_musa.cc new file mode 100644 index 00000000..9ba0547d --- /dev/null +++ b/src/ops/rotary_embedding/musa/rotary_embedding_musa.cc @@ -0,0 +1,76 @@ +#include "rotary_embedding_musa.h" +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" + +infiniopStatus_t musaCreateRoPEDescriptor(MusaHandle_t handle, + RoPEMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table) { + if (desc_ptr == nullptr) + return STATUS_MEMORY_NOT_ALLOCATED; + + if (t->ndim != 3 || + pos_ids->ndim != 1 || + sin_table->ndim != 2 || + cos_table->ndim != 2) + return STATUS_BAD_TENSOR_SHAPE; + + auto seq_len = t->shape[0]; + auto nhead = t->shape[1]; + auto dim = t->shape[2]; + auto total_seq_len = sin_table->shape[0]; + + if (dim % 2 != 0) + return STATUS_BAD_TENSOR_SHAPE; + + if (pos_ids->shape[0] != seq_len || + sin_table->shape[1] != dim || + cos_table->shape[1] != dim || + sin_table->shape[0] != cos_table->shape[0]) + return STATUS_BAD_TENSOR_SHAPE; + + // TODO: support larger dim in the future + if (dim / 2 > MAX_THREADS_PER_BLOCK) { + return STATUS_BAD_TENSOR_SHAPE; + } + + if (t->strides[2] != 1 || + pos_ids->strides[0] != 1 || + sin_table->strides[1] != 1 || + cos_table->strides[1] != 1) + return STATUS_BAD_TENSOR_STRIDES; + + if (!dtype_eq(t->dt, F16)) + return STATUS_BAD_TENSOR_DTYPE; + + if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32)) + return STATUS_BAD_TENSOR_DTYPE; + + if (!dtype_eq(pos_ids->dt, U64)) + return STATUS_BAD_TENSOR_DTYPE; + + *desc_ptr = new RoPEMusaDescriptor{ + handle->device, + handle->device_id, + t->dt, + seq_len, + nhead, + dim, + total_seq_len, + {t->strides[0], t->strides[1]}}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t musaGetRoPEWorkspaceSize(RoPEMusaDescriptor_t desc, uint64_t *size) { + *size = 0; + return STATUS_SUCCESS; +} + + +infiniopStatus_t musaDestroyRoPEDescriptor(RoPEMusaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/rotary_embedding/musa/rotary_embedding_musa.h b/src/ops/rotary_embedding/musa/rotary_embedding_musa.h new file mode 100644 index 00000000..7a14daea --- /dev/null +++ b/src/ops/rotary_embedding/musa/rotary_embedding_musa.h @@ -0,0 +1,40 @@ +#ifndef __MUSA_ROTARY_EMBEDDING_H__ +#define __MUSA_ROTARY_EMBEDDING_H__ + +#include "../../../devices/musa/musa_handle.h" +#include "operators.h" + +struct RoPEMusaDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t seq_len; + uint64_t nhead; + uint64_t dim; + uint64_t total_seq_len; + int64_t strides[2]; +}; + +typedef struct RoPEMusaDescriptor *RoPEMusaDescriptor_t; + +infiniopStatus_t musaCreateRoPEDescriptor(MusaHandle_t handle, + RoPEMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table); + +infiniopStatus_t musaGetRoPEWorkspaceSize(RoPEMusaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t musaRoPE(RoPEMusaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream); + +infiniopStatus_t musaDestroyRoPEDescriptor(RoPEMusaDescriptor_t desc); + +#endif// __MT_GPU_ROTARY_EMBEDDING_H__ diff --git a/src/ops/rotary_embedding/musa/rotary_embedding_musa.mu b/src/ops/rotary_embedding/musa/rotary_embedding_musa.mu new file mode 100644 index 00000000..bac7ad47 --- /dev/null +++ b/src/ops/rotary_embedding/musa/rotary_embedding_musa.mu @@ -0,0 +1,68 @@ +#include "../../utils.h" +#include "rotary_embedding_musa.h" +#include + +static __global__ void padding_f16( + half *__restrict__ x_, + uint64_t const *__restrict__ pos_, + float const *__restrict__ sin_, + float const *__restrict__ cos_, + long const stride0, + long const stride1) { + auto dk = blockDim.x; + auto k = threadIdx.x; + auto offset = blockIdx.x * stride0 + blockIdx.y * stride1 + k * 2; + auto &x = reinterpret_cast(x_[offset]); + auto pos = pos_[blockIdx.x]; + auto sincos_offset = pos * dk * 2 + k * 2; + + float sin0 = sin_[sincos_offset], cos0 = cos_[sincos_offset], + sin1 = sin_[sincos_offset + 1], cos1 = cos_[sincos_offset + 1]; + float x0 = __half2float(x.x) * cos0 - __half2float(x.y) * sin0; + float x1 = __half2float(x.y) * cos1 + __half2float(x.x) * sin1; + x = half2(x0, x1); +} + + +void rotary_embedding_mt_gpu_f16( + RoPEMusaDescriptor_t desc, + half *t, + uint64_t const *pos, + float const *sin_, float const *cos_, + void *stream) { + auto nt = desc->seq_len, + nh = desc->nhead, + dh = desc->dim; + + // batching 2 half together + auto stride0 = desc->strides[0], + stride1 = desc->strides[1]; + + auto musa_stream = reinterpret_cast(stream); + padding_f16<<>>(t, pos, sin_, cos_, stride0, stride1); +} + +infiniopStatus_t musaRoPE(RoPEMusaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream) { + if (t == nullptr || pos_ids == nullptr || sin_table == nullptr || cos_table == nullptr) + return STATUS_BAD_PARAM; + + if (dtype_eq(desc->dtype, F16)) { + rotary_embedding_mt_gpu_f16(desc, + reinterpret_cast(t), + reinterpret_cast(pos_ids), + reinterpret_cast(sin_table), + reinterpret_cast(cos_table), + stream); + } else { + return STATUS_BAD_TENSOR_DTYPE; + } + + return STATUS_SUCCESS; +} diff --git a/src/ops/rotary_embedding/operator.cc b/src/ops/rotary_embedding/operator.cc index dcfd1282..bc2dbc09 100644 --- a/src/ops/rotary_embedding/operator.cc +++ b/src/ops/rotary_embedding/operator.cc @@ -2,85 +2,209 @@ #include "ops/rotary_embedding/rotary_embedding.h" #ifdef ENABLE_CPU +#include "../../devices/cpu/cpu_handle.h" #include "cpu/rotary_embedding_cpu.h" #endif #ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" #include "cuda/rotary_embedding.cuh" #endif #ifdef ENABLE_CAMBRICON_MLU -#include "bang/rotary_embedding_cnnl.h" +#include "bang/rotary_embedding_bang.h" +#endif +#ifdef ENABLE_ASCEND_NPU +#include "ascend/rotary_embedding.h" +#endif +#ifdef ENABLE_METAX_GPU +#include "maca/rotary_embedding_maca.h" +#endif +#ifdef ENABLE_MTHREADS_GPU +#include "musa/rotary_embedding_musa.h" #endif -struct RotaryEmbeddingDescriptor { +struct RoPEDescriptor { Device device; }; -__C void *createRotaryEmbeddingDescriptor(Device device, void *config) { - switch (device) { + +__C infiniopStatus_t infiniopCreateRoPEDescriptor(infiniopHandle_t handle, + infiniopRoPEDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t t, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table) { + switch (handle->device) { #ifdef ENABLE_CPU case DevCpu: - return (RotaryEmbeddingDescriptor *) (new RotaryEmbeddingCpuDescriptor{device}); + return cpuCreateRoPEDescriptor((CpuHandle_t) handle, (RoPECpuDescriptor_t *) desc_ptr, t, pos_ids, sin_table, cos_table); #endif #ifdef ENABLE_NV_GPU - case DevNvGpu: - return (RotaryEmbeddingDescriptor *) (new RotaryEmbeddingCudaDescriptor{device}); + case DevNvGpu: { + return cudaCreateRoPEDescriptor((CudaHandle_t) handle, (RoPECudaDescriptor_t *) desc_ptr, t, pos_ids, sin_table, cos_table); + } + #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { - auto bangDescriptor = new RotaryEmbeddingBangDescriptor(device); - bangDescriptor->createCnnlDescriptors(); - return (RotaryEmbeddingDescriptor *) (bangDescriptor); + return bangCreateRoPEDescriptor((BangHandle_t) handle, (RoPEBangDescriptor_t *) desc_ptr, t, pos_ids, sin_table, cos_table); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return ascendCreateRoPEDescriptor((AscendHandle_t) handle, + (RoPEAscendDescriptor_t *) desc_ptr, + t, + pos_ids, + sin_table, + cos_table); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaCreateRoPEDescriptor((MacaHandle_t) handle, + (RoPEMacaDescriptor_t *) desc_ptr, + t, + pos_ids, + sin_table, + cos_table); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaCreateRoPEDescriptor((MusaHandle_t) handle, (RoPEMusaDescriptor_t *) desc_ptr, t, pos_ids, sin_table, cos_table); } #endif - default: - PANIC(UnsupportedDevice); } - return nullptr; -}; + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc, uint64_t *size) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuGetRoPEWorkspaceSize((RoPECpuDescriptor_t) desc, size); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaGetRoPEWorkspaceSize((RoPECudaDescriptor_t) desc, size); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + case DevCambriconMlu: { + return bangGetRoPEWorkspaceSize((RoPEBangDescriptor_t) desc, size); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return ascendGetRoPEWorkspaceSize((RoPEAscendDescriptor_t) desc, + size); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaGetRoPEWorkspaceSize((RoPEMacaDescriptor_t) desc, + size); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaGetRoPEWorkspaceSize((RoPEMusaDescriptor_t) desc, size); + } +#endif + } + return STATUS_BAD_DEVICE; +} -__C void destroyRotaryEmbeddingDescriptor(RotaryEmbeddingDescriptor *descriptor) { - switch (descriptor->device) { +__C infiniopStatus_t infiniopRoPE(infiniopRoPEDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *t, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - delete (RotaryEmbeddingCpuDescriptor *) (descriptor); - break; + return cpuRoPE((RoPECpuDescriptor_t) desc, workspace, workspace_size, t, pos_ids, sin_table, cos_table, stream); #endif #ifdef ENABLE_NV_GPU - case DevNvGpu: - delete (RotaryEmbeddingCudaDescriptor *) (descriptor); - break; + case DevNvGpu: { + return cudaRoPE((RoPECudaDescriptor_t) desc, workspace, workspace_size, t, pos_ids, sin_table, cos_table, stream); + } + #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { - auto bangDescriptor = (RotaryEmbeddingBangDescriptor *) (descriptor); - bangDescriptor->destroyCnnlDescriptors(); - delete bangDescriptor; - break; + return bangRoPE((RoPEBangDescriptor_t) desc, workspace, workspace_size, t, pos_ids, sin_table, cos_table, stream); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return ascendRoPE((RoPEAscendDescriptor_t) desc, + workspace, + workspace_size, + t, + pos_ids, + sin_table, + cos_table, + stream); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaRoPE((RoPEMacaDescriptor_t) desc, + workspace, + workspace_size, + t, + pos_ids, + sin_table, + cos_table, + stream); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaRoPE((RoPEMusaDescriptor_t) desc, workspace, workspace_size, t, pos_ids, sin_table, cos_table, stream); } #endif - default: - PANIC(UnsupportedDevice); } + return STATUS_BAD_DEVICE; } -__C void rotaryEmbedding(RotaryEmbeddingDescriptor *descriptor, Tensor t, Tensor pos, float theta, void *stream) { - switch (descriptor->device) { +__C infiniopStatus_t infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - rotary_embedding_cpu_f16(t, pos, theta); - break; + return cpuDestroyRoPEDescriptor((RoPECpuDescriptor_t) desc); #endif #ifdef ENABLE_NV_GPU - case DevNvGpu: - rotary_embedding_nv_gpu_f16(t, pos, theta, stream); - break; + case DevNvGpu: { + return cudaDestroyRoPEDescriptor((RoPECudaDescriptor_t) desc); + } + #endif #ifdef ENABLE_CAMBRICON_MLU - case DevCambriconMlu: - rotary_embedding_cnnl_f16((RotaryEmbeddingBangDescriptor *) (descriptor), t, pos, theta, stream); - break; + case DevCambriconMlu: { + return bangDestroyRoPEDescriptor((RoPEBangDescriptor_t) desc); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: { + return ascendDestroyRoPEDescriptor((RoPEAscendDescriptor_t) desc); + } +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaDestroyRoPEDescriptor((RoPEMacaDescriptor_t) desc); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + return musaDestroyRoPEDescriptor((RoPEMusaDescriptor_t) desc); + } #endif - default: - PANIC(UnsupportedDevice); } -}; + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/swiglu/ascend/swiglu.cc b/src/ops/swiglu/ascend/swiglu.cc new file mode 100644 index 00000000..ff2ee514 --- /dev/null +++ b/src/ops/swiglu/ascend/swiglu.cc @@ -0,0 +1,71 @@ +#include "swiglu.h" + + +infiniopStatus_t ascendCreateSwiGLUDescriptor(AscendHandle_t handle, + SwiGLUAscendDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + uint64_t ndim = c_desc->ndim; + DT dtype = c_desc->dt; + + aclDataType dt; + if (dtype_eq(dtype, F16)) { + dt = aclDataType::ACL_FLOAT16; + } else if (dtype_eq(dtype, F32)) { + dt = aclDataType::ACL_FLOAT; + } else { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) { + return STATUS_BAD_TENSOR_SHAPE; + } + + if (c_desc->strides[1] != 1 || a_desc->strides[1] != 1 || b_desc->strides[1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + int32_t seq_len = static_cast(c_desc->shape[0]), + di = static_cast(c_desc->shape[1]); + + int32_t sta = static_cast(a_desc->strides[0]); + int32_t stb = static_cast(b_desc->strides[0]); + int32_t stc = static_cast(c_desc->strides[0]); + + *desc_ptr = new SwiGLUAscendDescriptor{ + handle->device, + handle->device_id, + dt, + seq_len, + di, + sta, + stb, + stc}; + return STATUS_SUCCESS; +} + +infiniopStatus_t ascendSwiGLU(SwiGLUAscendDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto seq_len = desc->seq_len, + di = desc->di; + + auto sta = desc->sta, + stb = desc->stb, + stc = desc->stc; + + auto dt = desc->dtype; + + // Set device + aclrtSetDevice(desc->device_id); + + return swiglu_kernel_do(c, (void *) a, (void *) b, 1.0, seq_len, di, sta, stb, stc, dt, stream); +} + +infiniopStatus_t ascendDestroySwiGLUDescriptor(SwiGLUAscendDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/swiglu/ascend/swiglu.h b/src/ops/swiglu/ascend/swiglu.h new file mode 100644 index 00000000..be02a318 --- /dev/null +++ b/src/ops/swiglu/ascend/swiglu.h @@ -0,0 +1,45 @@ +#ifndef __ACLNN_SWIGLU_H__ +#define __ACLNN_SWIGLU_H__ + +#include "../../../devices/ascend/ascend_handle.h" +#include "../../../devices/ascend/tensor_aclnn.h" +#include "../../utils.h" +#include "operators.h" +#include "../../utils.h" +#include +#include + + +struct SwiGLUAscendDescriptor { + Device device; + int device_id; + aclDataType dtype; + int32_t seq_len; + int32_t di; + int32_t sta; + int32_t stb; + int32_t stc; +}; + +typedef struct SwiGLUAscendDescriptor *SwiGLUAscendDescriptor_t; + +infiniopStatus_t ascendCreateSwiGLUDescriptor(AscendHandle_t handle, + SwiGLUAscendDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + +infiniopStatus_t ascendSwiGLU(SwiGLUAscendDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +infiniopStatus_t ascendDestroySwiGLUDescriptor(SwiGLUAscendDescriptor_t desc); + +extern "C" infiniopStatus_t swiglu_kernel_do(void *c, void *a, void *b, + float beta, int32_t nt, int32_t dh, + int32_t sta, int32_t stb, int32_t stc, + int dtype, void *stream); + +#endif diff --git a/src/ops/swiglu/ascend/swiglu_kernel.cpp b/src/ops/swiglu/ascend/swiglu_kernel.cpp new file mode 100644 index 00000000..3dab674f --- /dev/null +++ b/src/ops/swiglu/ascend/swiglu_kernel.cpp @@ -0,0 +1,181 @@ +#include "../../../../include/status.h" +#include "kernel_operator.h" +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 1; +constexpr int32_t BLOCK_NUM = 8; + +template class KernelSwiGLU { +public: + __aicore__ inline KernelSwiGLU() {} + // Init SwiGLU + // c output tensor, support only 2 dim + // a up tensor + // b gate tensor + // formular: b = a x silu(b) + // a, b, c has same tensor shape + __aicore__ inline void Init(GM_ADDR c, GM_ADDR a, GM_ADDR b, + float beta, int32_t nt, int32_t dh, + int32_t sta, int32_t stb, int32_t stc, + uint32_t remainder, uint32_t base); + __aicore__ inline void Process(); + +private: + __aicore__ inline void CopyIn(int32_t i); + __aicore__ inline void Compute(int32_t i); + __aicore__ inline void CopyOut(int32_t i); + +private: + TPipe pipe; + TQue aQue; + TQue bQue; + TQue cQue; + // Used in GatherMask + // TBuf outBuf; + + GlobalTensor aGm; + GlobalTensor bGm; + GlobalTensor cGm; + + uint32_t _block_idx; + uint32_t _tile_len; + uint32_t _copy_len; + + // c[nt, dh] + // strides = [stx, 1] + int32_t nt; + int32_t dh; + int32_t sta; + int32_t stb; + int32_t stc; + float beta; +}; + + +template +__aicore__ inline void KernelSwiGLU::Init(GM_ADDR c, GM_ADDR a, GM_ADDR b, + float beta, int32_t nt, int32_t dh, + int32_t sta, int32_t stb, int32_t stc, + uint32_t remainder, uint32_t base) { + + this->nt = nt; + this->dh = dh; + this->beta = beta; + this->sta = sta; + this->stb = stb; + this->stc = stc; + + _block_idx = GetBlockIdx(); + _tile_len = _block_idx < remainder ? base + 1 : base; + _copy_len = _tile_len * sizeof(T) % 32 == 0 + ? _tile_len + : (_tile_len * sizeof(T) + 31) / 32 * 32 / sizeof(T); + + // Set global tensor + aGm.SetGlobalBuffer((__gm__ T *) a); + bGm.SetGlobalBuffer((__gm__ T *) b); + cGm.SetGlobalBuffer((__gm__ T *) c); + + // Pipe alloc memory to queue, the unit is bytes + pipe.InitBuffer(aQue, BUFFER_NUM, _copy_len * sizeof(T)); + pipe.InitBuffer(bQue, BUFFER_NUM, _copy_len * sizeof(T)); + pipe.InitBuffer(cQue, BUFFER_NUM, _copy_len * sizeof(T)); +} + +template +__aicore__ inline void KernelSwiGLU::CopyIn(int32_t i) { + // Alloc tensor from queue memory + LocalTensor aUb = aQue.AllocTensor(); + LocalTensor bUb = bQue.AllocTensor(); + // Get idx of current tile + auto idxa = i * sta + _block_idx * _tile_len; + auto idxb = i * stb + _block_idx * _tile_len; + // Copy process_th tile from global tensor to local tensor + // See https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/opdevgapi/atlasascendc_api_07_0105.html + // DataCopy cut down if _tile_len * sizeof(T) / 32 != 0 + DataCopy(aUb, aGm[idxa], _copy_len); + DataCopy(bUb, bGm[idxb], _copy_len); + + // Enque input tensor to VECIN queue + aQue.EnQue(aUb); + bQue.EnQue(bUb); +} + +template +__aicore__ inline void KernelSwiGLU::Compute(int32_t i) { + // Deque input tensors from VECIN queue + LocalTensor aUb = aQue.DeQue(); + LocalTensor bUb = bQue.DeQue(); + LocalTensor cUb = cQue.AllocTensor(); + // Call SwiGLU ascend api + SwiGLU(cUb, aUb, bUb, beta); + // Enque result and free input + cQue.EnQue(cUb); + aQue.FreeTensor(aUb); + bQue.FreeTensor(bUb); +} + +template +__aicore__ inline void KernelSwiGLU::CopyOut(int32_t i) { + // Deque output tensor from VECOUT queue + LocalTensor cUb = cQue.DeQue(); + auto idxc = i * stc + _block_idx * _tile_len; + // Copy progress_th tile from local tensor to global tensor + // Use Gather mask if _tile_len * sizeof(T) % 32 != 0 + if (_tile_len * sizeof(T) % 32 != 0) { + DataCopyExtParams dcep = {1, static_cast(_tile_len * sizeof(T)), 0, 0, 0}; + DataCopyPad(cGm[idxc], cUb, dcep); + } + DataCopy(cGm[idxc], cUb, _tile_len); + // Free output Local tensor + cQue.FreeTensor(cUb); +} + +template +__aicore__ inline void KernelSwiGLU::Process() { + for (int32_t i = 0; i < nt; ++i) { + CopyIn(i); + Compute(i); + CopyOut(i); + } +} + +__global__ __aicore__ void swiglu_kernel_f16(GM_ADDR c, GM_ADDR a, GM_ADDR b, + float beta, int32_t nt, int32_t dh, + int32_t sta, int32_t stb, int32_t stc, + uint32_t remainder, uint32_t base) { + KernelSwiGLU op; + op.Init(c, a, b, beta, nt, dh, sta, stb, stc, remainder, base); + op.Process(); +} + +__global__ __aicore__ void swiglu_kernel_f32(GM_ADDR c, GM_ADDR a, GM_ADDR b, + float beta, int32_t nt, int32_t dh, + int32_t sta, int32_t stb, int32_t stc, + uint32_t remainder, uint32_t base) { + KernelSwiGLU op; + op.Init(c, a, b, beta, nt, dh, sta, stb, stc, remainder, base); + op.Process(); +} + +extern "C" infiniopStatus_t swiglu_kernel_do(void *c, void *a, void *b, + float beta, int32_t nt, int32_t dh, + int32_t sta, int32_t stb, int32_t stc, + int dtype, void *stream) { + + // Tiling params + auto base = static_cast(dh / BLOCK_NUM); + auto remainder = static_cast(dh % BLOCK_NUM); + + switch (dtype) { + case 0: + swiglu_kernel_f32<<>>( + c, a, b, beta, nt, dh, sta, stb, stc, remainder, base); + return STATUS_SUCCESS; + case 1: + swiglu_kernel_f16<<>>( + c, a, b, beta, nt, dh, sta, stb, stc, remainder, base); + return STATUS_SUCCESS; + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/swiglu/bang/swiglu_bang.cc b/src/ops/swiglu/bang/swiglu_bang.cc new file mode 100644 index 00000000..7654bf4f --- /dev/null +++ b/src/ops/swiglu/bang/swiglu_bang.cc @@ -0,0 +1,50 @@ +#include "swiglu_bang.h" +#include "../../utils.h" + +infiniopStatus_t bangCreateSwiGLUDescriptor(BangHandle_t handle, + SwiGLUBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + if (c_desc->ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) { + return STATUS_BAD_TENSOR_SHAPE; + } + + DT dtype = c_desc->dt; + + if (!dtype_eq(dtype, F16)) { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (a_desc->strides[1] != 1 || b_desc->strides[1] != 1 || c_desc->strides[1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + uint64_t seq_len = c_desc->shape[0], + di = c_desc->shape[1]; + + uint64_t stride_a = a_desc->strides[0], + stride_b = b_desc->strides[0], + stride_c = c_desc->strides[0]; + + + if (a_desc->shape[0] != seq_len || a_desc->shape[1] != di || !dtype_eq(a_desc->dt, dtype) || + b_desc->shape[0] != seq_len || b_desc->shape[1] != di || !dtype_eq(b_desc->dt, dtype)) { + return STATUS_BAD_PARAM; + } + + *desc_ptr = new SwiGLUBangDescriptor{handle->device, + handle->device_id, + dtype, + seq_len, + di, + stride_a, + stride_b, + stride_c}; + return STATUS_SUCCESS; +} + +infiniopStatus_t bangDestroySwiGLUDescriptor(SwiGLUBangDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/swiglu/bang/swiglu_bang.h b/src/ops/swiglu/bang/swiglu_bang.h index 7e81ebee..5eabc103 100644 --- a/src/ops/swiglu/bang/swiglu_bang.h +++ b/src/ops/swiglu/bang/swiglu_bang.h @@ -1,10 +1,35 @@ #ifndef __BANG_SWIGLU_H__ #define __BANG_SWIGLU_H__ +#include "../../../devices/bang/bang_handle.h" #include "../../utils.h" -#include "cnrt.h" #include "operators.h" -void swiglu_bang_f16(Tensor gate, Tensor up, void *stream); +struct SwiGLUBangDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t seq_len; + uint64_t di; + uint64_t stride_a; + uint64_t stride_b; + uint64_t stride_c; +}; + +typedef struct SwiGLUBangDescriptor *SwiGLUBangDescriptor_t; + +infiniopStatus_t bangCreateSwiGLUDescriptor(BangHandle_t handle, + SwiGLUBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_dec, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + +infiniopStatus_t bangSwiGLU(SwiGLUBangDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +infiniopStatus_t bangDestroySwiGLUDescriptor(SwiGLUBangDescriptor_t desc); #endif// __BANG_SWIGLU_H__ diff --git a/src/ops/swiglu/bang/swiglu_bang.mlu b/src/ops/swiglu/bang/swiglu_bang.mlu index e1323236..b43c5e10 100644 --- a/src/ops/swiglu/bang/swiglu_bang.mlu +++ b/src/ops/swiglu/bang/swiglu_bang.mlu @@ -3,125 +3,20 @@ #include "cnrt.h" #include "swiglu_bang.h" #include "../../../devices/bang/common_bang.h" -const int SRC_MAX_SIZE = 1024 * 64;//至少大于等于128字节 -__nram__ char nram_buffer[NRAM_MAX_SIZE]; +#include "../../utils.h" -template -__mlu_device__ void swigluKernel(T *gate, int *gate_stride, T const *up, int *up_stride, int *shape, int othersize, int dimsize, int ndim){ - - const int maxNum = SRC_MAX_SIZE/sizeof(T); - - if(dimsize >= maxNum){ - T *src = (T *)nram_buffer;//[maxNum] - T *dest = src + maxNum; //[maxNum] - int remainT = othersize % taskDim; - int stepEasy = (othersize - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - int remain = dimsize % maxNum; - int repeat = (dimsize - remain) / maxNum; - int tidS; - int tidD; - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i; - for (int j = ndim - 2; j >= 0; --j) { - inds += (indi % shape[j]) * up_stride[j]; - indd += (indi % shape[j]) * gate_stride[j]; - indi /= shape[j]; - } - for(int s = 0; s < repeat; s++){ - tidS = inds + s * maxNum; - tidD = indd + s * maxNum; - __memcpy(src, up + tidS, maxNum * sizeof(T), GDRAM2NRAM); - __memcpy(dest, gate + tidD, maxNum * sizeof(T), GDRAM2NRAM); - __bang_mul(src, src, dest, maxNum);//up = up * gate - __bang_active_sigmoid(dest, dest, maxNum);//gate = sigmoid(gate) - __bang_mul(src, src, dest, maxNum);//up = up * gate - __memcpy(gate + tidD, src, maxNum * sizeof(T), NRAM2GDRAM); - } - if(remain){ - tidS = inds + repeat * maxNum; - tidD = indd + repeat * maxNum; - __memcpy(src, up + tidS, remain * sizeof(T), GDRAM2NRAM); - __memcpy(dest, gate + tidD, remain * sizeof(T), GDRAM2NRAM); - __bang_mul(src, src, dest, remain);//up = up * gate - __bang_active_sigmoid(dest, dest, remain);//gate = sigmoid(gate) - __bang_mul(src, src, dest, remain);//up = up * gate - __memcpy(gate + tidD, src, remain * sizeof(T), NRAM2GDRAM); - } - } - } - else{ - T *src = (T *)nram_buffer;//[dimsize] - T *dest = src + dimsize; //[dimsize] - int remainT = othersize % taskDim; - int stepEasy = (othersize - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i; - for (int j = ndim - 2; j >= 0; --j) { - inds += (indi % shape[j]) * up_stride[j]; - indd += (indi % shape[j]) * gate_stride[j]; - indi /= shape[j]; - } - __memcpy(src, up + inds, dimsize * sizeof(T), GDRAM2NRAM); - __memcpy(dest, gate + indd, dimsize * sizeof(T), GDRAM2NRAM); - - __bang_mul(src, src, dest, dimsize);//up = up * gate - __bang_active_sigmoid(dest, dest, dimsize);//gate = sigmoid(gate) - __bang_mul(src, src, dest, dimsize);//up = up * gate - - __memcpy(gate + indd, src, dimsize * sizeof(T), NRAM2GDRAM); - } - - } -} -template -__mlu_global__ void swigluUnion1(T *gate, int *gate_stride, T const *up, int *up_stride, int *shape, int othersize, int dimsize, int ndim) { +const int SRC_MAX_SIZE = 1024 * 32;//至少大于等于128字节 +__nram__ char nram_buffer[NRAM_MAX_SIZE]; - swigluKernel(gate, gate_stride, up, up_stride, shape, othersize, dimsize, ndim); -} -template -void swiglu(cnrtQueue_t queue, void *gate, int *gate_stride, void const *up, int *up_stride, int *shape, int othersize, int dimsize, int ndim) { - - auto y_ = reinterpret_cast(gate); - auto x_ = reinterpret_cast(up); - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - - k_dim.x = 4; - k_dim.y = 1; - k_dim.z = 1; - k_type = CNRT_FUNC_TYPE_UNION1; - - swigluUnion1<<>>(y_, gate_stride, x_, up_stride, shape, othersize, dimsize, ndim); - // cnrtQueueSync(queue); - -} -void swiglu_fp16(cnrtQueue_t queue, void *gate, void *up, int *gate_stride, int *up_stride, int *shape, int othersize, int dimsize, int ndim) { - - swiglu(queue, gate, gate_stride, up, up_stride, shape, othersize, dimsize, ndim); - -} template -__mlu_global__ void swigluDim_2(T *gate, T const *up, int strideS_f, int strideD_f, int othersize, int dimsize){ +__mlu_global__ void swigluDim_2(T const *a_, T const *b_, T *c_, int stride_a, int stride_b, int stride_c, int othersize, int dimsize){ const int maxNum = SRC_MAX_SIZE/sizeof(T); if(dimsize >= maxNum){ T *src = (T *)nram_buffer;//[maxNum] - T *dest = src + maxNum; //[maxNum] + T *dest = src + 3 * maxNum; //[maxNum] int remainT = othersize % taskDim; int stepEasy = (othersize - remainT) / taskDim; int stepHard = stepEasy + 1; @@ -130,33 +25,46 @@ __mlu_global__ void swigluDim_2(T *gate, T const *up, int strideS_f, int strideD int remain = dimsize % maxNum; int repeat = (dimsize - remain) / maxNum; - int tidS; - int tidD; + int tid_a; + int tid_b; + int tid_c; for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; + int ind_a = 0; + int ind_b = 0; + int ind_c = 0; int indi = i; - inds += (indi % othersize) * strideS_f; - indd += (indi % othersize) * strideD_f; - for(int s = 0; s < repeat; s++){ - tidS = inds + s * maxNum; - tidD = indd + s * maxNum; - __memcpy(src, up + tidS, maxNum * sizeof(T), GDRAM2NRAM); - __memcpy(dest, gate + tidD, maxNum * sizeof(T), GDRAM2NRAM); - __bang_mul(src, src, dest, maxNum);//up = up * gate - __bang_active_sigmoid(dest, dest, maxNum);//gate = sigmoid(gate) - __bang_mul(src, src, dest, maxNum);//up = up * gate - __memcpy(gate + tidD, src, maxNum * sizeof(T), NRAM2GDRAM); + ind_a += (indi % othersize) * stride_a; + ind_b += (indi % othersize) * stride_b; + ind_c += (indi % othersize) * stride_c; + for(int s = 0; s < repeat + 2; s++){ + + if(s < repeat){ + tid_a = ind_a + s * maxNum; + tid_b = ind_b + s * maxNum; + __memcpy_async(src + s % 3 * maxNum, a_ + tid_a, maxNum * sizeof(T), GDRAM2NRAM); + __memcpy_async(dest + s % 3 * maxNum, b_ + tid_b, maxNum * sizeof(T), GDRAM2NRAM); + } + if(s > 0 && s < repeat + 1){ + __bang_mul(src + (s - 1) % 3 * maxNum, src + (s - 1) % 3 * maxNum, dest + (s - 1) % 3 * maxNum, maxNum);//a_ = a_ * b_ + __bang_active_sigmoid(dest + (s - 1) % 3 * maxNum, dest + (s - 1) % 3 * maxNum, maxNum);//b_ = sigmoid(b_) + __bang_mul(src + (s - 1) % 3 * maxNum, src + (s - 1) % 3 * maxNum, dest + (s - 1) % 3 * maxNum, maxNum);//a_ = a_ * b_ + } + if(s > 1){ + tid_c = ind_c + (s - 2) * maxNum; + __memcpy_async(c_ + tid_c, src + (s - 2) % 3 * maxNum, maxNum * sizeof(T), NRAM2GDRAM); + } + __sync_all_ipu(); } if(remain){ - tidS = inds + repeat * maxNum; - tidD = indd + repeat * maxNum; - __memcpy(src, up + tidS, remain * sizeof(T), GDRAM2NRAM); - __memcpy(dest, gate + tidD, remain * sizeof(T), GDRAM2NRAM); - __bang_mul(src, src, dest, remain);//up = up * gate - __bang_active_sigmoid(dest, dest, remain);//gate = sigmoid(gate) - __bang_mul(src, src, dest, remain);//up = up * gate - __memcpy(gate + tidD, src, remain * sizeof(T), NRAM2GDRAM); + tid_a = ind_a + repeat * maxNum; + tid_b = ind_b + repeat * maxNum; + tid_c = ind_c + repeat * maxNum; + __memcpy(src, a_ + tid_a, remain * sizeof(T), GDRAM2NRAM); + __memcpy(dest, b_ + tid_b, remain * sizeof(T), GDRAM2NRAM); + __bang_mul(src, src, dest, remain);//a_ = a_ * b_ + __bang_active_sigmoid(dest, dest, remain);//b_ = sigmoid(b_) + __bang_mul(src, src, dest, remain);//a_ = a_ * b_ + __memcpy(c_ + tid_c, src, remain * sizeof(T), NRAM2GDRAM); } } } @@ -170,29 +78,32 @@ __mlu_global__ void swigluDim_2(T *gate, T const *up, int strideS_f, int strideD int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; + int ind_a = 0; + int ind_b = 0; + int ind_c = 0; int indi = i; - inds += (indi % othersize) * strideS_f; - indd += (indi % othersize) * strideD_f; - __memcpy(src, up + inds, dimsize * sizeof(T), GDRAM2NRAM); - __memcpy(dest, gate + indd, dimsize * sizeof(T), GDRAM2NRAM); + ind_a += (indi % othersize) * stride_a; + ind_b += (indi % othersize) * stride_b; + ind_c += (indi % othersize) * stride_c; + __memcpy(src, a_ + ind_a, dimsize * sizeof(T), GDRAM2NRAM); + __memcpy(dest, b_ + ind_b, dimsize * sizeof(T), GDRAM2NRAM); - __bang_mul(src, src, dest, dimsize);//up = up * gate - __bang_active_sigmoid(dest, dest, dimsize);//gate = sigmoid(gate) - __bang_mul(src, src, dest, dimsize);//up = up * gate + __bang_mul(src, src, dest, dimsize);//a_ = a_ * b_ + __bang_active_sigmoid(dest, dest, dimsize);//b_ = sigmoid(b_) + __bang_mul(src, src, dest, dimsize);//a_ = a_ * b_ - __memcpy(gate + indd, src, dimsize * sizeof(T), NRAM2GDRAM); + __memcpy(c_ + ind_c, src, dimsize * sizeof(T), NRAM2GDRAM); } } } template -void swigluUnionDim_2(cnrtQueue_t queue, void *gate, void const *up, int strideS_f, int strideD_f, int othersize, int dimsize) { - - auto y_ = reinterpret_cast(gate); - auto x_ = reinterpret_cast(up); +void swigluUnionDim_2(cnrtQueue_t queue, void const *a, void const *b, void *c, int stride_a, int stride_b, int stride_c, int othersize, int dimsize) { + auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + cnrtDim3_t k_dim; cnrtFunctionType_t k_type; @@ -201,156 +112,35 @@ void swigluUnionDim_2(cnrtQueue_t queue, void *gate, void const *up, int strideS k_dim.z = 1; k_type = CNRT_FUNC_TYPE_UNION1; - swigluDim_2<<>>(y_, x_, strideS_f, strideD_f, othersize, dimsize); - // cnrtQueueSync(queue); + swigluDim_2<<>>(a_, b_, c_, stride_a, stride_b, stride_c, othersize, dimsize); } -template -__mlu_global__ void swigluDim_3(T *gate, T const *up, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int othersize, int middle, int dimsize){ - - const int maxNum = SRC_MAX_SIZE/sizeof(T); - int startDim = othersize / middle; - if(dimsize >= maxNum){ - T *src = (T *)nram_buffer;//[maxNum] - T *dest = src + maxNum; //[maxNum] - int remainT = othersize % taskDim; - int stepEasy = (othersize - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - int remain = dimsize % maxNum; - int repeat = (dimsize - remain) / maxNum; - int tidS; - int tidD; - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i; - inds += (indi % middle) * strideS_m; - indd += (indi % middle) * strideD_m; - indi /= middle; - inds += (indi % startDim) * strideS_f; - indd += (indi % startDim) * strideD_f; - for(int s = 0; s < repeat; s++){ - tidS = inds + s * maxNum; - tidD = indd + s * maxNum; - __memcpy(src, up + tidS, maxNum * sizeof(T), GDRAM2NRAM); - __memcpy(dest, gate + tidD, maxNum * sizeof(T), GDRAM2NRAM); - __bang_mul(src, src, dest, maxNum);//up = up * gate - __bang_active_sigmoid(dest, dest, maxNum);//gate = sigmoid(gate) - __bang_mul(src, src, dest, maxNum);//up = up * gate - __memcpy(gate + tidD, src, maxNum * sizeof(T), NRAM2GDRAM); - } - if(remain){ - tidS = inds + repeat * maxNum; - tidD = indd + repeat * maxNum; - __memcpy(src, up + tidS, remain * sizeof(T), GDRAM2NRAM); - __memcpy(dest, gate + tidD, remain * sizeof(T), GDRAM2NRAM); - __bang_mul(src, src, dest, remain);//up = up * gate - __bang_active_sigmoid(dest, dest, remain);//gate = sigmoid(gate) - __bang_mul(src, src, dest, remain);//up = up * gate - __memcpy(gate + tidD, src, remain * sizeof(T), NRAM2GDRAM); - } - } - } - else{ - T *src = (T *)nram_buffer;//[dimsize] - T *dest = src + dimsize; //[dimsize] - int remainT = othersize % taskDim; - int stepEasy = (othersize - remainT) / taskDim; - int stepHard = stepEasy + 1; - int step = (taskId < remainT ? stepHard : stepEasy); - int indStart = (taskId < remainT ? taskId * stepHard : (taskId - remainT) * stepEasy + remainT * stepHard); - - for(int i = indStart; i < indStart + step; i++){ - int inds = 0; - int indd = 0; - int indi = i; - inds += (indi % middle) * strideS_m; - indd += (indi % middle) * strideD_m; - indi /= middle; - inds += (indi % startDim) * strideS_f; - indd += (indi % startDim) * strideD_f; - __memcpy(src, up + inds, dimsize * sizeof(T), GDRAM2NRAM); - __memcpy(dest, gate + indd, dimsize * sizeof(T), GDRAM2NRAM); - - __bang_mul(src, src, dest, dimsize);//up = up * gate - __bang_active_sigmoid(dest, dest, dimsize);//gate = sigmoid(gate) - __bang_mul(src, src, dest, dimsize);//up = up * gate +void swiglu_bang_f16(SwiGLUBangDescriptor_t desc, void const *a, void const *b, void *c, void *stream) { + auto queue = reinterpret_cast(stream); + auto seq_len = desc->seq_len, + di = desc->di; - __memcpy(gate + indd, src, dimsize * sizeof(T), NRAM2GDRAM); - } - - } -} -template -void swigluUnionDim_3(cnrtQueue_t queue, void *gate, void const *up, int strideS_f, int strideS_m, int strideD_f, int strideD_m, int othersize, int middle, int dimsize) { - - auto y_ = reinterpret_cast(gate); - auto x_ = reinterpret_cast(up); - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; + auto stride_a = desc->stride_a, + stride_b = desc->stride_b, + stride_c = desc->stride_c; - k_dim.x = 4; - k_dim.y = 1; - k_dim.z = 1; - k_type = CNRT_FUNC_TYPE_UNION1; - swigluDim_3<<>>(y_, x_, strideS_f, strideS_m, strideD_f, strideD_m, othersize, middle, dimsize); - // cnrtQueueSync(queue); + swigluUnionDim_2(queue, a, b, c, stride_a, stride_b, stride_c, seq_len, di); + } -void swiglu_bang_f16(Tensor gate, Tensor up, void *stream) { - auto queue = reinterpret_cast(stream); - int num = 1; - int ndim = gate.layout->ndim; - int gate_stride[ndim], up_stride[ndim], shape[ndim]; - for (int i = 0; i < ndim; i++) { - gate_stride[i] = gate.layout->strides[i] / gate.layout->dt.size; - up_stride[i] = up.layout->strides[i] / up.layout->dt.size; - shape[i] = gate.layout->shape[i]; - num *= shape[i]; - } - if(ndim == 2){ - ASSERT_EQ(gate.layout->ndim, 2); - ASSERT_EQ(up.layout->ndim, 2); - ASSERT_EQ(gate.layout->shape[0], up.layout->shape[0]); - ASSERT_EQ(gate.layout->shape[1], up.layout->shape[1]); - auto n = gate.layout->shape[0], - d = gate.layout->shape[1]; - int strideS_f = up_stride[0]; - int strideD_f = gate_stride[0]; - swigluUnionDim_2(queue, gate.data, up.data, strideS_f, strideD_f, n, d); +infiniopStatus_t bangSwiGLU(SwiGLUBangDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream){ + if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)) { + swiglu_bang_f16(desc, a, b, c, stream); + return STATUS_SUCCESS; } - else if(ndim == 3){ - int strideS_f = up_stride[0]; - int strideD_f = gate_stride[0]; - int strideS_m = up_stride[1]; - int strideD_m = gate_stride[1]; - int middle = shape[1]; - int d = shape[ndim - 1]; - int n = num / d; - swigluUnionDim_3(queue, gate.data, up.data, strideS_f, strideS_m, strideD_f, strideD_m, n, middle, d); - } - else{ - int d = shape[ndim - 1]; - int n = num / d; - int *mlu_stride_gate, *mlu_stride_up, *mlu_shape; - CNRT_CHECK(cnrtMalloc((void **)&mlu_stride_gate, ndim * sizeof(int))); - CNRT_CHECK(cnrtMalloc((void **)&mlu_stride_up, ndim * sizeof(int))); - CNRT_CHECK(cnrtMalloc((void **)&mlu_shape, ndim * sizeof(int))); - CNRT_CHECK(cnrtMemcpy(mlu_stride_gate, gate_stride, ndim * sizeof(int), cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpy(mlu_stride_up, up_stride, ndim * sizeof(int), cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpy(mlu_shape, shape, ndim * sizeof(int), cnrtMemcpyHostToDev)); - - - swiglu_fp16(queue, gate.data, up.data, mlu_stride_gate, mlu_stride_up, mlu_shape, n, d, ndim); - - CNRT_CHECK(cnrtFree(mlu_stride_gate)); - CNRT_CHECK(cnrtFree(mlu_stride_up)); - CNRT_CHECK(cnrtFree(mlu_shape)); - } - + return STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/ops/swiglu/bang/swiglu_cnnl.cc b/src/ops/swiglu/bang/swiglu_cnnl.cc deleted file mode 100644 index 64f062b6..00000000 --- a/src/ops/swiglu/bang/swiglu_cnnl.cc +++ /dev/null @@ -1,60 +0,0 @@ -#include "swiglu_cnnl.h" -#include "../../../devices/bang/common_bang.h" -#include "../../../devices/bang/handle_pool.h" -#include "../../utils.h" -#include "cnrt.h" - -SwigluBangDescriptor::SwigluBangDescriptor(Device device) { - this->device = device; - get_cnnl_pool(); -} - -void swiglu_cnnl_f16(SwigluBangDescriptor *descriptor, Tensor gate, Tensor up, void *stream) { - ASSERT_EQ(gate.layout->ndim, 2); - ASSERT_EQ(up.layout->ndim, 2); - ASSERT_EQ(gate.layout->shape[0], up.layout->shape[0]); - ASSERT_EQ(gate.layout->shape[1], up.layout->shape[1]); - - cnnlTensorDescriptor_t gateDesc, inDesc; - cnnlCreateTensorDescriptor(&gateDesc); - cnnlCreateTensorDescriptor(&inDesc); - - setCnnlTensor(gateDesc, gate.layout); - - std::vector dims(gate.layout->ndim); - size_t inputSizeInBytes = 1; - for (uint64_t i = 0; i < gate.layout->ndim; i++) { - dims[i] = static_cast(gate.layout->shape[i]); - inputSizeInBytes *= dims[i]; - } - dims[gate.layout->ndim - 1] *= 2; - inputSizeInBytes *= (2 * sizeof(uint16_t)); - cnnlSetTensorDescriptor(inDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF, - dims.size(), dims.data()); - - void *input; - cnrtMalloc(&input, inputSizeInBytes); - - void *concatWorkspace; - - use_cnnl((cnrtQueue_t) stream, - [&](cnnlHandle_t handle) { - size_t concatWorkspaceSize; - cnnlGetConcatWorkspaceSize(handle, 2, &concatWorkspaceSize); - cnrtMalloc(&concatWorkspace, concatWorkspaceSize); - - cnnlTensorDescriptor_t inputsDesc[2] = {gateDesc, gateDesc}; - const void *const inputsData[2] = {gate.data, up.data}; - cnnlConcat(handle, 2, -1, inputsDesc, inputsData, - concatWorkspace, concatWorkspaceSize, inDesc, input); - - cnnlBiasActivationGluForward_v2(handle, descriptor->opDesc, inDesc, input, - nullptr, nullptr, gateDesc, gate.data); - }); - - cnrtFree(concatWorkspace); - cnrtFree(input); - - cnnlDestroyTensorDescriptor(gateDesc); - cnnlDestroyTensorDescriptor(inDesc); -} diff --git a/src/ops/swiglu/bang/swiglu_cnnl.h b/src/ops/swiglu/bang/swiglu_cnnl.h deleted file mode 100644 index f729c425..00000000 --- a/src/ops/swiglu/bang/swiglu_cnnl.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef __CNNL_SWIGLU_H__ -#define __CNNL_SWIGLU_H__ - -#include "cnnl.h" -#include "cnnl_extra.h" -#include "operators.h" - -struct SwigluBangDescriptor { - Device device; - cnnlActivationDescriptor_t actDesc; - cnnlBiasActivationGluDescriptor_t opDesc; - - SwigluBangDescriptor(Device device); - void createCnnlDescriptors() { - cnnlCreateActivationDescriptor(&actDesc); - cnnlCreateBiasActivationGluDescriptor(&opDesc); - cnnlSetActivationDescriptor_v6(actDesc, CNNL_ACTIVATION_SILU, - CNNL_ACTIVATION_HIGH_PRECISION, - CNNL_NOT_PROPAGATE_NAN, - 0.0, 0, 0.0, 0.0, true, true); - cnnlSetBiasActivationGluDescriptor(opDesc, actDesc, - CNNL_BIAS_ACTIVATION_GLU_ALGO_V2); - } - void destroyCnnlDescriptors() { - cnnlDestroyActivationDescriptor(actDesc); - cnnlDestroyBiasActivationGluDescriptor(opDesc); - } -}; - -void swiglu_cnnl_f16(SwigluBangDescriptor *descriptor, Tensor gate, Tensor up, void *stream); - -#endif// __CNNL_SWIGLU_H__ diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cc b/src/ops/swiglu/cpu/swiglu_cpu.cc index 899f0793..4e0fd574 100644 --- a/src/ops/swiglu/cpu/swiglu_cpu.cc +++ b/src/ops/swiglu/cpu/swiglu_cpu.cc @@ -3,30 +3,89 @@ #include "../../utils.h" #include -inline float sigmoid(float x) { - return 1.0f / (1.0f + expf(-x)); + +infiniopStatus_t cpuCreateSwiGLUDescriptor(infiniopHandle_t handle, + SwiGLUCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + if (c_desc->ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) { + return STATUS_BAD_TENSOR_SHAPE; + } + + DT dtype = c_desc->dt; + + if (!dtype_eq(dtype, F16)) { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (a_desc->strides[1] != 1 || b_desc->strides[1] != 1 || c_desc->strides[1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + uint64_t seq_len = c_desc->shape[0], + di = c_desc->shape[1]; + + uint64_t stride_a = a_desc->strides[0], + stride_b = b_desc->strides[0], + stride_c = c_desc->strides[0]; + + + if (a_desc->shape[0] != seq_len || a_desc->shape[1] != di || !dtype_eq(a_desc->dt, dtype) || + b_desc->shape[0] != seq_len || b_desc->shape[1] != di || !dtype_eq(b_desc->dt, dtype)) { + return STATUS_BAD_PARAM; + } + + *desc_ptr = new SwiGLUCpuDescriptor{DevCpu, + dtype, + seq_len, + di, + stride_a, + stride_b, + stride_c}; + return STATUS_SUCCESS; } -void swiglu_cpu_f16(Tensor gate, Tensor up) { - ASSERT_EQ(gate.layout->ndim, 2); - ASSERT_EQ(up.layout->ndim, 2); - ASSERT_EQ(gate.layout->shape[0], up.layout->shape[0]); - ASSERT_EQ(gate.layout->shape[1], up.layout->shape[1]); +inline float silu(float x) { + return x / (1.0f + expf(-x)); +} - auto seq_len = gate.layout->shape[0], - di = gate.layout->shape[1]; +void swiglu_cpu_f16(SwiGLUCpuDescriptor_t desc, void *c, void const *a, void const *b) { - auto stride_gate = gate.layout->strides[0], - stride_up = up.layout->strides[0]; + auto seq_len = desc->seq_len, + di = desc->di; + + auto stride_a = desc->stride_a, + stride_b = desc->stride_b, + stride_c = desc->stride_c; for (int i = 0; i < seq_len; ++i) { - auto gate_ = reinterpret_cast(gate.data) + i * stride_gate; - auto up_ = reinterpret_cast(up.data) + i * stride_up; + auto a_ = reinterpret_cast(a) + i * stride_a; + auto b_ = reinterpret_cast(b) + i * stride_b; + auto c_ = reinterpret_cast(c) + i * stride_c; for (int j = 0; j < di; ++j) { - auto x = f16_to_f32(gate_[j]); - auto y = f16_to_f32(up_[j]); + auto a__ = f16_to_f32(a_[j]); + auto b__ = f16_to_f32(b_[j]); - gate_[j] = f32_to_f16(x * sigmoid(x) * y); + c_[j] = f32_to_f16(a__ * silu(b__)); } } } + +infiniopStatus_t cpuSwiGLU(SwiGLUCpuDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + if (dtype_eq(desc->dtype, F16)) { + swiglu_cpu_f16(desc, c, a, b); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} + +infiniopStatus_t cpuDestroySwiGLUDescriptor(SwiGLUCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/swiglu/cpu/swiglu_cpu.h b/src/ops/swiglu/cpu/swiglu_cpu.h index 7fd768e5..a853ccf8 100644 --- a/src/ops/swiglu/cpu/swiglu_cpu.h +++ b/src/ops/swiglu/cpu/swiglu_cpu.h @@ -3,10 +3,30 @@ #include "operators.h" -struct SwigluCpuDescriptor { +struct SwiGLUCpuDescriptor { Device device; + DT dtype; + uint64_t seq_len; + uint64_t di; + uint64_t stride_a; + uint64_t stride_b; + uint64_t stride_c; }; -void swiglu_cpu_f16(Tensor gate, Tensor up); +typedef struct SwiGLUCpuDescriptor *SwiGLUCpuDescriptor_t; + +infiniopStatus_t cpuCreateSwiGLUDescriptor(infiniopHandle_t handle, + SwiGLUCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_dec, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + +infiniopStatus_t cpuSwiGLU(SwiGLUCpuDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +infiniopStatus_t cpuDestroySwiGLUDescriptor(SwiGLUCpuDescriptor_t desc); #endif// __CPU_SWIGLU_H__ diff --git a/src/ops/swiglu/cuda/swiglu.cu b/src/ops/swiglu/cuda/swiglu.cu index aa55e63d..fdd3f16b 100644 --- a/src/ops/swiglu/cuda/swiglu.cu +++ b/src/ops/swiglu/cuda/swiglu.cu @@ -1,9 +1,10 @@ +#include "../../../devices/cuda/common_cuda.h" #include "../../utils.h" #include "swiglu.cuh" #include -static __forceinline__ __device__ float sigmoid(float x) { - return fdividef(1, 1 + expf(-x)); +static __forceinline__ __device__ float silu(float x) { + return x * fdividef(1, 1 + expf(-x)); } inline int gcd(int a, int b) { @@ -16,37 +17,54 @@ inline int gcd(int a, int b) { } template -static __global__ void swiglu( - Tdata *__restrict__ gate_, - int const stride_gate, - Tdata const *__restrict__ up_, - int const stride_up) { - auto i = blockIdx.y * stride_gate + blockIdx.x * blockDim.x + threadIdx.x, - j = blockIdx.y * stride_up + blockIdx.x * blockDim.x + threadIdx.x; - auto x = float(gate_[i]), - y = float(up_[j]); - gate_[i] = Tdata(x * sigmoid(x) * y); +static __launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void swiglu( + Tdata *__restrict__ c, + int const stride_c, + Tdata const *__restrict__ a, + int const stride_a, + Tdata const *__restrict__ b, + int const stride_b) { + auto i = blockIdx.y * stride_b + blockIdx.x * blockDim.x + threadIdx.x, + j = blockIdx.y * stride_a + blockIdx.x * blockDim.x + threadIdx.x, + k = blockIdx.y * stride_c + blockIdx.x * blockDim.x + threadIdx.x; + auto x = float(b[i]), + y = float(a[j]); + c[k] = Tdata(silu(x) * y); } -constexpr static int BLOCK_SIZE = 1024; +void swiglu_nv_gpu_f16(SwiGLUCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream) { -void swiglu_nv_gpu_f16(Tensor gate, Tensor up, void *stream) { - ASSERT_EQ(gate.layout->ndim, 2); - ASSERT_EQ(up.layout->ndim, 2); - ASSERT_EQ(gate.layout->shape[0], up.layout->shape[0]); - ASSERT_EQ(gate.layout->shape[1], up.layout->shape[1]); + auto seq_len = desc->seq_len, + di = desc->di; - auto seq_len = gate.layout->shape[0], - di = gate.layout->shape[1]; + auto stride_a = desc->stride_a, + stride_b = desc->stride_b, + stride_c = desc->stride_c; - dim3 block_dims = gcd(BLOCK_SIZE, di); + dim3 block_dims = gcd(MAX_THREADS_PER_BLOCK, di); dim3 grid_dims = dim3(di / block_dims.x, seq_len); - auto gate_ptr = reinterpret_cast(gate.data); - auto up_ptr = reinterpret_cast(up.data); + auto a_ptr = reinterpret_cast(a); + auto b_ptr = reinterpret_cast(b); + auto c_ptr = reinterpret_cast(c); auto cuda_stream = reinterpret_cast(stream); swiglu<<>>( - gate_ptr, gate.layout->strides[0] / 2, up_ptr, up.layout->strides[0] / 2); + c_ptr, stride_c, a_ptr, stride_a, b_ptr, stride_b); +} + +infiniopStatus_t cudaSwiGLU(SwiGLUCudaDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + + if (dtype_eq(desc->dtype, F16)) { + swiglu_nv_gpu_f16(desc, c, a, b, stream); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/ops/swiglu/cuda/swiglu.cuh b/src/ops/swiglu/cuda/swiglu.cuh index 617ecff9..9b3bdcb5 100644 --- a/src/ops/swiglu/cuda/swiglu.cuh +++ b/src/ops/swiglu/cuda/swiglu.cuh @@ -1,12 +1,36 @@ -#ifndef __NV_GPU_SWIGLU_H__ -#define __NV_GPU_SWIGLU_H__ - +#ifndef __CUDA_SWIGLU_H__ +#define __CUDA_SWIGLU_H__ +#include "../../../devices/cuda/cuda_handle.h" +#include "../../utils.h" #include "operators.h" -struct SwigluCudaDescriptor { +struct SwiGLUCudaDescriptor { Device device; + int device_id; + DT dtype; + uint64_t seq_len; + uint64_t di; + uint64_t stride_a; + uint64_t stride_b; + uint64_t stride_c; }; -void swiglu_nv_gpu_f16(Tensor gate, Tensor up, void *stream); +typedef struct SwiGLUCudaDescriptor *SwiGLUCudaDescriptor_t; + +infiniopStatus_t cudaCreateSwiGLUDescriptor(CudaHandle_t handle, + SwiGLUCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_dec, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + +infiniopStatus_t cudaSwiGLU(SwiGLUCudaDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +infiniopStatus_t cudaDestroySwiGLUDescriptor(SwiGLUCudaDescriptor_t desc); + +void swiglu_nv_gpu_f16(SwiGLUCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream); #endif// __NV_GPU_SWIGLU_H__ diff --git a/src/ops/swiglu/cuda/swiglu_cuda.cc b/src/ops/swiglu/cuda/swiglu_cuda.cc new file mode 100644 index 00000000..16d70503 --- /dev/null +++ b/src/ops/swiglu/cuda/swiglu_cuda.cc @@ -0,0 +1,51 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "swiglu.cuh" + +infiniopStatus_t cudaCreateSwiGLUDescriptor(CudaHandle_t handle, + SwiGLUCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + if (c_desc->ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) { + return STATUS_BAD_TENSOR_SHAPE; + } + + DT dtype = c_desc->dt; + + if (!dtype_eq(dtype, F16)) { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (a_desc->strides[1] != 1 || b_desc->strides[1] != 1 || c_desc->strides[1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + uint64_t seq_len = c_desc->shape[0], + di = c_desc->shape[1]; + + uint64_t stride_a = a_desc->strides[0], + stride_b = b_desc->strides[0], + stride_c = c_desc->strides[0]; + + + if (a_desc->shape[0] != seq_len || a_desc->shape[1] != di || !dtype_eq(a_desc->dt, dtype) || + b_desc->shape[0] != seq_len || b_desc->shape[1] != di || !dtype_eq(b_desc->dt, dtype)) { + return STATUS_BAD_PARAM; + } + + *desc_ptr = new SwiGLUCudaDescriptor{DevNvGpu, + handle->device_id, + dtype, + seq_len, + di, + stride_a, + stride_b, + stride_c}; + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroySwiGLUDescriptor(SwiGLUCudaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/swiglu/maca/swiglu_maca.cc b/src/ops/swiglu/maca/swiglu_maca.cc new file mode 100644 index 00000000..71c2af70 --- /dev/null +++ b/src/ops/swiglu/maca/swiglu_maca.cc @@ -0,0 +1,51 @@ +#include "../../../devices/maca/common_maca.h" +#include "../../utils.h" +#include "swiglu_maca.h" + +infiniopStatus_t macaCreateSwiGLUDescriptor(MacaHandle_t handle, + SwiGLUMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + if (c_desc->ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) { + return STATUS_BAD_TENSOR_SHAPE; + } + + DT dtype = c_desc->dt; + + if (!dtype_eq(dtype, F16)) { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (a_desc->strides[1] != 1 || b_desc->strides[1] != 1 || c_desc->strides[1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + uint64_t seq_len = c_desc->shape[0], + di = c_desc->shape[1]; + + uint64_t stride_a = a_desc->strides[0], + stride_b = b_desc->strides[0], + stride_c = c_desc->strides[0]; + + + if (a_desc->shape[0] != seq_len || a_desc->shape[1] != di || !dtype_eq(a_desc->dt, dtype) || + b_desc->shape[0] != seq_len || b_desc->shape[1] != di || !dtype_eq(b_desc->dt, dtype)) { + return STATUS_BAD_PARAM; + } + + *desc_ptr = new SwiGLUMacaDescriptor{DevMetaxGpu, + handle->device_id, + dtype, + seq_len, + di, + stride_a, + stride_b, + stride_c}; + return STATUS_SUCCESS; +} + +infiniopStatus_t macaDestroySwiGLUDescriptor(SwiGLUMacaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/swiglu/maca/swiglu_maca.h b/src/ops/swiglu/maca/swiglu_maca.h new file mode 100644 index 00000000..3ea7c661 --- /dev/null +++ b/src/ops/swiglu/maca/swiglu_maca.h @@ -0,0 +1,36 @@ +#ifndef __MACA_SWIGLU_H__ +#define __MACA_SWIGLU_H__ +#include "../../../devices/maca/maca_handle.h" +#include "../../utils.h" +#include "operators.h" + +struct SwiGLUMacaDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t seq_len; + uint64_t di; + uint64_t stride_a; + uint64_t stride_b; + uint64_t stride_c; +}; + +typedef struct SwiGLUMacaDescriptor *SwiGLUMacaDescriptor_t; + +infiniopStatus_t macaCreateSwiGLUDescriptor(MacaHandle_t handle, + SwiGLUMacaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_dec, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + +infiniopStatus_t macaSwiGLU(SwiGLUMacaDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +infiniopStatus_t macaDestroySwiGLUDescriptor(SwiGLUMacaDescriptor_t desc); + +void swiglu_mc_gpu_f16(SwiGLUMacaDescriptor_t desc, void *c, void const *a, void const *b, void *stream); + +#endif// __MC_GPU_SWIGLU_H__ diff --git a/src/ops/swiglu/maca/swiglu_maca.maca b/src/ops/swiglu/maca/swiglu_maca.maca new file mode 100644 index 00000000..68692c04 --- /dev/null +++ b/src/ops/swiglu/maca/swiglu_maca.maca @@ -0,0 +1,70 @@ +#include "../../../devices/maca/common_maca.h" +#include "../../utils.h" +#include "swiglu_maca.h" +#include + +static __forceinline__ __device__ float silu(float x) { + return x * fdividef(1, 1 + expf(-x)); +} + +inline int gcd(int a, int b) { + while (b != 0) { + int rem = a % b; + a = b; + b = rem; + } + return a; +} + +template +static __global__ void swiglu( + Tdata *__restrict__ c, + int const stride_c, + Tdata const *__restrict__ a, + int const stride_a, + Tdata const *__restrict__ b, + int const stride_b) { + auto i = blockIdx.y * stride_b + blockIdx.x * blockDim.x + threadIdx.x, + j = blockIdx.y * stride_a + blockIdx.x * blockDim.x + threadIdx.x, + k = blockIdx.y * stride_c + blockIdx.x * blockDim.x + threadIdx.x; + auto x = float(b[i]), + y = float(a[j]); + c[k] = Tdata(silu(x) * y); +} + +void swiglu_mc_gpu_f16(SwiGLUMacaDescriptor_t desc, void *c, void const *a, void const *b, void *stream) { + + auto seq_len = desc->seq_len, + di = desc->di; + + auto stride_a = desc->stride_a, + stride_b = desc->stride_b, + stride_c = desc->stride_c; + + dim3 block_dims = gcd(MAX_THREADS_PER_BLOCK, di); + dim3 grid_dims = dim3(di / block_dims.x, seq_len); + + auto a_ptr = reinterpret_cast(a); + auto b_ptr = reinterpret_cast(b); + auto c_ptr = reinterpret_cast(c); + + auto maca_stream = reinterpret_cast(stream); + + swiglu<<>>( + c_ptr, stride_c, a_ptr, stride_a, b_ptr, stride_b); +} + +infiniopStatus_t macaSwiGLU(SwiGLUMacaDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + checkMacaError(hcSetDevice(desc->device_id)); + + if (dtype_eq(desc->dtype, F16)) { + swiglu_mc_gpu_f16(desc, c, a, b, stream); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/swiglu/musa/swiglu.mu b/src/ops/swiglu/musa/swiglu.mu new file mode 100644 index 00000000..259e5c6f --- /dev/null +++ b/src/ops/swiglu/musa/swiglu.mu @@ -0,0 +1,68 @@ +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include "swiglu_musa.h" +#include + +static __forceinline__ __device__ float silu(float x) { + return x * fdividef(1, 1 + expf(-x)); +} + +inline int gcd(int a, int b) { + while (b != 0) { + int rem = a % b; + a = b; + b = rem; + } + return a; +} + +template +static __global__ void swiglu( + Tdata *__restrict__ c, + int const stride_c, + Tdata const *__restrict__ a, + int const stride_a, + Tdata const *__restrict__ b, + int const stride_b) { + auto i = blockIdx.y * stride_b + blockIdx.x * blockDim.x + threadIdx.x, + j = blockIdx.y * stride_a + blockIdx.x * blockDim.x + threadIdx.x, + k = blockIdx.y * stride_c + blockIdx.x * blockDim.x + threadIdx.x; + auto x = float(b[i]), + y = float(a[j]); + c[k] = Tdata(silu(x) * y); +} + +void swiglu_mt_gpu_f16(SwiGLUMusaDescriptor_t desc, void *c, void const *a, void const *b, void *stream) { + + auto seq_len = desc->seq_len, + di = desc->di; + + auto stride_a = desc->stride_a, + stride_b = desc->stride_b, + stride_c = desc->stride_c; + + dim3 block_dims = gcd(MAX_THREADS_PER_BLOCK, di); + dim3 grid_dims = dim3(di / block_dims.x, seq_len); + + auto a_ptr = reinterpret_cast(a); + auto b_ptr = reinterpret_cast(b); + auto c_ptr = reinterpret_cast(c); + + auto musa_stream = reinterpret_cast(stream); + + swiglu<<>>( + c_ptr, stride_c, a_ptr, stride_a, b_ptr, stride_b); +} + +infiniopStatus_t musaSwiGLU(SwiGLUMusaDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + if (dtype_eq(desc->dtype, F16)) { + swiglu_mt_gpu_f16(desc, c, a, b, stream); + return STATUS_SUCCESS; + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/swiglu/musa/swiglu_musa.cc b/src/ops/swiglu/musa/swiglu_musa.cc new file mode 100644 index 00000000..a1d5719b --- /dev/null +++ b/src/ops/swiglu/musa/swiglu_musa.cc @@ -0,0 +1,50 @@ +#include "../../../devices/musa/common_musa.h" +#include "../../utils.h" +#include "swiglu_musa.h" + +infiniopStatus_t musaCreateSwiGLUDescriptor(infiniopHandle_t handle, + SwiGLUMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + if (c_desc->ndim != 2 || a_desc->ndim != 2 || b_desc->ndim != 2) { + return STATUS_BAD_TENSOR_SHAPE; + } + + DT dtype = c_desc->dt; + + if (!dtype_eq(dtype, F16)) { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (a_desc->strides[1] != 1 || b_desc->strides[1] != 1 || c_desc->strides[1] != 1) { + return STATUS_BAD_TENSOR_STRIDES; + } + + uint64_t seq_len = c_desc->shape[0], + di = c_desc->shape[1]; + + uint64_t stride_a = a_desc->strides[0], + stride_b = b_desc->strides[0], + stride_c = c_desc->strides[0]; + + + if (a_desc->shape[0] != seq_len || a_desc->shape[1] != di || !dtype_eq(a_desc->dt, dtype) || + b_desc->shape[0] != seq_len || b_desc->shape[1] != di || !dtype_eq(b_desc->dt, dtype)) { + return STATUS_BAD_PARAM; + } + + *desc_ptr = new SwiGLUMusaDescriptor{DevMthreadsGpu, + dtype, + seq_len, + di, + stride_a, + stride_b, + stride_c}; + return STATUS_SUCCESS; +} + +infiniopStatus_t musaDestroySwiGLUDescriptor(SwiGLUMusaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/swiglu/musa/swiglu_musa.h b/src/ops/swiglu/musa/swiglu_musa.h new file mode 100644 index 00000000..00ae1155 --- /dev/null +++ b/src/ops/swiglu/musa/swiglu_musa.h @@ -0,0 +1,34 @@ +#ifndef __MUSA_SWIGLU_H__ +#define __MUSA_SWIGLU_H__ + +#include "operators.h" + +struct SwiGLUMusaDescriptor { + Device device; + DT dtype; + uint64_t seq_len; + uint64_t di; + uint64_t stride_a; + uint64_t stride_b; + uint64_t stride_c; +}; + +typedef struct SwiGLUMusaDescriptor *SwiGLUMusaDescriptor_t; + +infiniopStatus_t musaCreateSwiGLUDescriptor(infiniopHandle_t handle, + SwiGLUMusaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_dec, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + +infiniopStatus_t musaSwiGLU(SwiGLUMusaDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +infiniopStatus_t musaDestroySwiGLUDescriptor(SwiGLUMusaDescriptor_t desc); + +void swiglu_mt_gpu_f16(SwiGLUMusaDescriptor_t desc, void *c, void const *a, void const *b, void *stream); + +#endif// __MT_GPU_SWIGLU_H__ diff --git a/src/ops/swiglu/operator.cc b/src/ops/swiglu/operator.cc index 8f351242..3ea0bedc 100644 --- a/src/ops/swiglu/operator.cc +++ b/src/ops/swiglu/operator.cc @@ -1,4 +1,5 @@ #include "../utils.h" +#include "operators.h" #include "ops/swiglu/swiglu.h" #ifdef ENABLE_CPU @@ -9,80 +10,127 @@ #endif #ifdef ENABLE_CAMBRICON_MLU #include "bang/swiglu_bang.h" -#include "bang/swiglu_cnnl.h" +#endif +#ifdef ENABLE_ASCEND_NPU +#include "ascend/swiglu.h" +#endif +#ifdef ENABLE_METAX_GPU +#include "maca/swiglu_maca.h" +#endif +#ifdef ENABLE_MTHREADS_GPU +#include "musa/swiglu_musa.h" #endif -struct SwigluDescriptor { - Device device; -}; - -__C void *createSwigluDescriptor(Device device, void *config) { - switch (device) { +__C infiniopStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle, + infiniopSwiGLUDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + switch (handle->device) { #ifdef ENABLE_CPU - case DevCpu: - return (SwigluDescriptor *) (new SwigluCpuDescriptor{device}); + case DevCpu: + return cpuCreateSwiGLUDescriptor(handle, (SwiGLUCpuDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc); #endif #ifdef ENABLE_NV_GPU - case DevNvGpu: - return (SwigluDescriptor *) (new SwigluCudaDescriptor{device}); + case DevNvGpu: + return cudaCreateSwiGLUDescriptor((CudaHandle_t) handle, (SwiGLUCudaDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc); #endif #ifdef ENABLE_CAMBRICON_MLU - case DevCambriconMlu: { - auto bangDescriptor = new SwigluBangDescriptor(device); - bangDescriptor->createCnnlDescriptors(); - return (SwigluDescriptor *) (bangDescriptor); - } + case DevCambriconMlu: { + return bangCreateSwiGLUDescriptor((BangHandle_t) handle, + (SwiGLUBangDescriptor_t *) desc_ptr, + c_desc, + a_desc, + b_desc); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: + return ascendCreateSwiGLUDescriptor((AscendHandle_t) handle, + (SwiGLUAscendDescriptor_t *) desc_ptr, + c_desc, + a_desc, + b_desc); +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: { + return macaCreateSwiGLUDescriptor((MacaHandle_t) handle, + (SwiGLUMacaDescriptor_t *) desc_ptr, + c_desc, + a_desc, + b_desc); + } +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: + return musaCreateSwiGLUDescriptor(handle, (SwiGLUMusaDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc); #endif - default: - PANIC(UnsupportedDevice); } - return nullptr; + return STATUS_BAD_DEVICE; }; -__C void destroySwigluDescriptor(SwigluDescriptor *descriptor) { - switch (descriptor->device) { +__C infiniopStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - delete (SwigluCpuDescriptor *) (descriptor); - break; + return cpuSwiGLU((SwiGLUCpuDescriptor_t) desc, c, a, b, stream); #endif #ifdef ENABLE_NV_GPU case DevNvGpu: - delete (SwigluCudaDescriptor *) (descriptor); - break; + return cudaSwiGLU((SwiGLUCudaDescriptor_t) desc, c, a, b, stream); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { - auto bangDescriptor = (SwigluBangDescriptor *) (descriptor); - bangDescriptor->destroyCnnlDescriptors(); - delete bangDescriptor; - break; + return bangSwiGLU((SwiGLUBangDescriptor_t) desc, c, a, b, stream); } #endif - default: - PANIC(UnsupportedDevice); +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: + return ascendSwiGLU((SwiGLUAscendDescriptor_t) desc, c, a, b, stream); +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: + return macaSwiGLU((SwiGLUMacaDescriptor_t) desc, c, a, b, stream); +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: + return musaSwiGLU((SwiGLUMusaDescriptor_t) desc, c, a, b, stream); +#endif } + return STATUS_BAD_DEVICE; } -__C void swiglu(SwigluDescriptor *descriptor, Tensor gate, Tensor up, void *stream) { - switch (descriptor->device) { +__C infiniopStatus_t infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) { + switch (desc->device) { #ifdef ENABLE_CPU case DevCpu: - swiglu_cpu_f16(gate, up); - break; + return cpuDestroySwiGLUDescriptor((SwiGLUCpuDescriptor_t) desc); #endif #ifdef ENABLE_NV_GPU case DevNvGpu: - swiglu_nv_gpu_f16(gate, up, stream); - break; + return cudaDestroySwiGLUDescriptor((SwiGLUCudaDescriptor_t) desc); #endif #ifdef ENABLE_CAMBRICON_MLU - case DevCambriconMlu: - // swiglu_cnnl_f16((SwigluBangDescriptor *) (descriptor), gate, up, stream); - swiglu_bang_f16(gate, up, stream); - break; + case DevCambriconMlu: { + return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t) desc); + } +#endif +#ifdef ENABLE_ASCEND_NPU + case DevAscendNpu: + return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t) desc); +#endif +#ifdef ENABLE_METAX_GPU + case DevMetaxGpu: + return macaDestroySwiGLUDescriptor((SwiGLUMacaDescriptor_t) desc); +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: + return musaDestroySwiGLUDescriptor((SwiGLUMusaDescriptor_t) desc); #endif - default: - PANIC(UnsupportedDevice); } -}; + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/utils.h b/src/ops/utils.h index 01b5e81f..b48cf419 100644 --- a/src/ops/utils.h +++ b/src/ops/utils.h @@ -1,8 +1,14 @@ #ifndef __UTILS_H__ #define __UTILS_H__ +#include "data_type.h" +#include "tensor.h" +#include +#include +#include #include #include +#include /* This file contains some useful macros and helper functions */ @@ -23,4 +29,225 @@ inline void assert_true(int expr, const char *msg, const char *file, int line) { exit(EXIT_FAILURE) #define ROUND_UP_DIV(x, y) ((x + y - 1) / y) + +#define CHECK_ERROR(call, target, errCode) \ + do { \ + if (auto value = (call); value == (target)) { \ + std::cerr << "Error: expected " << (target) \ + << " but got " << value \ + << " in file " << __FILE__ \ + << ", function " << __func__ \ + << ", line " << __LINE__ << std::endl; \ + return (errCode); \ + } \ + } while (0) + +#define CREATE_CHECK_ERROR(expr, value, target, errCode) \ + expr; \ + CHECK_ERROR(value, target, errCode) + +#define CHECK_STATUS(call, target) \ + do { \ + if (auto value = (call); value != (target)) { \ + std::cerr << "Error: expected " << (target) \ + << " but got " << value \ + << " in file " << __FILE__ \ + << ", function " << __func__ \ + << ", line " << __LINE__ << std::endl; \ + return value; \ + } \ + } while (0) + +// check if two data layouts (types) are equal +inline bool dtype_eq(DataLayout a, DataLayout b) { + union TypePun { + DataLayout layout; + int i; + } pun; + pun.layout = a; + auto a_ = pun.i; + pun.layout = b; + auto b_ = pun.i; + return a_ == b_; +} + +inline std::vector get_byte_strides(infiniopTensorDescriptor_t desc) { + int64_t dsize = desc->dt.size; + std::vector strides(desc->ndim); + for (uint64_t i = 0; i < desc->ndim; i++) { + strides[i] = dsize * desc->strides[i]; + } + + return strides; +} + +// calculate the broadcasted shape for two tensors +inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1, + const uint64_t *shape2, uint64_t ndim2, + uint64_t *broadcast_shape, uint64_t *padded_shape1, + uint64_t *padded_shape2, uint64_t max_rank) { + // prepending and initializing + std::fill(padded_shape1, padded_shape1 + max_rank, 1); + std::fill(padded_shape2, padded_shape2 + max_rank, 1); + std::copy(shape1, shape1 + ndim1, padded_shape1 + max_rank - ndim1); + std::copy(shape2, shape2 + ndim2, padded_shape2 + max_rank - ndim2); + + // compute broadcasted shape + for (size_t i = 0; i < max_rank; ++i) { + if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) { + broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]); + } else { + return false; + } + } + + return true; +} + +// check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes +inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c, + uint64_t broadcast_ndim) { + std::vector + broadcast_shape_(broadcast_ndim), + padded_shape1_(broadcast_ndim), + padded_shape2_(broadcast_ndim); + auto broadcast_shape = broadcast_shape_.data(), + padded_shape1 = padded_shape1_.data(), + padded_shape2 = padded_shape2_.data(); + if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) { + return false; + } + return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim, c->shape); +} + +// check if the shape of tensor src can be validly broadcasted to that of the tensor dst +inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensorDescriptor_t src) { + if (dst->ndim < src->ndim) { + return false; + } + std::vector padded_shape_(dst->ndim); + auto padded_shape = padded_shape_.data(); + std::fill(padded_shape, padded_shape + dst->ndim, 1); + std::copy(src->shape, src->shape + src->ndim, padded_shape + dst->ndim - src->ndim); + for (size_t i = 0; i < dst->ndim; ++i) { + if (padded_shape[i] != dst->shape[i] && padded_shape[i] != 1) { + return false; + } + } + return true; +} + +// check if the shape of tensor c is valid after broadcasting tensors a and b +inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c) { + return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim)); +} + +inline uint64_t get_byte_size(infiniopTensorDescriptor_t desc) { + uint64_t dsize = desc->dt.size; + uint64_t size = 1; + for (uint64_t i = 0; i < desc->ndim; i++) { + size *= desc->shape[i]; + } + return size * dsize; +} + +// permute the dimensions of a tensor descriptor +inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, const std::vector &order) { + uint64_t ndim = desc->ndim; + if (order.size() != ndim) { + return nullptr; + } + uint64_t *shape = new uint64_t[ndim]; + int64_t *strides = new int64_t[ndim]; + for (uint64_t i = 0; i < ndim; i++) { + if (std::find(order.begin(), order.end(), i) == order.end()) { + return nullptr; + } + shape[i] = desc->shape[order[i]]; + strides[i] = desc->strides[order[i]]; + } + return new TensorDescriptor{ + desc->dt, ndim, shape, strides}; +} + +// check if the dimensions [dim_start, dim_end] of a tensor descriptor are contiguous +inline bool is_contiguous(const infiniopTensorDescriptor_t &desc, uint64_t dim_start, uint64_t dim_end) { + for (size_t i = dim_start + 1; i <= dim_end; i++) { + if (desc->strides[i - 1] != static_cast(desc->shape[i]) * desc->strides[i]) { + return false; + } + } + return true; +} + +inline bool is_contiguous(const infiniopTensorDescriptor_t &desc) { + if (desc->ndim == 0) { + return true; + } + return is_contiguous(desc, 0, desc->ndim - 1); +} + +// merge the dimensions [dim_start, dim_end] of a tensor descriptor +inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, uint64_t dim_start, uint64_t dim_end) { + uint64_t ndim = desc->ndim; + if (dim_start > dim_end || dim_end >= ndim) { + return nullptr; + } + + uint64_t new_ndim = ndim - (dim_end - dim_start); + uint64_t *new_shape = new uint64_t[new_ndim]; + int64_t *new_strides = new int64_t[new_ndim]; + uint64_t index = 0; + for (size_t i = 0; i < dim_start; i++) { + new_shape[index] = desc->shape[i]; + new_strides[index] = desc->strides[i]; + index++; + } + if (!is_contiguous(desc, dim_start, dim_end)) { + return nullptr; + } + new_shape[index] = 1; + for (size_t i = dim_start; i <= dim_end; i++) { + new_shape[index] *= desc->shape[i]; + } + new_strides[index] = desc->strides[dim_end]; + index++; + for (size_t i = dim_end + 1; i < ndim; i++) { + new_shape[index] = desc->shape[i]; + new_strides[index] = desc->strides[i]; + index++; + } + return new TensorDescriptor{ + desc->dt, new_ndim, new_shape, new_strides}; +} + +// split the dimension dim of a tensor descriptor into multiple dimensions +inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, uint64_t dim, const std::vector &dims) { + uint64_t ndim = desc->ndim; + if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t)1, std::multiplies{})) { + return nullptr; + } + uint64_t new_ndim = ndim + dims.size() - 1; + uint64_t *new_shape = new uint64_t[new_ndim]; + int64_t *new_strides = new int64_t[new_ndim]; + uint64_t index = 0; + for (size_t i = 0; i < dim; i++) { + new_shape[index] = desc->shape[i]; + new_strides[index] = desc->strides[i]; + index++; + } + for (size_t i = 0; i < dims.size(); i++) { + new_shape[index] = dims[i]; + new_strides[index] = desc->strides[dim] * desc->shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, 1, std::multiplies()); + index++; + } + for (size_t i = dim + 1; i < ndim; i++) { + new_shape[index] = desc->shape[i]; + new_strides[index] = desc->strides[i]; + index++; + } + return new TensorDescriptor{ + desc->dt, new_ndim, new_shape, new_strides}; +} + #endif// __UTILS_H__ diff --git a/src/tensor/tensor_descriptor.cc b/src/tensor/tensor_descriptor.cc index a6397206..57afe92d 100644 --- a/src/tensor/tensor_descriptor.cc +++ b/src/tensor/tensor_descriptor.cc @@ -1,16 +1,26 @@ #include "tensor/tensor_descriptor.h" #include -__C __export void createTensorDescriptor(TensorDescriptor* desc_ptr, uint64_t ndim, uint64_t *shape_, int64_t *strides_, DataLayout datatype) { +__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, uint64_t ndim, uint64_t const *shape_, int64_t const *strides_, DataLayout datatype) { uint64_t *shape = new uint64_t[ndim]; int64_t *strides = new int64_t[ndim]; std::memcpy(shape, shape_, ndim * sizeof(uint64_t)); - std::memcpy(strides, strides_, ndim * sizeof(int64_t)); - *desc_ptr = new TensorLayout{datatype, ndim, shape, strides}; + if (strides_) { + std::memcpy(strides, strides_, ndim * sizeof(int64_t)); + } else { + int64_t dsize = 1; + for (int i = ndim - 1; i >= 0; i--) { + strides[i] = dsize; + dsize *= shape[i]; + } + } + *desc_ptr = new TensorDescriptor{datatype, ndim, shape, strides}; + return STATUS_SUCCESS; } -__C __export void destroyTensorDescriptor(TensorDescriptor desc){ +__C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc) { delete[] desc->shape; delete[] desc->strides; delete desc; + return STATUS_SUCCESS; } diff --git a/xmake.lua b/xmake.lua index e508eae4..f9e6f3dc 100644 --- a/xmake.lua +++ b/xmake.lua @@ -1,4 +1,8 @@ add_rules("mode.debug", "mode.release") +-- Define color codes +local GREEN = '\27[0;32m' +local YELLOW = '\27[1;33m' +local NC = '\27[0m' -- No Color add_includedirs("include") @@ -9,6 +13,12 @@ option("cpu") add_defines("ENABLE_CPU") option_end() +option("omp") + set_default(false) + set_showmenu(true) + set_description("Enable or disable OpenMP support for cpu kernel") +option_end() + option("nv-gpu") set_default(false) set_showmenu(true) @@ -23,6 +33,36 @@ option("cambricon-mlu") add_defines("ENABLE_CAMBRICON_MLU") option_end() +option("ascend-npu") + set_default(false) + set_showmenu(true) + set_description("Enable or disable Ascend NPU kernel") + add_defines("ENABLE_ASCEND_NPU") +option_end() + +option("metax-gpu") + set_default(false) + set_showmenu(true) + set_description("Enable or disable Metax GPU kernel") + add_defines("ENABLE_METAX_GPU") +option_end() + + +option("mthreads-gpu") + set_default(false) + set_showmenu(true) + set_description("Enable or disable MThreads GPU kernel") + add_defines("ENABLE_MTHREADS_GPU") +option_end() + +option("sugon-dcu") + set_default(false) + set_showmenu(true) + set_description("Enable or disable Sugon DCU kernel") + add_defines("ENABLE_SUGON_DCU") + add_defines("ENABLE_NV_GPU") +option_end() + if is_mode("debug") then add_cxflags("-g -O0") add_defines("DEBUG_MODE") @@ -32,6 +72,7 @@ if has_config("cpu") then add_defines("ENABLE_CPU") target("cpu") + on_install(function (target) end) set_kind("static") if not is_plat("windows") then @@ -40,32 +81,52 @@ if has_config("cpu") then set_languages("cxx17") add_files("src/devices/cpu/*.cc", "src/ops/*/cpu/*.cc") - add_cxflags("-fopenmp") - add_ldflags("-fopenmp") + if has_config("omp") then + add_cxflags("-fopenmp") + add_ldflags("-fopenmp") + end target_end() end -if has_config("nv-gpu") then - +if has_config("nv-gpu", "sugon-dcu") then add_defines("ENABLE_NV_GPU") + if has_config("sugon-dcu") then + add_defines("ENABLE_SUGON_DCU") + end + local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH") + local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH") + if CUDA_ROOT ~= nil then + add_includedirs(CUDA_ROOT .. "/include") + end + if CUDNN_ROOT ~= nil then + add_includedirs(CUDNN_ROOT .. "/include") + end + target("nv-gpu") set_kind("static") + on_install(function (target) end) set_policy("build.cuda.devlink", true) set_toolchains("cuda") add_links("cublas") + add_links("cudnn") add_cugencodes("native") if is_plat("windows") then add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler") + if CUDNN_ROOT ~= nil then + add_linkdirs(CUDNN_ROOT .. "\\lib\\x64") + end else add_cuflags("-Xcompiler=-fPIC") add_culdflags("-Xcompiler=-fPIC") + add_cxxflags("-fPIC") end set_languages("cxx17") add_files("src/devices/cuda/*.cc", "src/ops/*/cuda/*.cu") + add_files("src/ops/*/cuda/*.cc") target_end() end @@ -96,7 +157,7 @@ if has_config("cambricon-mlu") then local includedirs = table.concat(target:get("includedirs"), " ") local args = {"-c", sourcefile, "-o", objectfile, "-I/usr/local/neuware/include", "--bang-mlu-arch=mtp_592", "-O3", "-fPIC", "-Wall", "-Werror", "-std=c++17", "-pthread"} - + for _, includedir in ipairs(target:get("includedirs")) do table.insert(args, "-I" .. includedir) end @@ -105,11 +166,11 @@ if has_config("cambricon-mlu") then table.insert(target:objectfiles(), objectfile) end) -rule_end() - + rule_end() target("cambricon-mlu") set_kind("static") + on_install(function (target) end) set_languages("cxx17") add_files("src/devices/bang/*.cc", "src/ops/*/bang/*.cc") add_files("src/ops/*/bang/*.mlu", {rule = "mlu"}) @@ -118,7 +179,162 @@ rule_end() end -target("operators") +if has_config("mthreads-gpu") then + + add_defines("ENABLE_MTHREADS_GPU") + local musa_home = os.getenv("MUSA_INSTALL_PATH") + -- Add include dirs + add_includedirs(musa_home .. "/include") + -- Add shared lib + add_linkdirs(musa_home .. "/lib") + add_links("libmusa.so") + add_links("libmusart.so") + add_links("libmudnn.so") + add_links("libmublas.so") + + rule("mu") + set_extensions(".mu") + on_load(function (target) + target:add("includedirs", "include") + end) + + on_build_file(function (target, sourcefile) + local objectfile = target:objectfile(sourcefile) + os.mkdir(path.directory(objectfile)) + + local mcc = "/usr/local/musa/bin/mcc" + local includedirs = table.concat(target:get("includedirs"), " ") + local args = {"-c", sourcefile, "-o", objectfile, "-I/usr/local/musa/include", "-O3", "-fPIC", "-Wall", "-std=c++17", "-pthread"} + for _, includedir in ipairs(target:get("includedirs")) do + table.insert(args, "-I" .. includedir) + end + + os.execv(mcc, args) + table.insert(target:objectfiles(), objectfile) + end) + rule_end() + + target("mthreads-gpu") + set_kind("static") + set_languages("cxx17") + add_files("src/devices/musa/*.cc", "src/ops/*/musa/*.cc") + add_files("src/ops/*/musa/*.mu", {rule = "mu"}) + add_cxflags("-lstdc++ -Wall -fPIC") + target_end() + +end + +if has_config("ascend-npu") then + + add_defines("ENABLE_ASCEND_NPU") + local ASCEND_HOME = os.getenv("ASCEND_HOME") + local SOC_VERSION = os.getenv("SOC_VERSION") + + -- Add include dirs + add_includedirs(ASCEND_HOME .. "/include") + add_includedirs(ASCEND_HOME .. "/include/aclnn") + add_linkdirs(ASCEND_HOME .. "/lib64") + add_links("libascendcl.so") + add_links("libnnopbase.so") + add_links("libopapi.so") + add_links("libruntime.so") + add_linkdirs(ASCEND_HOME .. "/../../driver/lib64/driver") + add_links("libascend_hal.so") + local builddir = string.format( + "%s/build/%s/%s/%s", + os.projectdir(), + get_config("plat"), + get_config("arch"), + get_config("mode") + ) + rule("ascend-kernels") + before_link(function () + local ascend_build_dir = path.join(os.projectdir(), "src/devices/ascend") + os.cd(ascend_build_dir) + os.exec("make") + os.exec("cp $(projectdir)/src/devices/ascend/build/lib/libascend_kernels.a "..builddir.."/") + os.cd(os.projectdir()) + + end) + after_clean(function () + local ascend_build_dir = path.join(os.projectdir(), "src/devices/ascend") + os.cd(ascend_build_dir) + os.exec("make clean") + os.cd(os.projectdir()) + os.rm(builddir.. "/libascend_kernels.a") + + end) + rule_end() + + target("ascend-npu") + -- Other configs + set_kind("static") + set_languages("cxx17") + on_install(function (target) end) + -- Add files + add_files("src/devices/ascend/*.cc", "src/ops/*/ascend/*.cc") + add_cxflags("-lstdc++ -Wall -Werror -fPIC") + + -- Add operator + add_rules("ascend-kernels") + add_links(builddir.."/libascend_kernels.a") + + target_end() +end + +if has_config("metax-gpu") then + + add_defines("ENABLE_METAX_GPU") + local MACA_ROOT = os.getenv("MACA_PATH") or os.getenv("MACA_HOME") or os.getenv("MACA_ROOT") + + add_includedirs(MACA_ROOT .. "/include") + add_linkdirs(MACA_ROOT .. "/lib") + -- add_linkdirs(MACA_ROOT .. "htgpu_llvm/lib") + add_links("libhcdnn.so") + add_links("libhcblas.so") + add_links("libhcruntime.so") + + rule("maca") + set_extensions(".maca") + + on_load(function (target) + target:add("includedirs", "include") + end) + + on_build_file(function (target, sourcefile) + local objectfile = target:objectfile(sourcefile) + os.mkdir(path.directory(objectfile)) + local htcc = "/opt/hpcc/htgpu_llvm/bin/htcc" + + local includedirs = table.concat(target:get("includedirs"), " ") + local args = { "-x", "hpcc", "-c", sourcefile, "-o", objectfile, "-I/opt/hpcc/include", "-O3", "-fPIC", "-Werror", "-std=c++17"} + + for _, includedir in ipairs(target:get("includedirs")) do + table.insert(args, "-I" .. includedir) + end + + os.execv(htcc, args) + table.insert(target:objectfiles(), objectfile) + end) + rule_end() + + target("metax-gpu") + set_kind("static") + on_install(function (target) end) + set_languages("cxx17") + add_files("src/devices/maca/*.cc", "src/ops/*/maca/*.cc") + add_files("src/ops/*/maca/*.maca", {rule = "maca"}) + add_cxflags("-lstdc++ -Werror -fPIC") + target_end() + +end + + +toolchain("sugon-dcu-linker") + set_toolset("sh", "nvcc") +toolchain_end() + +target("infiniop") set_kind("shared") if has_config("cpu") then @@ -127,44 +343,41 @@ target("operators") if has_config("nv-gpu") then add_deps("nv-gpu") end + if has_config("sugon-dcu") then + local builddir = string.format( + "build/%s/%s/%s", + get_config("plat"), + get_config("arch"), + get_config("mode") + ) + add_shflags("-s", "-shared", "-fPIC") + add_links("cublas", "cudnn", "cudadevrt", "cudart_static", "rt", "pthread", "dl") + -- Using -lnv-gpu will fail, manually link the target using full path + add_deps("nv-gpu", {inherit = false}) + add_links(builddir.."/libnv-gpu.a") + set_toolchains("sugon-dcu-linker") + end + if has_config("cambricon-mlu") then add_deps("cambricon-mlu") end + if has_config("ascend-npu") then + add_deps("ascend-npu") + end + if has_config("metax-gpu") then + add_deps("metax-gpu") + end + if has_config("mthreads-gpu") then + add_deps("mthreads-gpu") + end set_languages("cxx17") + add_files("src/devices/handle.cc") add_files("src/ops/*/operator.cc") add_files("src/tensor/*.cc") -target_end() + after_build(function (target) print(YELLOW .. "You can install the libraries with \"xmake install\"" .. NC) end) -target("main") - set_kind("binary") - add_deps("operators") + set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")) + add_installfiles("include/(**/*.h)", {prefixdir = "include"}) + add_installfiles("include/*.h", {prefixdir = "include"}) - set_languages("c11") - add_files("src/main.c") target_end() - -task("install-operators") - set_menu { - usage = "xmake install-operators", - description = "Build and install the operators", - options = {} - } - on_run(function () - os.exec("xmake --root") - os.exec("mkdir -p $(projectdir)/lib/") - os.exec("cp $(projectdir)/build/linux/x86_64/release/liboperators.so $(projectdir)/lib/") - os.exec("cp -r $(projectdir)/include $(projectdir)/lib/") - -- Define color codes - local GREEN = '\27[0;32m' - local YELLOW = '\27[1;33m' - local NC = '\27[0m' -- No Color - - -- Get the current directory - local current_dir = os.curdir() - - -- Output messages with colors - os.exec("echo -e '" .. GREEN .. "Compilation completed successfully." .. NC .. "'") - os.exec("echo -e '" .. YELLOW .. "To set the environment variable, please run the following command:" .. NC .. "'") - os.exec("echo -e '" .. YELLOW .. "echo \"export INFINI_ROOT=" .. current_dir .. "/lib\" >> ~/.bashrc" .. NC .. "'") - - end)