diff --git a/include/infinicore/ops/hypot.hpp b/include/infinicore/ops/hypot.hpp new file mode 100644 index 000000000..0608f9c08 --- /dev/null +++ b/include/infinicore/ops/hypot.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Hypot { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + + static void execute(Tensor output, Tensor input_a, Tensor input_b); + static common::OpDispatcher &dispatcher(); +}; + +Tensor hypot(Tensor input_a, Tensor input_b); + +void hypot_(Tensor output, Tensor input_a, Tensor input_b); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/index_add.hpp b/include/infinicore/ops/index_add.hpp new file mode 100644 index 000000000..14c0f72c5 --- /dev/null +++ b/include/infinicore/ops/index_add.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class IndexAdd { +public: + using schema = void (*)(Tensor, Tensor, int64_t, Tensor, Tensor, float); + static void execute(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source, float alpha); + + static common::OpDispatcher &dispatcher(); +}; + + +Tensor index_add(Tensor input, int64_t dim, Tensor index, Tensor source, float alpha = 1.0f); +void index_add_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source, float alpha); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/index_copy.hpp b/include/infinicore/ops/index_copy.hpp new file mode 100644 index 000000000..e6b862a5f --- /dev/null +++ b/include/infinicore/ops/index_copy.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class IndexCopy { +public: + using schema = void (*)(Tensor, Tensor, int64_t, Tensor, Tensor); + static void execute(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source); + + static common::OpDispatcher &dispatcher(); +}; +Tensor index_copy(Tensor input, int64_t dim, Tensor index, Tensor source); +void index_copy_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/smooth_l1_loss.hpp b/include/infinicore/ops/smooth_l1_loss.hpp new file mode 100644 index 000000000..329d1ce75 --- /dev/null +++ b/include/infinicore/ops/smooth_l1_loss.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class SmoothL1Loss { +public: + using schema = void (*)(Tensor, Tensor, Tensor, float, int64_t); + + static void execute(Tensor output, Tensor input, Tensor target, float beta, int64_t reduction); + static common::OpDispatcher &dispatcher(); +}; + +Tensor smooth_l1_loss(Tensor input, Tensor target, float beta = 1.0f, int64_t reduction = 1); +void smooth_l1_loss_(Tensor output, Tensor input, Tensor target, float beta, int64_t reduction); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/take.hpp b/include/infinicore/ops/take.hpp new file mode 100644 index 000000000..7c8fd6683 --- /dev/null +++ b/include/infinicore/ops/take.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Take { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + + static void execute(Tensor output, Tensor input, Tensor indices); + static common::OpDispatcher &dispatcher(); +}; + + +Tensor take(Tensor input, Tensor indices); + +void take_(Tensor output, Tensor input, Tensor indices); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infiniop.h b/include/infiniop.h index 92e6f5963..35b201db3 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -10,6 +10,9 @@ #include "infiniop/ops/dequantize_awq.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/hypot.h" +#include "infiniop/ops/index_add.h" +#include "infiniop/ops/index_copy.h" #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" @@ -22,11 +25,13 @@ #include "infiniop/ops/rope.h" #include "infiniop/ops/sigmoid.h" #include "infiniop/ops/silu.h" +#include "infiniop/ops/smooth_l1_loss.h" #include "infiniop/ops/softmax.h" #include "infiniop/ops/softplus.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" #include "infiniop/ops/tanh.h" +#include "infiniop/ops/take.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" #include "infiniop/ops/zeros.h" diff --git a/include/infiniop/ops/hypot.h b/include/infiniop/ops/hypot.h new file mode 100644 index 000000000..5f210194b --- /dev/null +++ b/include/infiniop/ops/hypot.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_HYPOT_API_H__ +#define __INFINIOP_HYPOT_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHypotDescriptor_t; + +__C __export infiniStatus_t infiniopCreateHypotDescriptor(infiniopHandle_t handle, + infiniopHypotDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input_a, + infiniopTensorDescriptor_t input_b); + +__C __export infiniStatus_t infiniopGetHypotWorkspaceSize(infiniopHypotDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopHypot(infiniopHypotDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input_a, + const void *input_b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyHypotDescriptor(infiniopHypotDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/index_add.h b/include/infiniop/ops/index_add.h new file mode 100644 index 000000000..6f574d21d --- /dev/null +++ b/include/infiniop/ops/index_add.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_INDEX_ADD_API_H__ +#define __INFINIOP_INDEX_ADD_API_H__ +#include +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopIndexAddDescriptor_t; + +__C __export infiniStatus_t infiniopCreateIndexAddDescriptor(infiniopHandle_t handle, + infiniopIndexAddDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + int64_t dim, + infiniopTensorDescriptor_t index, + infiniopTensorDescriptor_t source, + float alpha); + +__C __export infiniStatus_t infiniopGetIndexAddWorkspaceSize(infiniopIndexAddDescriptor_t desc, size_t *size); +__C __export infiniStatus_t infiniopIndexAdd(infiniopIndexAddDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream); + +__C __export infiniStatus_t infiniopDestroyIndexAddDescriptor(infiniopIndexAddDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/index_copy.h b/include/infiniop/ops/index_copy.h new file mode 100644 index 000000000..93e6766ac --- /dev/null +++ b/include/infiniop/ops/index_copy.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_INDEX_COPY_API_H__ +#define __INFINIOP_INDEX_COPY_API_H__ +#include +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopIndexCopyDescriptor_t; + +__C __export infiniStatus_t infiniopCreateIndexCopyDescriptor(infiniopHandle_t handle, + infiniopIndexCopyDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + int64_t dim, + infiniopTensorDescriptor_t index, + infiniopTensorDescriptor_t source); + +__C __export infiniStatus_t infiniopGetIndexCopyWorkspaceSize(infiniopIndexCopyDescriptor_t desc, size_t *size); +__C __export infiniStatus_t infiniopIndexCopy(infiniopIndexCopyDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream); + +__C __export infiniStatus_t infiniopDestroyIndexCopyDescriptor(infiniopIndexCopyDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/smooth_l1_loss.h b/include/infiniop/ops/smooth_l1_loss.h new file mode 100644 index 000000000..a17e090fc --- /dev/null +++ b/include/infiniop/ops/smooth_l1_loss.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_SMOOTH_L1_LOSS_API_H__ +#define __INFINIOP_SMOOTH_L1_LOSS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSmoothL1LossDescriptor_t; +__C __export infiniStatus_t infiniopCreateSmoothL1LossDescriptor(infiniopHandle_t handle, + infiniopSmoothL1LossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t target, + float beta, + int reduction); + +__C __export infiniStatus_t infiniopGetSmoothL1LossWorkspaceSize(infiniopSmoothL1LossDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSmoothL1Loss(infiniopSmoothL1LossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + void *stream); + +__C __export infiniStatus_t infiniopDestroySmoothL1LossDescriptor(infiniopSmoothL1LossDescriptor_t desc); + +#endif // __INFINIOP_SMOOTH_L1_LOSS_API_H__ \ No newline at end of file diff --git a/include/infiniop/ops/take.h b/include/infiniop/ops/take.h new file mode 100644 index 000000000..8aeefdb6e --- /dev/null +++ b/include/infiniop/ops/take.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_TAKE_API_H__ +#define __INFINIOP_TAKE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTakeDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTakeDescriptor(infiniopHandle_t handle, + infiniopTakeDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t indices); + +__C __export infiniStatus_t infiniopGetTakeWorkspaceSize(infiniopTakeDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTake(infiniopTakeDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTakeDescriptor(infiniopTakeDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 5c541ec3c..8c2ca51d0 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -45,6 +45,10 @@ from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow from infinicore.ops.rearrange import rearrange +from infinicore.ops.hypot import hypot +from infinicore.ops.index_add import index_add +from infinicore.ops.index_copy import index_copy +from infinicore.ops.take import take from infinicore.tensor import ( Tensor, empty, @@ -111,6 +115,10 @@ "from_list", "from_numpy", "from_torch", + "hypot", + "index_copy", + "index_add", + "take", "ones", "strided_empty", "strided_from_blob", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..772a7c4fa 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -6,12 +6,13 @@ from .rope import RopeAlgo, rope from .silu import silu from .swiglu import swiglu - +from .smooth_l1_loss import smooth_l1_loss __all__ = [ "causal_softmax", "random_sample", "rms_norm", "silu", + "smooth_l1_loss", "swiglu", "linear", "embedding", diff --git a/python/infinicore/nn/functional/smooth_l1_loss.py b/python/infinicore/nn/functional/smooth_l1_loss.py new file mode 100644 index 000000000..d26f49aaf --- /dev/null +++ b/python/infinicore/nn/functional/smooth_l1_loss.py @@ -0,0 +1,60 @@ +from typing import Optional +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +_REDUCTION_MODES = { + "none": 0, + "mean": 1, + "sum": 2, +} + +def smooth_l1_loss( + input: Tensor, + target: Tensor, + beta: float = 1.0, + reduction: str = "mean", + *, + out: Optional[Tensor] = None +) -> Tensor: + r"""Creates a criterion that uses a squared term if the absolute + element-wise error falls below beta and an L1 term otherwise. + + Args: + input (Tensor): the input tensor. + target (Tensor): the target tensor. + beta (float, optional): The threshold at which to change between L1 and L2 loss. + The value must be non-negative. Default: 1.0. + reduction (str, optional): Specifies the reduction to apply to the output: + 'none': no reduction will be applied, + 'mean': the sum of the output will be divided by the number of elements in the output, + 'sum': the output will be summed. Default: 'mean'. + out (Tensor, optional): the output tensor. + + Returns: + Tensor: The loss value. + """ + + if not input.is_contiguous(): + input = input.contiguous() + if not target.is_contiguous(): + target = target.contiguous() + if reduction not in _REDUCTION_MODES: + raise ValueError(f"{reduction} is not a valid value for reduction") + reduction_val = _REDUCTION_MODES[reduction] + if out is not None: + _infinicore.smooth_l1_loss_( + out._underlying, + input._underlying, + target._underlying, + beta, + reduction_val + ) + return out + return Tensor( + _infinicore.smooth_l1_loss( + input._underlying, + target._underlying, + beta, + reduction_val + ) + ) \ No newline at end of file diff --git a/python/infinicore/ops/hypot.py b/python/infinicore/ops/hypot.py new file mode 100644 index 000000000..d2a9b690e --- /dev/null +++ b/python/infinicore/ops/hypot.py @@ -0,0 +1,10 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def hypot(input, other, *, out=None): + if out is None: + return Tensor(_infinicore.hypot(input._underlying, other._underlying)) + _infinicore.hypot_(out._underlying, input._underlying, other._underlying) + + return out \ No newline at end of file diff --git a/python/infinicore/ops/index_add.py b/python/infinicore/ops/index_add.py new file mode 100644 index 000000000..f1bc8a5ed --- /dev/null +++ b/python/infinicore/ops/index_add.py @@ -0,0 +1,23 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def index_add(input, dim, index, source, *, alpha=1.0, out=None): + if out is None: + return Tensor(_infinicore.index_add( + input._underlying, + dim, + index._underlying, + source._underlying, + alpha + )) + _infinicore.index_add_( + out._underlying, + input._underlying, + dim, + index._underlying, + source._underlying, + alpha + ) + + return out \ No newline at end of file diff --git a/python/infinicore/ops/index_copy.py b/python/infinicore/ops/index_copy.py new file mode 100644 index 000000000..7fbdf8877 --- /dev/null +++ b/python/infinicore/ops/index_copy.py @@ -0,0 +1,22 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def index_copy(input, dim, index, source, *, out=None): + if out is None: + return Tensor(_infinicore.index_copy( + input._underlying, + dim, + index._underlying, + source._underlying + )) + + _infinicore.index_copy_( + out._underlying, + input._underlying, + dim, + index._underlying, + source._underlying + ) + + return out \ No newline at end of file diff --git a/python/infinicore/ops/take.py b/python/infinicore/ops/take.py new file mode 100644 index 000000000..d78c78392 --- /dev/null +++ b/python/infinicore/ops/take.py @@ -0,0 +1,25 @@ +from typing import Optional +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def take(input: Tensor, indices: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r"""Returns a new tensor with the elements of input at the given indices. + The input tensor is treated as if it were viewed as a 1-D tensor. + The result tensor has the same shape as the indices tensor. + + Args: + input (Tensor): the input tensor. + indices (Tensor): the indices into tensor, must be an Int or Long tensor. + out (Tensor, optional): the output tensor. + + Returns: + Tensor: A new tensor with the elements of input at the given indices. + """ + if not input.is_contiguous(): + input = input.contiguous() + + # 如果用户提供了 output tensor,调用底层的 in-place/explicit 接口 + if out is not None: + _infinicore.take_(out._underlying, input._underlying, indices._underlying) + return out + return Tensor(_infinicore.take(input._underlying, indices._underlying)) \ No newline at end of file diff --git a/src/infinicore/ops/hypot/hypot.cc b/src/infinicore/ops/hypot/hypot.cc new file mode 100644 index 000000000..2f61a0e74 --- /dev/null +++ b/src/infinicore/ops/hypot/hypot.cc @@ -0,0 +1,26 @@ +#include "infinicore/ops/hypot.hpp" + +namespace infinicore::op { + +// 1. 定义 Dispatcher 单例 +common::OpDispatcher &Hypot::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + + +void Hypot::execute(Tensor output, Tensor input_a, Tensor input_b) { + // lookup 需要传入设备类型,然后调用返回的函数指针 + dispatcher().lookup(context::getDevice().getType())(output, input_a, input_b); +} +Tensor hypot(Tensor input_a, Tensor input_b) { + auto output = Tensor::empty(input_a->shape(), input_a->dtype(), input_a->device()); + + hypot_(output, input_a, input_b); + return output; +} +void hypot_(Tensor output, Tensor input_a, Tensor input_b) { + Hypot::execute(output, input_a, input_b); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/hypot/hypot_infiniop.cc b/src/infinicore/ops/hypot/hypot_infiniop.cc new file mode 100644 index 000000000..8d86a0287 --- /dev/null +++ b/src/infinicore/ops/hypot/hypot_infiniop.cc @@ -0,0 +1,55 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/hypot.hpp" // 引入 Hypot 头文件 +#include + +namespace infinicore::op::hypot_impl::infiniop { +thread_local common::OpCache caches( + 100, // capacity + [](infiniopHypotDescriptor_t &desc) { + if (desc != nullptr) { + // 销毁 Hypot 描述符 + INFINICORE_CHECK_ERROR(infiniopDestroyHypotDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input_a, Tensor input_b) { + size_t seed = hash_combine(output, input_a, input_b); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopHypotDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateHypotDescriptor( + context::getInfiniopHandle(output->device()), &desc, + output->desc(), input_a->desc(), input_b->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetHypotWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopHypot( + desc, + workspace->data(), workspace_size, + output->data(), input_a->data(), input_b->data(), + context::getStream())); +} + +static bool registered = []() { + // 注册到 Hypot 的 dispatcher + Hypot::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::hypot_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/index_add/index_add.cc b/src/infinicore/ops/index_add/index_add.cc new file mode 100644 index 000000000..4e8dd7889 --- /dev/null +++ b/src/infinicore/ops/index_add/index_add.cc @@ -0,0 +1,109 @@ +#include "infinicore/ops/index_add.hpp" +#include +#include +#include +#include "infinicore/tensor.hpp" + +namespace infinicore::op { + +common::OpDispatcher &IndexAdd::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void IndexAdd::execute(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source, float alpha) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No IndexAdd implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, dim, index, source, alpha); +} + + +static void check_index_add_args(const Tensor& input, int64_t& dim, const Tensor& index, const Tensor& source) { + int64_t ndim = static_cast(input->ndim()); + + if (dim < 0) { + dim += ndim; + } + if (dim < 0 || dim >= ndim) { + throw std::runtime_error("IndexAdd: Dimension out of range."); + } + + if (index->ndim() != 1) { + throw std::runtime_error("IndexAdd: Index tensor must be 1D."); + } + + // 使用 DataType::I64 和 I32 + if (index->dtype() != DataType::I64 && index->dtype() != DataType::I32) { + throw std::runtime_error("IndexAdd: Index tensor must be I32 or I64."); + } + + if (source->ndim() != input->ndim()) { + throw std::runtime_error("IndexAdd: Source tensor must have same number of dimensions as input tensor."); + } + + auto in_shape = input->shape(); + auto src_shape = source->shape(); + auto idx_len = index->shape()[0]; + + for (int64_t i = 0; i < ndim; ++i) { + if (i == dim) { + if (src_shape[i] != idx_len) { + throw std::runtime_error("IndexAdd: Source dimension mismatch."); + } + } else { + if (src_shape[i] != in_shape[i]) { + throw std::runtime_error("IndexAdd: Source non-index dimension mismatch."); + } + } + } +} + +// 1. Out-of-place 接口 +Tensor index_add(Tensor input, int64_t dim, Tensor index, Tensor source, float alpha) { + check_index_add_args(input, dim, index, source); + + Tensor output = Tensor::empty(input->shape(), input->dtype(), input->device()); + output->copy_from(input); + if (!index->is_contiguous()) index = index->contiguous(); + if (!source->is_contiguous()) source = source->contiguous(); + IndexAdd::execute(output, output, dim, index, source, alpha); + + return output; +} + +// 2. In-place 接口 +void index_add_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source, float alpha) { + check_index_add_args(input, dim, index, source); + + if (output->shape() != input->shape()) { + throw std::runtime_error("IndexAdd (In-place): Output shape must match Input shape."); + } + + + if (output.operator->() != input.operator->()) { + output->copy_from(input); + } + + if (!index->is_contiguous()) index = index->contiguous(); + if (!source->is_contiguous()) source = source->contiguous(); + + + if (!output->is_contiguous()) { + + Tensor contiguous_out = output->contiguous(); + + IndexAdd::execute(contiguous_out, contiguous_out, dim, index, source, alpha); + + output->copy_from(contiguous_out); + } else { + // 正常路径: Output 已经是连续的,直接原地执行 + IndexAdd::execute(output, input, dim, index, source, alpha); + } +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/index_add/index_add_infiniop.cc b/src/infinicore/ops/index_add/index_add_infiniop.cc new file mode 100644 index 000000000..94ccfc0be --- /dev/null +++ b/src/infinicore/ops/index_add/index_add_infiniop.cc @@ -0,0 +1,69 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/index_add.hpp" // 引用算子定义 +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::index_add_impl::infiniop { + + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopIndexAddDescriptor_t &desc) { + if (desc != nullptr) { + // 销毁描述符 + INFINICORE_CHECK_ERROR(infiniopDestroyIndexAddDescriptor(desc)); + desc = nullptr; + } + }); + +// 计算函数实现 +void calculate(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source, float alpha) { + size_t seed = hash_combine(output, input, dim, index, source, alpha); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopIndexAddDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateIndexAddDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + dim, // 传入 int64_t + index->desc(), + source->desc(), + alpha)); // 传入 float + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 3. 获取 Workspace 大小并分配 + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetIndexAddWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + INFINICORE_CHECK_ERROR(infiniopIndexAdd( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + index->data(), + source->data(), + context::getStream())); +} + +// 5. 注册算子到 Dispatcher +static bool registered = []() { + IndexAdd::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::index_add_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/index_copy/index_copy.cc b/src/infinicore/ops/index_copy/index_copy.cc new file mode 100644 index 000000000..1f4689f16 --- /dev/null +++ b/src/infinicore/ops/index_copy/index_copy.cc @@ -0,0 +1,108 @@ +#include "infinicore/ops/index_copy.hpp" +#include +#include +#include +#include "infinicore/tensor.hpp" + +namespace infinicore::op { + +// ========================================================= +// Dispatcher & Execute +// ========================================================= + +common::OpDispatcher &IndexCopy::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; +void IndexCopy::execute(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No IndexCopy implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, dim, index, source); +} +static void check_index_copy_args(const Tensor& input, int64_t& dim, const Tensor& index, const Tensor& source) { + int64_t ndim = static_cast(input->ndim()); + + if (dim < 0) { + dim += ndim; + } + if (dim < 0 || dim >= ndim) { + throw std::runtime_error("IndexCopy: Dimension out of range."); + } + + if (index->ndim() != 1) { + throw std::runtime_error("IndexCopy: Index tensor must be 1D."); + } + + // 使用 DataType::I64 和 I32 + if (index->dtype() != DataType::I64 && index->dtype() != DataType::I32) { + throw std::runtime_error("IndexCopy: Index tensor must be I32 or I64."); + } + + if (source->ndim() != input->ndim()) { + throw std::runtime_error("IndexCopy: Source tensor must have same number of dimensions as input tensor."); + } + + auto in_shape = input->shape(); + auto src_shape = source->shape(); + auto idx_len = index->shape()[0]; + + for (int64_t i = 0; i < ndim; ++i) { + if (i == dim) { + if (src_shape[i] != idx_len) { + throw std::runtime_error("IndexCopy: Source dimension mismatch."); + } + } else { + if (src_shape[i] != in_shape[i]) { + throw std::runtime_error("IndexCopy: Source non-index dimension mismatch."); + } + } + } +} + + +Tensor index_copy(Tensor input, int64_t dim, Tensor index, Tensor source) { + check_index_copy_args(input, dim, index, source); + Tensor output = Tensor::empty(input->shape(), input->dtype(), input->device()); + output->copy_from(input); + if (!index->is_contiguous()) index = index->contiguous(); + if (!source->is_contiguous()) source = source->contiguous(); + IndexCopy::execute(output, output, dim, index, source); + + return output; +} + +// 2. In-place 接口 +void index_copy_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source) { + check_index_copy_args(input, dim, index, source); + + if (output->shape() != input->shape()) { + throw std::runtime_error("IndexCopy (In-place): Output shape must match Input shape."); + } + + if (output.operator->() != input.operator->()) { + output->copy_from(input); + } + + if (!index->is_contiguous()) index = index->contiguous(); + if (!source->is_contiguous()) source = source->contiguous(); + + if (!output->is_contiguous()) { + // 策略: Copy -> Compute -> CopyBack + Tensor contiguous_out = output->contiguous(); + + + IndexCopy::execute(contiguous_out, contiguous_out, dim, index, source); + + // 写回结果 + output->copy_from(contiguous_out); + } else { + IndexCopy::execute(output, input, dim, index, source); + } +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/index_copy/index_copy_infiniop.cc b/src/infinicore/ops/index_copy/index_copy_infiniop.cc new file mode 100644 index 000000000..d5f1ea822 --- /dev/null +++ b/src/infinicore/ops/index_copy/index_copy_infiniop.cc @@ -0,0 +1,63 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/index_copy.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::index_copy_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopIndexCopyDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyIndexCopyDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source) { + size_t seed = hash_combine(output, input, dim, index, source); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopIndexCopyDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateIndexCopyDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + dim, + index->desc(), + source->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetIndexCopyWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopIndexCopy( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + index->data(), + source->data(), + context::getStream())); +} + +static bool registered = []() { + IndexCopy::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::index_copy_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss.cc b/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss.cc new file mode 100644 index 000000000..263e2d07c --- /dev/null +++ b/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss.cc @@ -0,0 +1,34 @@ +#include "infinicore/ops/smooth_l1_loss.hpp" + +namespace infinicore::op { + +// 1. 定义 Dispatcher 单例 +common::OpDispatcher &SmoothL1Loss::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void SmoothL1Loss::execute(Tensor output, Tensor input, Tensor target, float beta, int64_t reduction) { + dispatcher().lookup(context::getDevice().getType())(output, input, target, beta, reduction); +} + +Tensor smooth_l1_loss(Tensor input, Tensor target, float beta, int64_t reduction) { + Shape output_shape; + if (reduction == 0) { + // Reduction::None -> 输出形状与输入一致 + output_shape = input->shape(); + } else { + output_shape = {}; + } + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + + smooth_l1_loss_(output, input, target, beta, reduction); + return output; +} + +void smooth_l1_loss_(Tensor output, Tensor input, Tensor target, float beta, int64_t reduction) { + SmoothL1Loss::execute(output, input, target, beta, reduction); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss_infiniop.cc b/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss_infiniop.cc new file mode 100644 index 000000000..5c994424a --- /dev/null +++ b/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss_infiniop.cc @@ -0,0 +1,65 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/smooth_l1_loss.hpp" +#include + +namespace infinicore::op::smooth_l1_loss_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopSmoothL1LossDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroySmoothL1LossDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, Tensor target, float beta, int64_t reduction) { + size_t seed = hash_combine(output, input, target, beta, reduction); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopSmoothL1LossDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateSmoothL1LossDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + target->desc(), + beta, + static_cast(reduction) + )); + + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetSmoothL1LossWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopSmoothL1Loss( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + target->data(), + context::getStream() + )); +} + +static bool registered = []() { + SmoothL1Loss::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::smooth_l1_loss_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/take/take.cc b/src/infinicore/ops/take/take.cc new file mode 100644 index 000000000..478227d69 --- /dev/null +++ b/src/infinicore/ops/take/take.cc @@ -0,0 +1,28 @@ +#include "infinicore/ops/take.hpp" + +namespace infinicore::op { + +// 1. 定义 Dispatcher 单例 +common::OpDispatcher &Take::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +// 2. Execute 实现:查找对应设备的核函数并执行 +void Take::execute(Tensor output, Tensor input, Tensor indices) { + dispatcher().lookup(context::getDevice().getType())(output, input, indices); +} + +Tensor take(Tensor input, Tensor indices) { + // 【关键区别】Take 的输出形状取决于 indices 的形状,但数据类型取决于 input + auto output = Tensor::empty(indices->shape(), input->dtype(), input->device()); + + take_(output, input, indices); + return output; +} + +void take_(Tensor output, Tensor input, Tensor indices) { + Take::execute(output, input, indices); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/take/take_infiniop.cc b/src/infinicore/ops/take/take_infiniop.cc new file mode 100644 index 000000000..5ab506973 --- /dev/null +++ b/src/infinicore/ops/take/take_infiniop.cc @@ -0,0 +1,58 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/take.hpp" +#include + +namespace infinicore::op::take_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopTakeDescriptor_t &desc) { + if (desc != nullptr) { + // 销毁 Take 描述符 + INFINICORE_CHECK_ERROR(infiniopDestroyTakeDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, Tensor indices) { + size_t seed = hash_combine(output, input, indices); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopTakeDescriptor_t desc = nullptr; + + if (!desc_opt) { + // 2. 创建描述符:传入 input 和 indices 的 descriptor + INFINICORE_CHECK_ERROR(infiniopCreateTakeDescriptor( + context::getInfiniopHandle(output->device()), &desc, + output->desc(), input->desc(), indices->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetTakeWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + // 3. 执行计算:传入 input 和 indices 的数据指针 + INFINICORE_CHECK_ERROR(infiniopTake( + desc, + workspace->data(), workspace_size, + output->data(), input->data(), indices->data(), + context::getStream())); +} + +static bool registered = []() { + // 注册到 Take 的 dispatcher + Take::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::take_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 978defa17..c4625c4c0 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -6,6 +6,11 @@ #include "ops/attention.hpp" #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" +#include "ops/hypot.hpp" +#include "ops/take.hpp" +#include "ops/index_copy.hpp" +#include "ops/index_add.hpp" +#include "ops/smooth_l1_loss.hpp" #include "ops/linear.hpp" #include "ops/matmul.hpp" #include "ops/mul.hpp" @@ -28,6 +33,11 @@ inline void bind(py::module &m) { bind_linear(m); bind_matmul(m); bind_mul(m); + bind_hypot(m); + bind_take(m); + bind_index_copy(m); + bind_index_add(m); + bind_smooth_l1_loss(m); bind_rearrange(m); bind_rms_norm(m); bind_silu(m); diff --git a/src/infinicore/pybind11/ops/hypot.hpp b/src/infinicore/pybind11/ops/hypot.hpp new file mode 100644 index 000000000..b8de2d0b9 --- /dev/null +++ b/src/infinicore/pybind11/ops/hypot.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include +#include "infinicore/ops/hypot.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_hypot(py::module &m) { + // 绑定 out-of-place 接口: output = hypot(input, other) + m.def("hypot", + &op::hypot, + py::arg("input"), + py::arg("other"), + R"doc(Computes the hypotenuse of input and other arguments, i.e. sqrt(input^2 + other^2).)doc"); + + // 绑定 in-place / 指定输出接口: hypot_(output, input, other) + m.def("hypot_", + &op::hypot_, + py::arg("output"), + py::arg("input"), + py::arg("other"), + R"doc(In-place hypot operation. Writes result into output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/index_add.hpp b/src/infinicore/pybind11/ops/index_add.hpp new file mode 100644 index 000000000..3731559ec --- /dev/null +++ b/src/infinicore/pybind11/ops/index_add.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include +#include "infinicore/ops/index_add.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_index_add(py::module &m) { + m.def("index_add", + &op::index_add, + py::arg("input"), + py::arg("dim"), + py::arg("index"), + py::arg("source"), + py::arg("alpha") = 1.0f, + R"doc(Accumulate elements of source into input by adding to the indices in the order given in index. + Formula: output[index[i]] = input[index[i]] + alpha * source[i])doc"); + m.def("index_add_", + &op::index_add_, + py::arg("output"), + py::arg("input"), + py::arg("dim"), + py::arg("index"), + py::arg("source"), + py::arg("alpha") = 1.0f, + R"doc(In-place version of index_add. Writes result into output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/index_copy.hpp b/src/infinicore/pybind11/ops/index_copy.hpp new file mode 100644 index 000000000..39712e700 --- /dev/null +++ b/src/infinicore/pybind11/ops/index_copy.hpp @@ -0,0 +1,30 @@ +#pragma once + +#include +#include "infinicore/ops/index_copy.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_index_copy(py::module &m) { + // 1. Out-of-place version (returns new tensor) + m.def("index_copy", + &op::index_copy, + py::arg("input"), + py::arg("dim"), + py::arg("index"), + py::arg("source"), + R"doc(Copies elements of source into input at the indices given in index. + Formula: output[index[i]] = source[i])doc"); + m.def("index_copy_", + &op::index_copy_, + py::arg("output"), + py::arg("input"), + py::arg("dim"), + py::arg("index"), + py::arg("source"), + R"doc(In-place version of index_copy. Writes result into output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/smooth_l1_loss.hpp b/src/infinicore/pybind11/ops/smooth_l1_loss.hpp new file mode 100644 index 000000000..eac332683 --- /dev/null +++ b/src/infinicore/pybind11/ops/smooth_l1_loss.hpp @@ -0,0 +1,36 @@ +#pragma once + +#include +#include "infinicore/ops/smooth_l1_loss.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_smooth_l1_loss(py::module &m) { + // 1. 绑定 out-of-place 接口: output = smooth_l1_loss(input, target, beta, reduction) + m.def("smooth_l1_loss", + &op::smooth_l1_loss, + py::arg("input"), + py::arg("target"), + py::arg("beta") = 1.0f, + py::arg("reduction") = 1, + R"doc(Computes the Smooth L1 Loss between input and target. + + Args: + input (Tensor): Predicted values. + target (Tensor): Ground truth values. + beta (float, optional): The threshold at which to change between L1 and L2 loss. Default: 1.0. + reduction (int, optional): Specifies the reduction to apply to the output: 0=None, 1=Mean, 2=Sum. Default: 1. + )doc"); + m.def("smooth_l1_loss_", + &op::smooth_l1_loss_, + py::arg("output"), + py::arg("input"), + py::arg("target"), + py::arg("beta") = 1.0f, + py::arg("reduction") = 1, + R"doc(Explicit output Smooth L1 Loss operation. Writes the result into the output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/take.hpp b/src/infinicore/pybind11/ops/take.hpp new file mode 100644 index 000000000..c62596451 --- /dev/null +++ b/src/infinicore/pybind11/ops/take.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include +#include "infinicore/ops/take.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_take(py::module &m) { + m.def("take", + &op::take, + py::arg("input"), + py::arg("indices"), + R"doc(Extracts elements from the input tensor along the given indices. +The input tensor is treated as a flattened 1D array.)doc"); + m.def("take_", + &op::take_, + py::arg("output"), + py::arg("input"), + py::arg("indices"), + R"doc(Explicit output take operation. Writes the result into the output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infiniop/ops/hypot/cpu/hypot_cpu.cc b/src/infiniop/ops/hypot/cpu/hypot_cpu.cc new file mode 100644 index 000000000..766eda851 --- /dev/null +++ b/src/infiniop/ops/hypot/cpu/hypot_cpu.cc @@ -0,0 +1,63 @@ +// 引用 Hypot 专用的 CPU 头文件 +#include "hypot_cpu.h" + +namespace op::hypot::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + if (input_desc_vec.size() != 2) { + return INFINI_STATUS_BAD_PARAM; + } + + const auto &input_a_desc = input_desc_vec.at(0); + const auto &input_b_desc = input_desc_vec.at(1); + const auto &output_shape = out_desc->shape(); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64 + ); + + // 检查两个输入的形状是否与输出一致 + CHECK_SAME_SHAPE(output_shape, input_a_desc->shape()); + CHECK_SAME_SHAPE(output_shape, input_b_desc->shape()); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + + default: + // 如果传入了整数类型或其他不支持的类型,将返回错误 + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::hypot::cpu \ No newline at end of file diff --git a/src/infiniop/ops/hypot/cpu/hypot_cpu.h b/src/infiniop/ops/hypot/cpu/hypot_cpu.h new file mode 100644 index 000000000..63575d331 --- /dev/null +++ b/src/infiniop/ops/hypot/cpu/hypot_cpu.h @@ -0,0 +1,31 @@ +#ifndef __HYPOT_CPU_H__ +#define __HYPOT_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(hypot, cpu) + +#include +#include + +namespace op::hypot::cpu { + +typedef struct HypotOp { +public: + // Hypot 是二元算子,计算 sqrt(x^2 + y^2) + static constexpr size_t num_inputs = 2; + + template + T operator()(const T &x, const T &y) const { + if constexpr (std::is_same_v || std::is_same_v) { + return std::hypot(x, y); + } + else { + return static_cast(std::hypot(static_cast(x), static_cast(y))); + } + } +} HypotOp; + +} // namespace op::hypot::cpu + +#endif // __HYPOT_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/hypot/cuda/kernel.cuh b/src/infiniop/ops/hypot/cuda/kernel.cuh new file mode 100644 index 000000000..377c557c0 --- /dev/null +++ b/src/infiniop/ops/hypot/cuda/kernel.cuh @@ -0,0 +1,64 @@ +#ifndef __HYPOT_CUDA_H__ +#define __HYPOT_CUDA_H__ + +#include +#include +#if ENABLE_METAX_API + #include + #include + using nv_bfloat162 = __maca_bfloat162; +#else + #include + #include +#endif + +namespace op::hypot::cuda { + +typedef struct HypotOp { +public: + static constexpr size_t num_inputs = 2; + + template + __device__ __forceinline__ T operator()(const T &x, const T &y) const { + + if constexpr (std::is_same_v) { + return sqrtf(fmaf(x, x, y * y)); + } + else if constexpr (std::is_same_v) { + half2 sq_sum = __hfma2(x, x, __hmul2(y, y)); + return h2sqrt(sq_sum); + } + + else if constexpr (std::is_same_v) { + return hsqrt(__hfma(x, x, __hmul(y, y))); + } + else if constexpr (std::is_same_v) { + + float f0_x = __bfloat162float(__low2bfloat16(x)); + float f1_x = __bfloat162float(__high2bfloat16(x)); + + float f0_y = __bfloat162float(__low2bfloat16(y)); + float f1_y = __bfloat162float(__high2bfloat16(y)); + float res0 = sqrtf(fmaf(f0_x, f0_x, f0_y * f0_y)); + float res1 = sqrtf(fmaf(f1_x, f1_x, f1_y * f1_y)); + + return __floats2bfloat162_rn(res0, res1); + } + else if constexpr (std::is_same_v) { + float fx = __bfloat162float(x); + float fy = __bfloat162float(y); + return __float2bfloat16(sqrtf(fmaf(fx, fx, fy * fy))); + } + + else if constexpr (std::is_same_v) { + return sqrt(fma(x, x, y * y)); + } + else { + return static_cast(sqrt(fma(static_cast(x), static_cast(x), static_cast(y) * static_cast(y)))); + } + } +} HypotOp; + +} // namespace op::hypot::cuda + +#endif // __HYPOT_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/hypot/metax/hypot_metax.h b/src/infiniop/ops/hypot/metax/hypot_metax.h new file mode 100644 index 000000000..a7a4e7d11 --- /dev/null +++ b/src/infiniop/ops/hypot/metax/hypot_metax.h @@ -0,0 +1,9 @@ +#ifndef __HYPOT_METAX_API_H__ +#define __HYPOT_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +// 将第一个参数修改为 hypot +ELEMENTWISE_DESCRIPTOR(hypot, metax) + +#endif // __HYPOT_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/hypot/metax/hypot_metax.maca b/src/infiniop/ops/hypot/metax/hypot_metax.maca new file mode 100644 index 000000000..ed9d6548a --- /dev/null +++ b/src/infiniop/ops/hypot/metax/hypot_metax.maca @@ -0,0 +1,78 @@ +#include "hypot_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::hypot::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // 3. Hypot 是二元算子,需要确保有两个输入 + if (input_desc_vec.size() != 2) { + return INFINI_STATUS_BAD_PARAM; + } + const auto &input_a_desc = input_desc_vec.at(0); + const auto &input_b_desc = input_desc_vec.at(1); + + const auto &out_shape = out_desc->shape(); + const auto &in_a_shape = input_a_desc->shape(); + const auto &in_b_shape = input_b_desc->shape(); + + // 检查数据类型 (通常 Hypot 支持浮点类型) + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // 4. 检查形状一致性 + // 注意:如果框架支持 Broadcasting (广播),这里可能不需要严格相等 + // 但参照你的 Floor 实现,这里先进行严格形状检查 + CHECK_SAME_SHAPE(out_shape, in_a_shape); + CHECK_SAME_SHAPE(out_shape, in_b_shape); + + // create CUDA elementwise descriptor + // 宏会自动处理 input_desc_vec 里的所有输入信息 + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // 5. 调用 calculate 并传入 cuda::HypotOp + // HypotOp 通常定义为二元运算: sqrt(x*x + y*y) + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HypotOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HypotOp, nv_bfloat162>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HypotOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HypotOp, double>(_info, workspace, output, inputs, stream); + + // Hypot 通常返回浮点数,如果是整数输入通常需要转为浮点计算, + // 这里暂时保持与 Floor 一致的浮点支持列表。 + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hypot::metax \ No newline at end of file diff --git a/src/infiniop/ops/hypot/moore/hypot_moore.h b/src/infiniop/ops/hypot/moore/hypot_moore.h new file mode 100644 index 000000000..026c6d691 --- /dev/null +++ b/src/infiniop/ops/hypot/moore/hypot_moore.h @@ -0,0 +1,7 @@ +#ifndef __HYPOT_MOORE_API_H__ +#define __HYPOT_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" +ELEMENTWISE_DESCRIPTOR(hypot, moore) + +#endif // __HYPOT_MOORE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/hypot/moore/hypot_moore.mu b/src/infiniop/ops/hypot/moore/hypot_moore.mu new file mode 100644 index 000000000..cc128b0ad --- /dev/null +++ b/src/infiniop/ops/hypot/moore/hypot_moore.mu @@ -0,0 +1,77 @@ +#include "hypot_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" +#include "hypot_moore_kernel.h" + +namespace op::hypot::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // Hypot is a binary operator (z = hypot(x, y) = sqrt(x^2 + y^2)) + // 需要确保有两个输入 + if (input_desc_vec.size() != 2) { + return INFINI_STATUS_BAD_PARAM; + } + const auto &in_desc_0 = input_desc_vec.at(0); + const auto &in_desc_1 = input_desc_vec.at(1); + + const auto &out_shape = out_desc->shape(); + const auto &in_shape_0 = in_desc_0->shape(); + const auto &in_shape_1 = in_desc_1->shape(); + + // Hypot supports floating point types. + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // Check if output shape matches input shapes + CHECK_SAME_SHAPE(out_shape, in_shape_0); + CHECK_SAME_SHAPE(out_shape, in_shape_1); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // Safety check for input count in calculate phase + if (inputs.size() != 2) { + return INFINI_STATUS_BAD_PARAM; + } + + // Use moore::HypotOp template defined in hypot_moore_kernel.h + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::HypotOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::HypotOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::HypotOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::HypotOp, double>(_info, workspace, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hypot::moore \ No newline at end of file diff --git a/src/infiniop/ops/hypot/moore/hypot_moore_kernel.h b/src/infiniop/ops/hypot/moore/hypot_moore_kernel.h new file mode 100644 index 000000000..2f2cb8e9b --- /dev/null +++ b/src/infiniop/ops/hypot/moore/hypot_moore_kernel.h @@ -0,0 +1,60 @@ +#ifndef __HYPOT_MOORE_KERNEL_H__ +#define __HYPOT_MOORE_KERNEL_H__ + +#include +#include +#include + +namespace op::hypot::moore { + +typedef struct HypotOp { +public: + static constexpr size_t num_inputs = 2; + + template + __device__ __forceinline__ T operator()(const T &x, const T &y) const { + // ----------------------------------------------------------------- + // 1. Half2 + // ----------------------------------------------------------------- + if constexpr (std::is_same_v) { + float x_low = __low2float(x); + float x_high = __high2float(x); + float y_low = __low2float(y); + float y_high = __high2float(y); + return __floats2half2_rn(::hypotf(x_low, y_low), ::hypotf(x_high, y_high)); + } + // ----------------------------------------------------------------- + // 2. Half + // ----------------------------------------------------------------- + else if constexpr (std::is_same_v) { + return __float2half(::hypotf(__half2float(x), __half2float(y))); + } + // ----------------------------------------------------------------- + // 3. Bfloat16 (__mt_bfloat16) + // ----------------------------------------------------------------- + else if constexpr (std::is_same_v) { + // 将 __mt_bfloat16 转为 float 计算 + float x_f = __bfloat162float(x); + float y_f = __bfloat162float(y); + + // 计算结果转回 __mt_bfloat16 + return __float2bfloat16(::hypotf(x_f, y_f)); + } + // ----------------------------------------------------------------- + // 4. Float32 + // ----------------------------------------------------------------- + else if constexpr (std::is_same_v) { + return ::hypotf(x, y); + } + // ----------------------------------------------------------------- + // 5. Double / Other + // ----------------------------------------------------------------- + else { + return ::hypot(x, y); + } + } +} HypotOp; + +} // namespace op::hypot::moore + +#endif // __HYPOT_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu new file mode 100644 index 000000000..a8a9bc198 --- /dev/null +++ b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu @@ -0,0 +1,78 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + + +#include "../cuda/kernel.cuh" +#include "hypot_nvidia.cuh" + +namespace op::hypot::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + + if (input_desc_vec.size() != 2) { + return INFINI_STATUS_BAD_PARAM; + } + + const auto &input_a_desc = input_desc_vec.at(0); + const auto &input_b_desc = input_desc_vec.at(1); + const auto &output_shape = out_desc->shape(); + + + CHECK_DTYPE(dtype, + INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64 + ); + + CHECK_SAME_SHAPE(output_shape, input_a_desc->shape()); + CHECK_SAME_SHAPE(output_shape, input_b_desc->shape()); + + // 创建描述符 + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // ----------------------------------------------------------- + // 算子分发:将 FloorOp 替换为 HypotOp + // ----------------------------------------------------------- + switch (_dtype) { + // === 浮点类型 === + case INFINI_DTYPE_BF16: + // 注意:cuda::HypotOp 对应我们在 hypot_cuda.h 中定义的 Functor + return _device_info->calculate<256, cuda::HypotOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HypotOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HypotOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HypotOp, double>(_info, workspace, output, inputs, stream); + + // 【修改点 4】移除了整数类型的 Case + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::hypot::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh new file mode 100644 index 000000000..7ff5a11b1 --- /dev/null +++ b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __HYPOT_NVIDIA_CUH__ +#define __HYPOT_NVIDIA_CUH__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(hypot, nvidia) + +#endif // __HYPOT_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/hypot/operator.cc b/src/infiniop/ops/hypot/operator.cc new file mode 100644 index 000000000..f056ea8d8 --- /dev/null +++ b/src/infiniop/ops/hypot/operator.cc @@ -0,0 +1,167 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/hypot.h" + +// --- 后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/hypot_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/hypot_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/hypot_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/hypot_moore.h" +#endif + +extern "C" { +__C infiniStatus_t infiniopCreateHypotDescriptor( + infiniopHandle_t handle, + infiniopHypotDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input_x, + infiniopTensorDescriptor_t input_y) { + + + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hypot::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr),\ + output, \ + {input_x, input_y}) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + + +__C infiniStatus_t infiniopGetHypotWorkspaceSize(infiniopHypotDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + + +__C infiniStatus_t infiniopHypot( + infiniopHypotDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input_x, + const void *input_y, + void *stream) { + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input_x, input_y}, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + + +__C infiniStatus_t infiniopDestroyHypotDescriptor(infiniopHypotDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/index_add/cpu/index_add_cpu.cc b/src/infiniop/ops/index_add/cpu/index_add_cpu.cc new file mode 100644 index 000000000..8b7551561 --- /dev/null +++ b/src/infiniop/ops/index_add/cpu/index_add_cpu.cc @@ -0,0 +1,151 @@ +#include "index_add_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include +#include // 必需:用于 std::conditional + +namespace op::index_add::cpu { + +Descriptor::~Descriptor() = default; + +// ================================================================== +// 创建描述符 +// ================================================================== +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + int64_t dim, + infiniopTensorDescriptor_t index_desc, + infiniopTensorDescriptor_t source_desc, + float alpha) { + + auto handle = reinterpret_cast(handle_); + + auto result = IndexAddInfo::create(out_desc, in_desc, dim, index_desc, source_desc, alpha); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + nullptr, // Opaque* + result.take(), // Info + 0, // Workspace Size + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +template +void calculate_cpu_impl( + const IndexAddInfo &info, + void *output, + const void *input, + const void *index, + const void *source) { + + using CalcType = typename std::conditional::value, double, float>::type; + + // 转换 Alpha (使用 utils::cast 处理自定义类型) + CalcType alpha_val = utils::cast(info.alpha()); + size_t outer_size = info.outer_size(); + size_t inner_size = info.inner_size(); + size_t dim_size = info.dim_size(); + size_t index_len = info.index_len(); + + auto out_ptr = reinterpret_cast(output); + auto src_ptr = reinterpret_cast(source); + auto idx_ptr = reinterpret_cast(index); + + for (size_t o = 0; o < outer_size; ++o) { + for (size_t i = 0; i < index_len; ++i) { + + TIdx idx = idx_ptr[i]; + + // 处理负索引 + if (idx < 0) idx += static_cast(dim_size); + + // 边界检查 + if (idx < 0 || static_cast(idx) >= dim_size) { + continue; + } + + // 计算偏移 + size_t src_offset = o * index_len * inner_size + i * inner_size; + size_t out_offset = o * dim_size * inner_size + static_cast(idx) * inner_size; + + // Inner 维度循环 + for (size_t in = 0; in < inner_size; ++in) { + CalcType src_val = utils::cast(src_ptr[src_offset + in]); + CalcType out_old_val = utils::cast(out_ptr[out_offset + in]); + + // 2. 执行计算: out = out + src * alpha + CalcType result_val = out_old_val + src_val * alpha_val; + + // 3. 使用 utils::cast 转回 TData 并写入 + out_ptr[out_offset + in] = utils::cast(result_val); + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream) const { + + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + + #define DISPATCH(TDATA, TIDX) \ + calculate_cpu_impl(_info, output, input, index, source); \ + return INFINI_STATUS_SUCCESS + + if (idx_dtype == INFINI_DTYPE_I32) { + switch (dtype) { + case INFINI_DTYPE_F32: + DISPATCH(float, int32_t); + case INFINI_DTYPE_F64: + DISPATCH(double, int32_t); + case INFINI_DTYPE_F16: + DISPATCH(fp16_t, int32_t); + case INFINI_DTYPE_BF16: + DISPATCH(bf16_t, int32_t); + case INFINI_DTYPE_I32: + DISPATCH(int32_t, int32_t); + case INFINI_DTYPE_I64: + DISPATCH(int64_t, int32_t); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else if (idx_dtype == INFINI_DTYPE_I64) { + switch (dtype) { + case INFINI_DTYPE_F32: + DISPATCH(float, int64_t); + case INFINI_DTYPE_F64: + DISPATCH(double, int64_t); + case INFINI_DTYPE_F16: + DISPATCH(fp16_t, int64_t); + case INFINI_DTYPE_BF16: + DISPATCH(bf16_t, int64_t); + case INFINI_DTYPE_I32: + DISPATCH(int32_t, int64_t); + case INFINI_DTYPE_I64: + DISPATCH(int64_t, int64_t); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } + + return INFINI_STATUS_BAD_TENSOR_DTYPE; + + #undef DISPATCH +} + +} // namespace op::index_add::cpu \ No newline at end of file diff --git a/src/infiniop/ops/index_add/cpu/index_add_cpu.h b/src/infiniop/ops/index_add/cpu/index_add_cpu.h new file mode 100644 index 000000000..e916399b1 --- /dev/null +++ b/src/infiniop/ops/index_add/cpu/index_add_cpu.h @@ -0,0 +1,7 @@ +#ifndef __INDEX_ADD_CPU_H__ +#define __INDEX_ADD_CPU_H__ + +#include "../index_add.h" +DESCRIPTOR(cpu) + +#endif // __INDEX_ADD_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_add/cuda/kernel.cuh b/src/infiniop/ops/index_add/cuda/kernel.cuh new file mode 100644 index 000000000..be5b7ce05 --- /dev/null +++ b/src/infiniop/ops/index_add/cuda/kernel.cuh @@ -0,0 +1,200 @@ +#ifndef __INDEX_ADD_CUDA_H__ +#define __INDEX_ADD_CUDA_H__ + +#if ENABLE_METAX_API +#include +#include +#define __nv_bfloat16 __maca_bfloat16 +#define __nv_bfloat162 __maca_bfloat162 +#else + #include + #include + #include +#endif +#include + +namespace op::index_add::cuda { + + +__device__ __forceinline__ void atomic_add_custom(__half* address, __half val) { +#if __CUDA_ARCH__ >= 700 + atomicAdd(address, val); +#else + // Fallback for older architectures (< Volta) + unsigned int* address_as_ui = (unsigned int*)((char*)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int assumed; + + do { + assumed = old; + unsigned short old_val_raw = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + __half old_val = *reinterpret_cast<__half*>(&old_val_raw); + + __half new_val = old_val + val; + unsigned short new_val_raw = *reinterpret_cast(&new_val); + + unsigned int new_int = (size_t)address & 2 ? (old & 0xffff) | (new_val_raw << 16) + : (old & 0xffff0000) | new_val_raw; + + old = atomicCAS(address_as_ui, assumed, new_int); + } while (assumed != old); +#endif +} + + +__device__ __forceinline__ void atomic_add_custom(__nv_bfloat16* address, __nv_bfloat16 val) { +#if __CUDA_ARCH__ >= 800 + atomicAdd(address, val); +#else + // Fallback for older architectures (< Ampere) + unsigned int* address_as_ui = (unsigned int*)((char*)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int assumed; + + do { + assumed = old; + unsigned short old_val_raw = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + __nv_bfloat16 old_val = *reinterpret_cast<__nv_bfloat16*>(&old_val_raw); + + __nv_bfloat16 new_val = old_val + val; + unsigned short new_val_raw = *reinterpret_cast(&new_val); + + unsigned int new_int = (size_t)address & 2 ? (old & 0xffff) | (new_val_raw << 16) + : (old & 0xffff0000) | new_val_raw; + + old = atomicCAS(address_as_ui, assumed, new_int); + } while (assumed != old); +#endif +} + + +__device__ __forceinline__ void atomic_add_custom(int64_t* address, int64_t val) { + atomicAdd(reinterpret_cast(address), static_cast(val)); +} + +// --- 通用模板 (float, double, int32 等) --- +template +__device__ __forceinline__ void atomic_add_custom(T* address, T val) { + atomicAdd(address, val); +} + +// ================================================================== +// 2. 定义向量化数据包 (Aligned Pack) +// ================================================================== +template +struct alignas(sizeof(T) * N) Pack { + T val[N]; +}; + +// ================================================================== +// 3. 标量版 Kernel (通用 fallback) +// ================================================================== +template +__global__ void index_add_kernel( + T * __restrict__ output, + const T * __restrict__ source, + const TIdx * __restrict__ indices, + size_t outer_size, // dim 左边的维度积 + size_t inner_size, // dim 右边的维度积 + size_t dim_size, // output 在 dim 维度的长度 + size_t index_len, // index 的长度 (source 在 dim 维度的长度) + size_t num_source, // source 的总元素数 + float alpha // 缩放因子 +) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + + T alpha_val = static_cast(alpha); + + // Grid-Stride Loop 遍历 Source 张量 + for (size_t i = tid; i < num_source; i += stride) { + // 1. 将线性索引 i 转换为逻辑坐标 (outer, idx_idx, inner) + // Source Shape: [Outer, IndexLen, Inner] + size_t inner_idx = i % inner_size; + size_t tmp = i / inner_size; + size_t idx_idx = tmp % index_len; + size_t outer_idx = tmp / index_len; + + // 2. 读取索引值 + TIdx target_dim_idx = indices[idx_idx]; + + // 3. 处理负索引 (防御性) + if (target_dim_idx < 0) target_dim_idx += static_cast(dim_size); + + // 4. 边界检查与原子累加 + if (target_dim_idx >= 0 && target_dim_idx < static_cast(dim_size)) { + // 计算 Output 的线性偏移 + // Output Shape: [Outer, DimSize, Inner] + size_t out_offset = outer_idx * (dim_size * inner_size) + + static_cast(target_dim_idx) * inner_size + + inner_idx; + + // 使用自定义原子操作 + atomic_add_custom(&output[out_offset], source[i] * alpha_val); + } + } +} + +// ================================================================== +// 4. 向量化 Kernel (优化读取带宽) +// ================================================================== +template +__global__ void index_add_kernel_vectorized( + T * __restrict__ output, + const T * __restrict__ source, + const TIdx * __restrict__ indices, + size_t outer_size, + size_t inner_size, + size_t dim_size, + size_t index_len, + size_t num_packs, // Source 的 Pack 数量 + float alpha +) { + // 将 source 强转为 Pack 指针,实现向量化读取 + using PackType = Pack; + const PackType *src_vec = reinterpret_cast(source); + + T alpha_val = static_cast(alpha); + + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = tid; i < num_packs; i += stride) { + // 向量化读取 (LDG.128) + PackType reg_pack = src_vec[i]; + + // 当前 Pack 在 Source 中的起始线性索引 + size_t base_idx = i * PackSize; + + // 循环展开:处理 Pack 中的每一个元素 + #pragma unroll + for (int k = 0; k < PackSize; ++k) { + size_t curr_src_idx = base_idx + k; + + // 1. 坐标变换 + size_t inner_idx = curr_src_idx % inner_size; + size_t tmp = curr_src_idx / inner_size; + size_t idx_idx = tmp % index_len; + size_t outer_idx = tmp / index_len; + + // 2. 读取 Index + TIdx target_dim_idx = indices[idx_idx]; + + if (target_dim_idx < 0) target_dim_idx += static_cast(dim_size); + + // 3. 原子累加 + if (target_dim_idx >= 0 && target_dim_idx < static_cast(dim_size)) { + size_t out_offset = outer_idx * (dim_size * inner_size) + + static_cast(target_dim_idx) * inner_size + + inner_idx; + + // 使用自定义原子操作 + atomic_add_custom(&output[out_offset], reg_pack.val[k] * alpha_val); + } + } + } +} + +} // namespace op::index_add::cuda + +#endif // __INDEX_ADD_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_add/index_add.h b/src/infiniop/ops/index_add/index_add.h new file mode 100644 index 000000000..1622b1231 --- /dev/null +++ b/src/infiniop/ops/index_add/index_add.h @@ -0,0 +1,52 @@ +#ifndef __INDEX_ADD_H__ +#define __INDEX_ADD_H__ + +#include "../../operator.h" +#include "info.h" // 引用 IndexAddInfo 定义 (需自行定义,包含 dim, alpha 等) +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::index_add::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + IndexAddInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + IndexAddInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t in_desc, \ + int64_t dim, \ + infiniopTensorDescriptor_t index_desc, \ + infiniopTensorDescriptor_t source_desc, \ + float alpha); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *index, \ + const void *source, \ + void *stream) const; \ + }; \ + } + +#endif // __INDEX_ADD_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_add/info.h b/src/infiniop/ops/index_add/info.h new file mode 100644 index 000000000..66602818f --- /dev/null +++ b/src/infiniop/ops/index_add/info.h @@ -0,0 +1,141 @@ +#ifndef __INDEX_ADD_INFO_H__ +#define __INDEX_ADD_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::index_add { + +class IndexAddInfo { + IndexAddInfo() = default; + +public: + int _dtype; // 数据类型 (Input/Output/Source) + int _idx_dtype; // 索引类型 (int32, int64) + int64_t _dim; // 操作维度 + float _alpha; // 缩放因子 + + // 【新增】几何信息,用于计算内存偏移 + size_t _outer_size; // dim 左侧维度的乘积 + size_t _inner_size; // dim 右侧维度的乘积 + size_t _dim_size; // Input/Output 在 dim 维度的长度 + size_t _index_len; // Index 的长度 + + // 【修改】构造函数,初始化新增成员 + IndexAddInfo(int dtype, int idx_dtype, int64_t dim, float alpha, + size_t outer_size, size_t inner_size, size_t dim_size, size_t index_len) + : _dtype(dtype), _idx_dtype(idx_dtype), _dim(dim), _alpha(alpha), + _outer_size(outer_size), _inner_size(inner_size), _dim_size(dim_size), _index_len(index_len) {} + + int dtype() const { return _dtype; } + int idx_dtype() const { return _idx_dtype; } + int64_t dim() const { return _dim; } + float alpha() const { return _alpha; } + + // 【新增】Getter 方法 + size_t outer_size() const { return _outer_size; } + size_t inner_size() const { return _inner_size; } + size_t dim_size() const { return _dim_size; } + size_t index_len() const { return _index_len; } + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + int64_t dim, + infiniopTensorDescriptor_t index_desc, + infiniopTensorDescriptor_t source_desc, + float alpha) { + + // 1. 检查数据类型一致性 (Output vs Input vs Source) + int dtype = in_desc->dtype(); + if (out_desc->dtype() != dtype || source_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + // 2. 检查索引数据类型 + int idx_dtype = index_desc->dtype(); + if (idx_dtype != INFINI_DTYPE_I32 && idx_dtype != INFINI_DTYPE_I64) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + // 3. 检查维度有效性 + int64_t ndim = static_cast(in_desc->ndim()); + if (dim < 0 || dim >= ndim) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 4. 检查 Index 形状 + if (index_desc->ndim() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 【新增】计算几何信息 + const auto &in_shape = in_desc->shape(); + + // outer_size: dim 之前所有维度的乘积 + size_t outer_size = 1; + for (int64_t i = 0; i < dim; ++i) { + outer_size *= in_shape[i]; + } + + // inner_size: dim 之后所有维度的乘积 (即 stride) + size_t inner_size = 1; + for (int64_t i = dim + 1; i < ndim; ++i) { + inner_size *= in_shape[i]; + } + + // dim_size + size_t dim_size = in_shape[dim]; + + // index_len + size_t index_len = index_desc->shape()[0]; + + // 5. 检查 Source 形状一致性 + // 规则: [Outer, IndexLen, Inner] + if (source_desc->ndim() != in_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + const auto &src_shape = source_desc->shape(); + + for (int64_t i = 0; i < ndim; ++i) { + if (i == dim) { + if (src_shape[i] != index_len) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } else { + if (src_shape[i] != in_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + } + + // 6. 检查 Output 与 Input 形状一致性 + if (out_desc->ndim() != in_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + const auto &out_shape = out_desc->shape(); + for (int64_t i = 0; i < ndim; ++i) { + if (out_shape[i] != in_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + // 7. 返回 Info 对象 (包含计算好的几何信息) + return utils::Result(IndexAddInfo{ + dtype, + idx_dtype, + dim, + alpha, + outer_size, // pass + inner_size, // pass + dim_size, // pass + index_len // pass + }); + } +}; + +} // namespace op::index_add + +#endif // __INDEX_ADD_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_add/metax/index_add_metax.h b/src/infiniop/ops/index_add/metax/index_add_metax.h new file mode 100644 index 000000000..99e2dee3e --- /dev/null +++ b/src/infiniop/ops/index_add/metax/index_add_metax.h @@ -0,0 +1,8 @@ +#ifndef __INDEX_ADD_METAX_H__ +#define __INDEX_ADD_METAX_H__ + +#include "../index_add.h" + +DESCRIPTOR(metax) + +#endif // __INDEX_ADD_METAX_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_add/metax/index_add_metax.maca b/src/infiniop/ops/index_add/metax/index_add_metax.maca new file mode 100644 index 000000000..ac7fbaf48 --- /dev/null +++ b/src/infiniop/ops/index_add/metax/index_add_metax.maca @@ -0,0 +1,345 @@ +#include "index_add_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include +#include +#include +#include +#include +#include +#include +#include "../../../tensor.h" +#include "../cuda/kernel.cuh" + +namespace op::index_add::metax { + +// ================================================================== +// Atomic Helpers +// ================================================================== + +template +__device__ __forceinline__ void gpuAtomicAdd(T* address, T val) { + atomicAdd(address, val); +} + +template <> +__device__ __forceinline__ void gpuAtomicAdd( + int64_t* address, + int64_t val) +{ + atomicAdd( + reinterpret_cast(address), + static_cast(val)); +} + +template <> +__device__ __forceinline__ void gpuAtomicAdd( + __maca_bfloat16* address, + __maca_bfloat16 val) +{ + unsigned int* addr = + (unsigned int*)((char*)address - ((size_t)address & 2)); + + unsigned int old = *addr; + unsigned int assumed; + + do { + assumed = old; + + unsigned short old_val = + ((size_t)address & 2) + ? (assumed >> 16) + : (assumed & 0xFFFF); + + __maca_bfloat16 sum = + (__maca_bfloat16)( + (float)*reinterpret_cast<__maca_bfloat16*>(&old_val) + + (float)val); + + unsigned short res = + *reinterpret_cast(&sum); + + old = atomicCAS( + addr, + assumed, + ((size_t)address & 2) + ? ((assumed & 0xFFFF) | (res << 16)) + : ((assumed & 0xFFFF0000) | res)); + } while (assumed != old); +} + +// ================================================================== +// Kernel +// ================================================================== + +template +__global__ void index_add_kernel( + T* output, + const T* source, + const TIdx* indices, + int outer_size, + int inner_size, + int index_size, + int dim_size, + T alpha) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int total_source = outer_size * index_size * inner_size; + + if (idx >= total_source) return; + + int inner = idx % inner_size; + int temp = idx / inner_size; + int i = temp % index_size; + int outer = temp / index_size; + + TIdx idx_pos = indices[i]; + if (idx_pos < 0) idx_pos += dim_size; + + if (idx_pos >= 0 && idx_pos < dim_size) { + int out_offset = + outer * (dim_size * inner_size) + + idx_pos * inner_size + + inner; + + gpuAtomicAdd( + output + out_offset, + static_cast(source[idx] * alpha)); + } +} + +// ================================================================== +// Kernel Launcher +// ================================================================== + +template +void launch_kernel_impl( + void* output, + const void* source, + const void* indices, + int outer, + int inner, + int idx_size, + int dim_s, + float alpha, + void* stream) +{ + auto hc_stream = reinterpret_cast(stream); + + size_t total = (size_t)outer * idx_size * inner; + size_t block = 256; + size_t grid = (total + block - 1) / block; + + index_add_kernel + <<>>( + reinterpret_cast(output), + reinterpret_cast(source), + reinterpret_cast(indices), + outer, + inner, + idx_size, + dim_s, + static_cast(alpha)); +} + +// ================================================================== +// Utilities +// ================================================================== + +static size_t get_element_size(int dtype) { + if (dtype == INFINI_DTYPE_F64 || dtype == INFINI_DTYPE_I64) return 8; + if (dtype == INFINI_DTYPE_F32 || dtype == INFINI_DTYPE_I32) return 4; + return 2; +} + + +struct Descriptor::Opaque { + std::shared_ptr internal; + float alpha; + int64_t dim; + int outer_size; + int inner_size; + int index_size; + int dim_size; + size_t total_bytes; +}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + int64_t dim, + infiniopTensorDescriptor_t index_desc, + infiniopTensorDescriptor_t source_desc, + float alpha) +{ + auto handle = + reinterpret_cast(handle_); + + auto info_result = + IndexAddInfo::create( + out_desc, in_desc, dim, index_desc, source_desc, alpha); + + if (!info_result) { + return info_result.status(); + } + + auto out_d = + reinterpret_cast(out_desc); + auto idx_d = + reinterpret_cast(index_desc); + + int ndim = out_d->ndim(); + int64_t real_dim = dim < 0 ? dim + ndim : dim; + + int outer = 1; + for (int i = 0; i < real_dim; ++i) { + outer *= out_d->shape()[i]; + } + + int inner = 1; + for (int i = real_dim + 1; i < ndim; ++i) { + inner *= out_d->shape()[i]; + } + + int dim_s = out_d->shape()[real_dim]; + + int idx_s = 1; + for (int i = 0; i < idx_d->ndim(); ++i) { + idx_s *= idx_d->shape()[i]; + } + + size_t bytes = + (size_t)outer * dim_s * inner * + get_element_size(out_d->dtype()); + + auto opaque = + new Opaque{ + handle->internal(), + alpha, + dim, + outer, + inner, + idx_s, + dim_s, + bytes}; + + *desc_ptr = new Descriptor( + opaque, + info_result.take(), + 0, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream) const +{ + auto hc_stream = reinterpret_cast(stream); + + hcMemcpyAsync( + output, + input, + _opaque->total_bytes, + hcMemcpyDeviceToDevice, + hc_stream); + + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + + int outer = _opaque->outer_size; + int inner = _opaque->inner_size; + int dim_s = _opaque->dim_size; + int idx_sz = _opaque->index_size; + float alpha = _opaque->alpha; + +#define LAUNCH(T, TIdx) \ + launch_kernel_impl( \ + output, source, index, \ + outer, inner, idx_sz, dim_s, alpha, stream) + + if (idx_dtype == INFINI_DTYPE_I32) { + switch (dtype) { + + case INFINI_DTYPE_F16: + LAUNCH(__half, int32_t); + break; + + case INFINI_DTYPE_BF16: + LAUNCH(__maca_bfloat16, int32_t); + break; + + case INFINI_DTYPE_F32: + LAUNCH(float, int32_t); + break; + + case INFINI_DTYPE_F64: + LAUNCH(double, int32_t); + break; + + case INFINI_DTYPE_I32: + LAUNCH(int32_t, int32_t); + break; + + case INFINI_DTYPE_I64: + LAUNCH(int64_t, int32_t); + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + } else if (idx_dtype == INFINI_DTYPE_I64) { + switch (dtype) { + + case INFINI_DTYPE_F16: + LAUNCH(__half, int64_t); + break; + + case INFINI_DTYPE_BF16: + LAUNCH(__maca_bfloat16, int64_t); + break; + + case INFINI_DTYPE_F32: + LAUNCH(float, int64_t); + break; + + case INFINI_DTYPE_F64: + LAUNCH(double, int64_t); + break; + + case INFINI_DTYPE_I32: + LAUNCH(int32_t, int64_t); + break; + + case INFINI_DTYPE_I64: + LAUNCH(int64_t, int64_t); + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + +#undef LAUNCH + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::index_add::metax diff --git a/src/infiniop/ops/index_add/moore/index_add_moore.h b/src/infiniop/ops/index_add/moore/index_add_moore.h new file mode 100644 index 000000000..fd26e62a1 --- /dev/null +++ b/src/infiniop/ops/index_add/moore/index_add_moore.h @@ -0,0 +1,11 @@ +#ifndef __INDEX_ADD_MOORE_API_H__ +#define __INDEX_ADD_MOORE_API_H__ + +// 引入上层定义的 Descriptor 宏和基础类 +#include "../index_add.h" + +// 使用 index_add.h 中定义的 DESCRIPTOR 宏 +// 这将自动生成 op::index_add::moore::Descriptor 类定义 +DESCRIPTOR(moore) + +#endif // __INDEX_ADD_MOORE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_add/moore/index_add_moore.mu b/src/infiniop/ops/index_add/moore/index_add_moore.mu new file mode 100644 index 000000000..3b69d35d1 --- /dev/null +++ b/src/infiniop/ops/index_add/moore/index_add_moore.mu @@ -0,0 +1,203 @@ +#include "index_add_moore.h" +#include "index_add_moore_kernel.h" // 包含 IndexAddOp Functor 定义 + +#include +#include +#include +#include + +#include "../../../devices/moore/moore_handle.h" + +namespace op::index_add::moore { + +// ================================================================== +// 1. Kernel Wrapper +// ================================================================== + +// 这是一个 Global Kernel 包装器,它调用 index_add_moore_kernel.h 中的 IndexAddOp Functor +template +__global__ void index_add_kernel( + const size_t num_elements, // Source 的总元素数量 (线程任务总量) + const size_t index_len, // Index 向量长度 + const size_t inner_size, // stride + const size_t dim_size, // Output 在 dim 维度的长度 + const float alpha, // 缩放因子 + const T *source, + const TIdx *indices, + T *output) { + + // idx 对应 Source 张量的线性索引 + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_elements) { + // 使用 Functor + IndexAddOp op; + op(idx, index_len, inner_size, dim_size, alpha, source, indices, output); + } +} + +// ================================================================== +// 2. Launcher Implementation +// ================================================================== + +template +void index_add_moore_launch( + const IndexAddInfo &info, + T *output, + const T *input, + const T *source, + const void *indices, // void* 传入,内部强转 + void *stream) { + + auto musa_stream = (musaStream_t)stream; + const TIdx *indices_ptr = static_cast(indices); + + // -------------------------------------------------------------- + // 步骤 1: Copy Input -> Output + // -------------------------------------------------------------- + // Output 初始化为 Input 的值。 + // 计算总元素数量: Output 形状与 Input 一致 + size_t total_out_elements = info.outer_size() * info.dim_size() * info.inner_size(); + + // 如果 input 和 output 指针不同,则执行拷贝 + if (output != input) { + musaMemcpyAsync(output, input, total_out_elements * sizeof(T), musaMemcpyDeviceToDevice, musa_stream); + } + + // -------------------------------------------------------------- + // 步骤 2: Scatter Add (Source -> Output) + // -------------------------------------------------------------- + // 线程并行度取决于 Source 的大小 + // Source 逻辑形状: [Outer, IndexLen, Inner] + size_t num_src_elements = info.outer_size() * info.index_len() * info.inner_size(); + + if (num_src_elements == 0) { + return; //以此避免空 Kernel Launch + } + + int threads = 256; + int blocks = (num_src_elements + threads - 1) / threads; + + index_add_kernel<<>>( + num_src_elements, + info.index_len(), + info.inner_size(), + info.dim_size(), + info.alpha(), + source, + indices_ptr, + output + ); +} + +// ================================================================== +// 3. Descriptor Implementation +// ================================================================== + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + int64_t dim, + infiniopTensorDescriptor_t index_desc, + infiniopTensorDescriptor_t source_desc, + float alpha) { + + auto handle = reinterpret_cast(handle_); + + // 使用 Info 类校验形状和类型,并预计算几何参数 + auto info_result = IndexAddInfo::create(out_desc, in_desc, dim, index_desc, source_desc, alpha); + + if (!info_result) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + *desc_ptr = new Descriptor( + nullptr, + *info_result, + 0, // No workspace needed + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // -------------------------------------------------------------- + // 定义分发宏:解决 Data Type x Index Type 的组合爆炸 + // -------------------------------------------------------------- + #define LAUNCH_KERNEL(T) \ + do { \ + if (_info.idx_dtype() == INFINI_DTYPE_I32) { \ + index_add_moore_launch( \ + _info, \ + static_cast(output), \ + static_cast(input), \ + static_cast(source), \ + index, \ + stream); \ + } else if (_info.idx_dtype() == INFINI_DTYPE_I64) { \ + index_add_moore_launch( \ + _info, \ + static_cast(output), \ + static_cast(input), \ + static_cast(source), \ + index, \ + stream); \ + } else { \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } while (0) + + // -------------------------------------------------------------- + // 根据数据类型分发 + // -------------------------------------------------------------- + switch (_info.dtype()) { + case INFINI_DTYPE_F16: + LAUNCH_KERNEL(half); + break; + + case INFINI_DTYPE_BF16: + LAUNCH_KERNEL(__mt_bfloat16); + break; + + case INFINI_DTYPE_F32: + LAUNCH_KERNEL(float); + break; + + case INFINI_DTYPE_F64: + LAUNCH_KERNEL(double); + break; + + // 如果需要支持整数类型的 AtomicAdd,需要确保 Kernel 中有对应特化 + // 这里仅示例浮点类型 + case INFINI_DTYPE_I32: + LAUNCH_KERNEL(int32_t); + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + #undef LAUNCH_KERNEL + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::index_add::moore \ No newline at end of file diff --git a/src/infiniop/ops/index_add/moore/index_add_moore_kernel.h b/src/infiniop/ops/index_add/moore/index_add_moore_kernel.h new file mode 100644 index 000000000..e7460d711 --- /dev/null +++ b/src/infiniop/ops/index_add/moore/index_add_moore_kernel.h @@ -0,0 +1,130 @@ +#ifndef __INDEX_ADD_MOORE_KERNEL_H__ +#define __INDEX_ADD_MOORE_KERNEL_H__ + +#include +#include +#include +#include + +namespace op::index_add::moore { + + +template +__device__ __forceinline__ void atomic_add_func(T* address, T val) { + atomicAdd(address, val); +} + +template <> +__device__ __forceinline__ void atomic_add_func(half* address, half val) { + // 将地址重解释为 unsigned short* 以便进行位操作 + unsigned short* address_as_us = reinterpret_cast(address); + unsigned short old = *address_as_us; + unsigned short assumed; + + do { + assumed = old; + half sum = __float2half(__half2float(*reinterpret_cast(&assumed)) + + __half2float(val)); + + unsigned short sum_as_us = *reinterpret_cast(&sum); + + + old = atomicCAS(address_as_us, assumed, sum_as_us); + + } while (assumed != old); +} + + +template <> +__device__ __forceinline__ void atomic_add_func<__mt_bfloat16>(__mt_bfloat16* address, __mt_bfloat16 val) { + unsigned short* address_as_us = reinterpret_cast(address); + unsigned short old = *address_as_us; + unsigned short assumed; + + do { + assumed = old; + // BF16 -> Float -> Add -> BF16 + float sum_f = __bfloat162float(*reinterpret_cast(&assumed)) + + __bfloat162float(val); + + __mt_bfloat16 sum_bf = __float2bfloat16(sum_f); + unsigned short sum_as_us = *reinterpret_cast(&sum_bf); + + old = atomicCAS(address_as_us, assumed, sum_as_us); + + } while (assumed != old); +} + +// ================================================================== +// 2. Kernel Functor +// ================================================================== + +typedef struct IndexAddOp { +public: + template + __device__ __forceinline__ void operator()( + const size_t curr_idx, // Flattened index for Source + const size_t index_len, // Length of Index tensor + const size_t inner_size, // Stride of inner dims + const size_t dim_size, // Size of target dim in Output + const float alpha, // Scale factor + const T* source, // Source Tensor + const TIdx* indices, // Index Tensor + T* output // Output Tensor + ) const { + + + + size_t inner_idx = curr_idx % inner_size; + size_t tmp = curr_idx / inner_size; + size_t idx_in_indices = tmp % index_len; // 当前处理的是 Index 张量中的第几个索引 + size_t outer_idx = tmp / index_len; + + // --- 2. 读取 Source 并应用 Alpha --- + T src_val = source[curr_idx]; + float val_f; + + // 统一转 float 计算乘法 + if constexpr (std::is_same_v) { + val_f = __half2float(src_val); + } else if constexpr (std::is_same_v) { + val_f = __bfloat162float(src_val); + } else { + val_f = static_cast(src_val); + } + + val_f *= alpha; + + // 转回 T + T add_val; + if constexpr (std::is_same_v) { + add_val = __float2half(val_f); + } else if constexpr (std::is_same_v) { + add_val = __float2bfloat16(val_f); + } else { + add_val = static_cast(val_f); + } + + // --- 3. 读取 Index 并计算 Output 偏移 --- + TIdx target_dim_idx = indices[idx_in_indices]; + + // 处理 Python 风格负索引 + if (target_dim_idx < 0) { + target_dim_idx += static_cast(dim_size); + } + + // --- 4. 边界检查 & 原子累加 --- + if (target_dim_idx >= 0 && target_dim_idx < static_cast(dim_size)) { + // output_offset = outer * (dim_size * inner) + target_idx * inner + inner + size_t out_offset = outer_idx * (dim_size * inner_size) + + static_cast(target_dim_idx) * inner_size + + inner_idx; + atomic_add_func(output + out_offset, add_val); + } + } + +} IndexAddOp; + +} // namespace op::index_add::moore + +#endif // __INDEX_ADD_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu new file mode 100644 index 000000000..619523f47 --- /dev/null +++ b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu @@ -0,0 +1,164 @@ +#include "index_add_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include + +#include +#include + +namespace op::index_add::nvidia { + +template +bool is_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *source, + const void *indices, + const IndexAddInfo &info, + void *stream) { + + auto out_ptr = reinterpret_cast(output); + auto src_ptr = reinterpret_cast(source); + auto idx_ptr = reinterpret_cast(indices); + auto cuda_stream = reinterpret_cast(stream); + + // 获取几何信息 + size_t outer_size = info.outer_size(); + size_t inner_size = info.inner_size(); + size_t dim_size = info.dim_size(); + size_t index_len = info.index_len(); + float alpha = info.alpha(); + + // Source 总元素数 + size_t num_source = outer_size * index_len * inner_size; + + // --- 向量化参数配置 --- + // 目标:每个线程读取 128-bit (16 Bytes) Source 数据 + constexpr int TotalBytes = 16; + constexpr int PackSize = TotalBytes / sizeof(T); + bool can_vectorize = (PackSize > 1) && + (num_source % PackSize == 0) && + is_aligned(source, TotalBytes); + + if (can_vectorize) { + // === 路径 A: 向量化读取 Kernel === + size_t num_packs = num_source / PackSize; + + size_t block_size = 256; + size_t grid_size = (num_packs + block_size - 1) / block_size; + + op::index_add::cuda::index_add_kernel_vectorized + <<>>( + out_ptr, src_ptr, idx_ptr, + outer_size, inner_size, dim_size, index_len, + num_packs, alpha + ); + } else { + // === 路径 B: 标量 Kernel === + size_t block_size = 256; + size_t grid_size = (num_source + block_size - 1) / block_size; + + op::index_add::cuda::index_add_kernel + <<>>( + out_ptr, src_ptr, idx_ptr, + outer_size, inner_size, dim_size, index_len, + num_source, alpha + ); + } +} + +// ================================================================== +// Descriptor 实现 +// ================================================================== + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + int64_t dim, + infiniopTensorDescriptor_t index_desc, + infiniopTensorDescriptor_t source_desc, + float alpha) { + + // Info 创建 + auto info_result = IndexAddInfo::create(out_desc, in_desc, dim, index_desc, source_desc, alpha); + if (!info_result) return info_result.status(); + + *desc_ptr = new Descriptor( + new Opaque(), info_result.take(), 0, handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// Calculate +// ================================================================== +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream) const { + + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + #define LAUNCH_BY_SIZE(T_STORAGE) \ + switch (idx_dtype) { \ + case INFINI_DTYPE_I32: \ + launch_kernel(output, source, index, _info, stream); \ + break; \ + case INFINI_DTYPE_I64: \ + launch_kernel(output, source, index, _info, stream); \ + break; \ + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + switch (dtype) { + // 32-bit Float + case INFINI_DTYPE_F32: + LAUNCH_BY_SIZE(float); + break; + // 64-bit Float + case INFINI_DTYPE_F64: + LAUNCH_BY_SIZE(double); + break; + // 16-bit Half (fp16) -> 使用 __half + case INFINI_DTYPE_F16: + LAUNCH_BY_SIZE(__half); + break; + // 16-bit BFloat16 (bf16) -> 使用 __nv_bfloat16 + case INFINI_DTYPE_BF16: + LAUNCH_BY_SIZE(__nv_bfloat16); + break; + // Integers + case INFINI_DTYPE_I32: + LAUNCH_BY_SIZE(int32_t); + break; + case INFINI_DTYPE_I64: + LAUNCH_BY_SIZE(int64_t); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + #undef LAUNCH_BY_SIZE + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::index_add::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cuh b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cuh new file mode 100644 index 000000000..6fe6b7ef9 --- /dev/null +++ b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __INDEX_ADD_NVIDIA_CUH__ +#define __INDEX_ADD_NVIDIA_CUH__ + +#include "../index_add.h" + +DESCRIPTOR(nvidia) + +#endif // __INDEX_ADD_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/index_add/operator.cc b/src/infiniop/ops/index_add/operator.cc new file mode 100644 index 000000000..44b228c97 --- /dev/null +++ b/src/infiniop/ops/index_add/operator.cc @@ -0,0 +1,190 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/index_add.h" + +#ifdef ENABLE_CPU_API +#include "cpu/index_add_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/index_add_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/index_add_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/index_add_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateIndexAddDescriptor( + infiniopHandle_t handle, + infiniopIndexAddDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + int64_t dim, + infiniopTensorDescriptor_t index, + infiniopTensorDescriptor_t source, + float alpha) { + + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::index_add::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input, \ + dim, \ + index, \ + source, \ + alpha) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetIndexAddWorkspaceSize(infiniopIndexAddDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + // 【关键修复】启用 Moore 分支 + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopIndexAdd( + infiniopIndexAddDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, index, source, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + // 【关键修复】启用 Moore 分支 + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +// ======================================================================= +// 4. 销毁描述符 +// ======================================================================= +__C infiniStatus_t infiniopDestroyIndexAddDescriptor(infiniopIndexAddDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/cpu/index_copy_cpu.cc b/src/infiniop/ops/index_copy/cpu/index_copy_cpu.cc new file mode 100644 index 000000000..e70863527 --- /dev/null +++ b/src/infiniop/ops/index_copy/cpu/index_copy_cpu.cc @@ -0,0 +1,154 @@ +#include "index_copy_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include + +namespace op::index_copy::cpu { + +Descriptor::~Descriptor() = default; + +// ================================================================== +// 创建描述符 +// ================================================================== +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + int64_t dim, + infiniopTensorDescriptor_t index_desc, + infiniopTensorDescriptor_t source_desc) { // 注意:移除了 float alpha + + auto handle = reinterpret_cast(handle_); + + // 创建 Info 对象 + auto result = IndexCopyInfo::create(out_desc, in_desc, dim, index_desc, source_desc); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + nullptr, // Opaque* + result.take(), // Info + 0, // Workspace Size + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// 核心计算逻辑 (串行实现) +// ================================================================== +template +void calculate_cpu_impl( + const IndexCopyInfo &info, + void *output, + const void *input, + const void *index, + const void *source) { + + // IndexCopy 不需要 Alpha 也不需要提升精度进行计算,直接拷贝即可 + + // 1. 获取几何信息 + size_t outer_size = info.outer_size(); + size_t inner_size = info.inner_size(); + size_t dim_size = info.dim_size(); + size_t index_len = info.index_len(); + + auto out_ptr = reinterpret_cast(output); + auto src_ptr = reinterpret_cast(source); + auto idx_ptr = reinterpret_cast(index); + + // ----------------------------------------------------------- + // 串行循环逻辑 + // ----------------------------------------------------------- + for (size_t o = 0; o < outer_size; ++o) { + for (size_t i = 0; i < index_len; ++i) { + + TIdx idx = idx_ptr[i]; + + // 处理负索引 + if (idx < 0) idx += static_cast(dim_size); + + // 边界检查 + if (idx < 0 || static_cast(idx) >= dim_size) { + continue; + } + + // 计算偏移 + size_t src_offset = o * index_len * inner_size + i * inner_size; + size_t out_offset = o * dim_size * inner_size + static_cast(idx) * inner_size; + + // Inner 维度循环 + for (size_t in = 0; in < inner_size; ++in) { + // 【核心逻辑】 + // IndexCopy: output[idx] = source[i] + // 直接赋值,无需 utils::cast 提升精度 + out_ptr[out_offset + in] = src_ptr[src_offset + in]; + } + } + } +} + +// ================================================================== +// 执行函数 (分发逻辑) +// ================================================================== +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream) const { + + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + + #define DISPATCH(TDATA, TIDX) \ + calculate_cpu_impl(_info, output, input, index, source); \ + return INFINI_STATUS_SUCCESS + + if (idx_dtype == INFINI_DTYPE_I32) { + switch (dtype) { + case INFINI_DTYPE_F32: + DISPATCH(float, int32_t); + case INFINI_DTYPE_F64: + DISPATCH(double, int32_t); + case INFINI_DTYPE_F16: + DISPATCH(fp16_t, int32_t); + case INFINI_DTYPE_BF16: + DISPATCH(bf16_t, int32_t); + case INFINI_DTYPE_I32: + DISPATCH(int32_t, int32_t); + case INFINI_DTYPE_I64: + DISPATCH(int64_t, int32_t); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else if (idx_dtype == INFINI_DTYPE_I64) { + switch (dtype) { + case INFINI_DTYPE_F32: + DISPATCH(float, int64_t); + case INFINI_DTYPE_F64: + DISPATCH(double, int64_t); + case INFINI_DTYPE_F16: + DISPATCH(fp16_t, int64_t); + case INFINI_DTYPE_BF16: + DISPATCH(bf16_t, int64_t); + case INFINI_DTYPE_I32: + DISPATCH(int32_t, int64_t); + case INFINI_DTYPE_I64: + DISPATCH(int64_t, int64_t); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } + + return INFINI_STATUS_BAD_TENSOR_DTYPE; + + #undef DISPATCH +} + +} // namespace op::index_copy::cpu \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/cpu/index_copy_cpu.h b/src/infiniop/ops/index_copy/cpu/index_copy_cpu.h new file mode 100644 index 000000000..a13369ed9 --- /dev/null +++ b/src/infiniop/ops/index_copy/cpu/index_copy_cpu.h @@ -0,0 +1,7 @@ +#ifndef __INDEX_COPY_CPU_H__ +#define __INDEX_COPY_CPU_H__ + +#include "../index_copy.h" +DESCRIPTOR(cpu) + +#endif // __INDEX_COPY_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/cuda/kernel.cuh b/src/infiniop/ops/index_copy/cuda/kernel.cuh new file mode 100644 index 000000000..f022ffdca --- /dev/null +++ b/src/infiniop/ops/index_copy/cuda/kernel.cuh @@ -0,0 +1,136 @@ +#ifndef __INDEX_COPY_CUDA_H__ +#define __INDEX_COPY_CUDA_H__ + +//#include +#if defined(__MACA__) || defined(__MACACC__) + #include + #include + using nv_bfloat162 = __maca_bfloat162; +#else + #include + #include +#endif + +#include + +namespace op::index_copy::cuda { + +// ================================================================== +// 1. 定义向量化数据包 (Aligned Pack) +// ================================================================== +// 与 IndexAdd 保持一致,用于向量化读取 +template +struct alignas(sizeof(T) * N) Pack { + T val[N]; +}; + +// ================================================================== +// 2. 标量版 Kernel (通用 fallback) +// ================================================================== +template +__global__ void index_copy_kernel( + T * __restrict__ output, + const T * __restrict__ source, + const TIdx * __restrict__ indices, + size_t outer_size, // dim 左边的维度积 + size_t inner_size, // dim 右边的维度积 + size_t dim_size, // output 在 dim 维度的长度 + size_t index_len, // index 的长度 (source 在 dim 维度的长度) + size_t num_source // source 的总元素数 + // 注意:移除了 float alpha +) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + + // Grid-Stride Loop 遍历 Source 张量 + for (size_t i = tid; i < num_source; i += stride) { + // 1. 将线性索引 i 转换为逻辑坐标 (outer, idx_idx, inner) + // Source Shape: [Outer, IndexLen, Inner] + size_t inner_idx = i % inner_size; + size_t tmp = i / inner_size; + size_t idx_idx = tmp % index_len; + size_t outer_idx = tmp / index_len; + + // 2. 读取索引值 + TIdx target_dim_idx = indices[idx_idx]; + + // 3. 处理负索引 (防御性) + if (target_dim_idx < 0) target_dim_idx += static_cast(dim_size); + + // 4. 边界检查与赋值 + if (target_dim_idx >= 0 && target_dim_idx < static_cast(dim_size)) { + // 计算 Output 的线性偏移 + // Output Shape: [Outer, DimSize, Inner] + size_t out_offset = outer_idx * (dim_size * inner_size) + + static_cast(target_dim_idx) * inner_size + + inner_idx; + + // 【核心修改】 + // IndexCopy 不需要原子操作,直接赋值。 + // 如果有多个索引指向同一个位置,结果由执行顺序决定(Race Condition),这是符合预期的行为。 + output[out_offset] = source[i]; + } + } +} + +// ================================================================== +// 3. 向量化 Kernel (优化读取带宽) +// ================================================================== +template +__global__ void index_copy_kernel_vectorized( + T * __restrict__ output, + const T * __restrict__ source, + const TIdx * __restrict__ indices, + size_t outer_size, + size_t inner_size, + size_t dim_size, + size_t index_len, + size_t num_packs // Source 的 Pack 数量 + // 注意:移除了 float alpha +) { + // 将 source 强转为 Pack 指针,实现向量化读取 + using PackType = Pack; + const PackType *src_vec = reinterpret_cast(source); + + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = tid; i < num_packs; i += stride) { + // 向量化读取 (LDG.128) + PackType reg_pack = src_vec[i]; + + // 当前 Pack 在 Source 中的起始线性索引 + size_t base_idx = i * PackSize; + + // 循环展开:处理 Pack 中的每一个元素 + #pragma unroll + for (int k = 0; k < PackSize; ++k) { + size_t curr_src_idx = base_idx + k; + + // 1. 坐标变换 + size_t inner_idx = curr_src_idx % inner_size; + size_t tmp = curr_src_idx / inner_size; + size_t idx_idx = tmp % index_len; + size_t outer_idx = tmp / index_len; + + // 2. 读取 Index + TIdx target_dim_idx = indices[idx_idx]; + + if (target_dim_idx < 0) target_dim_idx += static_cast(dim_size); + + // 3. 赋值 + if (target_dim_idx >= 0 && target_dim_idx < static_cast(dim_size)) { + size_t out_offset = outer_idx * (dim_size * inner_size) + + static_cast(target_dim_idx) * inner_size + + inner_idx; + + // 【核心修改】直接赋值 + output[out_offset] = reg_pack.val[k]; + } + } + } +} + +} // namespace op::index_copy::cuda + +#endif // __INDEX_COPY_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/index_copy.h b/src/infiniop/ops/index_copy/index_copy.h new file mode 100644 index 000000000..e2ef85f01 --- /dev/null +++ b/src/infiniop/ops/index_copy/index_copy.h @@ -0,0 +1,51 @@ +#ifndef __INDEX_COPY_H__ +#define __INDEX_COPY_H__ + +#include "../../operator.h" +#include "info.h" // 引用 IndexCopyInfo 定义 +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::index_copy::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + IndexCopyInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + IndexCopyInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t in_desc, \ + int64_t dim, \ + infiniopTensorDescriptor_t index_desc, \ + infiniopTensorDescriptor_t source_desc); /* 注意:移除了 alpha */ \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *index, \ + const void *source, \ + void *stream) const; \ + }; \ + } + +#endif // __INDEX_COPY_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/info.h b/src/infiniop/ops/index_copy/info.h new file mode 100644 index 000000000..3276752af --- /dev/null +++ b/src/infiniop/ops/index_copy/info.h @@ -0,0 +1,120 @@ +#ifndef __INDEX_COPY_INFO_H__ +#define __INDEX_COPY_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::index_copy { + +class IndexCopyInfo { + IndexCopyInfo() = default; + +public: + int _dtype; + int _idx_dtype; + int64_t _dim; + + size_t _outer_size; + size_t _inner_size; + size_t _dim_size; + size_t _index_len; + + IndexCopyInfo(int dtype, int idx_dtype, int64_t dim, + size_t outer_size, size_t inner_size, size_t dim_size, size_t index_len) + : _dtype(dtype), _idx_dtype(idx_dtype), _dim(dim), + _outer_size(outer_size), _inner_size(inner_size), _dim_size(dim_size), _index_len(index_len) {} + + int dtype() const { return _dtype; } + int idx_dtype() const { return _idx_dtype; } + int64_t dim() const { return _dim; } + + size_t outer_size() const { return _outer_size; } + size_t inner_size() const { return _inner_size; } + size_t dim_size() const { return _dim_size; } + size_t index_len() const { return _index_len; } + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + int64_t dim, + infiniopTensorDescriptor_t index_desc, + infiniopTensorDescriptor_t source_desc) { + + int dtype = in_desc->dtype(); + if (out_desc->dtype() != dtype || source_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + int idx_dtype = index_desc->dtype(); + if (idx_dtype != INFINI_DTYPE_I32 && idx_dtype != INFINI_DTYPE_I64) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + int64_t ndim = static_cast(in_desc->ndim()); + if (dim < 0 || dim >= ndim) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (index_desc->ndim() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + const auto &in_shape = in_desc->shape(); + + size_t outer_size = 1; + for (int64_t i = 0; i < dim; ++i) { + outer_size *= in_shape[i]; + } + + size_t inner_size = 1; + for (int64_t i = dim + 1; i < ndim; ++i) { + inner_size *= in_shape[i]; + } + + size_t dim_size = in_shape[dim]; + size_t index_len = index_desc->shape()[0]; + + if (source_desc->ndim() != in_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + const auto &src_shape = source_desc->shape(); + + for (int64_t i = 0; i < ndim; ++i) { + if (i == dim) { + if (src_shape[i] != index_len) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } else { + if (src_shape[i] != in_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + } + + if (out_desc->ndim() != in_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + const auto &out_shape = out_desc->shape(); + for (int64_t i = 0; i < ndim; ++i) { + if (out_shape[i] != in_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + return utils::Result(IndexCopyInfo{ + dtype, + idx_dtype, + dim, + outer_size, + inner_size, + dim_size, + index_len + }); + } +}; + +} // namespace op::index_copy + +#endif // __INDEX_COPY_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/metax/index_copy_metax.h b/src/infiniop/ops/index_copy/metax/index_copy_metax.h new file mode 100644 index 000000000..d42339a9e --- /dev/null +++ b/src/infiniop/ops/index_copy/metax/index_copy_metax.h @@ -0,0 +1,8 @@ +#ifndef __INDEX_COPY_METAX_H__ +#define __INDEX_COPY_METAX_H__ + +#include "../index_copy.h" + +DESCRIPTOR(metax) + +#endif // __INDEX_COPY_METAX_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/metax/index_copy_metax.maca b/src/infiniop/ops/index_copy/metax/index_copy_metax.maca new file mode 100644 index 000000000..01704f436 --- /dev/null +++ b/src/infiniop/ops/index_copy/metax/index_copy_metax.maca @@ -0,0 +1,159 @@ +#include "index_copy_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include +#include +#include +#include +#include +#if defined(__MACA__) || defined(__MACACC__) + #include + #include +#endif +#include "../../../tensor.h" +#include "../cuda/kernel.cuh" + +#include "../../../tensor.h" +#include "../cuda/kernel.cuh" + +namespace op::index_copy::metax { + +template +__global__ void index_copy_kernel( + T* output, const T* source, const TIdx* indices, + int outer_size, int inner_size, int index_size, int dim_size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int total_source = outer_size * index_size * inner_size; + if (idx >= total_source) return; + + int inner = idx % inner_size; + int temp = idx / inner_size; + int i = temp % index_size; + int outer = temp / index_size; + + TIdx idx_pos = indices[i]; + if (idx_pos < 0) idx_pos += dim_size; + + if (idx_pos >= 0 && idx_pos < dim_size) { + int out_offset = outer * (dim_size * inner_size) + idx_pos * inner_size + inner; + output[out_offset] = source[idx]; + } +} + +template +void launch_kernel_impl( + void* output, const void* source, const void* indices, + int outer_size, int inner_size, int index_size, int dim_size, void* stream) +{ + auto hc_stream = reinterpret_cast(stream); + size_t total_elements = (size_t)outer_size * index_size * inner_size; + size_t block_size = 256; + size_t grid_size = (total_elements + block_size - 1) / block_size; + index_copy_kernel<<>>( + reinterpret_cast(output), reinterpret_cast(source), reinterpret_cast(indices), + outer_size, inner_size, index_size, dim_size); +} + +static size_t get_element_size(int dtype) { + if (dtype == INFINI_DTYPE_F64 || dtype == INFINI_DTYPE_I64) return 8; + if (dtype == INFINI_DTYPE_F32 || dtype == INFINI_DTYPE_I32) return 4; + return 2; +} + +struct Descriptor::Opaque { + std::shared_ptr internal; + int64_t dim; + int outer_size, inner_size, index_size, dim_size; + size_t total_bytes; +}; + +Descriptor::~Descriptor() { if (_opaque) delete _opaque; } + +// [修复 1] create 签名匹配头文件:dim 在 index_desc 之前 +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + int64_t dim, + infiniopTensorDescriptor_t index_desc, + infiniopTensorDescriptor_t source_desc) +{ + auto handle = reinterpret_cast(handle_); + // Info 创建顺序通常是 out, in, dim, index, source + auto info_result = IndexCopyInfo::create(out_desc, in_desc, dim, index_desc, source_desc); + if (!info_result) return info_result.status(); + + auto out_d = reinterpret_cast(out_desc); + auto idx_d = reinterpret_cast(index_desc); + + int ndim = out_d->ndim(); + int64_t real_dim = dim < 0 ? dim + ndim : dim; + + int outer = 1; for(int i=0; ishape()[i]; + int inner = 1; for(int i=real_dim+1; ishape()[i]; + int dim_s = out_d->shape()[real_dim]; + int idx_s = 1; for(int i=0; indim(); ++i) idx_s *= idx_d->shape()[i]; + + size_t bytes = (size_t)outer * dim_s * inner * get_element_size(out_d->dtype()); + + auto opaque = new Opaque{handle->internal(), dim, outer, inner, idx_s, dim_s, bytes}; + *desc_ptr = new Descriptor(opaque, info_result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +// [修复 2] calculate 签名匹配头文件:显式参数 input, index, source +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream) const +{ + auto hc_stream = reinterpret_cast(stream); + hcMemcpyAsync(output, input, _opaque->total_bytes, hcMemcpyDeviceToDevice, hc_stream); + + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + int outer = _opaque->outer_size; + int inner = _opaque->inner_size; + int dim_s = _opaque->dim_size; + int idx_sz = _opaque->index_size; + + #define LAUNCH(T, TIdx) launch_kernel_impl(output, source, index, outer, inner, idx_sz, dim_s, stream) + + if (idx_dtype == INFINI_DTYPE_I32) { + switch (dtype) { + case INFINI_DTYPE_F16: LAUNCH(__half, int32_t); break; + case INFINI_DTYPE_BF16: +#if defined(__MACA__) || defined(__MACACC__) + LAUNCH(__maca_bfloat16, int32_t); +#endif + break; + case INFINI_DTYPE_F32: LAUNCH(float, int32_t); break; + case INFINI_DTYPE_F64: LAUNCH(double, int32_t); break; + case INFINI_DTYPE_I32: LAUNCH(int32_t, int32_t); break; + case INFINI_DTYPE_I64: LAUNCH(int64_t, int32_t); break; + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else if (idx_dtype == INFINI_DTYPE_I64) { + switch (dtype) { + case INFINI_DTYPE_F16: LAUNCH(__half, int64_t); break; + case INFINI_DTYPE_BF16: +#if defined(__MACA__) || defined(__MACACC__) + LAUNCH(__maca_bfloat16, int64_t); +#endif + break; + case INFINI_DTYPE_F32: LAUNCH(float, int64_t); break; + case INFINI_DTYPE_F64: LAUNCH(double, int64_t); break; + case INFINI_DTYPE_I32: LAUNCH(int32_t, int64_t); break; + case INFINI_DTYPE_I64: LAUNCH(int64_t, int64_t); break; + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else { return INFINI_STATUS_BAD_TENSOR_DTYPE; } + #undef LAUNCH + return INFINI_STATUS_SUCCESS; +} +} // namespace op::index_copy::metax \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/moore/index_copy_moore.h b/src/infiniop/ops/index_copy/moore/index_copy_moore.h new file mode 100644 index 000000000..101a9f3f7 --- /dev/null +++ b/src/infiniop/ops/index_copy/moore/index_copy_moore.h @@ -0,0 +1,11 @@ +#ifndef __INDEX_COPY_MOORE_API_H__ +#define __INDEX_COPY_MOORE_API_H__ + +// 引入上层定义的 Descriptor 宏和基础类 +#include "../index_copy.h" + +// 使用 index_copy.h 中定义的 DESCRIPTOR 宏 +// 这将自动生成 op::index_copy::moore::Descriptor 类定义 +DESCRIPTOR(moore) + +#endif // __INDEX_COPY_MOORE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/moore/index_copy_moore.mu b/src/infiniop/ops/index_copy/moore/index_copy_moore.mu new file mode 100644 index 000000000..cae6c2b94 --- /dev/null +++ b/src/infiniop/ops/index_copy/moore/index_copy_moore.mu @@ -0,0 +1,207 @@ +#include "index_copy_moore.h" +#include "index_copy_moore_kernel.h" // 包含 IndexCopyOp Functor 定义 + +#include +#include +#include + +#include "../../../devices/moore/moore_handle.h" + +namespace op::index_copy::moore { + +// ================================================================== +// 1. Kernel Wrapper Implementation +// ================================================================== + +template +__global__ void index_copy_kernel( + const size_t num_elements, // Source 元素总数 (线程任务总量) + const size_t index_len, // Index 长度 + const size_t inner_size, // Stride + const size_t dim_size, // Output 在 dim 维度的长度 + const T *source, + const TIdx *indices, + T *output) { + + // idx 对应 Source 张量的线性索引 + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_elements) { + // 使用 index_copy_moore_kernel.h 中定义的 Functor + IndexCopyOp op; + op(idx, index_len, inner_size, dim_size, source, indices, output); + } +} + +// ================================================================== +// 2. Launcher Implementation +// ================================================================== + +template +void index_copy_moore_launch( + const IndexCopyInfo &info, + T *output, + const T *input, + const T *source, + const void *indices, // void* 传入,内部强转 + void *stream) { + + auto musa_stream = (musaStream_t)stream; + const TIdx *indices_ptr = static_cast(indices); + + // -------------------------------------------------------------- + // 步骤 1: Copy Input -> Output + // -------------------------------------------------------------- + // Output 初始化为 Input 的值 + size_t total_out_elements = info.outer_size() * info.dim_size() * info.inner_size(); + + // 如果 input 和 output 指针不同,则执行拷贝 + if (output != input) { + musaMemcpyAsync(output, input, total_out_elements * sizeof(T), musaMemcpyDeviceToDevice, musa_stream); + } + + // -------------------------------------------------------------- + // 步骤 2: Scatter (Source -> Output) + // -------------------------------------------------------------- + // 线程并行度取决于 Source 的大小 + // Source 逻辑形状: [Outer, IndexLen, Inner] + size_t num_src_elements = info.outer_size() * info.index_len() * info.inner_size(); + + if (num_src_elements == 0) { + return; + } + + int threads = 256; + int blocks = (num_src_elements + threads - 1) / threads; + + index_copy_kernel<<>>( + num_src_elements, + info.index_len(), + info.inner_size(), + info.dim_size(), + source, + indices_ptr, + output + ); +} + +// ================================================================== +// 3. Descriptor Implementation +// ================================================================== + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + int64_t dim, + infiniopTensorDescriptor_t index_desc, + infiniopTensorDescriptor_t source_desc) { + + auto handle = reinterpret_cast(handle_); + + // 使用 Info 类校验形状和类型 + auto info_result = IndexCopyInfo::create(out_desc, in_desc, dim, index_desc, source_desc); + + if (!info_result) { + return info_result.status(); + } + + *desc_ptr = new Descriptor( + nullptr, + *info_result, + 0, // No workspace needed + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // -------------------------------------------------------------- + // 定义分发宏:Data Type x Index Type + // -------------------------------------------------------------- + #define LAUNCH_KERNEL(T) \ + do { \ + if (_info.idx_dtype() == INFINI_DTYPE_I32) { \ + index_copy_moore_launch( \ + _info, \ + static_cast(output), \ + static_cast(input), \ + static_cast(source), \ + index, \ + stream); \ + } else if (_info.idx_dtype() == INFINI_DTYPE_I64) { \ + index_copy_moore_launch( \ + _info, \ + static_cast(output), \ + static_cast(input), \ + static_cast(source), \ + index, \ + stream); \ + } else { \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } while (0) + + // -------------------------------------------------------------- + // 根据数据类型分发 + // -------------------------------------------------------------- + switch (_info.dtype()) { + case INFINI_DTYPE_F16: + LAUNCH_KERNEL(half); + break; + + case INFINI_DTYPE_BF16: + LAUNCH_KERNEL(__mt_bfloat16); + break; + + case INFINI_DTYPE_F32: + LAUNCH_KERNEL(float); + break; + + case INFINI_DTYPE_F64: + LAUNCH_KERNEL(double); + break; + + case INFINI_DTYPE_I32: + LAUNCH_KERNEL(int32_t); + break; + + case INFINI_DTYPE_I64: + LAUNCH_KERNEL(int64_t); + break; + + case INFINI_DTYPE_I8: + LAUNCH_KERNEL(int8_t); + break; + + case INFINI_DTYPE_U8: + LAUNCH_KERNEL(uint8_t); + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + #undef LAUNCH_KERNEL + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::index_copy::moore \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/moore/index_copy_moore_kernel.h b/src/infiniop/ops/index_copy/moore/index_copy_moore_kernel.h new file mode 100644 index 000000000..223d9196c --- /dev/null +++ b/src/infiniop/ops/index_copy/moore/index_copy_moore_kernel.h @@ -0,0 +1,73 @@ +#ifndef __INDEX_COPY_MOORE_KERNEL_H__ +#define __INDEX_COPY_MOORE_KERNEL_H__ + +#include +#include +#include // 必须包含,用于 __mt_bfloat16 定义 + +#include // 用于 std::is_same_v + +namespace op::index_copy::moore { + +typedef struct IndexCopyOp { +public: + // IndexCopy 算子涉及两种数据类型:数据本身 (T) 和 索引类型 (TIdx) + // 逻辑:Output[..., indices[i], ...] = Source[..., i, ...] + + template + __device__ __forceinline__ void operator()( + const size_t curr_idx, // 当前线程处理的 Source 元素的线性索引 + + // 几何参数 (来自 IndexCopyInfo) + const size_t index_len, // Index 向量的长度 (即 Source 在 dim 维度的长度) + const size_t inner_size, // dim 右侧维度的 stride + const size_t dim_size, // Output 在 dim 维度的长度 (用于计算偏移和边界检查) + + // 指针 + const T* source_data, // Source 数据指针 (Input source) + const TIdx* indices_data, // 索引数据指针 + T* output_data // Output 数据指针 (In/Out, 通常已由 Input 初始化) + ) const { + + // 1. 坐标映射 (Flat Index -> Multi-dim Index) + // Source 逻辑形状为: [Outer, IndexLen, Inner] + // 将 curr_idx 分解为 (outer, idx_in_index, inner) + + size_t inner_idx = curr_idx % inner_size; + size_t tmp = curr_idx / inner_size; + size_t idx_in_indices = tmp % index_len; // 当前处理的是 indices 张量中的第几个索引 + size_t outer_idx = tmp / index_len; + + // 2. 读取 Source 数据 + T src_val = source_data[curr_idx]; + + // 3. 获取目标 Index + TIdx target_dim_idx = indices_data[idx_in_indices]; + + // 4. 处理负索引 (支持 Python 风格,如 -1 代表最后一个元素) + if (target_dim_idx < 0) { + target_dim_idx += static_cast(dim_size); + } + + // 5. 边界检查与赋值 + if (target_dim_idx >= 0 && target_dim_idx < static_cast(dim_size)) { + // 计算 Output 的线性偏移 + // Output 逻辑形状: [Outer, DimSize, Inner] + // Offset = Outer_Pos + Dim_Pos + Inner_Pos + size_t out_offset = outer_idx * (dim_size * inner_size) + + static_cast(target_dim_idx) * inner_size + + inner_idx; + + // 直接赋值 (Scatter) + // MUSA 平台 half/bf16/float 均支持 operator= + // 对于重复索引的情况,通常行为是非确定性的 (Last write wins),无需原子操作 + output_data[out_offset] = src_val; + } + // 如果索引越界,IndexCopy 通常忽略该操作,不做任何修改 + } + +} IndexCopyOp; + +} // namespace op::index_copy::moore + +#endif // __INDEX_COPY_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu new file mode 100644 index 000000000..27f574697 --- /dev/null +++ b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu @@ -0,0 +1,179 @@ +#include "index_copy_nvidia.cuh" +#include "../cuda/kernel.cuh" // 假设这是通用 kernel 头文件路径,或者是 index_copy_cuda.h +#include "../../../handle.h" +#include + +// 【关键】引入 CUDA 浮点类型定义 +#include +#include + +namespace op::index_copy::nvidia { + +// ================================================================== +// 辅助函数:检查指针内存对齐 +// ================================================================== +template +bool is_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *source, + const void *indices, + const IndexCopyInfo &info, + void *stream) { + + auto out_ptr = reinterpret_cast(output); + auto src_ptr = reinterpret_cast(source); + auto idx_ptr = reinterpret_cast(indices); + auto cuda_stream = reinterpret_cast(stream); + + // 获取几何信息 (无 alpha) + size_t outer_size = info.outer_size(); + size_t inner_size = info.inner_size(); + size_t dim_size = info.dim_size(); + size_t index_len = info.index_len(); + + // Source 总元素数 + size_t num_source = outer_size * index_len * inner_size; + + // --- 向量化参数配置 --- + // 目标:每个线程读取 128-bit (16 Bytes) Source 数据 + // IndexCopy 是从 Source 读取并写入 Output,Source 是连续读取,适合向量化 Load + constexpr int TotalBytes = 16; + constexpr int PackSize = TotalBytes / sizeof(T); + + // 向量化条件检查: + // 1. PackSize > 1 + // 2. Source 总数能整除 PackSize (简化 tail 处理) + // 3. Source 指针地址对齐 (Load Vectorized 要求) + bool can_vectorize = (PackSize > 1) && + (num_source % PackSize == 0) && + is_aligned(source, TotalBytes); + + if (can_vectorize) { + // === 路径 A: 向量化读取 Kernel === + size_t num_packs = num_source / PackSize; + + size_t block_size = 256; + size_t grid_size = (num_packs + block_size - 1) / block_size; + + op::index_copy::cuda::index_copy_kernel_vectorized + <<>>( + out_ptr, src_ptr, idx_ptr, + outer_size, inner_size, dim_size, index_len, + num_packs + // 注意:移除了 alpha + ); + } else { + // === 路径 B: 标量 Kernel === + size_t block_size = 256; + size_t grid_size = (num_source + block_size - 1) / block_size; + + op::index_copy::cuda::index_copy_kernel + <<>>( + out_ptr, src_ptr, idx_ptr, + outer_size, inner_size, dim_size, index_len, + num_source + // 注意:移除了 alpha + ); + } +} + +// ================================================================== +// Descriptor 实现 +// ================================================================== + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + int64_t dim, + infiniopTensorDescriptor_t index_desc, + infiniopTensorDescriptor_t source_desc) { // 注意:移除了 alpha 参数 + + // Info 创建 + auto info_result = IndexCopyInfo::create(out_desc, in_desc, dim, index_desc, source_desc); // 无 alpha + if (!info_result) return info_result.status(); + + *desc_ptr = new Descriptor( + new Opaque(), info_result.take(), 0, handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// Calculate +// ================================================================== +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream) const { + + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + + // 宏:根据 T_STORAGE 类型实例化 launch_kernel + // T_STORAGE 将会是: float, double, int32_t, __half, __nv_bfloat16 + #define LAUNCH_BY_SIZE(T_STORAGE) \ + switch (idx_dtype) { \ + case INFINI_DTYPE_I32: \ + launch_kernel(output, source, index, _info, stream); \ + break; \ + case INFINI_DTYPE_I64: \ + launch_kernel(output, source, index, _info, stream); \ + break; \ + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + switch (dtype) { + // 32-bit Float + case INFINI_DTYPE_F32: + LAUNCH_BY_SIZE(float); + break; + // 64-bit Float + case INFINI_DTYPE_F64: + LAUNCH_BY_SIZE(double); + break; + // 16-bit Half (fp16) -> 使用 __half + case INFINI_DTYPE_F16: + LAUNCH_BY_SIZE(__half); + break; + // 16-bit BFloat16 (bf16) -> 使用 __nv_bfloat16 + case INFINI_DTYPE_BF16: + LAUNCH_BY_SIZE(__nv_bfloat16); + break; + // Integers + case INFINI_DTYPE_I32: + LAUNCH_BY_SIZE(int32_t); + break; + case INFINI_DTYPE_I64: + LAUNCH_BY_SIZE(int64_t); + break; + + // 如果有其他整型需求 (I8, U8 等),也在这里添加 case + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + #undef LAUNCH_BY_SIZE + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::index_copy::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cuh b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cuh new file mode 100644 index 000000000..296737294 --- /dev/null +++ b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __INDEX_COPY_NVIDIA_CUH__ +#define __INDEX_COPY_NVIDIA_CUH__ + +#include "../index_copy.h" + +DESCRIPTOR(nvidia) + +#endif // __INDEX_COPY_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/index_copy/operator.cc b/src/infiniop/ops/index_copy/operator.cc new file mode 100644 index 000000000..213189cf7 --- /dev/null +++ b/src/infiniop/ops/index_copy/operator.cc @@ -0,0 +1,189 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/index_copy.h" + +// --- 后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/index_copy_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/index_copy_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/index_copy_metax.h" +#endif + +// 【新增】Moore 后端头文件 +#ifdef ENABLE_MOORE_API +#include "moore/index_copy_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateIndexCopyDescriptor( + infiniopHandle_t handle, + infiniopIndexCopyDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + int64_t dim, + infiniopTensorDescriptor_t index, + infiniopTensorDescriptor_t source) { + + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::index_copy::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input, \ + dim, \ + index, \ + source) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + // 【关键修复】启用 Moore 分支 + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetIndexCopyWorkspaceSize(infiniopIndexCopyDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + // 【关键修复】启用 Moore 分支 + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopIndexCopy( + infiniopIndexCopyDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + const void *source, + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, index, source, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + // 【关键修复】启用 Moore 分支 + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +// ======================================================================= +// 4. 销毁描述符 +// ======================================================================= +__C infiniStatus_t infiniopDestroyIndexCopyDescriptor(infiniopIndexCopyDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + // 【关键修复】启用 Moore 分支 + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.cc b/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.cc new file mode 100644 index 000000000..c8ccced6f --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.cc @@ -0,0 +1,148 @@ +#include "smooth_l1_loss_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include + +#include "../../../../utils/custom_types.h" + +namespace op::smooth_l1_loss::cpu { + +Descriptor::~Descriptor() = default; + +// ================================================================== +// 创建描述符 +// ================================================================== +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + float beta, + int reduction) { + + auto handle = reinterpret_cast(handle_); + auto result = SmoothL1LossInfo::create(out_desc, input_desc, target_desc, beta, reduction); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + nullptr, + result.take(), + 0, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// 核心计算逻辑 +// ================================================================== +template +void calculate_cpu_impl( + const SmoothL1LossInfo &info, + void *output, + const void *input, + const void *target) { + + size_t numel = info.numel(); + float beta = info.beta(); + int reduction = info.reduction(); + + float inv_beta = (beta > 0) ? (1.0f / beta) : 0.0f; + float half_beta = 0.5f * beta; + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + auto tar_ptr = reinterpret_cast(target); + + // ---------------------------------------------------- + // 模式 A: Elementwise (None) + // ---------------------------------------------------- + if (reduction == 0) { + #pragma omp parallel for schedule(static) + for (size_t i = 0; i < numel; ++i) { + float in_val = utils::cast(in_ptr[i]); + float tar_val = utils::cast(tar_ptr[i]); + + float diff = std::abs(in_val - tar_val); + float loss; + + if (diff < beta) { + loss = 0.5f * diff * diff * inv_beta; + } else { + loss = diff - half_beta; + } + + // [核心] 计算完 float 后,转回目标类型 T 存储 + out_ptr[i] = utils::cast(loss); + } + } + // ---------------------------------------------------- + // 模式 B: Reduction (Mean / Sum) + // ---------------------------------------------------- + else { + double total_sum = 0.0; + + #pragma omp parallel for reduction(+:total_sum) schedule(static) + for (size_t i = 0; i < numel; ++i) { + float in_val = utils::cast(in_ptr[i]); + float tar_val = utils::cast(tar_ptr[i]); + + float diff = std::abs(in_val - tar_val); + float loss; + + if (diff < beta) { + loss = 0.5f * diff * diff * inv_beta; + } else { + loss = diff - half_beta; + } + + total_sum += static_cast(loss); + } + + if (reduction == 1) { // Mean + total_sum /= static_cast(numel); + } + + // Reduction 结果写入第 0 个位置 + out_ptr[0] = utils::cast(static_cast(total_sum)); + } +} + +// ================================================================== +// 分发逻辑 +// ================================================================== +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + void *stream) const { + + auto dtype = _info.dtype(); + + #define DISPATCH_TYPE(T) \ + cpu::calculate_cpu_impl(_info, output, input, target); \ + return INFINI_STATUS_SUCCESS; + + switch (dtype) { + case INFINI_DTYPE_F32: + DISPATCH_TYPE(float); + case INFINI_DTYPE_F64: + DISPATCH_TYPE(double); + case INFINI_DTYPE_F16: + DISPATCH_TYPE(fp16_t); + case INFINI_DTYPE_BF16: + DISPATCH_TYPE(bf16_t); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + #undef DISPATCH_TYPE +} + +} // namespace op::smooth_l1_loss::cpu \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.h b/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.h new file mode 100644 index 000000000..ee113c858 --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.h @@ -0,0 +1,9 @@ +#ifndef __SMOOTH_L1_LOSS_CPU_H__ +#define __SMOOTH_L1_LOSS_CPU_H__ + +#include "../smooth_l1_loss.h" + +// 使用宏实例化 op::smooth_l1_loss::cpu::Descriptor 类声明 +DESCRIPTOR(cpu) + +#endif // __SMOOTH_L1_LOSS_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh b/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh new file mode 100644 index 000000000..29d8d0acf --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh @@ -0,0 +1,148 @@ +#ifndef __SMOOTH_L1_LOSS_CUDA_CUH__ +#define __SMOOTH_L1_LOSS_CUDA_CUH__ + +#if defined(__MACA__) || defined(__MACACC__) + #include + #include +#else + #include + #include + #include +#endif + +#include + +namespace op::smooth_l1_loss::cuda { + +// ================================================================== +// 基础定义 +// ================================================================== +template +struct alignas(sizeof(T) * N) Pack { + T val[N]; +}; + +// ================================================================== +// 归约辅助函数 (Warp & Block Reduction) +// ================================================================== +__device__ __forceinline__ float warpReduceSum(float val) { + unsigned int mask = 0xffffffff; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += __shfl_down_sync(mask, val, offset); + return val; +} + +__device__ __forceinline__ float blockReduceSum(float val) { + static __shared__ float shared[32]; // Max 1024 threads / 32 warps + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + + val = warpReduceSum(val); + if (lane == 0) shared[wid] = val; + __syncthreads(); + + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0f; + if (wid == 0) val = warpReduceSum(val); + return val; +} + +// ================================================================== +// Functor +// ================================================================== +struct SmoothL1LossFunctor { + float beta; + float inv_beta; + float half_beta; + + __host__ __device__ SmoothL1LossFunctor(float beta_val) + : beta(beta_val), inv_beta(1.0f / beta_val), half_beta(0.5f * beta_val) {} + + template + __device__ __forceinline__ float compute(const T &input, const T &target) const { + float in_f = static_cast(input); + float tg_f = static_cast(target); + float diff = in_f - tg_f; + float abs_diff = fabsf(diff); + + if (abs_diff < beta) { + return 0.5f * diff * diff * inv_beta; + } else { + return abs_diff - half_beta; + } + } +}; + +// ================================================================== +// 1. Elementwise Kernels (reduction='none') +// ================================================================== +template +__global__ void smooth_l1_loss_kernel( + T * __restrict__ output, const T * __restrict__ input, const T * __restrict__ target, + size_t numel, SmoothL1LossFunctor functor) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel) { + output[idx] = static_cast(functor.compute(input[idx], target[idx])); + } +} + +template +__global__ void smooth_l1_loss_kernel_vectorized( + T * __restrict__ output, const T * __restrict__ input, const T * __restrict__ target, + size_t num_packs, SmoothL1LossFunctor functor) { + using PackType = Pack; + auto out_vec = reinterpret_cast(output); + auto in_vec = reinterpret_cast(input); + auto tar_vec = reinterpret_cast(target); + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_packs) { + PackType in_pack = in_vec[idx]; + PackType tar_pack = tar_vec[idx]; + PackType out_pack; + #pragma unroll + for (int i = 0; i < PackSize; ++i) { + out_pack.val[i] = static_cast(functor.compute(in_pack.val[i], tar_pack.val[i])); + } + out_vec[idx] = out_pack; + } +} + +// ================================================================== +// 2. Reduction Kernel (reduction='mean' / 'sum') +// ================================================================== +// 简单的 AtomicAdd 全局归约 +template +__global__ void smooth_l1_loss_reduce_kernel( + float * output, // 使用 float 累加防止溢出 + const T * __restrict__ input, + const T * __restrict__ target, + size_t numel, + SmoothL1LossFunctor functor, + float scale // Mean模式传 1/N, Sum模式传 1.0 +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + float local_sum = 0.0f; + + // Grid-Stride Loop + for (size_t i = idx; i < numel; i += stride) { + local_sum += functor.compute(input[i], target[i]); + } + + // Block Reduction + float block_sum = blockReduceSum(local_sum); + + // Global Atomic Add + if (threadIdx.x == 0) { + atomicAdd(output, block_sum * scale); + } +} + +// 辅助 Kernel: 将 float 结果转回目标类型 T 并写入 output +template +__global__ void cast_float_to_t(T* output, const float* src) { + *output = static_cast(*src); +} + +} // namespace op::smooth_l1_loss::cuda + +#endif // __SMOOTH_L1_LOSS_CUDA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/info.h b/src/infiniop/ops/smooth_l1_loss/info.h new file mode 100644 index 000000000..475dd7a95 --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/info.h @@ -0,0 +1,93 @@ +#ifndef __SMOOTH_L1_LOSS_INFO_H__ +#define __SMOOTH_L1_LOSS_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::smooth_l1_loss { + +class SmoothL1LossInfo { + SmoothL1LossInfo() = default; + +public: + int _dtype; // 数据类型 (float, half, etc.) + size_t _numel; // 参与计算的元素总数 (input.numel()) + float _beta; // 平滑阈值参数 + int _reduction; // 规约模式 (0:None, 1:Mean, 2:Sum) + + int dtype() const { return _dtype; } + size_t numel() const { return _numel; } + float beta() const { return _beta; } + int reduction() const { return _reduction; } + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + float beta, + int reduction) { + + // 1. 检查输入数据类型一致性 (Input vs Target) + if (input_desc->dtype() != target_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + // 2. 检查输出数据类型一致性 (Output vs Input) + if (out_desc->dtype() != input_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + // 3. 检查输入形状一致性 (Input vs Target) + // SmoothL1Loss 要求 input 和 target 形状完全一致 (Elementwise) + if (input_desc->ndim() != target_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + const auto &in_shape = input_desc->shape(); + const auto &tar_shape = target_desc->shape(); + size_t numel = input_desc->numel(); + + for (size_t i = 0; i < input_desc->ndim(); ++i) { + if (in_shape[i] != tar_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + // 4. 检查输出形状 (Output vs Reduction Mode) + // Reduction枚举值: 0=None, 1=Mean, 2=Sum + if (reduction == 0) { + // Reduction::None -> 输出形状必须与输入一致 + if (out_desc->ndim() != input_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + const auto &out_shape = out_desc->shape(); + for (size_t i = 0; i < input_desc->ndim(); ++i) { + if (out_shape[i] != in_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + } else { + // Reduction::Mean/Sum -> 输出通常是标量 + // 标量的定义可能是 ndim=0,或者 numel=1 + if (out_desc->numel() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + // 5. 校验 beta 参数 (必须非负) + if (beta < 0) { + return INFINI_STATUS_BAD_PARAM; + } + return utils::Result(SmoothL1LossInfo{ + input_desc->dtype(), // _dtype + numel, // _numel + beta, // _beta + reduction // _reduction + }); + } +}; + +} // namespace op::smooth_l1_loss + +#endif // __SMOOTH_L1_LOSS_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.h b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.h new file mode 100644 index 000000000..cf731d377 --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.h @@ -0,0 +1,8 @@ +#ifndef __SMOOTH_L1_LOSS_METAX_H__ +#define __SMOOTH_L1_LOSS_METAX_H__ + +#include "../smooth_l1_loss.h" + +DESCRIPTOR(metax) + +#endif // __SMOOTH_L1_LOSS_METAX_H__ \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca new file mode 100644 index 000000000..a1ba08bfe --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca @@ -0,0 +1,207 @@ +#include "smooth_l1_loss_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include +#include +#include +#include +#include + +#include "../../../tensor.h" +#include "../cuda/kernel.cuh" +namespace op::smooth_l1_loss::metax { + +// ================================================================== +// Atomic Helpers +// ================================================================== + +template __device__ __forceinline__ void gpuAtomicAdd(T* address, T val) { atomicAdd(address, val); } +// Float atomic add is usually native, but explicitly defined just in case +template <> __device__ __forceinline__ void gpuAtomicAdd(float* address, float val) { atomicAdd(address, val); } + +// ================================================================== +// Stride Helper +// ================================================================== +struct TensorShape { + int ndim; + int dims[4]; + int strides[4]; +}; + +__device__ inline size_t get_offset(int idx, const TensorShape& shape) { + if (shape.ndim == 0) return 0; + size_t offset = 0; + int rem = idx; + #pragma unroll + for (int i = shape.ndim - 1; i >= 0; --i) { + int dim_sz = shape.dims[i]; + int pos = rem % dim_sz; + rem /= dim_sz; + offset += pos * shape.strides[i]; + } + return offset; +} + +// ================================================================== +// Conversion Kernel +// ================================================================== +// 将 float 类型的累加结果转换回目标类型 T 并写入 output +template +__global__ void cast_float_to_output(T* dest, const float* src) { + *dest = static_cast(*src); +} + +// ================================================================== +// Main Kernel +// ================================================================== + +// AccT: 累加器类型。Reduction模式下强制为 float 以保证精度。 +template +__global__ void smooth_l1_loss_kernel( + AccT* output, // Reduction时是 float* (workspace), None时是 T* (output) + const T* input, + const T* target, + size_t n, + float beta, + float scale, + TensorShape in_s, TensorShape tg_s, TensorShape out_s) +{ + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + size_t in_off = get_offset(idx, in_s); + size_t tg_off = get_offset(idx, tg_s); + + // 全部转为 float 进行高精度计算 + float val_in = static_cast(input[in_off]); + float val_tg = static_cast(target[tg_off]); + float diff = val_in - val_tg; + float abs_diff = fabsf(diff); + float loss = (abs_diff < beta) ? (0.5f * diff * diff / beta) : (abs_diff - 0.5f * beta); + + if constexpr (ReductionMode == 0) { // None + size_t out_off = get_offset(idx, out_s); + output[out_off] = static_cast(loss); + } else { // Sum or Mean + loss *= scale; + // 使用 AccT (float) 进行原子累加 + gpuAtomicAdd(output, static_cast(loss)); + } +} + +// ================================================================== +// Launcher +// ================================================================== + +template +void launch_kernel_impl( + void* output, void* workspace, + const void* input, const void* target, + size_t n, float beta, int reduction, + const TensorShape& in_s, const TensorShape& tg_s, const TensorShape& out_s, + void* stream) +{ + auto mc_stream = reinterpret_cast(stream); + size_t grid = (n + 255) / 256; + float scale = (reduction == 1) ? (1.0f / static_cast(n)) : 1.0f; + + if (reduction == 0) { // None + // 直接输出到 output (T*) + smooth_l1_loss_kernel<<>>( + (T*)output, (const T*)input, (const T*)target, n, beta, scale, in_s, tg_s, out_s); + } else { // Reduction + // 1. 累加到 workspace (float*),避免 FP16 精度丢失 + smooth_l1_loss_kernel<<>>( + (float*)workspace, (const T*)input, (const T*)target, n, beta, scale, in_s, tg_s, out_s); + + // 2. 将 workspace 的 float 结果转存到 output (T*) + cast_float_to_output<<<1, 1, 0, mc_stream>>>((T*)output, (const float*)workspace); + } +} + +// ================================================================== +// Descriptor +// ================================================================== + +struct Descriptor::Opaque { + std::shared_ptr internal; + float beta; + int reduction; + size_t total_elements; + TensorShape in_shape; + TensorShape tg_shape; + TensorShape out_shape; +}; + +Descriptor::~Descriptor() { if (_opaque) delete _opaque; } + +static TensorShape extract_shape(infiniopTensorDescriptor_t desc) { + auto d = reinterpret_cast(desc); + TensorShape s; + s.ndim = d->ndim(); + if (s.ndim > 4) s.ndim = 4; + for (int i = 0; i < s.ndim; ++i) { + s.dims[i] = d->shape()[i]; + s.strides[i] = d->strides()[i]; + } + if (d->ndim() == 0) { s.ndim = 1; s.dims[0] = 1; s.strides[0] = 1; } + return s; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, float beta, int reduction) +{ + auto handle = reinterpret_cast(handle_); + auto info_result = SmoothL1LossInfo::create(out_desc, input_desc, target_desc, beta, reduction); + if (!info_result) return info_result.status(); + + auto in_d = reinterpret_cast(input_desc); + size_t total = 1; for (int i = 0; i < in_d->ndim(); ++i) total *= in_d->shape()[i]; + + auto opaque = new Opaque{ + handle->internal(), beta, reduction, total, + extract_shape(input_desc), extract_shape(target_desc), extract_shape(out_desc) + }; + + // [关键] 如果需要 Reduction,申请 4 字节 workspace 用于存放 float 累加值 + size_t ws_size = (reduction != 0) ? sizeof(float) : 0; + + *desc_ptr = new Descriptor(opaque, info_result.take(), ws_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, void *output, + const void *input, const void *target, void *stream) const +{ + auto mc_stream = reinterpret_cast(stream); + + if (_opaque->reduction != 0) { + // Reduction 模式:清空 Workspace (float) 而不是 Output + // 这样可以确保累加从 0.0f 开始 + mcMemsetAsync(workspace, 0, sizeof(float), mc_stream); + } + + size_t n = _opaque->total_elements; + float beta = _opaque->beta; + int reduction = _opaque->reduction; + + #define LAUNCH(T) launch_kernel_impl(output, workspace, input, target, n, beta, reduction, _opaque->in_shape, _opaque->tg_shape, _opaque->out_shape, stream) + + switch (_info.dtype()) { + case INFINI_DTYPE_F16: LAUNCH(__half); break; + case INFINI_DTYPE_BF16: +#if defined(__MACA__) || defined(__MACACC__) + LAUNCH(__maca_bfloat16); +#endif + break; + case INFINI_DTYPE_F32: LAUNCH(float); break; + case INFINI_DTYPE_F64: LAUNCH(double); break; + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + #undef LAUNCH + return INFINI_STATUS_SUCCESS; +} +} // namespace op::smooth_l1_loss::metax \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.h b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.h new file mode 100644 index 000000000..c7358e518 --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.h @@ -0,0 +1,11 @@ +#ifndef __SMOOTH_L1_LOSS_MOORE_API_H__ +#define __SMOOTH_L1_LOSS_MOORE_API_H__ + +// 引入上层定义的 Descriptor 宏和基础类 +#include "../smooth_l1_loss.h" + +// 使用 smooth_l1_loss.h 中定义的 DESCRIPTOR 宏 +// 这将自动生成 op::smooth_l1_loss::moore::Descriptor 类定义 +DESCRIPTOR(moore) + +#endif // __SMOOTH_L1_LOSS_MOORE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.mu b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.mu new file mode 100644 index 000000000..5b7dbcb0a --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.mu @@ -0,0 +1,104 @@ +#include "smooth_l1_loss_moore.h" +#include "smooth_l1_loss_moore_kernel.h" + +#include +#include +#include + +#include "../../../devices/moore/moore_handle.h" + +namespace op::smooth_l1_loss::moore { + +template +void smooth_l1_loss_moore_launch( + const SmoothL1LossInfo &info, + T *output, + const T *input, + const T *target, + void *stream) { + + auto musa_stream = (musaStream_t)stream; + size_t numel = info.numel(); + int reduction = info.reduction(); // 0:None, 1:Mean, 2:Sum + + int threads = 256; + // Calculate blocks + int blocks = (numel + threads - 1) / threads; + // Cap blocks to avoid overhead for huge tensors (standard practice) + if (blocks > 1024) blocks = 1024; + + if (reduction == 0) { + // --- None (Elementwise) --- + // Just use simple grid mapping + int simple_blocks = (numel + threads - 1) / threads; + smooth_l1_loss_elementwise_kernel<<>>( + numel, info.beta(), input, target, output + ); + } else { + // --- Mean / Sum (Reduction) --- + + // 1. Zero out output (Atomic accumulation destination) + musaMemsetAsync(output, 0, sizeof(T), musa_stream); + + // 2. Launch Reduction Kernel + // Shared Memory size: threads * sizeof(float) because we accumulate in float32 + size_t smem_size = threads * sizeof(float); + + smooth_l1_loss_reduce_kernel<<>>( + numel, info.beta(), input, target, output + ); + + // 3. Post-processing for Mean + if (reduction == 1) { + avg_scaling_kernel<<<1, 1, 0, musa_stream>>>(output, numel); + } + } +} + +// ... (Descriptor implementation remains unchanged) ... +// Ensure Descriptor::~Descriptor(), create, calculate are still there as before. +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + float beta, + int reduction) { + + auto handle = reinterpret_cast(handle_); + auto info_result = SmoothL1LossInfo::create(out_desc, input_desc, target_desc, beta, reduction); + if (!info_result) return info_result.status(); + + *desc_ptr = new Descriptor(nullptr, *info_result, 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + void *stream) const { + + if (workspace_size < _workspace_size) return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + switch (_info.dtype()) { + case INFINI_DTYPE_F16: + smooth_l1_loss_moore_launch(_info, static_cast(output), static_cast(input), static_cast(target), stream); break; + case INFINI_DTYPE_BF16: + smooth_l1_loss_moore_launch<__mt_bfloat16>(_info, static_cast<__mt_bfloat16 *>(output), static_cast(input), static_cast(target), stream); break; + case INFINI_DTYPE_F32: + smooth_l1_loss_moore_launch(_info, static_cast(output), static_cast(input), static_cast(target), stream); break; + case INFINI_DTYPE_F64: + smooth_l1_loss_moore_launch(_info, static_cast(output), static_cast(input), static_cast(target), stream); break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::smooth_l1_loss::moore \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore_kernel.h b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore_kernel.h new file mode 100644 index 000000000..3209e2c59 --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore_kernel.h @@ -0,0 +1,182 @@ +#ifndef __SMOOTH_L1_LOSS_MOORE_KERNEL_H__ +#define __SMOOTH_L1_LOSS_MOORE_KERNEL_H__ + +#include +#include +#include +#include +#include + +namespace op::smooth_l1_loss::moore { + +// ================================================================== +// 1. Type Converter & Atomic Add (Keep existing helpers) +// ================================================================== +template +union TypeConverter; + +template <> +union TypeConverter { + half val; + unsigned short bits; +}; + +template <> +union TypeConverter<__mt_bfloat16> { + __mt_bfloat16 val; + unsigned short bits; +}; + +template +__device__ __forceinline__ void atomic_add_func(T* address, T val) { + atomicAdd(address, val); +} + +template <> +__device__ __forceinline__ void atomic_add_func(half* address, half val) { + unsigned short* address_as_us = reinterpret_cast(address); + unsigned short old = *address_as_us; + unsigned short assumed; + do { + assumed = old; + TypeConverter old_converter; old_converter.bits = assumed; + float sum_f = __half2float(old_converter.val) + __half2float(val); + TypeConverter new_converter; new_converter.val = __float2half(sum_f); + old = atomicCAS(address_as_us, assumed, new_converter.bits); + } while (assumed != old); +} + +template <> +__device__ __forceinline__ void atomic_add_func<__mt_bfloat16>(__mt_bfloat16* address, __mt_bfloat16 val) { + unsigned short* address_as_us = reinterpret_cast(address); + unsigned short old = *address_as_us; + unsigned short assumed; + do { + assumed = old; + TypeConverter<__mt_bfloat16> old_converter; old_converter.bits = assumed; + float sum_f = __bfloat162float(old_converter.val) + __bfloat162float(val); + TypeConverter<__mt_bfloat16> new_converter; new_converter.val = __float2bfloat16(sum_f); + old = atomicCAS(address_as_us, assumed, new_converter.bits); + } while (assumed != old); +} + +// ================================================================== +// 2. Math Functor +// ================================================================== +struct SmoothL1LossMath { + template + __device__ __forceinline__ float operator()(T x_val, T y_val, float beta) const { + float x_f, y_f; + if constexpr (std::is_same_v) { + x_f = __half2float(x_val); y_f = __half2float(y_val); + } else if constexpr (std::is_same_v) { + x_f = __bfloat162float(x_val); y_f = __bfloat162float(y_val); + } else { + x_f = static_cast(x_val); y_f = static_cast(y_val); + } + + float diff = x_f - y_f; + float abs_diff = ::fabsf(diff); + if (abs_diff < beta) { + return 0.5f * diff * diff / beta; + } else { + return abs_diff - 0.5f * beta; + } + } +}; + +// ================================================================== +// 3. Optimized Kernels +// ================================================================== + +// ------------------------------------------------------------------ +// Kernel A: Elementwise (No Reduction) +// ------------------------------------------------------------------ +template +__global__ void smooth_l1_loss_elementwise_kernel( + const size_t numel, + const float beta, + const T* input, + const T* target, + T* output) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel) { + float loss_f = SmoothL1LossMath()(input[idx], target[idx], beta); + + if constexpr (std::is_same_v) output[idx] = __float2half(loss_f); + else if constexpr (std::is_same_v) output[idx] = __float2bfloat16(loss_f); + else output[idx] = static_cast(loss_f); + } +} + +// ------------------------------------------------------------------ +// Kernel B: Block Reduction (For Mean/Sum) - High Precision +// ------------------------------------------------------------------ +template +__global__ void smooth_l1_loss_reduce_kernel( + const size_t numel, + const float beta, + const T* input, + const T* target, + T* output) { + + // 1. Thread-Local Accumulation (in Float32) + float thread_sum = 0.0f; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = idx; i < numel; i += stride) { + thread_sum += SmoothL1LossMath()(input[i], target[i], beta); + } + + // 2. Shared Memory Reduction + // Declare dynamic shared memory (size determined at launch) + extern __shared__ float sdata[]; + unsigned int tid = threadIdx.x; + sdata[tid] = thread_sum; + __syncthreads(); + + // Standard Tree Reduction + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] += sdata[tid + s]; + } + __syncthreads(); + } + + // 3. Block Leader writes to Global Memory + if (tid == 0) { + float block_sum = sdata[0]; + // Convert to T only ONCE per block + T val_to_add; + if constexpr (std::is_same_v) val_to_add = __float2half(block_sum); + else if constexpr (std::is_same_v) val_to_add = __float2bfloat16(block_sum); + else val_to_add = static_cast(block_sum); + + atomic_add_func(output, val_to_add); + } +} + +// ------------------------------------------------------------------ +// Kernel C: Mean Scaling +// ------------------------------------------------------------------ +template +__global__ void avg_scaling_kernel(T* output, size_t numel) { + if (threadIdx.x == 0) { + float sum_f; + if constexpr (std::is_same_v) sum_f = __half2float(output[0]); + else if constexpr (std::is_same_v) sum_f = __bfloat162float(output[0]); + else sum_f = static_cast(output[0]); + + float mean_f = sum_f / static_cast(numel); + + if constexpr (std::is_same_v) output[0] = __float2half(mean_f); + else if constexpr (std::is_same_v) output[0] = __float2bfloat16(mean_f); + else output[0] = static_cast(mean_f); + } +} + +} // namespace op::smooth_l1_loss::moore + +#endif // __SMOOTH_L1_LOSS_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu b/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu new file mode 100644 index 000000000..5a5fd9dfe --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu @@ -0,0 +1,143 @@ +#include "smooth_l1_loss_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include +#include + +namespace op::smooth_l1_loss::nvidia { + +// ================================================================== +// 辅助函数 +// ================================================================== +template +bool is_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, const void *input, const void *target, void* workspace, + size_t numel, float beta, int reduction, + void *stream) { + + auto in_ptr = reinterpret_cast(input); + auto tar_ptr = reinterpret_cast(target); + auto cuda_stream = reinterpret_cast(stream); + op::smooth_l1_loss::cuda::SmoothL1LossFunctor functor(beta); + + // ------------------------------------------ + // 模式 1: Elementwise (None) + // ------------------------------------------ + if (reduction == 0) { + auto out_ptr = reinterpret_cast(output); + constexpr int TotalBytes = 16; + constexpr int PackSize = TotalBytes / sizeof(T); + bool can_vectorize = (PackSize > 1) && (numel % PackSize == 0) && + is_aligned(output, TotalBytes) && + is_aligned(input, TotalBytes) && + is_aligned(target, TotalBytes); + + if (can_vectorize) { + size_t num_packs = numel / PackSize; + size_t block_size = 256; + size_t grid_size = (num_packs + block_size - 1) / block_size; + op::smooth_l1_loss::cuda::smooth_l1_loss_kernel_vectorized + <<>>(out_ptr, in_ptr, tar_ptr, num_packs, functor); + } else { + size_t block_size = 256; + size_t grid_size = (numel + block_size - 1) / block_size; + op::smooth_l1_loss::cuda::smooth_l1_loss_kernel + <<>>(out_ptr, in_ptr, tar_ptr, numel, functor); + } + } + // ------------------------------------------ + // 模式 2: Reduction (Mean / Sum) + // ------------------------------------------ + else { + // 使用 workspace 作为临时的 float 累加器 (精度更高,且方便 atomicAdd) + float* acc_ptr = reinterpret_cast(workspace); + + // 1. 清零 Accumulator + cudaMemsetAsync(acc_ptr, 0, sizeof(float), cuda_stream); + + // 2. 启动 Reduction Kernel + float scale = (reduction == 1) ? (1.0f / numel) : 1.0f; // 1=Mean, 2=Sum + size_t block_size = 256; + // 限制 Grid 大小,避免过多 Block 竞争 atomicAdd + size_t grid_size = std::min((numel + block_size - 1) / block_size, static_cast(1024)); + + op::smooth_l1_loss::cuda::smooth_l1_loss_reduce_kernel + <<>>( + acc_ptr, in_ptr, tar_ptr, numel, functor, scale + ); + + // 3. 将结果从 float workspace 转回目标类型并写入 output + // 输出只有 1 个元素 + op::smooth_l1_loss::cuda::cast_float_to_t + <<<1, 1, 0, cuda_stream>>>(reinterpret_cast(output), acc_ptr); + } +} + +// ================================================================== +// Descriptor +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { if (_opaque) delete _opaque; } + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t target_desc, + float beta, int reduction) { + + auto info_result = SmoothL1LossInfo::create(out_desc, input_desc, target_desc, beta, reduction); + if (!info_result) return info_result.status(); + + // [关键] 如果是 Reduction 模式,我们需要 4 字节的 workspace 来存 float 中间结果 + size_t workspace_size = 0; + if (reduction != 0) { + workspace_size = sizeof(float); + } + + *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, void *output, + const void *input, const void *target, void *stream) const { + + auto dtype = _info.dtype(); + auto numel = _info.numel(); + float beta = _info.beta(); + int reduction = _info.reduction(); + + // 检查 workspace 是否够用 + if (reduction != 0 && workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, target, workspace, numel, beta, reduction, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, target, workspace, numel, beta, reduction, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, target, workspace, numel, beta, reduction, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, target, workspace, numel, beta, reduction, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::smooth_l1_loss::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cuh b/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cuh new file mode 100644 index 000000000..cd6492105 --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cuh @@ -0,0 +1,9 @@ +#ifndef __SMOOTH_L1_LOSS_NVIDIA_CUH__ +#define __SMOOTH_L1_LOSS_NVIDIA_CUH__ + +#include "../smooth_l1_loss.h" + +// 使用宏实例化 op::smooth_l1_loss::nvidia::Descriptor 类声明 +DESCRIPTOR(nvidia) + +#endif // __SMOOTH_L1_LOSS_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/operator.cc b/src/infiniop/ops/smooth_l1_loss/operator.cc new file mode 100644 index 000000000..3703e186f --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/operator.cc @@ -0,0 +1,188 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/smooth_l1_loss.h" + +// --- 后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/smooth_l1_loss_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/smooth_l1_loss_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/smooth_l1_loss_metax.h" +#endif + +// 【新增】Moore 后端头文件 +#ifdef ENABLE_MOORE_API +#include "moore/smooth_l1_loss_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateSmoothL1LossDescriptor( + infiniopHandle_t handle, + infiniopSmoothL1LossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t target, + float beta, + int reduction) { + + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::smooth_l1_loss::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input, \ + target, \ + beta, \ + reduction) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + // 【新增】Moore 分支 + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetSmoothL1LossWorkspaceSize(infiniopSmoothL1LossDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + // 【新增】Moore 分支 + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopSmoothL1Loss( + infiniopSmoothL1LossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, target, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + // 【新增】Moore 分支 + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +// ======================================================================= +// 4. 销毁描述符 +// ======================================================================= +__C infiniStatus_t infiniopDestroySmoothL1LossDescriptor(infiniopSmoothL1LossDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + // 【新增】Moore 分支 + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/smooth_l1_loss/smooth_l1_loss.h b/src/infiniop/ops/smooth_l1_loss/smooth_l1_loss.h new file mode 100644 index 000000000..85296a1d1 --- /dev/null +++ b/src/infiniop/ops/smooth_l1_loss/smooth_l1_loss.h @@ -0,0 +1,51 @@ +#ifndef __SMOOTH_L1_LOSS_H__ +#define __SMOOTH_L1_LOSS_H__ + +#include "../../operator.h" +#include "info.h" // 引用刚才生成的 SmoothL1LossInfo 定义 + +// 宏定义:用于生成不同命名空间下的 Descriptor 类 +#define DESCRIPTOR(NAMESPACE) \ + namespace op::smooth_l1_loss::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + SmoothL1LossInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + SmoothL1LossInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t target_desc, \ + float beta, \ + int reduction); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *target, \ + void *stream) const; \ + }; \ + } + +#endif // __SMOOTH_L1_LOSS_H__ \ No newline at end of file diff --git a/src/infiniop/ops/take/cpu/take_cpu.cc b/src/infiniop/ops/take/cpu/take_cpu.cc new file mode 100644 index 000000000..27bd2b44f --- /dev/null +++ b/src/infiniop/ops/take/cpu/take_cpu.cc @@ -0,0 +1,138 @@ +#include "take_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include + +namespace op::take::cpu { + +Descriptor::~Descriptor() = default; + +// ================================================================== +// 创建描述符 +// ================================================================== +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + infiniopTensorDescriptor_t indices_desc) { + + auto handle = reinterpret_cast(handle_); + + auto result = TakeInfo::create(out_desc, in_desc, indices_desc); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + nullptr, // Opaque* + result.take(), // Info + 0, // Workspace Size + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// 核心计算逻辑 (模板实现) +// ================================================================== +template +void calculate_cpu_impl( + const TakeInfo &info, + void *output, + const void *input, + const void *indices) { + + size_t num_out = info.num_out(); + size_t num_in = info.num_in(); + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + auto idx_ptr = reinterpret_cast(indices); + + // OpenMP 并行化处理 +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < num_out; ++i) { + TIdx idx = idx_ptr[i]; + + // 边界检查 + if (idx >= 0 && static_cast(idx) < num_in) { + out_ptr[i] = in_ptr[idx]; + } else { + // 越界处理:填充 0 + if constexpr (std::is_arithmetic_v) { + // 标准类型 (float, int 等) 直接转换 + out_ptr[i] = static_cast(0); + } else { + out_ptr[i] = utils::cast(0.0f); + } + } + } +} + +// ================================================================== +// 执行函数 (分发逻辑) +// ================================================================== +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + void *stream) const { + + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + + // 辅助宏:根据 idx_dtype 分发 + #define DISPATCH_IDX(TDATA) \ + switch (idx_dtype) { \ + case INFINI_DTYPE_I32: \ + cpu::calculate_cpu_impl(_info, output, input, indices); \ + return INFINI_STATUS_SUCCESS; \ + case INFINI_DTYPE_I64: \ + cpu::calculate_cpu_impl(_info, output, input, indices); \ + return INFINI_STATUS_SUCCESS; \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + // 主 Switch:根据 dtype 分发 + switch (dtype) { + // 浮点类型 + case INFINI_DTYPE_F16: + DISPATCH_IDX(fp16_t); + case INFINI_DTYPE_BF16: + DISPATCH_IDX(bf16_t); + case INFINI_DTYPE_F32: + DISPATCH_IDX(float); + case INFINI_DTYPE_F64: + DISPATCH_IDX(double); + + // 整数类型 + case INFINI_DTYPE_I8: + DISPATCH_IDX(int8_t); + case INFINI_DTYPE_U8: + DISPATCH_IDX(uint8_t); + case INFINI_DTYPE_I16: + DISPATCH_IDX(int16_t); + case INFINI_DTYPE_U16: + DISPATCH_IDX(uint16_t); + case INFINI_DTYPE_I32: + DISPATCH_IDX(int32_t); + case INFINI_DTYPE_U32: + DISPATCH_IDX(uint32_t); + case INFINI_DTYPE_I64: + DISPATCH_IDX(int64_t); + case INFINI_DTYPE_U64: + DISPATCH_IDX(uint64_t); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + #undef DISPATCH_IDX +} + +} // namespace op::take::cpu \ No newline at end of file diff --git a/src/infiniop/ops/take/cpu/take_cpu.h b/src/infiniop/ops/take/cpu/take_cpu.h new file mode 100644 index 000000000..e87b3975e --- /dev/null +++ b/src/infiniop/ops/take/cpu/take_cpu.h @@ -0,0 +1,7 @@ +#ifndef __TAKE_CPU_H__ +#define __TAKE_CPU_H__ + +#include "../take.h" +DESCRIPTOR(cpu) + +#endif // __TAKE_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/take/cuda/kernel.cuh b/src/infiniop/ops/take/cuda/kernel.cuh new file mode 100644 index 000000000..eb0e50f15 --- /dev/null +++ b/src/infiniop/ops/take/cuda/kernel.cuh @@ -0,0 +1,93 @@ +#ifndef __TAKE_CUDA_H__ +#define __TAKE_CUDA_H__ + +//#include +#include + +namespace op::take::cuda { + +// ================================================================== +// 1. 定义向量化数据包 (Aligned Pack) +// ================================================================== +template +struct alignas(sizeof(T) * N) Pack { + T val[N]; +}; + +// ================================================================== +// 2. 标量版 Kernel (用于处理非对齐数据或尾部剩余数据) +// ================================================================== +template +__global__ void take_kernel( + T * __restrict__ output, + const T * __restrict__ input, + const TIdx * __restrict__ indices, + size_t num_out, + size_t num_in +) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = tid; i < num_out; i += stride) { + TIdx idx = __ldg(&indices[i]); + + // 标量读取 + if (idx >= 0 && idx < static_cast(num_in)) { + output[i] = input[idx]; + } else { + output[i] = static_cast(0); + } + } +} + +// ================================================================== +// 3. 向量化 Kernel (优化版) +// ================================================================== +/** + * @tparam PackSize 每个线程处理的元素个数 (目标是凑齐 128-bit, e.g., float x 4) + */ +template +__global__ void take_kernel_vectorized( + T * __restrict__ output, + const T * __restrict__ input, + const TIdx * __restrict__ indices, + size_t num_packs, // 需要处理的 Pack 数量 (num_out / PackSize) + size_t num_in +) { + // 将 output 强转为 Pack 指针,实现向量化写入 + using PackType = Pack; + PackType *out_vec = reinterpret_cast(output); + + // Grid-Stride Loop 遍历 Pack + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = tid; i < num_packs; i += stride) { + PackType reg_pack; // 寄存器缓存 + size_t base_idx = i * PackSize; // 当前 Pack 对应的原始 output 起始索引 + + // 循环展开 (Unroll): 关键优化点 + // 编译器会展开这个循环,生成 PackSize 个独立的 Load 指令 + // 从而利用 ILP (Instruction Level Parallelism) 掩盖 Input 的随机读取延迟 + #pragma unroll + for (int k = 0; k < PackSize; ++k) { + // 读取索引 (Indices 是连续的,L1 Cache 命中率高) + // 注意:Indices 类型大小可能与 T 不同,所以独立读取 + TIdx gather_idx = indices[base_idx + k]; + + // 随机读取 (Gather) + if (gather_idx >= 0 && gather_idx < static_cast(num_in)) { + reg_pack.val[k] = input[gather_idx]; + } else { + reg_pack.val[k] = static_cast(0); + } + } + + // 向量化写入 (一次 STG.128 指令) + out_vec[i] = reg_pack; + } +} + +} // namespace op::take::cuda + +#endif // __TAKE_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/take/info.h b/src/infiniop/ops/take/info.h new file mode 100644 index 000000000..ff649be04 --- /dev/null +++ b/src/infiniop/ops/take/info.h @@ -0,0 +1,69 @@ +#ifndef __TAKE_INFO_H__ +#define __TAKE_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::take { + +class TakeInfo { + TakeInfo() = default; + +public: + int _dtype; // 数据类型 (float, half, etc.) + int _idx_dtype; // 索引类型 (int32, int64) + size_t _num_out; // 输出元素总数 (== indices.numel()) + size_t _num_in; // 输入元素总数 (用于边界检查) + + int dtype() const { return _dtype; } + int idx_dtype() const { return _idx_dtype; } + size_t num_out() const { return _num_out; } + size_t num_in() const { return _num_in; } + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + infiniopTensorDescriptor_t indices_desc) { + + // 1. 检查数据类型一致性 (Output vs Input) + if (out_desc->dtype() != in_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + // 2. 检查索引数据类型 (Indices) + int idx_type = indices_desc->dtype(); + if (idx_type != INFINI_DTYPE_I32 && idx_type != INFINI_DTYPE_I64) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + // 3. 检查形状一致性 (Output vs Indices) + if (out_desc->ndim() != indices_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + const auto &out_shape = out_desc->shape(); + const auto &idx_shape = indices_desc->shape(); + + for (size_t i = 0; i < out_desc->ndim(); ++i) { + if (out_shape[i] != idx_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + // 4. 提取信息 + int dtype = in_desc->dtype(); + size_t num_out = out_desc->numel(); + size_t num_in = in_desc->numel(); + return utils::Result(TakeInfo{ + dtype, + idx_type, + num_out, + num_in + }); + } +}; + +} // namespace op::take + +#endif // __TAKE_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/take/metax/take_metax.h b/src/infiniop/ops/take/metax/take_metax.h new file mode 100644 index 000000000..e879d3dde --- /dev/null +++ b/src/infiniop/ops/take/metax/take_metax.h @@ -0,0 +1,8 @@ +#ifndef __TAKE_METAX_H__ +#define __TAKE_METAX_H__ + +#include "../take.h" + +DESCRIPTOR(metax) + +#endif // __TAKE_METAX_H__ \ No newline at end of file diff --git a/src/infiniop/ops/take/metax/take_metax.maca b/src/infiniop/ops/take/metax/take_metax.maca new file mode 100644 index 000000000..955047ebd --- /dev/null +++ b/src/infiniop/ops/take/metax/take_metax.maca @@ -0,0 +1,264 @@ +#include "take_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include +#include +#include +#include +#include +#include +#include + +#include "../../../tensor.h" +#include "../cuda/kernel.cuh" + +namespace op::take::metax { + +// ================================================================== +// Kernel +// ================================================================== + +template +__global__ void take_kernel( + T* output, + const T* input, + const TIdx* indices, + int outer_size, + int inner_size, + int indices_size, + int input_dim_size) +{ + // Flattened Take Logic + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= static_cast(indices_size)) return; + + TIdx idx_val = indices[idx]; + + if (idx_val < 0) { + idx_val += input_dim_size; + } + + if (idx_val >= 0 && idx_val < input_dim_size) { + output[idx] = input[idx_val]; + } else { + output[idx] = T(0); + } +} + +// ================================================================== +// Kernel Launcher +// ================================================================== + +template +void launch_kernel_impl( + void* output, + const void* input, + const void* indices, + int outer_size, + int inner_size, + int indices_size, + int input_dim_size, + void* stream) +{ + auto hc_stream = reinterpret_cast(stream); + + size_t total_elements = + static_cast(outer_size) * indices_size * inner_size; + + size_t block_size = 256; + size_t grid_size = + (total_elements + block_size - 1) / block_size; + + take_kernel + <<>>( + reinterpret_cast(output), + reinterpret_cast(input), + reinterpret_cast(indices), + outer_size, + inner_size, + indices_size, + input_dim_size); +} + +// ================================================================== +// Descriptor Implementation +// ================================================================== + +struct Descriptor::Opaque { + std::shared_ptr internal; + int outer_size; + int inner_size; + int indices_size; + int input_dim_size; + size_t total_bytes; +}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +static size_t get_element_size(int dtype) { + if (dtype == INFINI_DTYPE_F64 || dtype == INFINI_DTYPE_I64) return 8; + if (dtype == INFINI_DTYPE_F32 || dtype == INFINI_DTYPE_I32) return 4; + return 2; +} + +// ================================================================== +// Descriptor::create +// ================================================================== + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + infiniopTensorDescriptor_t indices_desc) +{ + auto handle = + reinterpret_cast(handle_); + + auto info_result = + TakeInfo::create(out_desc, in_desc, indices_desc); + if (!info_result) { + return info_result.status(); + } + + auto out_d = + reinterpret_cast(out_desc); + auto in_d = + reinterpret_cast(in_desc); + auto idx_d = + reinterpret_cast(indices_desc); + + int input_dim = 1; + for (int i = 0; i < in_d->ndim(); ++i) { + input_dim *= in_d->shape()[i]; + } + + int indices_numel = 1; + for (int i = 0; i < idx_d->ndim(); ++i) { + indices_numel *= idx_d->shape()[i]; + } + + int outer = 1; + int inner = 1; + + size_t bytes = + static_cast(indices_numel) * + get_element_size(out_d->dtype()); + + auto opaque = new Opaque{ + handle->internal(), + outer, + inner, + indices_numel, + input_dim, + bytes + }; + + *desc_ptr = new Descriptor( + opaque, + info_result.take(), + 0, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// Descriptor::calculate +// ================================================================== + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + void *stream) const +{ + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + + int outer = _opaque->outer_size; + int inner = _opaque->inner_size; + int input_dim = _opaque->input_dim_size; + int idx_sz = _opaque->indices_size; + +#define LAUNCH(T, TIdx) \ + launch_kernel_impl( \ + output, input, indices, \ + outer, inner, idx_sz, input_dim, stream) + + if (idx_dtype == INFINI_DTYPE_I32) { + switch (dtype) { + + case INFINI_DTYPE_F16: + LAUNCH(__half, int32_t); + break; + + case INFINI_DTYPE_BF16: + LAUNCH(__maca_bfloat16, int32_t); + break; + + case INFINI_DTYPE_F32: + LAUNCH(float, int32_t); + break; + + case INFINI_DTYPE_F64: + LAUNCH(double, int32_t); + break; + + case INFINI_DTYPE_I32: + LAUNCH(int32_t, int32_t); + break; + + case INFINI_DTYPE_I64: + LAUNCH(int64_t, int32_t); + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + } else if (idx_dtype == INFINI_DTYPE_I64) { + switch (dtype) { + + case INFINI_DTYPE_F16: + LAUNCH(__half, int64_t); + break; + + case INFINI_DTYPE_BF16: + LAUNCH(__maca_bfloat16, int64_t); + break; + + case INFINI_DTYPE_F32: + LAUNCH(float, int64_t); + break; + + case INFINI_DTYPE_F64: + LAUNCH(double, int64_t); + break; + + case INFINI_DTYPE_I32: + LAUNCH(int32_t, int64_t); + break; + + case INFINI_DTYPE_I64: + LAUNCH(int64_t, int64_t); + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + +#undef LAUNCH + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::take::metax diff --git a/src/infiniop/ops/take/moore/take_moore.h b/src/infiniop/ops/take/moore/take_moore.h new file mode 100644 index 000000000..3a231d8af --- /dev/null +++ b/src/infiniop/ops/take/moore/take_moore.h @@ -0,0 +1,6 @@ +#ifndef __TAKE_MOORE_API_H__ +#define __TAKE_MOORE_API_H__ +#include "../take.h" +DESCRIPTOR(moore) + +#endif // __TAKE_MOORE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/take/moore/take_moore.mu b/src/infiniop/ops/take/moore/take_moore.mu new file mode 100644 index 000000000..10517b431 --- /dev/null +++ b/src/infiniop/ops/take/moore/take_moore.mu @@ -0,0 +1,174 @@ +#include "take_moore.h" +#include "take_moore_kernel.h" // 包含 TakeOp 结构体定义 + +#include +#include +#include + +#include "../../../devices/moore/moore_handle.h" + +namespace op::take::moore { + +// ================================================================== +// 1. Kernel Wrapper Implementation +// ================================================================== + +// 这是一个 Global Kernel 包装器,它调用 take_moore_kernel.h 中的 TakeOp Functor +template +__global__ void take_kernel( + const size_t num_out, // 输出元素总数 + const size_t num_in, // 输入元素总数 (用于边界检查) + const T *input, // 输入数据 + const TIdx *indices, // 索引数据 + T *output) { // 输出数据 + + // idx 对应输出张量和索引张量的线性索引 + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_out) { + // 使用 take_moore_kernel.h 中定义的 Functor + TakeOp op; + op(idx, num_in, input, indices, output); + } +} + +// ================================================================== +// 2. Launcher Implementation +// ================================================================== + +template +void take_moore_launch( + const TakeInfo &info, + T *output, + const T *input, + const void *indices, // indices 传入时是 void*,需要强转为 TIdx* + void *stream) { + + size_t num_out = info.num_out(); + size_t num_in = info.num_in(); + + // 强转索引指针 + const TIdx *indices_ptr = static_cast(indices); + + int threads = 256; + // 使用 size_t 防止溢出,但 GridDim 必须是 int + // 如果 num_out 非常大,可能需要 stride loop,此处假设在 Grid 限制内 + int blocks = (num_out + threads - 1) / threads; + + take_kernel<<>>( + num_out, + num_in, + input, + indices_ptr, + output + ); +} + +// ================================================================== +// 3. Descriptor Implementation +// ================================================================== + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + infiniopTensorDescriptor_t indices_desc) { + + auto handle = reinterpret_cast(handle_); + + // 使用 TakeInfo::create 校验形状和类型 + auto info_result = TakeInfo::create(out_desc, in_desc, indices_desc); + + if (!info_result) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + *desc_ptr = new Descriptor( + nullptr, + *info_result, // 解包 Result 获取 Info 对象 + 0, // Take 算子不需要 Workspace + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + void *stream) const { + + // 1. 基础检查 + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // 2. 定义分发宏:先确定 Data Type,再确定 Index Type + // T 是数据类型 (float, half, etc.) + #define LAUNCH_TAKE_KERNEL(T) \ + do { \ + if (_info.idx_dtype() == INFINI_DTYPE_I32) { \ + take_moore_launch(_info, \ + static_cast(output), \ + static_cast(input), \ + indices, stream); \ + } else if (_info.idx_dtype() == INFINI_DTYPE_I64) { \ + take_moore_launch(_info, \ + static_cast(output), \ + static_cast(input), \ + indices, stream); \ + } else { \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } while (0) + + // 3. 根据数据类型分发 + switch (_info.dtype()) { + case INFINI_DTYPE_F16: + LAUNCH_TAKE_KERNEL(half); + break; + + case INFINI_DTYPE_BF16: + LAUNCH_TAKE_KERNEL(__mt_bfloat16); + break; + + case INFINI_DTYPE_F32: + LAUNCH_TAKE_KERNEL(float); + break; + + case INFINI_DTYPE_F64: + LAUNCH_TAKE_KERNEL(double); + break; + + case INFINI_DTYPE_I32: + LAUNCH_TAKE_KERNEL(int32_t); + break; + + case INFINI_DTYPE_I64: + LAUNCH_TAKE_KERNEL(int64_t); + break; + + case INFINI_DTYPE_I8: + LAUNCH_TAKE_KERNEL(int8_t); + break; + + case INFINI_DTYPE_U8: + LAUNCH_TAKE_KERNEL(uint8_t); + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + #undef LAUNCH_TAKE_KERNEL + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::take::moore \ No newline at end of file diff --git a/src/infiniop/ops/take/moore/take_moore_kernel.h b/src/infiniop/ops/take/moore/take_moore_kernel.h new file mode 100644 index 000000000..70725cb3a --- /dev/null +++ b/src/infiniop/ops/take/moore/take_moore_kernel.h @@ -0,0 +1,56 @@ +#ifndef __TAKE_MOORE_KERNEL_H__ +#define __TAKE_MOORE_KERNEL_H__ + +#include +#include +#include // 必须包含,用于 __mt_bfloat16 定义 + +#include // 用于 std::is_same_v + +namespace op::take::moore { + +typedef struct TakeOp { +public: + // Take 算子涉及两种数据类型:数据本身 (T) 和 索引类型 (TIdx) + template + __device__ __forceinline__ void operator()( + const size_t curr_idx, // 当前线程处理的线性索引 (对应 output 和 indices 的位置) + const size_t num_in, // 输入张量的元素总数 (用于边界检查) + const T* input_data, // 输入数据指针 + const TIdx* indices_data, // 索引数据指针 + T* output_data // 输出数据指针 + ) const { + + // 1. 获取当前位置的索引值 + // indices 和 output 形状一致,curr_idx 对应两者的扁平化索引 + TIdx idx = indices_data[curr_idx]; + + // 2. 处理负索引 (支持 Python 风格的负数索引,如 -1 代表最后一个元素) + // 如果业务不需要支持负索引,可以移除此逻辑,直接判断 idx < 0 + if (idx < 0) { + idx += static_cast(num_in); + } + + // 3. 边界检查与赋值 + if (idx >= 0 && idx < static_cast(num_in)) { + // 合法索引:直接拷贝 + // half/bf16/float 在 MUSA 中通常支持直接赋值(=),无需转换 + output_data[curr_idx] = input_data[idx]; + } else { + // 越界处理:通常填充 0 + // 使用 constexpr 保持与 avg_pool 代码风格一致的类型处理 + if constexpr (std::is_same_v) { + output_data[curr_idx] = __float2half(0.0f); + } else if constexpr (std::is_same_v) { + output_data[curr_idx] = __float2bfloat16(0.0f); + } else { + output_data[curr_idx] = static_cast(0); + } + } + } + +} TakeOp; + +} // namespace op::take::moore + +#endif // __TAKE_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/take/nvidia/take_nvidia.cu b/src/infiniop/ops/take/nvidia/take_nvidia.cu new file mode 100644 index 000000000..6f167110c --- /dev/null +++ b/src/infiniop/ops/take/nvidia/take_nvidia.cu @@ -0,0 +1,152 @@ +#include "take_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include + +namespace op::take::nvidia { + +// ================================================================== +// 辅助函数:检查指针内存对齐 +// ================================================================== +template +bool is_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const void *indices, + size_t num_out, + size_t num_in, + void *stream) { + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + auto idx_ptr = reinterpret_cast(indices); + auto cuda_stream = reinterpret_cast(stream); + + // --- 向量化参数配置 --- + // 目标:每个线程处理 128-bit (16 Bytes) 数据以最大化带宽 + constexpr int TotalBytes = 16; + constexpr int PackSize = TotalBytes / sizeof(T); + + // 向量化条件检查: + // 1. PackSize >= 2: 只有能打包 2 个以上才有意义 (double 是 2 个,float 是 4 个) + // 2. 整除检查: 输出元素总数必须能被 PackSize 整除 (简化 Kernel 边界判断) + // 3. 地址对齐: output 指针必须 16 字节对齐 + bool can_vectorize = (PackSize > 1) && + (num_out % PackSize == 0) && + is_aligned(output, TotalBytes); + + if (can_vectorize) { + // + // === 路径 A: 向量化 Kernel (高性能) === + size_t num_packs = num_out / PackSize; + + // Block/Grid 配置 + size_t block_size = 256; + size_t grid_size = (num_packs + block_size - 1) / block_size; + + op::take::cuda::take_kernel_vectorized + <<>>( + out_ptr, in_ptr, idx_ptr, num_packs, num_in + ); + } else { + // === 路径 B: 标量 Kernel (回退/兼容) === + size_t block_size = 256; + size_t grid_size = (num_out + block_size - 1) / block_size; + + op::take::cuda::take_kernel + <<>>( + out_ptr, in_ptr, idx_ptr, num_out, num_in + ); + } +} + +// ================================================================== +// Descriptor 部分保持不变 +// ================================================================== + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + infiniopTensorDescriptor_t indices_desc) { + + auto info_result = TakeInfo::create(out_desc, in_desc, indices_desc); + if (!info_result) return info_result.status(); + + *desc_ptr = new Descriptor( + new Opaque(), info_result.take(), 0, handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// Calculate: 结合 "按字节分发" 和 "向量化" +// ================================================================== +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + void *stream) const { + + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + auto num_out = _info.num_out(); + auto num_in = _info.num_in(); + + // 索引分发宏 + #define LAUNCH_BY_SIZE(T_STORAGE) \ + switch (idx_dtype) { \ + case INFINI_DTYPE_I32: \ + launch_kernel(output, input, indices, num_out, num_in, stream); \ + break; \ + case INFINI_DTYPE_I64: \ + launch_kernel(output, input, indices, num_out, num_in, stream); \ + break; \ + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + // 根据数据类型字节大小归类 + size_t element_size = 0; + switch (dtype) { + case INFINI_DTYPE_I8: case INFINI_DTYPE_U8: + element_size = 1; break; + case INFINI_DTYPE_F16: case INFINI_DTYPE_BF16: case INFINI_DTYPE_I16: case INFINI_DTYPE_U16: + element_size = 2; break; + case INFINI_DTYPE_F32: case INFINI_DTYPE_I32: case INFINI_DTYPE_U32: + element_size = 4; break; + case INFINI_DTYPE_F64: case INFINI_DTYPE_I64: case INFINI_DTYPE_U64: + element_size = 8; break; + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + // 分发到对应的存储类型 + switch (element_size) { + case 1: LAUNCH_BY_SIZE(uint8_t); break; // PackSize = 16 + case 2: LAUNCH_BY_SIZE(uint16_t); break; // PackSize = 8 + case 4: LAUNCH_BY_SIZE(uint32_t); break; // PackSize = 4 (float4) + case 8: LAUNCH_BY_SIZE(uint64_t); break; // PackSize = 2 (double2) + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + #undef LAUNCH_BY_SIZE + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::take::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/take/nvidia/take_nvidia.cuh b/src/infiniop/ops/take/nvidia/take_nvidia.cuh new file mode 100644 index 000000000..eeb3e2da9 --- /dev/null +++ b/src/infiniop/ops/take/nvidia/take_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TAKE_NVIDIA_CUH__ +#define __TAKE_NVIDIA_CUH__ + +#include "../take.h" + +DESCRIPTOR(nvidia) + +#endif // __TAKE_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/take/operator.cc b/src/infiniop/ops/take/operator.cc new file mode 100644 index 000000000..8e99d8195 --- /dev/null +++ b/src/infiniop/ops/take/operator.cc @@ -0,0 +1,176 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/take.h" + +// --- 后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/take_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/take_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/take_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/take_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateTakeDescriptor( + infiniopHandle_t handle, + infiniopTakeDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t indices) { + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::take::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input, \ + indices) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetTakeWorkspaceSize(infiniopTakeDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopTake( + infiniopTakeDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + void *stream) { + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, indices, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +// ======================================================================= +// 4. 销毁描述符 +// ======================================================================= +__C infiniStatus_t infiniopDestroyTakeDescriptor(infiniopTakeDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/take/take.h b/src/infiniop/ops/take/take.h new file mode 100644 index 000000000..d45e867ac --- /dev/null +++ b/src/infiniop/ops/take/take.h @@ -0,0 +1,50 @@ +#ifndef __TAKE_H__ +#define __TAKE_H__ + +#include "../../operator.h" +#include "info.h" // 引用刚才生成的 TakeInfo 定义 + +// 宏定义:用于生成不同命名空间下的 Descriptor 类 +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::take::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + TakeInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + TakeInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t in_desc, \ + infiniopTensorDescriptor_t indices_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *indices, \ + void *stream) const; \ + }; \ + } + +#endif // __TAKE_H__ \ No newline at end of file diff --git a/test/infinicore/ops/hypot.py b/test/infinicore/ops/hypot.py index 283214c0e..593f09b47 100644 --- a/test/infinicore/ops/hypot.py +++ b/test/infinicore/ops/hypot.py @@ -103,9 +103,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.hypot(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.hypot(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.hypot(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/index_add.py b/test/infinicore/ops/index_add.py index ee136cf31..531e04097 100644 --- a/test/infinicore/ops/index_add.py +++ b/test/infinicore/ops/index_add.py @@ -91,9 +91,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.index_add(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.index_add(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.index_add(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/index_copy.py b/test/infinicore/ops/index_copy.py index f38403320..d2e34b7c3 100644 --- a/test/infinicore/ops/index_copy.py +++ b/test/infinicore/ops/index_copy.py @@ -89,9 +89,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.index_copy(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.index_copy(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.index_copy(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/smooth_l1_loss.py b/test/infinicore/ops/smooth_l1_loss.py index 7f0ca670c..76dae9496 100644 --- a/test/infinicore/ops/smooth_l1_loss.py +++ b/test/infinicore/ops/smooth_l1_loss.py @@ -67,9 +67,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.smooth_l1_loss(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.smooth_l1_loss(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.nn.functional.smooth_l1_loss(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/take.py b/test/infinicore/ops/take.py index 5ca28b86e..22b9e39ef 100644 --- a/test/infinicore/ops/take.py +++ b/test/infinicore/ops/take.py @@ -67,9 +67,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.take(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.take(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.take(*args, **kwargs) def main():