diff --git a/include/infinicore/ops/hypot.hpp b/include/infinicore/ops/hypot.hpp
new file mode 100644
index 000000000..0608f9c08
--- /dev/null
+++ b/include/infinicore/ops/hypot.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Hypot {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    
+    static void execute(Tensor output, Tensor input_a, Tensor input_b);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor hypot(Tensor input_a, Tensor input_b);
+
+void hypot_(Tensor output, Tensor input_a, Tensor input_b);
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/index_add.hpp b/include/infinicore/ops/index_add.hpp
new file mode 100644
index 000000000..14c0f72c5
--- /dev/null
+++ b/include/infinicore/ops/index_add.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class IndexAdd {
+public:
+    using schema = void (*)(Tensor, Tensor, int64_t, Tensor, Tensor, float);
+    static void execute(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source, float alpha);
+
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+
+Tensor index_add(Tensor input, int64_t dim, Tensor index, Tensor source, float alpha = 1.0f);
+void index_add_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source, float alpha);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/index_copy.hpp b/include/infinicore/ops/index_copy.hpp
new file mode 100644
index 000000000..e6b862a5f
--- /dev/null
+++ b/include/infinicore/ops/index_copy.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class IndexCopy {
+public:
+    using schema = void (*)(Tensor, Tensor, int64_t, Tensor, Tensor);
+    static void execute(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source);
+
+    static common::OpDispatcher<schema> &dispatcher();
+};
+Tensor index_copy(Tensor input, int64_t dim, Tensor index, Tensor source);
+void index_copy_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/smooth_l1_loss.hpp b/include/infinicore/ops/smooth_l1_loss.hpp
new file mode 100644
index 000000000..329d1ce75
--- /dev/null
+++ b/include/infinicore/ops/smooth_l1_loss.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class SmoothL1Loss {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, float, int64_t);
+    
+    static void execute(Tensor output, Tensor input, Tensor target, float beta, int64_t reduction);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor smooth_l1_loss(Tensor input, Tensor target, float beta = 1.0f, int64_t reduction = 1);
+void smooth_l1_loss_(Tensor output, Tensor input, Tensor target, float beta, int64_t reduction);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/take.hpp b/include/infinicore/ops/take.hpp
new file mode 100644
index 000000000..7c8fd6683
--- /dev/null
+++ b/include/infinicore/ops/take.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Take {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    
+    static void execute(Tensor output, Tensor input, Tensor indices);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+
+Tensor take(Tensor input, Tensor indices);
+
+void take_(Tensor output, Tensor input, Tensor indices);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infiniop.h b/include/infiniop.h
index 92e6f5963..35b201db3 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -10,6 +10,9 @@
 #include "infiniop/ops/dequantize_awq.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/hypot.h"
+#include "infiniop/ops/index_add.h"
+#include "infiniop/ops/index_copy.h"
 #include "infiniop/ops/layer_norm.h"
 #include "infiniop/ops/logsoftmax.h"
 #include "infiniop/ops/lp_norm.h"
@@ -22,11 +25,13 @@
 #include "infiniop/ops/rope.h"
 #include "infiniop/ops/sigmoid.h"
 #include "infiniop/ops/silu.h"
+#include "infiniop/ops/smooth_l1_loss.h"
 #include "infiniop/ops/softmax.h"
 #include "infiniop/ops/softplus.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/ops/tanh.h"
+#include "infiniop/ops/take.h"
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/zeros.h"
diff --git a/include/infiniop/ops/hypot.h b/include/infiniop/ops/hypot.h
new file mode 100644
index 000000000..5f210194b
--- /dev/null
+++ b/include/infiniop/ops/hypot.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_HYPOT_API_H__
+#define __INFINIOP_HYPOT_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopHypotDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateHypotDescriptor(infiniopHandle_t handle,
+                                                          infiniopHypotDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t output,
+                                                          infiniopTensorDescriptor_t input_a,
+                                                          infiniopTensorDescriptor_t input_b);
+
+__C __export infiniStatus_t infiniopGetHypotWorkspaceSize(infiniopHypotDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopHypot(infiniopHypotDescriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *output,
+                                          const void *input_a,
+                                          const void *input_b,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyHypotDescriptor(infiniopHypotDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/index_add.h b/include/infiniop/ops/index_add.h
new file mode 100644
index 000000000..6f574d21d
--- /dev/null
+++ b/include/infiniop/ops/index_add.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_INDEX_ADD_API_H__
+#define __INFINIOP_INDEX_ADD_API_H__
+#include <stdint.h>
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopIndexAddDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateIndexAddDescriptor(infiniopHandle_t handle,
+                                                             infiniopIndexAddDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t output,
+                                                             infiniopTensorDescriptor_t input,
+                                                             int64_t dim,
+                                                             infiniopTensorDescriptor_t index,
+                                                             infiniopTensorDescriptor_t source,
+                                                             float alpha);
+
+__C __export infiniStatus_t infiniopGetIndexAddWorkspaceSize(infiniopIndexAddDescriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopIndexAdd(infiniopIndexAddDescriptor_t desc,
+                                             void *workspace,
+                                             size_t workspace_size,
+                                             void *output,
+                                             const void *input,
+                                             const void *index,
+                                             const void *source,
+                                             void *stream);
+
+__C __export infiniStatus_t infiniopDestroyIndexAddDescriptor(infiniopIndexAddDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/index_copy.h b/include/infiniop/ops/index_copy.h
new file mode 100644
index 000000000..93e6766ac
--- /dev/null
+++ b/include/infiniop/ops/index_copy.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_INDEX_COPY_API_H__
+#define __INFINIOP_INDEX_COPY_API_H__
+#include <stdint.h>
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopIndexCopyDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateIndexCopyDescriptor(infiniopHandle_t handle,
+                                                             infiniopIndexCopyDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t output,
+                                                             infiniopTensorDescriptor_t input,
+                                                             int64_t dim,
+                                                             infiniopTensorDescriptor_t index,
+                                                             infiniopTensorDescriptor_t source);
+
+__C __export infiniStatus_t infiniopGetIndexCopyWorkspaceSize(infiniopIndexCopyDescriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopIndexCopy(infiniopIndexCopyDescriptor_t desc,
+                                             void *workspace,
+                                             size_t workspace_size,
+                                             void *output,
+                                             const void *input,
+                                             const void *index,
+                                             const void *source,
+                                             void *stream);
+
+__C __export infiniStatus_t infiniopDestroyIndexCopyDescriptor(infiniopIndexCopyDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/smooth_l1_loss.h b/include/infiniop/ops/smooth_l1_loss.h
new file mode 100644
index 000000000..a17e090fc
--- /dev/null
+++ b/include/infiniop/ops/smooth_l1_loss.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_SMOOTH_L1_LOSS_API_H__
+#define __INFINIOP_SMOOTH_L1_LOSS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSmoothL1LossDescriptor_t;
+__C __export infiniStatus_t infiniopCreateSmoothL1LossDescriptor(infiniopHandle_t handle,
+                                                                 infiniopSmoothL1LossDescriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t output,
+                                                                 infiniopTensorDescriptor_t input,
+                                                                 infiniopTensorDescriptor_t target,
+                                                                 float beta,
+                                                                 int reduction);
+
+__C __export infiniStatus_t infiniopGetSmoothL1LossWorkspaceSize(infiniopSmoothL1LossDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSmoothL1Loss(infiniopSmoothL1LossDescriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *output,
+                                                 const void *input,
+                                                 const void *target,
+                                                 void *stream);
+
+__C __export infiniStatus_t infiniopDestroySmoothL1LossDescriptor(infiniopSmoothL1LossDescriptor_t desc);
+
+#endif // __INFINIOP_SMOOTH_L1_LOSS_API_H__
\ No newline at end of file
diff --git a/include/infiniop/ops/take.h b/include/infiniop/ops/take.h
new file mode 100644
index 000000000..8aeefdb6e
--- /dev/null
+++ b/include/infiniop/ops/take.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_TAKE_API_H__
+#define __INFINIOP_TAKE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTakeDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTakeDescriptor(infiniopHandle_t handle,
+                                                         infiniopTakeDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t input,
+                                                         infiniopTensorDescriptor_t indices);
+
+__C __export infiniStatus_t infiniopGetTakeWorkspaceSize(infiniopTakeDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTake(infiniopTakeDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *input,
+                                         const void *indices,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTakeDescriptor(infiniopTakeDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 5c541ec3c..8c2ca51d0 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -45,6 +45,10 @@
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
 from infinicore.ops.rearrange import rearrange
+from infinicore.ops.hypot import hypot
+from infinicore.ops.index_add import index_add
+from infinicore.ops.index_copy import index_copy
+from infinicore.ops.take import take
 from infinicore.tensor import (
     Tensor,
     empty,
@@ -111,6 +115,10 @@
     "from_list",
     "from_numpy",
     "from_torch",
+    "hypot",
+    "index_copy",
+    "index_add",
+    "take",
     "ones",
     "strided_empty",
     "strided_from_blob",
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 255079790..772a7c4fa 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -6,12 +6,13 @@
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
-
+from .smooth_l1_loss import smooth_l1_loss
 __all__ = [
     "causal_softmax",
     "random_sample",
     "rms_norm",
     "silu",
+    "smooth_l1_loss",
     "swiglu",
     "linear",
     "embedding",
diff --git a/python/infinicore/nn/functional/smooth_l1_loss.py b/python/infinicore/nn/functional/smooth_l1_loss.py
new file mode 100644
index 000000000..d26f49aaf
--- /dev/null
+++ b/python/infinicore/nn/functional/smooth_l1_loss.py
@@ -0,0 +1,60 @@
+from typing import Optional
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+_REDUCTION_MODES = {
+    "none": 0,
+    "mean": 1,
+    "sum": 2,
+}
+
+def smooth_l1_loss(
+    input: Tensor, 
+    target: Tensor, 
+    beta: float = 1.0, 
+    reduction: str = "mean", 
+    *, 
+    out: Optional[Tensor] = None
+) -> Tensor:
+    r"""Creates a criterion that uses a squared term if the absolute
+    element-wise error falls below beta and an L1 term otherwise.
+
+    Args:
+        input (Tensor): the input tensor.
+        target (Tensor): the target tensor.
+        beta (float, optional): The threshold at which to change between L1 and L2 loss. 
+            The value must be non-negative. Default: 1.0.
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            'none': no reduction will be applied,
+            'mean': the sum of the output will be divided by the number of elements in the output,
+            'sum': the output will be summed. Default: 'mean'.
+        out (Tensor, optional): the output tensor.
+
+    Returns:
+        Tensor: The loss value.
+    """
+    
+    if not input.is_contiguous():
+        input = input.contiguous()
+    if not target.is_contiguous():
+        target = target.contiguous()
+    if reduction not in _REDUCTION_MODES:
+        raise ValueError(f"{reduction} is not a valid value for reduction")
+    reduction_val = _REDUCTION_MODES[reduction]
+    if out is not None:
+        _infinicore.smooth_l1_loss_(
+            out._underlying, 
+            input._underlying, 
+            target._underlying, 
+            beta, 
+            reduction_val
+        )
+        return out
+    return Tensor(
+        _infinicore.smooth_l1_loss(
+            input._underlying, 
+            target._underlying, 
+            beta, 
+            reduction_val
+        )
+    )
\ No newline at end of file
diff --git a/python/infinicore/ops/hypot.py b/python/infinicore/ops/hypot.py
new file mode 100644
index 000000000..d2a9b690e
--- /dev/null
+++ b/python/infinicore/ops/hypot.py
@@ -0,0 +1,10 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def hypot(input, other, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.hypot(input._underlying, other._underlying))
+    _infinicore.hypot_(out._underlying, input._underlying, other._underlying)
+
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/index_add.py b/python/infinicore/ops/index_add.py
new file mode 100644
index 000000000..f1bc8a5ed
--- /dev/null
+++ b/python/infinicore/ops/index_add.py
@@ -0,0 +1,23 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def index_add(input, dim, index, source, *, alpha=1.0, out=None):
+    if out is None:
+        return Tensor(_infinicore.index_add(
+            input._underlying, 
+            dim, 
+            index._underlying, 
+            source._underlying, 
+            alpha
+        ))
+    _infinicore.index_add_(
+        out._underlying, 
+        input._underlying, 
+        dim, 
+        index._underlying, 
+        source._underlying, 
+        alpha
+    )
+
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/index_copy.py b/python/infinicore/ops/index_copy.py
new file mode 100644
index 000000000..7fbdf8877
--- /dev/null
+++ b/python/infinicore/ops/index_copy.py
@@ -0,0 +1,22 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def index_copy(input, dim, index, source, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.index_copy(
+            input._underlying, 
+            dim, 
+            index._underlying, 
+            source._underlying
+        ))
+    
+    _infinicore.index_copy_(
+        out._underlying, 
+        input._underlying, 
+        dim, 
+        index._underlying, 
+        source._underlying
+    )
+
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/take.py b/python/infinicore/ops/take.py
new file mode 100644
index 000000000..d78c78392
--- /dev/null
+++ b/python/infinicore/ops/take.py
@@ -0,0 +1,25 @@
+from typing import Optional
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def take(input: Tensor, indices: Tensor, *, out: Optional[Tensor] = None) -> Tensor:
+    r"""Returns a new tensor with the elements of input at the given indices.
+    The input tensor is treated as if it were viewed as a 1-D tensor.
+    The result tensor has the same shape as the indices tensor.
+
+    Args:
+        input (Tensor): the input tensor.
+        indices (Tensor): the indices into tensor, must be an Int or Long tensor.
+        out (Tensor, optional): the output tensor.
+
+    Returns:
+        Tensor: A new tensor with the elements of input at the given indices.
+    """
+    if not input.is_contiguous():
+        input = input.contiguous()
+
+    # 如果用户提供了 output tensor，调用底层的 in-place/explicit 接口
+    if out is not None:
+        _infinicore.take_(out._underlying, input._underlying, indices._underlying)
+        return out
+    return Tensor(_infinicore.take(input._underlying, indices._underlying))
\ No newline at end of file
diff --git a/src/infinicore/ops/hypot/hypot.cc b/src/infinicore/ops/hypot/hypot.cc
new file mode 100644
index 000000000..2f61a0e74
--- /dev/null
+++ b/src/infinicore/ops/hypot/hypot.cc
@@ -0,0 +1,26 @@
+#include "infinicore/ops/hypot.hpp"
+
+namespace infinicore::op {
+
+// 1. 定义 Dispatcher 单例
+common::OpDispatcher<Hypot::schema> &Hypot::dispatcher() {
+    static common::OpDispatcher<Hypot::schema> dispatcher_;
+    return dispatcher_;
+};
+
+
+void Hypot::execute(Tensor output, Tensor input_a, Tensor input_b) {
+    // lookup 需要传入设备类型，然后调用返回的函数指针
+    dispatcher().lookup(context::getDevice().getType())(output, input_a, input_b);
+}
+Tensor hypot(Tensor input_a, Tensor input_b) {
+    auto output = Tensor::empty(input_a->shape(), input_a->dtype(), input_a->device());
+    
+    hypot_(output, input_a, input_b);
+    return output;
+}
+void hypot_(Tensor output, Tensor input_a, Tensor input_b) {
+    Hypot::execute(output, input_a, input_b);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/hypot/hypot_infiniop.cc b/src/infinicore/ops/hypot/hypot_infiniop.cc
new file mode 100644
index 000000000..8d86a0287
--- /dev/null
+++ b/src/infinicore/ops/hypot/hypot_infiniop.cc
@@ -0,0 +1,55 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/hypot.hpp" // 引入 Hypot 头文件
+#include <infiniop.h>
+
+namespace infinicore::op::hypot_impl::infiniop {
+thread_local common::OpCache<size_t, infiniopHypotDescriptor_t> caches(
+    100, // capacity
+    [](infiniopHypotDescriptor_t &desc) {
+        if (desc != nullptr) {
+            // 销毁 Hypot 描述符
+            INFINICORE_CHECK_ERROR(infiniopDestroyHypotDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input_a, Tensor input_b) {
+    size_t seed = hash_combine(output, input_a, input_b);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopHypotDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateHypotDescriptor(
+            context::getInfiniopHandle(output->device()), &desc,
+            output->desc(), input_a->desc(), input_b->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetHypotWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopHypot(
+        desc, 
+        workspace->data(), workspace_size,
+        output->data(), input_a->data(), input_b->data(), 
+        context::getStream()));
+}
+
+static bool registered = []() {
+    // 注册到 Hypot 的 dispatcher
+    Hypot::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::hypot_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/index_add/index_add.cc b/src/infinicore/ops/index_add/index_add.cc
new file mode 100644
index 000000000..4e8dd7889
--- /dev/null
+++ b/src/infinicore/ops/index_add/index_add.cc
@@ -0,0 +1,109 @@
+#include "infinicore/ops/index_add.hpp"
+#include <stdexcept>
+#include <vector>
+#include <string>
+#include "infinicore/tensor.hpp" 
+
+namespace infinicore::op {
+
+common::OpDispatcher<IndexAdd::schema> &IndexAdd::dispatcher() {
+    static common::OpDispatcher<IndexAdd::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void IndexAdd::execute(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source, float alpha) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No IndexAdd implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input, dim, index, source, alpha);
+}
+
+
+static void check_index_add_args(const Tensor& input, int64_t& dim, const Tensor& index, const Tensor& source) {
+    int64_t ndim = static_cast<int64_t>(input->ndim());
+    
+    if (dim < 0) {
+        dim += ndim;
+    }
+    if (dim < 0 || dim >= ndim) {
+        throw std::runtime_error("IndexAdd: Dimension out of range.");
+    }
+
+    if (index->ndim() != 1) {
+        throw std::runtime_error("IndexAdd: Index tensor must be 1D.");
+    }
+    
+    // 使用 DataType::I64 和 I32
+    if (index->dtype() != DataType::I64 && index->dtype() != DataType::I32) {
+        throw std::runtime_error("IndexAdd: Index tensor must be I32 or I64.");
+    }
+
+    if (source->ndim() != input->ndim()) {
+        throw std::runtime_error("IndexAdd: Source tensor must have same number of dimensions as input tensor.");
+    }
+
+    auto in_shape = input->shape();
+    auto src_shape = source->shape();
+    auto idx_len = index->shape()[0];
+
+    for (int64_t i = 0; i < ndim; ++i) {
+        if (i == dim) {
+            if (src_shape[i] != idx_len) {
+                throw std::runtime_error("IndexAdd: Source dimension mismatch.");
+            }
+        } else {
+            if (src_shape[i] != in_shape[i]) {
+                throw std::runtime_error("IndexAdd: Source non-index dimension mismatch.");
+            }
+        }
+    }
+}
+
+// 1. Out-of-place 接口
+Tensor index_add(Tensor input, int64_t dim, Tensor index, Tensor source, float alpha) {
+    check_index_add_args(input, dim, index, source);
+
+    Tensor output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    output->copy_from(input);
+    if (!index->is_contiguous()) index = index->contiguous();
+    if (!source->is_contiguous()) source = source->contiguous();
+    IndexAdd::execute(output, output, dim, index, source, alpha);
+
+    return output;
+}
+
+// 2. In-place 接口
+void index_add_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source, float alpha) {
+    check_index_add_args(input, dim, index, source);
+
+    if (output->shape() != input->shape()) {
+         throw std::runtime_error("IndexAdd (In-place): Output shape must match Input shape.");
+    }
+
+    
+    if (output.operator->() != input.operator->()) {
+        output->copy_from(input);
+    }
+
+    if (!index->is_contiguous()) index = index->contiguous();
+    if (!source->is_contiguous()) source = source->contiguous();
+    
+   
+    if (!output->is_contiguous()) {
+        
+        Tensor contiguous_out = output->contiguous();
+
+        IndexAdd::execute(contiguous_out, contiguous_out, dim, index, source, alpha);
+        
+        output->copy_from(contiguous_out);
+    } else {
+        // 正常路径: Output 已经是连续的，直接原地执行
+        IndexAdd::execute(output, input, dim, index, source, alpha);
+    }
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/index_add/index_add_infiniop.cc b/src/infinicore/ops/index_add/index_add_infiniop.cc
new file mode 100644
index 000000000..94ccfc0be
--- /dev/null
+++ b/src/infinicore/ops/index_add/index_add_infiniop.cc
@@ -0,0 +1,69 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/index_add.hpp" // 引用算子定义
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::index_add_impl::infiniop {
+
+
+thread_local common::OpCache<size_t, infiniopIndexAddDescriptor_t> caches(
+    100, // capacity
+    [](infiniopIndexAddDescriptor_t &desc) {
+        if (desc != nullptr) {
+            // 销毁描述符
+            INFINICORE_CHECK_ERROR(infiniopDestroyIndexAddDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+// 计算函数实现
+void calculate(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source, float alpha) {
+    size_t seed = hash_combine(output, input, dim, index, source, alpha);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopIndexAddDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateIndexAddDescriptor(
+            context::getInfiniopHandle(output->device()), 
+            &desc,
+            output->desc(), 
+            input->desc(),
+            dim,            // 传入 int64_t
+            index->desc(), 
+            source->desc(), 
+            alpha));        // 传入 float
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // 3. 获取 Workspace 大小并分配
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetIndexAddWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+    INFINICORE_CHECK_ERROR(infiniopIndexAdd(
+        desc, 
+        workspace->data(), 
+        workspace_size,
+        output->data(), 
+        input->data(), 
+        index->data(), 
+        source->data(), 
+        context::getStream()));
+}
+
+// 5. 注册算子到 Dispatcher
+static bool registered = []() {
+    IndexAdd::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::index_add_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/index_copy/index_copy.cc b/src/infinicore/ops/index_copy/index_copy.cc
new file mode 100644
index 000000000..1f4689f16
--- /dev/null
+++ b/src/infinicore/ops/index_copy/index_copy.cc
@@ -0,0 +1,108 @@
+#include "infinicore/ops/index_copy.hpp"
+#include <stdexcept>
+#include <vector>
+#include <string>
+#include "infinicore/tensor.hpp" 
+
+namespace infinicore::op {
+
+// =========================================================
+// Dispatcher & Execute
+// =========================================================
+
+common::OpDispatcher<IndexCopy::schema> &IndexCopy::dispatcher() {
+    static common::OpDispatcher<IndexCopy::schema> dispatcher_;
+    return dispatcher_;
+};
+void IndexCopy::execute(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No IndexCopy implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input, dim, index, source);
+}
+static void check_index_copy_args(const Tensor& input, int64_t& dim, const Tensor& index, const Tensor& source) {
+    int64_t ndim = static_cast<int64_t>(input->ndim());
+    
+    if (dim < 0) {
+        dim += ndim;
+    }
+    if (dim < 0 || dim >= ndim) {
+        throw std::runtime_error("IndexCopy: Dimension out of range.");
+    }
+
+    if (index->ndim() != 1) {
+        throw std::runtime_error("IndexCopy: Index tensor must be 1D.");
+    }
+    
+    // 使用 DataType::I64 和 I32
+    if (index->dtype() != DataType::I64 && index->dtype() != DataType::I32) {
+        throw std::runtime_error("IndexCopy: Index tensor must be I32 or I64.");
+    }
+
+    if (source->ndim() != input->ndim()) {
+        throw std::runtime_error("IndexCopy: Source tensor must have same number of dimensions as input tensor.");
+    }
+
+    auto in_shape = input->shape();
+    auto src_shape = source->shape();
+    auto idx_len = index->shape()[0];
+
+    for (int64_t i = 0; i < ndim; ++i) {
+        if (i == dim) {
+            if (src_shape[i] != idx_len) {
+                throw std::runtime_error("IndexCopy: Source dimension mismatch.");
+            }
+        } else {
+            if (src_shape[i] != in_shape[i]) {
+                throw std::runtime_error("IndexCopy: Source non-index dimension mismatch.");
+            }
+        }
+    }
+}
+
+
+Tensor index_copy(Tensor input, int64_t dim, Tensor index, Tensor source) {
+    check_index_copy_args(input, dim, index, source);
+    Tensor output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    output->copy_from(input);
+    if (!index->is_contiguous()) index = index->contiguous();
+    if (!source->is_contiguous()) source = source->contiguous();
+    IndexCopy::execute(output, output, dim, index, source);
+
+    return output;
+}
+
+// 2. In-place 接口
+void index_copy_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source) {
+    check_index_copy_args(input, dim, index, source);
+
+    if (output->shape() != input->shape()) {
+         throw std::runtime_error("IndexCopy (In-place): Output shape must match Input shape.");
+    }
+
+    if (output.operator->() != input.operator->()) {
+        output->copy_from(input);
+    }
+
+    if (!index->is_contiguous()) index = index->contiguous();
+    if (!source->is_contiguous()) source = source->contiguous();
+    
+    if (!output->is_contiguous()) {
+        // 策略: Copy -> Compute -> CopyBack
+        Tensor contiguous_out = output->contiguous();
+        
+       
+        IndexCopy::execute(contiguous_out, contiguous_out, dim, index, source);
+        
+        // 写回结果
+        output->copy_from(contiguous_out);
+    } else {
+        IndexCopy::execute(output, input, dim, index, source);
+    }
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/index_copy/index_copy_infiniop.cc b/src/infinicore/ops/index_copy/index_copy_infiniop.cc
new file mode 100644
index 000000000..d5f1ea822
--- /dev/null
+++ b/src/infinicore/ops/index_copy/index_copy_infiniop.cc
@@ -0,0 +1,63 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/index_copy.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::index_copy_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopIndexCopyDescriptor_t> caches(
+    100,
+    [](infiniopIndexCopyDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyIndexCopyDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor source) {
+    size_t seed = hash_combine(output, input, dim, index, source);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopIndexCopyDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateIndexCopyDescriptor(
+            context::getInfiniopHandle(output->device()), 
+            &desc,
+            output->desc(), 
+            input->desc(),
+            dim,
+            index->desc(), 
+            source->desc()));        
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetIndexCopyWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+    
+    INFINICORE_CHECK_ERROR(infiniopIndexCopy(
+        desc, 
+        workspace->data(), 
+        workspace_size,
+        output->data(), 
+        input->data(), 
+        index->data(), 
+        source->data(), 
+        context::getStream()));
+}
+
+static bool registered = []() {
+    IndexCopy::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::index_copy_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss.cc b/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss.cc
new file mode 100644
index 000000000..263e2d07c
--- /dev/null
+++ b/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss.cc
@@ -0,0 +1,34 @@
+#include "infinicore/ops/smooth_l1_loss.hpp"
+
+namespace infinicore::op {
+
+// 1. 定义 Dispatcher 单例
+common::OpDispatcher<SmoothL1Loss::schema> &SmoothL1Loss::dispatcher() {
+    static common::OpDispatcher<SmoothL1Loss::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void SmoothL1Loss::execute(Tensor output, Tensor input, Tensor target, float beta, int64_t reduction) {
+    dispatcher().lookup(context::getDevice().getType())(output, input, target, beta, reduction);
+}
+
+Tensor smooth_l1_loss(Tensor input, Tensor target, float beta, int64_t reduction) {
+    Shape output_shape;
+    if (reduction == 0) { 
+        // Reduction::None -> 输出形状与输入一致
+        output_shape = input->shape();
+    } else { 
+        output_shape = {}; 
+    }
+
+    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
+    
+    smooth_l1_loss_(output, input, target, beta, reduction);
+    return output;
+}
+
+void smooth_l1_loss_(Tensor output, Tensor input, Tensor target, float beta, int64_t reduction) {
+    SmoothL1Loss::execute(output, input, target, beta, reduction);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss_infiniop.cc b/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss_infiniop.cc
new file mode 100644
index 000000000..5c994424a
--- /dev/null
+++ b/src/infinicore/ops/smooth_l1_loss/smooth_l1_loss_infiniop.cc
@@ -0,0 +1,65 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/smooth_l1_loss.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::smooth_l1_loss_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopSmoothL1LossDescriptor_t> caches(
+    100, // capacity
+    [](infiniopSmoothL1LossDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroySmoothL1LossDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input, Tensor target, float beta, int64_t reduction) {
+    size_t seed = hash_combine(output, input, target, beta, reduction);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopSmoothL1LossDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateSmoothL1LossDescriptor(
+            context::getInfiniopHandle(output->device()), 
+            &desc,
+            output->desc(), 
+            input->desc(), 
+            target->desc(), 
+            beta, 
+            static_cast<int>(reduction)
+        ));
+        
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetSmoothL1LossWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopSmoothL1Loss(
+        desc, 
+        workspace->data(), 
+        workspace_size,
+        output->data(), 
+        input->data(), 
+        target->data(), 
+        context::getStream()
+    ));
+}
+
+static bool registered = []() {
+    SmoothL1Loss::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::smooth_l1_loss_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/take/take.cc b/src/infinicore/ops/take/take.cc
new file mode 100644
index 000000000..478227d69
--- /dev/null
+++ b/src/infinicore/ops/take/take.cc
@@ -0,0 +1,28 @@
+#include "infinicore/ops/take.hpp"
+
+namespace infinicore::op {
+
+// 1. 定义 Dispatcher 单例
+common::OpDispatcher<Take::schema> &Take::dispatcher() {
+    static common::OpDispatcher<Take::schema> dispatcher_;
+    return dispatcher_;
+};
+
+// 2. Execute 实现：查找对应设备的核函数并执行
+void Take::execute(Tensor output, Tensor input, Tensor indices) {
+    dispatcher().lookup(context::getDevice().getType())(output, input, indices);
+}
+
+Tensor take(Tensor input, Tensor indices) {
+    // 【关键区别】Take 的输出形状取决于 indices 的形状，但数据类型取决于 input
+    auto output = Tensor::empty(indices->shape(), input->dtype(), input->device());
+    
+    take_(output, input, indices);
+    return output;
+}
+
+void take_(Tensor output, Tensor input, Tensor indices) {
+    Take::execute(output, input, indices);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/take/take_infiniop.cc b/src/infinicore/ops/take/take_infiniop.cc
new file mode 100644
index 000000000..5ab506973
--- /dev/null
+++ b/src/infinicore/ops/take/take_infiniop.cc
@@ -0,0 +1,58 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/take.hpp" 
+#include <infiniop.h>
+
+namespace infinicore::op::take_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopTakeDescriptor_t> caches(
+    100, // capacity
+    [](infiniopTakeDescriptor_t &desc) {
+        if (desc != nullptr) {
+            // 销毁 Take 描述符
+            INFINICORE_CHECK_ERROR(infiniopDestroyTakeDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input, Tensor indices) {
+    size_t seed = hash_combine(output, input, indices);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopTakeDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        // 2. 创建描述符：传入 input 和 indices 的 descriptor
+        INFINICORE_CHECK_ERROR(infiniopCreateTakeDescriptor(
+            context::getInfiniopHandle(output->device()), &desc,
+            output->desc(), input->desc(), indices->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetTakeWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    // 3. 执行计算：传入 input 和 indices 的数据指针
+    INFINICORE_CHECK_ERROR(infiniopTake(
+        desc, 
+        workspace->data(), workspace_size,
+        output->data(), input->data(), indices->data(), 
+        context::getStream()));
+}
+
+static bool registered = []() {
+    // 注册到 Take 的 dispatcher
+    Take::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::take_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 978defa17..c4625c4c0 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -6,6 +6,11 @@
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
+#include "ops/hypot.hpp"
+#include "ops/take.hpp"
+#include "ops/index_copy.hpp"
+#include "ops/index_add.hpp"
+#include "ops/smooth_l1_loss.hpp"
 #include "ops/linear.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mul.hpp"
@@ -28,6 +33,11 @@ inline void bind(py::module &m) {
     bind_linear(m);
     bind_matmul(m);
     bind_mul(m);
+     bind_hypot(m);
+    bind_take(m);
+    bind_index_copy(m);
+    bind_index_add(m);
+    bind_smooth_l1_loss(m);
     bind_rearrange(m);
     bind_rms_norm(m);
     bind_silu(m);
diff --git a/src/infinicore/pybind11/ops/hypot.hpp b/src/infinicore/pybind11/ops/hypot.hpp
new file mode 100644
index 000000000..b8de2d0b9
--- /dev/null
+++ b/src/infinicore/pybind11/ops/hypot.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/hypot.hpp" 
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_hypot(py::module &m) {
+    // 绑定 out-of-place 接口: output = hypot(input, other)
+    m.def("hypot",
+          &op::hypot,
+          py::arg("input"),
+          py::arg("other"), 
+          R"doc(Computes the hypotenuse of input and other arguments, i.e. sqrt(input^2 + other^2).)doc");
+
+    // 绑定 in-place / 指定输出接口: hypot_(output, input, other)
+    m.def("hypot_",
+          &op::hypot_,
+          py::arg("output"),
+          py::arg("input"),
+          py::arg("other"), 
+          R"doc(In-place hypot operation. Writes result into output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/index_add.hpp b/src/infinicore/pybind11/ops/index_add.hpp
new file mode 100644
index 000000000..3731559ec
--- /dev/null
+++ b/src/infinicore/pybind11/ops/index_add.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/index_add.hpp" 
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_index_add(py::module &m) {
+    m.def("index_add",
+          &op::index_add,
+          py::arg("input"),
+          py::arg("dim"),
+          py::arg("index"),
+          py::arg("source"),
+          py::arg("alpha") = 1.0f,  
+          R"doc(Accumulate elements of source into input by adding to the indices in the order given in index.
+                Formula: output[index[i]] = input[index[i]] + alpha * source[i])doc");
+    m.def("index_add_",
+          &op::index_add_,
+          py::arg("output"),
+          py::arg("input"),
+          py::arg("dim"),
+          py::arg("index"),
+          py::arg("source"),
+          py::arg("alpha") = 1.0f, 
+          R"doc(In-place version of index_add. Writes result into output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/index_copy.hpp b/src/infinicore/pybind11/ops/index_copy.hpp
new file mode 100644
index 000000000..39712e700
--- /dev/null
+++ b/src/infinicore/pybind11/ops/index_copy.hpp
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/index_copy.hpp" 
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_index_copy(py::module &m) {
+    // 1. Out-of-place version (returns new tensor)
+    m.def("index_copy",
+          &op::index_copy,
+          py::arg("input"),
+          py::arg("dim"),
+          py::arg("index"),
+          py::arg("source"),
+          R"doc(Copies elements of source into input at the indices given in index.
+                Formula: output[index[i]] = source[i])doc");
+    m.def("index_copy_",
+          &op::index_copy_,
+          py::arg("output"),
+          py::arg("input"),
+          py::arg("dim"),
+          py::arg("index"),
+          py::arg("source"),
+          R"doc(In-place version of index_copy. Writes result into output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/smooth_l1_loss.hpp b/src/infinicore/pybind11/ops/smooth_l1_loss.hpp
new file mode 100644
index 000000000..eac332683
--- /dev/null
+++ b/src/infinicore/pybind11/ops/smooth_l1_loss.hpp
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/smooth_l1_loss.hpp" 
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_smooth_l1_loss(py::module &m) {
+    // 1. 绑定 out-of-place 接口: output = smooth_l1_loss(input, target, beta, reduction)
+    m.def("smooth_l1_loss",
+          &op::smooth_l1_loss,
+          py::arg("input"),
+          py::arg("target"),
+          py::arg("beta") = 1.0f,
+          py::arg("reduction") = 1, 
+          R"doc(Computes the Smooth L1 Loss between input and target.
+    
+    Args:
+        input (Tensor): Predicted values.
+        target (Tensor): Ground truth values.
+        beta (float, optional): The threshold at which to change between L1 and L2 loss. Default: 1.0.
+        reduction (int, optional): Specifies the reduction to apply to the output: 0=None, 1=Mean, 2=Sum. Default: 1.
+    )doc");
+    m.def("smooth_l1_loss_",
+          &op::smooth_l1_loss_,
+          py::arg("output"),
+          py::arg("input"),
+          py::arg("target"),
+          py::arg("beta") = 1.0f,
+          py::arg("reduction") = 1,
+          R"doc(Explicit output Smooth L1 Loss operation. Writes the result into the output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/take.hpp b/src/infinicore/pybind11/ops/take.hpp
new file mode 100644
index 000000000..c62596451
--- /dev/null
+++ b/src/infinicore/pybind11/ops/take.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/take.hpp" 
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_take(py::module &m) {
+    m.def("take",
+          &op::take,
+          py::arg("input"),
+          py::arg("indices"),
+          R"doc(Extracts elements from the input tensor along the given indices. 
+The input tensor is treated as a flattened 1D array.)doc");
+    m.def("take_",
+          &op::take_,
+          py::arg("output"),
+          py::arg("input"),
+          py::arg("indices"),
+          R"doc(Explicit output take operation. Writes the result into the output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infiniop/ops/hypot/cpu/hypot_cpu.cc b/src/infiniop/ops/hypot/cpu/hypot_cpu.cc
new file mode 100644
index 000000000..766eda851
--- /dev/null
+++ b/src/infiniop/ops/hypot/cpu/hypot_cpu.cc
@@ -0,0 +1,63 @@
+// 引用 Hypot 专用的 CPU 头文件
+#include "hypot_cpu.h"
+
+namespace op::hypot::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+    if (input_desc_vec.size() != 2) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    const auto &input_a_desc = input_desc_vec.at(0);
+    const auto &input_b_desc = input_desc_vec.at(1);
+    const auto &output_shape = out_desc->shape();
+
+    CHECK_DTYPE(dtype, 
+        INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64
+    );
+
+    // 检查两个输入的形状是否与输出一致
+    CHECK_SAME_SHAPE(output_shape, input_a_desc->shape());
+    CHECK_SAME_SHAPE(output_shape, input_b_desc->shape());
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<HypotOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<HypotOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<HypotOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<HypotOp, double>(_info, output, inputs, stream);
+    
+    default:
+        // 如果传入了整数类型或其他不支持的类型，将返回错误
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::hypot::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/hypot/cpu/hypot_cpu.h b/src/infiniop/ops/hypot/cpu/hypot_cpu.h
new file mode 100644
index 000000000..63575d331
--- /dev/null
+++ b/src/infiniop/ops/hypot/cpu/hypot_cpu.h
@@ -0,0 +1,31 @@
+#ifndef __HYPOT_CPU_H__
+#define __HYPOT_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(hypot, cpu)
+
+#include <cmath>
+#include <type_traits>
+
+namespace op::hypot::cpu {
+
+typedef struct HypotOp {
+public:
+    // Hypot 是二元算子，计算 sqrt(x^2 + y^2)
+    static constexpr size_t num_inputs = 2;
+
+    template <typename T>
+    T operator()(const T &x, const T &y) const {
+        if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+            return std::hypot(x, y);
+        }
+        else {
+            return static_cast<T>(std::hypot(static_cast<float>(x), static_cast<float>(y)));
+        }
+    }
+} HypotOp;
+
+} // namespace op::hypot::cpu
+
+#endif // __HYPOT_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/hypot/cuda/kernel.cuh b/src/infiniop/ops/hypot/cuda/kernel.cuh
new file mode 100644
index 000000000..377c557c0
--- /dev/null
+++ b/src/infiniop/ops/hypot/cuda/kernel.cuh
@@ -0,0 +1,64 @@
+#ifndef __HYPOT_CUDA_H__
+#define __HYPOT_CUDA_H__
+
+#include <cmath>
+#include <type_traits>
+#if ENABLE_METAX_API
+    #include <maca_fp16.h>
+    #include <maca_bfloat16.h>
+    using nv_bfloat162 = __maca_bfloat162;
+#else
+    #include <cuda_fp16.h>
+    #include <cuda_bf16.h>
+#endif
+
+namespace op::hypot::cuda {
+
+typedef struct HypotOp {
+public:
+    static constexpr size_t num_inputs = 2;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x, const T &y) const {
+        
+        if constexpr (std::is_same_v<T, float>) {
+            return sqrtf(fmaf(x, x, y * y));
+        } 
+        else if constexpr (std::is_same_v<T, half2>) {
+            half2 sq_sum = __hfma2(x, x, __hmul2(y, y));
+            return h2sqrt(sq_sum);
+        } 
+
+        else if constexpr (std::is_same_v<T, half>) {
+            return hsqrt(__hfma(x, x, __hmul(y, y)));
+        } 
+        else if constexpr (std::is_same_v<T, nv_bfloat162>) {
+           
+            float f0_x = __bfloat162float(__low2bfloat16(x));
+            float f1_x = __bfloat162float(__high2bfloat16(x));
+            
+            float f0_y = __bfloat162float(__low2bfloat16(y));
+            float f1_y = __bfloat162float(__high2bfloat16(y));
+            float res0 = sqrtf(fmaf(f0_x, f0_x, f0_y * f0_y));
+            float res1 = sqrtf(fmaf(f1_x, f1_x, f1_y * f1_y));
+
+            return __floats2bfloat162_rn(res0, res1);
+        }
+        else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float fx = __bfloat162float(x);
+            float fy = __bfloat162float(y);
+            return __float2bfloat16(sqrtf(fmaf(fx, fx, fy * fy)));
+        } 
+        
+        else if constexpr (std::is_same_v<T, double>) {
+            return sqrt(fma(x, x, y * y));
+        }
+        else {
+            return static_cast<T>(sqrt(fma(static_cast<double>(x), static_cast<double>(x), static_cast<double>(y) * static_cast<double>(y))));
+        }
+    }
+} HypotOp;
+
+} // namespace op::hypot::cuda
+
+#endif // __HYPOT_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/hypot/metax/hypot_metax.h b/src/infiniop/ops/hypot/metax/hypot_metax.h
new file mode 100644
index 000000000..a7a4e7d11
--- /dev/null
+++ b/src/infiniop/ops/hypot/metax/hypot_metax.h
@@ -0,0 +1,9 @@
+#ifndef __HYPOT_METAX_API_H__
+#define __HYPOT_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+// 将第一个参数修改为 hypot
+ELEMENTWISE_DESCRIPTOR(hypot, metax)
+
+#endif // __HYPOT_METAX_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/hypot/metax/hypot_metax.maca b/src/infiniop/ops/hypot/metax/hypot_metax.maca
new file mode 100644
index 000000000..ed9d6548a
--- /dev/null
+++ b/src/infiniop/ops/hypot/metax/hypot_metax.maca
@@ -0,0 +1,78 @@
+#include "hypot_metax.h" 
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh" 
+
+namespace op::hypot::metax { 
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // 3. Hypot 是二元算子，需要确保有两个输入
+    if (input_desc_vec.size() != 2) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    const auto &input_a_desc = input_desc_vec.at(0);
+    const auto &input_b_desc = input_desc_vec.at(1);
+    
+    const auto &out_shape = out_desc->shape();
+    const auto &in_a_shape = input_a_desc->shape();
+    const auto &in_b_shape = input_b_desc->shape();
+
+    // 检查数据类型 (通常 Hypot 支持浮点类型)
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // 4. 检查形状一致性 
+    // 注意：如果框架支持 Broadcasting (广播)，这里可能不需要严格相等
+    // 但参照你的 Floor 实现，这里先进行严格形状检查
+    CHECK_SAME_SHAPE(out_shape, in_a_shape);
+    CHECK_SAME_SHAPE(out_shape, in_b_shape);
+
+    // create CUDA elementwise descriptor
+    // 宏会自动处理 input_desc_vec 里的所有输入信息
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // 5. 调用 calculate 并传入 cuda::HypotOp
+    // HypotOp 通常定义为二元运算： sqrt(x*x + y*y)
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HypotOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HypotOp, nv_bfloat162>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HypotOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HypotOp, double>(_info, workspace, output, inputs, stream);
+    
+    // Hypot 通常返回浮点数，如果是整数输入通常需要转为浮点计算，
+    // 这里暂时保持与 Floor 一致的浮点支持列表。
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hypot::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/hypot/moore/hypot_moore.h b/src/infiniop/ops/hypot/moore/hypot_moore.h
new file mode 100644
index 000000000..026c6d691
--- /dev/null
+++ b/src/infiniop/ops/hypot/moore/hypot_moore.h
@@ -0,0 +1,7 @@
+#ifndef __HYPOT_MOORE_API_H__
+#define __HYPOT_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+ELEMENTWISE_DESCRIPTOR(hypot, moore)
+
+#endif // __HYPOT_MOORE_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/hypot/moore/hypot_moore.mu b/src/infiniop/ops/hypot/moore/hypot_moore.mu
new file mode 100644
index 000000000..cc128b0ad
--- /dev/null
+++ b/src/infiniop/ops/hypot/moore/hypot_moore.mu
@@ -0,0 +1,77 @@
+#include "hypot_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+#include "hypot_moore_kernel.h"
+
+namespace op::hypot::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // Hypot is a binary operator (z = hypot(x, y) = sqrt(x^2 + y^2))
+    // 需要确保有两个输入
+    if (input_desc_vec.size() != 2) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    const auto &in_desc_0 = input_desc_vec.at(0);
+    const auto &in_desc_1 = input_desc_vec.at(1);
+    
+    const auto &out_shape = out_desc->shape();
+    const auto &in_shape_0 = in_desc_0->shape();
+    const auto &in_shape_1 = in_desc_1->shape();
+
+    // Hypot supports floating point types.
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // Check if output shape matches input shapes
+    CHECK_SAME_SHAPE(out_shape, in_shape_0);
+    CHECK_SAME_SHAPE(out_shape, in_shape_1);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    
+    // Safety check for input count in calculate phase
+    if (inputs.size() != 2) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    // Use moore::HypotOp template defined in hypot_moore_kernel.h
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::HypotOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::HypotOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::HypotOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::HypotOp, double>(_info, workspace, output, inputs, stream);
+    
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hypot::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/hypot/moore/hypot_moore_kernel.h b/src/infiniop/ops/hypot/moore/hypot_moore_kernel.h
new file mode 100644
index 000000000..2f2cb8e9b
--- /dev/null
+++ b/src/infiniop/ops/hypot/moore/hypot_moore_kernel.h
@@ -0,0 +1,60 @@
+#ifndef __HYPOT_MOORE_KERNEL_H__
+#define __HYPOT_MOORE_KERNEL_H__
+
+#include <musa_fp16.h> 
+#include <musa_bf16.h>
+#include <cmath>
+
+namespace op::hypot::moore {
+
+typedef struct HypotOp {
+public:
+    static constexpr size_t num_inputs = 2;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x, const T &y) const {
+        // -----------------------------------------------------------------
+        // 1. Half2
+        // -----------------------------------------------------------------
+        if constexpr (std::is_same_v<T, half2>) {
+            float x_low = __low2float(x);
+            float x_high = __high2float(x);
+            float y_low = __low2float(y);
+            float y_high = __high2float(y);
+            return __floats2half2_rn(::hypotf(x_low, y_low), ::hypotf(x_high, y_high));
+        } 
+        // -----------------------------------------------------------------
+        // 2. Half
+        // -----------------------------------------------------------------
+        else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(::hypotf(__half2float(x), __half2float(y)));
+        } 
+        // -----------------------------------------------------------------
+        // 3. Bfloat16 (__mt_bfloat16)
+        // -----------------------------------------------------------------
+        else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            // 将 __mt_bfloat16 转为 float 计算
+            float x_f = __bfloat162float(x);
+            float y_f = __bfloat162float(y);
+            
+            // 计算结果转回 __mt_bfloat16
+            return __float2bfloat16(::hypotf(x_f, y_f));
+        } 
+        // -----------------------------------------------------------------
+        // 4. Float32
+        // -----------------------------------------------------------------
+        else if constexpr (std::is_same_v<T, float>) {
+            return ::hypotf(x, y);
+        } 
+        // -----------------------------------------------------------------
+        // 5. Double / Other
+        // -----------------------------------------------------------------
+        else {
+            return ::hypot(x, y);
+        }
+    }
+} HypotOp;
+
+} // namespace op::hypot::moore
+
+#endif // __HYPOT_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu
new file mode 100644
index 000000000..a8a9bc198
--- /dev/null
+++ b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu
@@ -0,0 +1,78 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+
+#include "../cuda/kernel.cuh"
+#include "hypot_nvidia.cuh"
+
+namespace op::hypot::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+   
+    if (input_desc_vec.size() != 2) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    const auto &input_a_desc = input_desc_vec.at(0);
+    const auto &input_b_desc = input_desc_vec.at(1);
+    const auto &output_shape = out_desc->shape();
+
+    
+    CHECK_DTYPE(dtype, 
+        INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64
+    );
+
+    CHECK_SAME_SHAPE(output_shape, input_a_desc->shape());
+    CHECK_SAME_SHAPE(output_shape, input_b_desc->shape());
+
+    // 创建描述符
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // -----------------------------------------------------------
+    // 算子分发：将 FloorOp 替换为 HypotOp
+    // -----------------------------------------------------------
+    switch (_dtype) {
+    // === 浮点类型 ===
+    case INFINI_DTYPE_BF16:
+        // 注意：cuda::HypotOp 对应我们在 hypot_cuda.h 中定义的 Functor
+        return _device_info->calculate<256, cuda::HypotOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HypotOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HypotOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HypotOp, double>(_info, workspace, output, inputs, stream);
+    
+    // 【修改点 4】移除了整数类型的 Case
+    
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::hypot::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh
new file mode 100644
index 000000000..7ff5a11b1
--- /dev/null
+++ b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __HYPOT_NVIDIA_CUH__
+#define __HYPOT_NVIDIA_CUH__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(hypot, nvidia)
+
+#endif // __HYPOT_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/hypot/operator.cc b/src/infiniop/ops/hypot/operator.cc
new file mode 100644
index 000000000..f056ea8d8
--- /dev/null
+++ b/src/infiniop/ops/hypot/operator.cc
@@ -0,0 +1,167 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/hypot.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/hypot_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/hypot_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/hypot_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/hypot_moore.h"
+#endif
+
+extern "C" {
+__C infiniStatus_t infiniopCreateHypotDescriptor(
+    infiniopHandle_t handle,
+    infiniopHypotDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input_x,
+    infiniopTensorDescriptor_t input_y) {
+
+
+    #define CREATE(CASE, NAMESPACE)                                             \
+        case CASE:                                                              \
+            return op::hypot::NAMESPACE::Descriptor::create(                    \
+                handle,                                                         \
+                reinterpret_cast<op::hypot::NAMESPACE::Descriptor **>(desc_ptr),\
+                output,                                                         \
+                {input_x, input_y})
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+
+__C infiniStatus_t infiniopGetHypotWorkspaceSize(infiniopHypotDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                   \
+        case CASE:                                                                                 \
+            *size = reinterpret_cast<op::hypot::NAMESPACE::Descriptor *>(desc)->workspaceSize();   \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif 
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+
+__C infiniStatus_t infiniopHypot(
+    infiniopHypotDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input_x,
+    const void *input_y,
+    void *stream) {
+    #define CALCULATE(CASE, NAMESPACE)                                          \
+        case CASE:                                                              \
+            return reinterpret_cast<const op::hypot::NAMESPACE::Descriptor *>(desc) \
+                ->calculate(workspace, workspace_size, output, {input_x, input_y}, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif 
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+
+__C infiniStatus_t infiniopDestroyHypotDescriptor(infiniopHypotDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                          \
+        case CASE:                                                                           \
+            delete reinterpret_cast<const op::hypot::NAMESPACE::Descriptor *>(desc);         \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif 
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/cpu/index_add_cpu.cc b/src/infiniop/ops/index_add/cpu/index_add_cpu.cc
new file mode 100644
index 000000000..8b7551561
--- /dev/null
+++ b/src/infiniop/ops/index_add/cpu/index_add_cpu.cc
@@ -0,0 +1,151 @@
+#include "index_add_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+#include <cstring>
+#include <vector>
+#include <type_traits> // 必需：用于 std::conditional
+
+namespace op::index_add::cpu {
+
+Descriptor::~Descriptor() = default;
+
+// ==================================================================
+// 创建描述符
+// ==================================================================
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    int64_t dim,
+    infiniopTensorDescriptor_t index_desc,
+    infiniopTensorDescriptor_t source_desc,
+    float alpha) {
+    
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto result = IndexAddInfo::create(out_desc, in_desc, dim, index_desc, source_desc, alpha);
+    CHECK_RESULT(result);
+    
+    *desc_ptr = new Descriptor(
+        nullptr,             // Opaque*
+        result.take(),       // Info
+        0,                   // Workspace Size
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename TData, typename TIdx>
+void calculate_cpu_impl(
+    const IndexAddInfo &info,
+    void *output,
+    const void *input,
+    const void *index,
+    const void *source) {
+
+    using CalcType = typename std::conditional<std::is_same<TData, double>::value, double, float>::type;
+
+    // 转换 Alpha (使用 utils::cast 处理自定义类型)
+    CalcType alpha_val = utils::cast<CalcType>(info.alpha());
+    size_t outer_size = info.outer_size(); 
+    size_t inner_size = info.inner_size();
+    size_t dim_size   = info.dim_size(); 
+    size_t index_len  = info.index_len();
+
+    auto out_ptr = reinterpret_cast<TData *>(output);
+    auto src_ptr = reinterpret_cast<const TData *>(source);
+    auto idx_ptr = reinterpret_cast<const TIdx *>(index);
+
+    for (size_t o = 0; o < outer_size; ++o) {
+        for (size_t i = 0; i < index_len; ++i) {
+            
+            TIdx idx = idx_ptr[i];
+
+            // 处理负索引
+            if (idx < 0) idx += static_cast<TIdx>(dim_size);
+
+            // 边界检查
+            if (idx < 0 || static_cast<size_t>(idx) >= dim_size) {
+                continue; 
+            }
+
+            // 计算偏移
+            size_t src_offset = o * index_len * inner_size + i * inner_size;
+            size_t out_offset = o * dim_size * inner_size + static_cast<size_t>(idx) * inner_size;
+
+            // Inner 维度循环
+            for (size_t in = 0; in < inner_size; ++in) {
+                CalcType src_val = utils::cast<CalcType>(src_ptr[src_offset + in]);
+                CalcType out_old_val = utils::cast<CalcType>(out_ptr[out_offset + in]);
+
+                // 2. 执行计算: out = out + src * alpha
+                CalcType result_val = out_old_val + src_val * alpha_val;
+
+                // 3. 使用 utils::cast 转回 TData 并写入
+                out_ptr[out_offset + in] = utils::cast<TData>(result_val);
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    const void *source,
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+    auto idx_dtype = _info.idx_dtype();
+
+    #define DISPATCH(TDATA, TIDX) \
+        calculate_cpu_impl<TDATA, TIDX>(_info, output, input, index, source); \
+        return INFINI_STATUS_SUCCESS
+
+    if (idx_dtype == INFINI_DTYPE_I32) {
+        switch (dtype) {
+            case INFINI_DTYPE_F32: 
+                DISPATCH(float, int32_t);
+            case INFINI_DTYPE_F64: 
+                DISPATCH(double, int32_t);
+            case INFINI_DTYPE_F16: 
+                DISPATCH(fp16_t, int32_t); 
+            case INFINI_DTYPE_BF16: 
+                DISPATCH(bf16_t, int32_t);
+            case INFINI_DTYPE_I32: 
+                DISPATCH(int32_t, int32_t);
+            case INFINI_DTYPE_I64: 
+                DISPATCH(int64_t, int32_t);
+            default: 
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else if (idx_dtype == INFINI_DTYPE_I64) {
+        switch (dtype) {
+            case INFINI_DTYPE_F32: 
+                DISPATCH(float, int64_t);
+            case INFINI_DTYPE_F64: 
+                DISPATCH(double, int64_t);
+            case INFINI_DTYPE_F16: 
+                DISPATCH(fp16_t, int64_t);
+            case INFINI_DTYPE_BF16: 
+                DISPATCH(bf16_t, int64_t);
+            case INFINI_DTYPE_I32: 
+                DISPATCH(int32_t, int64_t);
+            case INFINI_DTYPE_I64: 
+                DISPATCH(int64_t, int64_t);
+            default: 
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    }
+
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+
+    #undef DISPATCH
+}
+
+} // namespace op::index_add::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/cpu/index_add_cpu.h b/src/infiniop/ops/index_add/cpu/index_add_cpu.h
new file mode 100644
index 000000000..e916399b1
--- /dev/null
+++ b/src/infiniop/ops/index_add/cpu/index_add_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __INDEX_ADD_CPU_H__
+#define __INDEX_ADD_CPU_H__
+
+#include "../index_add.h"
+DESCRIPTOR(cpu)
+
+#endif // __INDEX_ADD_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/cuda/kernel.cuh b/src/infiniop/ops/index_add/cuda/kernel.cuh
new file mode 100644
index 000000000..be5b7ce05
--- /dev/null
+++ b/src/infiniop/ops/index_add/cuda/kernel.cuh
@@ -0,0 +1,200 @@
+#ifndef __INDEX_ADD_CUDA_H__
+#define __INDEX_ADD_CUDA_H__
+
+#if ENABLE_METAX_API
+#include <maca_fp16.h>
+#include <maca_bfloat16.h>
+#define __nv_bfloat16 __maca_bfloat16
+#define __nv_bfloat162 __maca_bfloat162
+#else
+    #include <cuda_fp16.h>
+    #include <cuda_bf16.h>
+    #include <cuda_runtime.h>
+#endif
+#include <cstdint>
+
+namespace op::index_add::cuda {
+
+
+__device__ __forceinline__ void atomic_add_custom(__half* address, __half val) {
+#if __CUDA_ARCH__ >= 700
+    atomicAdd(address, val);
+#else
+    // Fallback for older architectures (< Volta)
+    unsigned int* address_as_ui = (unsigned int*)((char*)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+        unsigned short old_val_raw = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        __half old_val = *reinterpret_cast<__half*>(&old_val_raw);
+        
+        __half new_val = old_val + val;
+        unsigned short new_val_raw = *reinterpret_cast<unsigned short*>(&new_val);
+
+        unsigned int new_int = (size_t)address & 2 ? (old & 0xffff) | (new_val_raw << 16)
+                                                   : (old & 0xffff0000) | new_val_raw;
+        
+        old = atomicCAS(address_as_ui, assumed, new_int);
+    } while (assumed != old);
+#endif
+}
+
+
+__device__ __forceinline__ void atomic_add_custom(__nv_bfloat16* address, __nv_bfloat16 val) {
+#if __CUDA_ARCH__ >= 800
+    atomicAdd(address, val);
+#else
+    // Fallback for older architectures (< Ampere)
+    unsigned int* address_as_ui = (unsigned int*)((char*)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+        unsigned short old_val_raw = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        __nv_bfloat16 old_val = *reinterpret_cast<__nv_bfloat16*>(&old_val_raw);
+
+        __nv_bfloat16 new_val = old_val + val;
+        unsigned short new_val_raw = *reinterpret_cast<unsigned short*>(&new_val);
+
+        unsigned int new_int = (size_t)address & 2 ? (old & 0xffff) | (new_val_raw << 16)
+                                                   : (old & 0xffff0000) | new_val_raw;
+
+        old = atomicCAS(address_as_ui, assumed, new_int);
+    } while (assumed != old);
+#endif
+}
+
+
+__device__ __forceinline__ void atomic_add_custom(int64_t* address, int64_t val) {
+    atomicAdd(reinterpret_cast<unsigned long long int*>(address), static_cast<unsigned long long int>(val));
+}
+
+// --- 通用模板 (float, double, int32 等) ---
+template <typename T>
+__device__ __forceinline__ void atomic_add_custom(T* address, T val) {
+    atomicAdd(address, val);
+}
+
+// ==================================================================
+// 2. 定义向量化数据包 (Aligned Pack)
+// ==================================================================
+template <typename T, int N>
+struct alignas(sizeof(T) * N) Pack {
+    T val[N];
+};
+
+// ==================================================================
+// 3. 标量版 Kernel (通用 fallback)
+// ==================================================================
+template <typename T, typename TIdx>
+__global__ void index_add_kernel(
+    T * __restrict__ output,
+    const T * __restrict__ source,
+    const TIdx * __restrict__ indices,
+    size_t outer_size,      // dim 左边的维度积
+    size_t inner_size,      // dim 右边的维度积
+    size_t dim_size,        // output 在 dim 维度的长度
+    size_t index_len,       // index 的长度 (source 在 dim 维度的长度)
+    size_t num_source,      // source 的总元素数
+    float alpha             // 缩放因子
+) {
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+
+    T alpha_val = static_cast<T>(alpha);
+
+    // Grid-Stride Loop 遍历 Source 张量
+    for (size_t i = tid; i < num_source; i += stride) {
+        // 1. 将线性索引 i 转换为逻辑坐标 (outer, idx_idx, inner)
+        // Source Shape: [Outer, IndexLen, Inner]
+        size_t inner_idx = i % inner_size;
+        size_t tmp = i / inner_size;
+        size_t idx_idx = tmp % index_len;
+        size_t outer_idx = tmp / index_len;
+
+        // 2. 读取索引值
+        TIdx target_dim_idx = indices[idx_idx];
+
+        // 3. 处理负索引 (防御性)
+        if (target_dim_idx < 0) target_dim_idx += static_cast<TIdx>(dim_size);
+
+        // 4. 边界检查与原子累加
+        if (target_dim_idx >= 0 && target_dim_idx < static_cast<TIdx>(dim_size)) {
+            // 计算 Output 的线性偏移
+            // Output Shape: [Outer, DimSize, Inner]
+            size_t out_offset = outer_idx * (dim_size * inner_size) + 
+                                static_cast<size_t>(target_dim_idx) * inner_size + 
+                                inner_idx;
+
+            // 使用自定义原子操作
+            atomic_add_custom(&output[out_offset], source[i] * alpha_val);
+        }
+    }
+}
+
+// ==================================================================
+// 4. 向量化 Kernel (优化读取带宽)
+// ==================================================================
+template <typename T, typename TIdx, int PackSize>
+__global__ void index_add_kernel_vectorized(
+    T * __restrict__ output,
+    const T * __restrict__ source,
+    const TIdx * __restrict__ indices,
+    size_t outer_size,
+    size_t inner_size,
+    size_t dim_size,
+    size_t index_len,
+    size_t num_packs,       // Source 的 Pack 数量
+    float alpha
+) {
+    // 将 source 强转为 Pack 指针，实现向量化读取
+    using PackType = Pack<T, PackSize>;
+    const PackType *src_vec = reinterpret_cast<const PackType *>(source);
+    
+    T alpha_val = static_cast<T>(alpha);
+
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+
+    for (size_t i = tid; i < num_packs; i += stride) {
+        // 向量化读取 (LDG.128)
+        PackType reg_pack = src_vec[i];
+        
+        // 当前 Pack 在 Source 中的起始线性索引
+        size_t base_idx = i * PackSize;
+
+        // 循环展开：处理 Pack 中的每一个元素
+        #pragma unroll
+        for (int k = 0; k < PackSize; ++k) {
+            size_t curr_src_idx = base_idx + k;
+
+            // 1. 坐标变换
+            size_t inner_idx = curr_src_idx % inner_size;
+            size_t tmp = curr_src_idx / inner_size;
+            size_t idx_idx = tmp % index_len;
+            size_t outer_idx = tmp / index_len;
+
+            // 2. 读取 Index
+            TIdx target_dim_idx = indices[idx_idx];
+
+            if (target_dim_idx < 0) target_dim_idx += static_cast<TIdx>(dim_size);
+
+            // 3. 原子累加
+            if (target_dim_idx >= 0 && target_dim_idx < static_cast<TIdx>(dim_size)) {
+                size_t out_offset = outer_idx * (dim_size * inner_size) + 
+                                    static_cast<size_t>(target_dim_idx) * inner_size + 
+                                    inner_idx;
+                
+                // 使用自定义原子操作
+                atomic_add_custom(&output[out_offset], reg_pack.val[k] * alpha_val);
+            }
+        }
+    }
+}
+
+} // namespace op::index_add::cuda
+
+#endif // __INDEX_ADD_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/index_add.h b/src/infiniop/ops/index_add/index_add.h
new file mode 100644
index 000000000..1622b1231
--- /dev/null
+++ b/src/infiniop/ops/index_add/index_add.h
@@ -0,0 +1,52 @@
+#ifndef __INDEX_ADD_H__
+#define __INDEX_ADD_H__
+
+#include "../../operator.h"
+#include "info.h" // 引用 IndexAddInfo 定义 (需自行定义，包含 dim, alpha 等)
+#define DESCRIPTOR(NAMESPACE)                                            \
+                                                                         \
+    namespace op::index_add::NAMESPACE {                                 \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        IndexAddInfo _info;                                              \
+        size_t _workspace_size;                                          \
+                                                                         \
+        Descriptor(                                                      \
+            Opaque *opaque,                                              \
+            IndexAddInfo info,                                           \
+            size_t workspace_size,                                       \
+            infiniDevice_t device_type,                                  \
+            int device_id)                                               \
+            : InfiniopDescriptor{device_type, device_id},                \
+              _opaque(opaque),                                           \
+              _info(info),                                               \
+              _workspace_size(workspace_size) {}                         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+                                                                         \
+        size_t workspaceSize() const { return _workspace_size; }         \
+                                                                         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t out_desc,                         \
+            infiniopTensorDescriptor_t in_desc,                          \
+            int64_t dim,                                                 \
+            infiniopTensorDescriptor_t index_desc,                       \
+            infiniopTensorDescriptor_t source_desc,                      \
+            float alpha);                                                \
+                                                                         \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            const void *input,                                           \
+            const void *index,                                           \
+            const void *source,                                          \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif // __INDEX_ADD_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/info.h b/src/infiniop/ops/index_add/info.h
new file mode 100644
index 000000000..66602818f
--- /dev/null
+++ b/src/infiniop/ops/index_add/info.h
@@ -0,0 +1,141 @@
+#ifndef __INDEX_ADD_INFO_H__
+#define __INDEX_ADD_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::index_add {
+
+class IndexAddInfo {
+    IndexAddInfo() = default;
+
+public:
+    int _dtype;          // 数据类型 (Input/Output/Source)
+    int _idx_dtype;      // 索引类型 (int32, int64)
+    int64_t _dim;        // 操作维度
+    float _alpha;        // 缩放因子
+
+    // 【新增】几何信息，用于计算内存偏移
+    size_t _outer_size;  // dim 左侧维度的乘积
+    size_t _inner_size;  // dim 右侧维度的乘积
+    size_t _dim_size;    // Input/Output 在 dim 维度的长度
+    size_t _index_len;   // Index 的长度
+
+    // 【修改】构造函数，初始化新增成员
+    IndexAddInfo(int dtype, int idx_dtype, int64_t dim, float alpha,
+                 size_t outer_size, size_t inner_size, size_t dim_size, size_t index_len)
+        : _dtype(dtype), _idx_dtype(idx_dtype), _dim(dim), _alpha(alpha),
+          _outer_size(outer_size), _inner_size(inner_size), _dim_size(dim_size), _index_len(index_len) {}
+
+    int dtype() const { return _dtype; }
+    int idx_dtype() const { return _idx_dtype; }
+    int64_t dim() const { return _dim; }
+    float alpha() const { return _alpha; }
+
+    // 【新增】Getter 方法
+    size_t outer_size() const { return _outer_size; }
+    size_t inner_size() const { return _inner_size; }
+    size_t dim_size() const { return _dim_size; }
+    size_t index_len() const { return _index_len; }
+
+    static utils::Result<IndexAddInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t in_desc,
+        int64_t dim,
+        infiniopTensorDescriptor_t index_desc,
+        infiniopTensorDescriptor_t source_desc,
+        float alpha) {
+
+        // 1. 检查数据类型一致性 (Output vs Input vs Source)
+        int dtype = in_desc->dtype();
+        if (out_desc->dtype() != dtype || source_desc->dtype() != dtype) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        // 2. 检查索引数据类型
+        int idx_dtype = index_desc->dtype();
+        if (idx_dtype != INFINI_DTYPE_I32 && idx_dtype != INFINI_DTYPE_I64) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        // 3. 检查维度有效性
+        int64_t ndim = static_cast<int64_t>(in_desc->ndim());
+        if (dim < 0 || dim >= ndim) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        // 4. 检查 Index 形状
+        if (index_desc->ndim() != 1) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        // 【新增】计算几何信息
+        const auto &in_shape = in_desc->shape();
+        
+        // outer_size: dim 之前所有维度的乘积
+        size_t outer_size = 1;
+        for (int64_t i = 0; i < dim; ++i) {
+            outer_size *= in_shape[i];
+        }
+
+        // inner_size: dim 之后所有维度的乘积 (即 stride)
+        size_t inner_size = 1;
+        for (int64_t i = dim + 1; i < ndim; ++i) {
+            inner_size *= in_shape[i];
+        }
+
+        // dim_size
+        size_t dim_size = in_shape[dim];
+        
+        // index_len
+        size_t index_len = index_desc->shape()[0];
+
+        // 5. 检查 Source 形状一致性
+        // 规则: [Outer, IndexLen, Inner]
+        if (source_desc->ndim() != in_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        const auto &src_shape = source_desc->shape();
+
+        for (int64_t i = 0; i < ndim; ++i) {
+            if (i == dim) {
+                if (src_shape[i] != index_len) {
+                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
+                }
+            } else {
+                if (src_shape[i] != in_shape[i]) {
+                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
+                }
+            }
+        }
+
+        // 6. 检查 Output 与 Input 形状一致性
+        if (out_desc->ndim() != in_desc->ndim()) {
+             return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        const auto &out_shape = out_desc->shape();
+        for (int64_t i = 0; i < ndim; ++i) {
+            if (out_shape[i] != in_shape[i]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        // 7. 返回 Info 对象 (包含计算好的几何信息)
+        return utils::Result<IndexAddInfo>(IndexAddInfo{
+            dtype,
+            idx_dtype,
+            dim,
+            alpha,
+            outer_size, // pass
+            inner_size, // pass
+            dim_size,   // pass
+            index_len   // pass
+        });
+    }
+};
+
+} // namespace op::index_add
+
+#endif // __INDEX_ADD_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/metax/index_add_metax.h b/src/infiniop/ops/index_add/metax/index_add_metax.h
new file mode 100644
index 000000000..99e2dee3e
--- /dev/null
+++ b/src/infiniop/ops/index_add/metax/index_add_metax.h
@@ -0,0 +1,8 @@
+#ifndef __INDEX_ADD_METAX_H__
+#define __INDEX_ADD_METAX_H__
+
+#include "../index_add.h"
+
+DESCRIPTOR(metax)
+
+#endif // __INDEX_ADD_METAX_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/metax/index_add_metax.maca b/src/infiniop/ops/index_add/metax/index_add_metax.maca
new file mode 100644
index 000000000..ac7fbaf48
--- /dev/null
+++ b/src/infiniop/ops/index_add/metax/index_add_metax.maca
@@ -0,0 +1,345 @@
+#include "index_add_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include <mcr/mc_runtime.h>
+#include <common/mc_library_types.h>
+#include <cmath>
+#include <cstdio>
+#include <vector>
+#include <maca_fp16.h>
+#include <maca_bfloat16.h>
+#include "../../../tensor.h"
+#include "../cuda/kernel.cuh"
+
+namespace op::index_add::metax {
+
+// ==================================================================
+// Atomic Helpers
+// ==================================================================
+
+template <typename T>
+__device__ __forceinline__ void gpuAtomicAdd(T* address, T val) {
+    atomicAdd(address, val);
+}
+
+template <>
+__device__ __forceinline__ void gpuAtomicAdd(
+    int64_t* address,
+    int64_t val)
+{
+    atomicAdd(
+        reinterpret_cast<unsigned long long*>(address),
+        static_cast<unsigned long long>(val));
+}
+
+template <>
+__device__ __forceinline__ void gpuAtomicAdd(
+    __maca_bfloat16* address,
+    __maca_bfloat16 val)
+{
+    unsigned int* addr =
+        (unsigned int*)((char*)address - ((size_t)address & 2));
+
+    unsigned int old = *addr;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+
+        unsigned short old_val =
+            ((size_t)address & 2)
+                ? (assumed >> 16)
+                : (assumed & 0xFFFF);
+
+        __maca_bfloat16 sum =
+            (__maca_bfloat16)(
+                (float)*reinterpret_cast<__maca_bfloat16*>(&old_val) +
+                (float)val);
+
+        unsigned short res =
+            *reinterpret_cast<unsigned short*>(&sum);
+
+        old = atomicCAS(
+            addr,
+            assumed,
+            ((size_t)address & 2)
+                ? ((assumed & 0xFFFF) | (res << 16))
+                : ((assumed & 0xFFFF0000) | res));
+    } while (assumed != old);
+}
+
+// ==================================================================
+// Kernel
+// ==================================================================
+
+template <typename T, typename TIdx>
+__global__ void index_add_kernel(
+    T* output,
+    const T* source,
+    const TIdx* indices,
+    int outer_size,
+    int inner_size,
+    int index_size,
+    int dim_size,
+    T alpha)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total_source = outer_size * index_size * inner_size;
+
+    if (idx >= total_source) return;
+
+    int inner = idx % inner_size;
+    int temp  = idx / inner_size;
+    int i     = temp % index_size;
+    int outer = temp / index_size;
+
+    TIdx idx_pos = indices[i];
+    if (idx_pos < 0) idx_pos += dim_size;
+
+    if (idx_pos >= 0 && idx_pos < dim_size) {
+        int out_offset =
+            outer * (dim_size * inner_size) +
+            idx_pos * inner_size +
+            inner;
+
+        gpuAtomicAdd(
+            output + out_offset,
+            static_cast<T>(source[idx] * alpha));
+    }
+}
+
+// ==================================================================
+// Kernel Launcher
+// ==================================================================
+
+template <typename T, typename TIdx>
+void launch_kernel_impl(
+    void* output,
+    const void* source,
+    const void* indices,
+    int outer,
+    int inner,
+    int idx_size,
+    int dim_s,
+    float alpha,
+    void* stream)
+{
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+
+    size_t total = (size_t)outer * idx_size * inner;
+    size_t block = 256;
+    size_t grid  = (total + block - 1) / block;
+
+    index_add_kernel<T, TIdx>
+        <<<grid, block, 0, hc_stream>>>(
+            reinterpret_cast<T*>(output),
+            reinterpret_cast<const T*>(source),
+            reinterpret_cast<const TIdx*>(indices),
+            outer,
+            inner,
+            idx_size,
+            dim_s,
+            static_cast<T>(alpha));
+}
+
+// ==================================================================
+// Utilities
+// ==================================================================
+
+static size_t get_element_size(int dtype) {
+    if (dtype == INFINI_DTYPE_F64 || dtype == INFINI_DTYPE_I64) return 8;
+    if (dtype == INFINI_DTYPE_F32 || dtype == INFINI_DTYPE_I32) return 4;
+    return 2;
+}
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    float alpha;
+    int64_t dim;
+    int outer_size;
+    int inner_size;
+    int index_size;
+    int dim_size;
+    size_t total_bytes;
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    int64_t dim,
+    infiniopTensorDescriptor_t index_desc,
+    infiniopTensorDescriptor_t source_desc,
+    float alpha)
+{
+    auto handle =
+        reinterpret_cast<device::metax::Handle *>(handle_);
+
+    auto info_result =
+        IndexAddInfo::create(
+            out_desc, in_desc, dim, index_desc, source_desc, alpha);
+
+    if (!info_result) {
+        return info_result.status();
+    }
+
+    auto out_d =
+        reinterpret_cast<const InfiniopTensorDescriptor*>(out_desc);
+    auto idx_d =
+        reinterpret_cast<const InfiniopTensorDescriptor*>(index_desc);
+
+    int ndim = out_d->ndim();
+    int64_t real_dim = dim < 0 ? dim + ndim : dim;
+
+    int outer = 1;
+    for (int i = 0; i < real_dim; ++i) {
+        outer *= out_d->shape()[i];
+    }
+
+    int inner = 1;
+    for (int i = real_dim + 1; i < ndim; ++i) {
+        inner *= out_d->shape()[i];
+    }
+
+    int dim_s = out_d->shape()[real_dim];
+
+    int idx_s = 1;
+    for (int i = 0; i < idx_d->ndim(); ++i) {
+        idx_s *= idx_d->shape()[i];
+    }
+
+    size_t bytes =
+        (size_t)outer * dim_s * inner *
+        get_element_size(out_d->dtype());
+
+    auto opaque =
+        new Opaque{
+            handle->internal(),
+            alpha,
+            dim,
+            outer,
+            inner,
+            idx_s,
+            dim_s,
+            bytes};
+
+    *desc_ptr = new Descriptor(
+        opaque,
+        info_result.take(),
+        0,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    const void *source,
+    void *stream) const
+{
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+
+    hcMemcpyAsync(
+        output,
+        input,
+        _opaque->total_bytes,
+        hcMemcpyDeviceToDevice,
+        hc_stream);
+
+    auto dtype     = _info.dtype();
+    auto idx_dtype = _info.idx_dtype();
+
+    int outer  = _opaque->outer_size;
+    int inner  = _opaque->inner_size;
+    int dim_s  = _opaque->dim_size;
+    int idx_sz = _opaque->index_size;
+    float alpha = _opaque->alpha;
+
+#define LAUNCH(T, TIdx) \
+    launch_kernel_impl<T, TIdx>( \
+        output, source, index, \
+        outer, inner, idx_sz, dim_s, alpha, stream)
+
+    if (idx_dtype == INFINI_DTYPE_I32) {
+        switch (dtype) {
+
+        case INFINI_DTYPE_F16:
+            LAUNCH(__half, int32_t);
+            break;
+
+        case INFINI_DTYPE_BF16:
+            LAUNCH(__maca_bfloat16, int32_t);
+            break;
+
+        case INFINI_DTYPE_F32:
+            LAUNCH(float, int32_t);
+            break;
+
+        case INFINI_DTYPE_F64:
+            LAUNCH(double, int32_t);
+            break;
+
+        case INFINI_DTYPE_I32:
+            LAUNCH(int32_t, int32_t);
+            break;
+
+        case INFINI_DTYPE_I64:
+            LAUNCH(int64_t, int32_t);
+            break;
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+    } else if (idx_dtype == INFINI_DTYPE_I64) {
+        switch (dtype) {
+
+        case INFINI_DTYPE_F16:
+            LAUNCH(__half, int64_t);
+            break;
+
+        case INFINI_DTYPE_BF16:
+            LAUNCH(__maca_bfloat16, int64_t);
+            break;
+
+        case INFINI_DTYPE_F32:
+            LAUNCH(float, int64_t);
+            break;
+
+        case INFINI_DTYPE_F64:
+            LAUNCH(double, int64_t);
+            break;
+
+        case INFINI_DTYPE_I32:
+            LAUNCH(int32_t, int64_t);
+            break;
+
+        case INFINI_DTYPE_I64:
+            LAUNCH(int64_t, int64_t);
+            break;
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+#undef LAUNCH
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::index_add::metax
diff --git a/src/infiniop/ops/index_add/moore/index_add_moore.h b/src/infiniop/ops/index_add/moore/index_add_moore.h
new file mode 100644
index 000000000..fd26e62a1
--- /dev/null
+++ b/src/infiniop/ops/index_add/moore/index_add_moore.h
@@ -0,0 +1,11 @@
+#ifndef __INDEX_ADD_MOORE_API_H__
+#define __INDEX_ADD_MOORE_API_H__
+
+// 引入上层定义的 Descriptor 宏和基础类
+#include "../index_add.h"
+
+// 使用 index_add.h 中定义的 DESCRIPTOR 宏
+// 这将自动生成 op::index_add::moore::Descriptor 类定义
+DESCRIPTOR(moore)
+
+#endif // __INDEX_ADD_MOORE_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/moore/index_add_moore.mu b/src/infiniop/ops/index_add/moore/index_add_moore.mu
new file mode 100644
index 000000000..3b69d35d1
--- /dev/null
+++ b/src/infiniop/ops/index_add/moore/index_add_moore.mu
@@ -0,0 +1,203 @@
+#include "index_add_moore.h"
+#include "index_add_moore_kernel.h" // 包含 IndexAddOp Functor 定义
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+#include <type_traits>
+
+#include "../../../devices/moore/moore_handle.h"
+
+namespace op::index_add::moore {
+
+// ==================================================================
+// 1. Kernel Wrapper
+// ==================================================================
+
+// 这是一个 Global Kernel 包装器，它调用 index_add_moore_kernel.h 中的 IndexAddOp Functor
+template <typename T, typename TIdx>
+__global__ void index_add_kernel(
+    const size_t num_elements,  // Source 的总元素数量 (线程任务总量)
+    const size_t index_len,     // Index 向量长度
+    const size_t inner_size,    // stride
+    const size_t dim_size,      // Output 在 dim 维度的长度
+    const float alpha,          // 缩放因子
+    const T *source,
+    const TIdx *indices,
+    T *output) {
+
+    // idx 对应 Source 张量的线性索引
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < num_elements) {
+        // 使用 Functor
+        IndexAddOp op;
+        op(idx, index_len, inner_size, dim_size, alpha, source, indices, output);
+    }
+}
+
+// ==================================================================
+// 2. Launcher Implementation
+// ==================================================================
+
+template <typename T, typename TIdx>
+void index_add_moore_launch(
+    const IndexAddInfo &info,
+    T *output,
+    const T *input,
+    const T *source,
+    const void *indices, // void* 传入，内部强转
+    void *stream) {
+
+    auto musa_stream = (musaStream_t)stream;
+    const TIdx *indices_ptr = static_cast<const TIdx *>(indices);
+
+    // --------------------------------------------------------------
+    // 步骤 1: Copy Input -> Output
+    // --------------------------------------------------------------
+    // Output 初始化为 Input 的值。
+    // 计算总元素数量: Output 形状与 Input 一致
+    size_t total_out_elements = info.outer_size() * info.dim_size() * info.inner_size();
+
+    // 如果 input 和 output 指针不同，则执行拷贝
+    if (output != input) {
+        musaMemcpyAsync(output, input, total_out_elements * sizeof(T), musaMemcpyDeviceToDevice, musa_stream);
+    }
+
+    // --------------------------------------------------------------
+    // 步骤 2: Scatter Add (Source -> Output)
+    // --------------------------------------------------------------
+    // 线程并行度取决于 Source 的大小
+    // Source 逻辑形状: [Outer, IndexLen, Inner]
+    size_t num_src_elements = info.outer_size() * info.index_len() * info.inner_size();
+
+    if (num_src_elements == 0) {
+        return; //以此避免空 Kernel Launch
+    }
+
+    int threads = 256;
+    int blocks = (num_src_elements + threads - 1) / threads;
+
+    index_add_kernel<T, TIdx><<<blocks, threads, 0, musa_stream>>>(
+        num_src_elements,
+        info.index_len(),
+        info.inner_size(),
+        info.dim_size(),
+        info.alpha(),
+        source,
+        indices_ptr,
+        output
+    );
+}
+
+// ==================================================================
+// 3. Descriptor Implementation
+// ==================================================================
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    int64_t dim,
+    infiniopTensorDescriptor_t index_desc,
+    infiniopTensorDescriptor_t source_desc,
+    float alpha) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    // 使用 Info 类校验形状和类型，并预计算几何参数
+    auto info_result = IndexAddInfo::create(out_desc, in_desc, dim, index_desc, source_desc, alpha);
+
+    if (!info_result) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    *desc_ptr = new Descriptor(
+        nullptr,
+        *info_result,
+        0, // No workspace needed
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    const void *source,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // --------------------------------------------------------------
+    // 定义分发宏：解决 Data Type x Index Type 的组合爆炸
+    // --------------------------------------------------------------
+    #define LAUNCH_KERNEL(T)                                                            \
+        do {                                                                            \
+            if (_info.idx_dtype() == INFINI_DTYPE_I32) {                                \
+                index_add_moore_launch<T, int32_t>(                                     \
+                    _info,                                                              \
+                    static_cast<T *>(output),                                           \
+                    static_cast<const T *>(input),                                      \
+                    static_cast<const T *>(source),                                     \
+                    index,                                                              \
+                    stream);                                                            \
+            } else if (_info.idx_dtype() == INFINI_DTYPE_I64) {                         \
+                index_add_moore_launch<T, int64_t>(                                     \
+                    _info,                                                              \
+                    static_cast<T *>(output),                                           \
+                    static_cast<const T *>(input),                                      \
+                    static_cast<const T *>(source),                                     \
+                    index,                                                              \
+                    stream);                                                            \
+            } else {                                                                    \
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;                                  \
+            }                                                                           \
+        } while (0)
+
+    // --------------------------------------------------------------
+    // 根据数据类型分发
+    // --------------------------------------------------------------
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        LAUNCH_KERNEL(half);
+        break;
+        
+    case INFINI_DTYPE_BF16:
+        LAUNCH_KERNEL(__mt_bfloat16);
+        break;
+
+    case INFINI_DTYPE_F32:
+        LAUNCH_KERNEL(float);
+        break;
+
+    case INFINI_DTYPE_F64:
+        LAUNCH_KERNEL(double);
+        break;
+    
+    // 如果需要支持整数类型的 AtomicAdd，需要确保 Kernel 中有对应特化
+    // 这里仅示例浮点类型
+    case INFINI_DTYPE_I32:
+        LAUNCH_KERNEL(int32_t); 
+        break;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    #undef LAUNCH_KERNEL
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::index_add::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/moore/index_add_moore_kernel.h b/src/infiniop/ops/index_add/moore/index_add_moore_kernel.h
new file mode 100644
index 000000000..e7460d711
--- /dev/null
+++ b/src/infiniop/ops/index_add/moore/index_add_moore_kernel.h
@@ -0,0 +1,130 @@
+#ifndef __INDEX_ADD_MOORE_KERNEL_H__
+#define __INDEX_ADD_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+#include <type_traits>
+
+namespace op::index_add::moore {
+
+
+template <typename T>
+__device__ __forceinline__ void atomic_add_func(T* address, T val) {
+    atomicAdd(address, val);
+}
+
+template <>
+__device__ __forceinline__ void atomic_add_func<half>(half* address, half val) {
+    // 将地址重解释为 unsigned short* 以便进行位操作
+    unsigned short* address_as_us = reinterpret_cast<unsigned short*>(address);
+    unsigned short old = *address_as_us;
+    unsigned short assumed;
+
+    do {
+        assumed = old;
+        half sum = __float2half(__half2float(*reinterpret_cast<const half*>(&assumed)) + 
+                                __half2float(val));
+        
+        unsigned short sum_as_us = *reinterpret_cast<unsigned short*>(&sum);
+
+       
+        old = atomicCAS(address_as_us, assumed, sum_as_us);
+
+    } while (assumed != old); 
+}
+
+
+template <>
+__device__ __forceinline__ void atomic_add_func<__mt_bfloat16>(__mt_bfloat16* address, __mt_bfloat16 val) {
+    unsigned short* address_as_us = reinterpret_cast<unsigned short*>(address);
+    unsigned short old = *address_as_us;
+    unsigned short assumed;
+
+    do {
+        assumed = old;
+        // BF16 -> Float -> Add -> BF16
+        float sum_f = __bfloat162float(*reinterpret_cast<const __mt_bfloat16*>(&assumed)) + 
+                      __bfloat162float(val);
+        
+        __mt_bfloat16 sum_bf = __float2bfloat16(sum_f);
+        unsigned short sum_as_us = *reinterpret_cast<unsigned short*>(&sum_bf);
+
+        old = atomicCAS(address_as_us, assumed, sum_as_us);
+
+    } while (assumed != old);
+}
+
+// ==================================================================
+// 2. Kernel Functor
+// ==================================================================
+
+typedef struct IndexAddOp {
+public:
+    template <typename T, typename TIdx>
+    __device__ __forceinline__ void operator()(
+        const size_t curr_idx,      // Flattened index for Source
+        const size_t index_len,     // Length of Index tensor
+        const size_t inner_size,    // Stride of inner dims
+        const size_t dim_size,      // Size of target dim in Output
+        const float alpha,          // Scale factor
+        const T* source,            // Source Tensor
+        const TIdx* indices,        // Index Tensor
+        T* output                   // Output Tensor
+    ) const {
+        
+       
+        
+        size_t inner_idx = curr_idx % inner_size;
+        size_t tmp = curr_idx / inner_size;
+        size_t idx_in_indices = tmp % index_len; // 当前处理的是 Index 张量中的第几个索引
+        size_t outer_idx = tmp / index_len;
+
+        // --- 2. 读取 Source 并应用 Alpha ---
+        T src_val = source[curr_idx];
+        float val_f;
+
+        // 统一转 float 计算乘法
+        if constexpr (std::is_same_v<T, half>) {
+            val_f = __half2float(src_val);
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            val_f = __bfloat162float(src_val);
+        } else {
+            val_f = static_cast<float>(src_val);
+        }
+
+        val_f *= alpha;
+
+        // 转回 T
+        T add_val;
+        if constexpr (std::is_same_v<T, half>) {
+            add_val = __float2half(val_f);
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            add_val = __float2bfloat16(val_f);
+        } else {
+            add_val = static_cast<T>(val_f);
+        }
+
+        // --- 3. 读取 Index 并计算 Output 偏移 ---
+        TIdx target_dim_idx = indices[idx_in_indices];
+
+        // 处理 Python 风格负索引
+        if (target_dim_idx < 0) {
+            target_dim_idx += static_cast<TIdx>(dim_size);
+        }
+
+        // --- 4. 边界检查 & 原子累加 ---
+        if (target_dim_idx >= 0 && target_dim_idx < static_cast<TIdx>(dim_size)) {
+            // output_offset = outer * (dim_size * inner) + target_idx * inner + inner
+            size_t out_offset = outer_idx * (dim_size * inner_size) + 
+                                static_cast<size_t>(target_dim_idx) * inner_size + 
+                                inner_idx;
+            atomic_add_func(output + out_offset, add_val);
+        }
+    }
+
+} IndexAddOp;
+
+} // namespace op::index_add::moore
+
+#endif // __INDEX_ADD_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu
new file mode 100644
index 000000000..619523f47
--- /dev/null
+++ b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cu
@@ -0,0 +1,164 @@
+#include "index_add_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../../../handle.h"
+#include <cstdint>
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+namespace op::index_add::nvidia {
+
+template <typename T>
+bool is_aligned(const void *ptr, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+}
+
+// ==================================================================
+// Kernel Launch Logic
+// ==================================================================
+template <typename T, typename TIdx>
+void launch_kernel(
+    void *output,
+    const void *source,
+    const void *indices,
+    const IndexAddInfo &info,
+    void *stream) {
+
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto src_ptr = reinterpret_cast<const T *>(source);
+    auto idx_ptr = reinterpret_cast<const TIdx *>(indices);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    // 获取几何信息
+    size_t outer_size = info.outer_size();
+    size_t inner_size = info.inner_size();
+    size_t dim_size   = info.dim_size();
+    size_t index_len  = info.index_len();
+    float alpha       = info.alpha();
+
+    // Source 总元素数
+    size_t num_source = outer_size * index_len * inner_size;
+
+    // --- 向量化参数配置 ---
+    // 目标：每个线程读取 128-bit (16 Bytes) Source 数据
+    constexpr int TotalBytes = 16;
+    constexpr int PackSize = TotalBytes / sizeof(T);
+    bool can_vectorize = (PackSize > 1) && 
+                         (num_source % PackSize == 0) &&
+                         is_aligned<T>(source, TotalBytes);
+
+    if (can_vectorize) {
+        // === 路径 A: 向量化读取 Kernel ===
+        size_t num_packs = num_source / PackSize;
+        
+        size_t block_size = 256;
+        size_t grid_size = (num_packs + block_size - 1) / block_size;
+
+        op::index_add::cuda::index_add_kernel_vectorized<T, TIdx, PackSize>
+            <<<grid_size, block_size, 0, cuda_stream>>>(
+            out_ptr, src_ptr, idx_ptr, 
+            outer_size, inner_size, dim_size, index_len, 
+            num_packs, alpha
+        );
+    } else {
+        // === 路径 B: 标量 Kernel ===
+        size_t block_size = 256;
+        size_t grid_size = (num_source + block_size - 1) / block_size;
+
+        op::index_add::cuda::index_add_kernel<T, TIdx>
+            <<<grid_size, block_size, 0, cuda_stream>>>(
+            out_ptr, src_ptr, idx_ptr, 
+            outer_size, inner_size, dim_size, index_len, 
+            num_source, alpha
+        );
+    }
+}
+
+// ==================================================================
+// Descriptor 实现
+// ==================================================================
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    int64_t dim,
+    infiniopTensorDescriptor_t index_desc,
+    infiniopTensorDescriptor_t source_desc,
+    float alpha) {
+
+    // Info 创建
+    auto info_result = IndexAddInfo::create(out_desc, in_desc, dim, index_desc, source_desc, alpha);
+    if (!info_result) return info_result.status();
+
+    *desc_ptr = new Descriptor(
+        new Opaque(), info_result.take(), 0, handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+// ==================================================================
+// Calculate
+// ==================================================================
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    const void *source,
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+    auto idx_dtype = _info.idx_dtype();
+    #define LAUNCH_BY_SIZE(T_STORAGE) \
+        switch (idx_dtype) { \
+        case INFINI_DTYPE_I32: \
+            launch_kernel<T_STORAGE, int32_t>(output, source, index, _info, stream); \
+            break; \
+        case INFINI_DTYPE_I64: \
+            launch_kernel<T_STORAGE, int64_t>(output, source, index, _info, stream); \
+            break; \
+        default: return INFINI_STATUS_BAD_TENSOR_DTYPE; \
+        }
+
+    switch (dtype) {
+    // 32-bit Float
+    case INFINI_DTYPE_F32:
+        LAUNCH_BY_SIZE(float);
+        break;
+    // 64-bit Float
+    case INFINI_DTYPE_F64:
+        LAUNCH_BY_SIZE(double);
+        break;
+    // 16-bit Half (fp16) -> 使用 __half
+    case INFINI_DTYPE_F16:
+        LAUNCH_BY_SIZE(__half);
+        break;
+    // 16-bit BFloat16 (bf16) -> 使用 __nv_bfloat16
+    case INFINI_DTYPE_BF16:
+        LAUNCH_BY_SIZE(__nv_bfloat16);
+        break;
+    // Integers
+    case INFINI_DTYPE_I32:
+        LAUNCH_BY_SIZE(int32_t);
+        break;
+    case INFINI_DTYPE_I64:
+        LAUNCH_BY_SIZE(int64_t);
+        break;
+    default: 
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    #undef LAUNCH_BY_SIZE
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::index_add::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cuh b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cuh
new file mode 100644
index 000000000..6fe6b7ef9
--- /dev/null
+++ b/src/infiniop/ops/index_add/nvidia/index_add_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __INDEX_ADD_NVIDIA_CUH__
+#define __INDEX_ADD_NVIDIA_CUH__
+
+#include "../index_add.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __INDEX_ADD_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_add/operator.cc b/src/infiniop/ops/index_add/operator.cc
new file mode 100644
index 000000000..44b228c97
--- /dev/null
+++ b/src/infiniop/ops/index_add/operator.cc
@@ -0,0 +1,190 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/index_add.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/index_add_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/index_add_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/index_add_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/index_add_moore.h"
+#endif
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateIndexAddDescriptor(
+    infiniopHandle_t handle,
+    infiniopIndexAddDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input,
+    int64_t dim,
+    infiniopTensorDescriptor_t index,
+    infiniopTensorDescriptor_t source,
+    float alpha) {
+
+    #define CREATE(CASE, NAMESPACE)                                             \
+        case CASE:                                                              \
+            return op::index_add::NAMESPACE::Descriptor::create(                \
+                handle,                                                         \
+                reinterpret_cast<op::index_add::NAMESPACE::Descriptor **>(desc_ptr), \
+                output,                                                         \
+                input,                                                          \
+                dim,                                                            \
+                index,                                                          \
+                source,                                                         \
+                alpha)
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetIndexAddWorkspaceSize(infiniopIndexAddDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                        \
+        case CASE:                                                                                      \
+            *size = reinterpret_cast<op::index_add::NAMESPACE::Descriptor *>(desc)->workspaceSize();    \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+    // 【关键修复】启用 Moore 分支
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopIndexAdd(
+    infiniopIndexAddDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    const void *source,
+    void *stream) {
+
+    #define CALCULATE(CASE, NAMESPACE)                                          \
+        case CASE:                                                              \
+            return reinterpret_cast<const op::index_add::NAMESPACE::Descriptor *>(desc) \
+                ->calculate(workspace, workspace_size, output, input, index, source, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // 【关键修复】启用 Moore 分支
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroyIndexAddDescriptor(infiniopIndexAddDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                          \
+        case CASE:                                                                           \
+            delete reinterpret_cast<const op::index_add::NAMESPACE::Descriptor *>(desc);     \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/cpu/index_copy_cpu.cc b/src/infiniop/ops/index_copy/cpu/index_copy_cpu.cc
new file mode 100644
index 000000000..e70863527
--- /dev/null
+++ b/src/infiniop/ops/index_copy/cpu/index_copy_cpu.cc
@@ -0,0 +1,154 @@
+#include "index_copy_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
+namespace op::index_copy::cpu {
+
+Descriptor::~Descriptor() = default;
+
+// ==================================================================
+// 创建描述符
+// ==================================================================
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    int64_t dim,
+    infiniopTensorDescriptor_t index_desc,
+    infiniopTensorDescriptor_t source_desc) { // 注意：移除了 float alpha
+    
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    // 创建 Info 对象
+    auto result = IndexCopyInfo::create(out_desc, in_desc, dim, index_desc, source_desc);
+    CHECK_RESULT(result);
+    
+    *desc_ptr = new Descriptor(
+        nullptr,             // Opaque*
+        result.take(),       // Info
+        0,                   // Workspace Size
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// ==================================================================
+// 核心计算逻辑 (串行实现)
+// ==================================================================
+template <typename TData, typename TIdx>
+void calculate_cpu_impl(
+    const IndexCopyInfo &info,
+    void *output,
+    const void *input,
+    const void *index,
+    const void *source) {
+
+    // IndexCopy 不需要 Alpha 也不需要提升精度进行计算，直接拷贝即可
+
+    // 1. 获取几何信息
+    size_t outer_size = info.outer_size(); 
+    size_t inner_size = info.inner_size();
+    size_t dim_size   = info.dim_size(); 
+    size_t index_len  = info.index_len();
+
+    auto out_ptr = reinterpret_cast<TData *>(output);
+    auto src_ptr = reinterpret_cast<const TData *>(source);
+    auto idx_ptr = reinterpret_cast<const TIdx *>(index);
+
+    // -----------------------------------------------------------
+    // 串行循环逻辑
+    // -----------------------------------------------------------
+    for (size_t o = 0; o < outer_size; ++o) {
+        for (size_t i = 0; i < index_len; ++i) {
+            
+            TIdx idx = idx_ptr[i];
+
+            // 处理负索引
+            if (idx < 0) idx += static_cast<TIdx>(dim_size);
+
+            // 边界检查
+            if (idx < 0 || static_cast<size_t>(idx) >= dim_size) {
+                continue; 
+            }
+
+            // 计算偏移
+            size_t src_offset = o * index_len * inner_size + i * inner_size;
+            size_t out_offset = o * dim_size * inner_size + static_cast<size_t>(idx) * inner_size;
+
+            // Inner 维度循环
+            for (size_t in = 0; in < inner_size; ++in) {
+                // 【核心逻辑】
+                // IndexCopy: output[idx] = source[i]
+                // 直接赋值，无需 utils::cast 提升精度
+                out_ptr[out_offset + in] = src_ptr[src_offset + in];
+            }
+        }
+    }
+}
+
+// ==================================================================
+// 执行函数 (分发逻辑)
+// ==================================================================
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    const void *source,
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+    auto idx_dtype = _info.idx_dtype();
+
+    #define DISPATCH(TDATA, TIDX) \
+        calculate_cpu_impl<TDATA, TIDX>(_info, output, input, index, source); \
+        return INFINI_STATUS_SUCCESS
+
+    if (idx_dtype == INFINI_DTYPE_I32) {
+        switch (dtype) {
+            case INFINI_DTYPE_F32: 
+                DISPATCH(float, int32_t);
+            case INFINI_DTYPE_F64: 
+                DISPATCH(double, int32_t);
+            case INFINI_DTYPE_F16: 
+                DISPATCH(fp16_t, int32_t); 
+            case INFINI_DTYPE_BF16: 
+                DISPATCH(bf16_t, int32_t);
+            case INFINI_DTYPE_I32: 
+                DISPATCH(int32_t, int32_t);
+            case INFINI_DTYPE_I64: 
+                DISPATCH(int64_t, int32_t);
+            default: 
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else if (idx_dtype == INFINI_DTYPE_I64) {
+        switch (dtype) {
+            case INFINI_DTYPE_F32: 
+                DISPATCH(float, int64_t);
+            case INFINI_DTYPE_F64: 
+                DISPATCH(double, int64_t);
+            case INFINI_DTYPE_F16: 
+                DISPATCH(fp16_t, int64_t);
+            case INFINI_DTYPE_BF16: 
+                DISPATCH(bf16_t, int64_t);
+            case INFINI_DTYPE_I32: 
+                DISPATCH(int32_t, int64_t);
+            case INFINI_DTYPE_I64: 
+                DISPATCH(int64_t, int64_t);
+            default: 
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    }
+
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+
+    #undef DISPATCH
+}
+
+} // namespace op::index_copy::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/cpu/index_copy_cpu.h b/src/infiniop/ops/index_copy/cpu/index_copy_cpu.h
new file mode 100644
index 000000000..a13369ed9
--- /dev/null
+++ b/src/infiniop/ops/index_copy/cpu/index_copy_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __INDEX_COPY_CPU_H__
+#define __INDEX_COPY_CPU_H__
+
+#include "../index_copy.h"
+DESCRIPTOR(cpu)
+
+#endif // __INDEX_COPY_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/cuda/kernel.cuh b/src/infiniop/ops/index_copy/cuda/kernel.cuh
new file mode 100644
index 000000000..f022ffdca
--- /dev/null
+++ b/src/infiniop/ops/index_copy/cuda/kernel.cuh
@@ -0,0 +1,136 @@
+#ifndef __INDEX_COPY_CUDA_H__
+#define __INDEX_COPY_CUDA_H__
+
+//#include <cuda_runtime.h>
+#if defined(__MACA__) || defined(__MACACC__)
+    #include <maca_fp16.h>
+    #include <maca_bfloat16.h>
+    using nv_bfloat162 = __maca_bfloat162;
+#else
+    #include <cuda_fp16.h>
+    #include <cuda_bf16.h>
+#endif
+
+#include <cstdint>
+
+namespace op::index_copy::cuda {
+
+// ==================================================================
+// 1. 定义向量化数据包 (Aligned Pack)
+// ==================================================================
+// 与 IndexAdd 保持一致，用于向量化读取
+template <typename T, int N>
+struct alignas(sizeof(T) * N) Pack {
+    T val[N];
+};
+
+// ==================================================================
+// 2. 标量版 Kernel (通用 fallback)
+// ==================================================================
+template <typename T, typename TIdx>
+__global__ void index_copy_kernel(
+    T * __restrict__ output,
+    const T * __restrict__ source,
+    const TIdx * __restrict__ indices,
+    size_t outer_size,      // dim 左边的维度积
+    size_t inner_size,      // dim 右边的维度积
+    size_t dim_size,        // output 在 dim 维度的长度
+    size_t index_len,       // index 的长度 (source 在 dim 维度的长度)
+    size_t num_source       // source 的总元素数
+    // 注意：移除了 float alpha
+) {
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+
+    // Grid-Stride Loop 遍历 Source 张量
+    for (size_t i = tid; i < num_source; i += stride) {
+        // 1. 将线性索引 i 转换为逻辑坐标 (outer, idx_idx, inner)
+        // Source Shape: [Outer, IndexLen, Inner]
+        size_t inner_idx = i % inner_size;
+        size_t tmp = i / inner_size;
+        size_t idx_idx = tmp % index_len;
+        size_t outer_idx = tmp / index_len;
+
+        // 2. 读取索引值
+        TIdx target_dim_idx = indices[idx_idx];
+
+        // 3. 处理负索引 (防御性)
+        if (target_dim_idx < 0) target_dim_idx += static_cast<TIdx>(dim_size);
+
+        // 4. 边界检查与赋值
+        if (target_dim_idx >= 0 && target_dim_idx < static_cast<TIdx>(dim_size)) {
+            // 计算 Output 的线性偏移
+            // Output Shape: [Outer, DimSize, Inner]
+            size_t out_offset = outer_idx * (dim_size * inner_size) + 
+                                static_cast<size_t>(target_dim_idx) * inner_size + 
+                                inner_idx;
+
+            // 【核心修改】
+            // IndexCopy 不需要原子操作，直接赋值。
+            // 如果有多个索引指向同一个位置，结果由执行顺序决定（Race Condition），这是符合预期的行为。
+            output[out_offset] = source[i];
+        }
+    }
+}
+
+// ==================================================================
+// 3. 向量化 Kernel (优化读取带宽)
+// ==================================================================
+template <typename T, typename TIdx, int PackSize>
+__global__ void index_copy_kernel_vectorized(
+    T * __restrict__ output,
+    const T * __restrict__ source,
+    const TIdx * __restrict__ indices,
+    size_t outer_size,
+    size_t inner_size,
+    size_t dim_size,
+    size_t index_len,
+    size_t num_packs        // Source 的 Pack 数量
+    // 注意：移除了 float alpha
+) {
+    // 将 source 强转为 Pack 指针，实现向量化读取
+    using PackType = Pack<T, PackSize>;
+    const PackType *src_vec = reinterpret_cast<const PackType *>(source);
+    
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+
+    for (size_t i = tid; i < num_packs; i += stride) {
+        // 向量化读取 (LDG.128)
+        PackType reg_pack = src_vec[i];
+        
+        // 当前 Pack 在 Source 中的起始线性索引
+        size_t base_idx = i * PackSize;
+
+        // 循环展开：处理 Pack 中的每一个元素
+        #pragma unroll
+        for (int k = 0; k < PackSize; ++k) {
+            size_t curr_src_idx = base_idx + k;
+
+            // 1. 坐标变换
+            size_t inner_idx = curr_src_idx % inner_size;
+            size_t tmp = curr_src_idx / inner_size;
+            size_t idx_idx = tmp % index_len;
+            size_t outer_idx = tmp / index_len;
+
+            // 2. 读取 Index
+            TIdx target_dim_idx = indices[idx_idx];
+
+            if (target_dim_idx < 0) target_dim_idx += static_cast<TIdx>(dim_size);
+
+            // 3. 赋值
+            if (target_dim_idx >= 0 && target_dim_idx < static_cast<TIdx>(dim_size)) {
+                size_t out_offset = outer_idx * (dim_size * inner_size) + 
+                                    static_cast<size_t>(target_dim_idx) * inner_size + 
+                                    inner_idx;
+                
+                // 【核心修改】直接赋值
+                output[out_offset] = reg_pack.val[k];
+            }
+        }
+    }
+}
+
+} // namespace op::index_copy::cuda
+
+#endif // __INDEX_COPY_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/index_copy.h b/src/infiniop/ops/index_copy/index_copy.h
new file mode 100644
index 000000000..e2ef85f01
--- /dev/null
+++ b/src/infiniop/ops/index_copy/index_copy.h
@@ -0,0 +1,51 @@
+#ifndef __INDEX_COPY_H__
+#define __INDEX_COPY_H__
+
+#include "../../operator.h"
+#include "info.h" // 引用 IndexCopyInfo 定义
+#define DESCRIPTOR(NAMESPACE)                                                    \
+                                                                                 \
+    namespace op::index_copy::NAMESPACE {                                        \
+    class Descriptor final : public InfiniopDescriptor {                         \
+        struct Opaque;                                                           \
+        Opaque *_opaque;                                                         \
+        IndexCopyInfo _info;                                                     \
+        size_t _workspace_size;                                                  \
+                                                                                 \
+        Descriptor(                                                              \
+            Opaque *opaque,                                                      \
+            IndexCopyInfo info,                                                  \
+            size_t workspace_size,                                               \
+            infiniDevice_t device_type,                                          \
+            int device_id)                                                       \
+            : InfiniopDescriptor{device_type, device_id},                        \
+              _opaque(opaque),                                                   \
+              _info(info),                                                       \
+              _workspace_size(workspace_size) {}                                 \
+                                                                                 \
+    public:                                                                      \
+        ~Descriptor();                                                           \
+                                                                                 \
+        size_t workspaceSize() const { return _workspace_size; }                 \
+                                                                                 \
+        static infiniStatus_t create(                                            \
+            infiniopHandle_t handle,                                             \
+            Descriptor **desc_ptr,                                               \
+            infiniopTensorDescriptor_t out_desc,                                 \
+            infiniopTensorDescriptor_t in_desc,                                  \
+            int64_t dim,                                                         \
+            infiniopTensorDescriptor_t index_desc,                               \
+            infiniopTensorDescriptor_t source_desc); /* 注意：移除了 alpha */    \
+                                                                                 \
+        infiniStatus_t calculate(                                                \
+            void *workspace,                                                     \
+            size_t workspace_size,                                               \
+            void *output,                                                        \
+            const void *input,                                                   \
+            const void *index,                                                   \
+            const void *source,                                                  \
+            void *stream) const;                                                 \
+    };                                                                           \
+    }
+
+#endif // __INDEX_COPY_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/info.h b/src/infiniop/ops/index_copy/info.h
new file mode 100644
index 000000000..3276752af
--- /dev/null
+++ b/src/infiniop/ops/index_copy/info.h
@@ -0,0 +1,120 @@
+#ifndef __INDEX_COPY_INFO_H__
+#define __INDEX_COPY_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::index_copy {
+
+class IndexCopyInfo {
+    IndexCopyInfo() = default;
+
+public:
+    int _dtype;
+    int _idx_dtype;
+    int64_t _dim;
+
+    size_t _outer_size;
+    size_t _inner_size;
+    size_t _dim_size;
+    size_t _index_len;
+
+    IndexCopyInfo(int dtype, int idx_dtype, int64_t dim,
+                  size_t outer_size, size_t inner_size, size_t dim_size, size_t index_len)
+        : _dtype(dtype), _idx_dtype(idx_dtype), _dim(dim),
+          _outer_size(outer_size), _inner_size(inner_size), _dim_size(dim_size), _index_len(index_len) {}
+
+    int dtype() const { return _dtype; }
+    int idx_dtype() const { return _idx_dtype; }
+    int64_t dim() const { return _dim; }
+
+    size_t outer_size() const { return _outer_size; }
+    size_t inner_size() const { return _inner_size; }
+    size_t dim_size() const { return _dim_size; }
+    size_t index_len() const { return _index_len; }
+
+    static utils::Result<IndexCopyInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t in_desc,
+        int64_t dim,
+        infiniopTensorDescriptor_t index_desc,
+        infiniopTensorDescriptor_t source_desc) {
+
+        int dtype = in_desc->dtype();
+        if (out_desc->dtype() != dtype || source_desc->dtype() != dtype) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        int idx_dtype = index_desc->dtype();
+        if (idx_dtype != INFINI_DTYPE_I32 && idx_dtype != INFINI_DTYPE_I64) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        int64_t ndim = static_cast<int64_t>(in_desc->ndim());
+        if (dim < 0 || dim >= ndim) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (index_desc->ndim() != 1) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        const auto &in_shape = in_desc->shape();
+
+        size_t outer_size = 1;
+        for (int64_t i = 0; i < dim; ++i) {
+            outer_size *= in_shape[i];
+        }
+
+        size_t inner_size = 1;
+        for (int64_t i = dim + 1; i < ndim; ++i) {
+            inner_size *= in_shape[i];
+        }
+
+        size_t dim_size = in_shape[dim];
+        size_t index_len = index_desc->shape()[0];
+
+        if (source_desc->ndim() != in_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        const auto &src_shape = source_desc->shape();
+
+        for (int64_t i = 0; i < ndim; ++i) {
+            if (i == dim) {
+                if (src_shape[i] != index_len) {
+                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
+                }
+            } else {
+                if (src_shape[i] != in_shape[i]) {
+                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
+                }
+            }
+        }
+
+        if (out_desc->ndim() != in_desc->ndim()) {
+             return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        const auto &out_shape = out_desc->shape();
+        for (int64_t i = 0; i < ndim; ++i) {
+            if (out_shape[i] != in_shape[i]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        return utils::Result<IndexCopyInfo>(IndexCopyInfo{
+            dtype,
+            idx_dtype,
+            dim,
+            outer_size, 
+            inner_size, 
+            dim_size,   
+            index_len   
+        });
+    }
+};
+
+} // namespace op::index_copy
+
+#endif // __INDEX_COPY_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/metax/index_copy_metax.h b/src/infiniop/ops/index_copy/metax/index_copy_metax.h
new file mode 100644
index 000000000..d42339a9e
--- /dev/null
+++ b/src/infiniop/ops/index_copy/metax/index_copy_metax.h
@@ -0,0 +1,8 @@
+#ifndef __INDEX_COPY_METAX_H__
+#define __INDEX_COPY_METAX_H__
+
+#include "../index_copy.h"
+
+DESCRIPTOR(metax)
+
+#endif // __INDEX_COPY_METAX_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/metax/index_copy_metax.maca b/src/infiniop/ops/index_copy/metax/index_copy_metax.maca
new file mode 100644
index 000000000..01704f436
--- /dev/null
+++ b/src/infiniop/ops/index_copy/metax/index_copy_metax.maca
@@ -0,0 +1,159 @@
+#include "index_copy_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include <mcr/mc_runtime.h>
+#include <common/mc_library_types.h>
+#include <cmath>
+#include <cstdio>
+#include <vector>
+#if defined(__MACA__) || defined(__MACACC__)
+    #include <maca_fp16.h>
+    #include <maca_bfloat16.h>
+#endif
+#include "../../../tensor.h"
+#include "../cuda/kernel.cuh"
+
+#include "../../../tensor.h"
+#include "../cuda/kernel.cuh"
+
+namespace op::index_copy::metax {
+
+template <typename T, typename TIdx>
+__global__ void index_copy_kernel(
+    T* output, const T* source, const TIdx* indices,
+    int outer_size, int inner_size, int index_size, int dim_size) 
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total_source = outer_size * index_size * inner_size;
+    if (idx >= total_source) return;
+
+    int inner = idx % inner_size;
+    int temp = idx / inner_size;
+    int i = temp % index_size;
+    int outer = temp / index_size;
+
+    TIdx idx_pos = indices[i];
+    if (idx_pos < 0) idx_pos += dim_size;
+
+    if (idx_pos >= 0 && idx_pos < dim_size) {
+        int out_offset = outer * (dim_size * inner_size) + idx_pos * inner_size + inner;
+        output[out_offset] = source[idx];
+    }
+}
+
+template <typename T, typename TIdx>
+void launch_kernel_impl(
+    void* output, const void* source, const void* indices,
+    int outer_size, int inner_size, int index_size, int dim_size, void* stream) 
+{
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+    size_t total_elements = (size_t)outer_size * index_size * inner_size;
+    size_t block_size = 256;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    index_copy_kernel<T, TIdx><<<grid_size, block_size, 0, hc_stream>>>(
+        reinterpret_cast<T*>(output), reinterpret_cast<const T*>(source), reinterpret_cast<const TIdx*>(indices),
+        outer_size, inner_size, index_size, dim_size);
+}
+
+static size_t get_element_size(int dtype) {
+    if (dtype == INFINI_DTYPE_F64 || dtype == INFINI_DTYPE_I64) return 8;
+    if (dtype == INFINI_DTYPE_F32 || dtype == INFINI_DTYPE_I32) return 4;
+    return 2;
+}
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    int64_t dim;
+    int outer_size, inner_size, index_size, dim_size;
+    size_t total_bytes;
+};
+
+Descriptor::~Descriptor() { if (_opaque) delete _opaque; }
+
+// [修复 1] create 签名匹配头文件：dim 在 index_desc 之前
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_, Descriptor **desc_ptr, 
+    infiniopTensorDescriptor_t out_desc, 
+    infiniopTensorDescriptor_t in_desc, 
+    int64_t dim, 
+    infiniopTensorDescriptor_t index_desc, 
+    infiniopTensorDescriptor_t source_desc) 
+{
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    // Info 创建顺序通常是 out, in, dim, index, source
+    auto info_result = IndexCopyInfo::create(out_desc, in_desc, dim, index_desc, source_desc);
+    if (!info_result) return info_result.status();
+
+    auto out_d = reinterpret_cast<const InfiniopTensorDescriptor*>(out_desc);
+    auto idx_d = reinterpret_cast<const InfiniopTensorDescriptor*>(index_desc);
+    
+    int ndim = out_d->ndim();
+    int64_t real_dim = dim < 0 ? dim + ndim : dim;
+
+    int outer = 1; for(int i=0; i<real_dim; ++i) outer *= out_d->shape()[i];
+    int inner = 1; for(int i=real_dim+1; i<ndim; ++i) inner *= out_d->shape()[i];
+    int dim_s = out_d->shape()[real_dim];
+    int idx_s = 1; for(int i=0; i<idx_d->ndim(); ++i) idx_s *= idx_d->shape()[i];
+    
+    size_t bytes = (size_t)outer * dim_s * inner * get_element_size(out_d->dtype());
+
+    auto opaque = new Opaque{handle->internal(), dim, outer, inner, idx_s, dim_s, bytes};
+    *desc_ptr = new Descriptor(opaque, info_result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+// [修复 2] calculate 签名匹配头文件：显式参数 input, index, source
+infiniStatus_t Descriptor::calculate(
+    void *workspace, 
+    size_t workspace_size, 
+    void *output, 
+    const void *input, 
+    const void *index, 
+    const void *source, 
+    void *stream) const 
+{
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+    hcMemcpyAsync(output, input, _opaque->total_bytes, hcMemcpyDeviceToDevice, hc_stream);
+
+    auto dtype = _info.dtype();
+    auto idx_dtype = _info.idx_dtype(); 
+    int outer = _opaque->outer_size;
+    int inner = _opaque->inner_size;
+    int dim_s = _opaque->dim_size;
+    int idx_sz = _opaque->index_size;
+
+    #define LAUNCH(T, TIdx) launch_kernel_impl<T, TIdx>(output, source, index, outer, inner, idx_sz, dim_s, stream)
+
+    if (idx_dtype == INFINI_DTYPE_I32) {
+        switch (dtype) {
+        case INFINI_DTYPE_F16: LAUNCH(__half, int32_t); break;
+        case INFINI_DTYPE_BF16: 
+#if defined(__MACA__) || defined(__MACACC__)
+            LAUNCH(__maca_bfloat16, int32_t); 
+#endif
+            break;
+        case INFINI_DTYPE_F32: LAUNCH(float, int32_t); break;
+        case INFINI_DTYPE_F64: LAUNCH(double, int32_t); break;
+        case INFINI_DTYPE_I32: LAUNCH(int32_t, int32_t); break;
+        case INFINI_DTYPE_I64: LAUNCH(int64_t, int32_t); break;
+        default: return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else if (idx_dtype == INFINI_DTYPE_I64) {
+        switch (dtype) {
+        case INFINI_DTYPE_F16: LAUNCH(__half, int64_t); break;
+        case INFINI_DTYPE_BF16: 
+#if defined(__MACA__) || defined(__MACACC__)
+            LAUNCH(__maca_bfloat16, int64_t); 
+#endif
+            break;
+        case INFINI_DTYPE_F32: LAUNCH(float, int64_t); break;
+        case INFINI_DTYPE_F64: LAUNCH(double, int64_t); break;
+        case INFINI_DTYPE_I32: LAUNCH(int32_t, int64_t); break;
+        case INFINI_DTYPE_I64: LAUNCH(int64_t, int64_t); break;
+        default: return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else { return INFINI_STATUS_BAD_TENSOR_DTYPE; }
+    #undef LAUNCH
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::index_copy::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/moore/index_copy_moore.h b/src/infiniop/ops/index_copy/moore/index_copy_moore.h
new file mode 100644
index 000000000..101a9f3f7
--- /dev/null
+++ b/src/infiniop/ops/index_copy/moore/index_copy_moore.h
@@ -0,0 +1,11 @@
+#ifndef __INDEX_COPY_MOORE_API_H__
+#define __INDEX_COPY_MOORE_API_H__
+
+// 引入上层定义的 Descriptor 宏和基础类
+#include "../index_copy.h"
+
+// 使用 index_copy.h 中定义的 DESCRIPTOR 宏
+// 这将自动生成 op::index_copy::moore::Descriptor 类定义
+DESCRIPTOR(moore)
+
+#endif // __INDEX_COPY_MOORE_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/moore/index_copy_moore.mu b/src/infiniop/ops/index_copy/moore/index_copy_moore.mu
new file mode 100644
index 000000000..cae6c2b94
--- /dev/null
+++ b/src/infiniop/ops/index_copy/moore/index_copy_moore.mu
@@ -0,0 +1,207 @@
+#include "index_copy_moore.h"
+#include "index_copy_moore_kernel.h" // 包含 IndexCopyOp Functor 定义
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+
+#include "../../../devices/moore/moore_handle.h"
+
+namespace op::index_copy::moore {
+
+// ==================================================================
+// 1. Kernel Wrapper Implementation
+// ==================================================================
+
+template <typename T, typename TIdx>
+__global__ void index_copy_kernel(
+    const size_t num_elements,  // Source 元素总数 (线程任务总量)
+    const size_t index_len,     // Index 长度
+    const size_t inner_size,    // Stride
+    const size_t dim_size,      // Output 在 dim 维度的长度
+    const T *source,
+    const TIdx *indices,
+    T *output) {
+
+    // idx 对应 Source 张量的线性索引
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < num_elements) {
+        // 使用 index_copy_moore_kernel.h 中定义的 Functor
+        IndexCopyOp op;
+        op(idx, index_len, inner_size, dim_size, source, indices, output);
+    }
+}
+
+// ==================================================================
+// 2. Launcher Implementation
+// ==================================================================
+
+template <typename T, typename TIdx>
+void index_copy_moore_launch(
+    const IndexCopyInfo &info,
+    T *output,
+    const T *input,
+    const T *source,
+    const void *indices, // void* 传入，内部强转
+    void *stream) {
+
+    auto musa_stream = (musaStream_t)stream;
+    const TIdx *indices_ptr = static_cast<const TIdx *>(indices);
+
+    // --------------------------------------------------------------
+    // 步骤 1: Copy Input -> Output
+    // --------------------------------------------------------------
+    // Output 初始化为 Input 的值
+    size_t total_out_elements = info.outer_size() * info.dim_size() * info.inner_size();
+
+    // 如果 input 和 output 指针不同，则执行拷贝
+    if (output != input) {
+        musaMemcpyAsync(output, input, total_out_elements * sizeof(T), musaMemcpyDeviceToDevice, musa_stream);
+    }
+
+    // --------------------------------------------------------------
+    // 步骤 2: Scatter (Source -> Output)
+    // --------------------------------------------------------------
+    // 线程并行度取决于 Source 的大小
+    // Source 逻辑形状: [Outer, IndexLen, Inner]
+    size_t num_src_elements = info.outer_size() * info.index_len() * info.inner_size();
+
+    if (num_src_elements == 0) {
+        return;
+    }
+
+    int threads = 256;
+    int blocks = (num_src_elements + threads - 1) / threads;
+
+    index_copy_kernel<T, TIdx><<<blocks, threads, 0, musa_stream>>>(
+        num_src_elements,
+        info.index_len(),
+        info.inner_size(),
+        info.dim_size(),
+        source,
+        indices_ptr,
+        output
+    );
+}
+
+// ==================================================================
+// 3. Descriptor Implementation
+// ==================================================================
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    int64_t dim,
+    infiniopTensorDescriptor_t index_desc,
+    infiniopTensorDescriptor_t source_desc) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    // 使用 Info 类校验形状和类型
+    auto info_result = IndexCopyInfo::create(out_desc, in_desc, dim, index_desc, source_desc);
+
+    if (!info_result) {
+        return info_result.status();
+    }
+
+    *desc_ptr = new Descriptor(
+        nullptr,
+        *info_result,
+        0, // No workspace needed
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    const void *source,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // --------------------------------------------------------------
+    // 定义分发宏：Data Type x Index Type
+    // --------------------------------------------------------------
+    #define LAUNCH_KERNEL(T)                                                            \
+        do {                                                                            \
+            if (_info.idx_dtype() == INFINI_DTYPE_I32) {                                \
+                index_copy_moore_launch<T, int32_t>(                                    \
+                    _info,                                                              \
+                    static_cast<T *>(output),                                           \
+                    static_cast<const T *>(input),                                      \
+                    static_cast<const T *>(source),                                     \
+                    index,                                                              \
+                    stream);                                                            \
+            } else if (_info.idx_dtype() == INFINI_DTYPE_I64) {                         \
+                index_copy_moore_launch<T, int64_t>(                                    \
+                    _info,                                                              \
+                    static_cast<T *>(output),                                           \
+                    static_cast<const T *>(input),                                      \
+                    static_cast<const T *>(source),                                     \
+                    index,                                                              \
+                    stream);                                                            \
+            } else {                                                                    \
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;                                  \
+            }                                                                           \
+        } while (0)
+
+    // --------------------------------------------------------------
+    // 根据数据类型分发
+    // --------------------------------------------------------------
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        LAUNCH_KERNEL(half);
+        break;
+        
+    case INFINI_DTYPE_BF16:
+        LAUNCH_KERNEL(__mt_bfloat16);
+        break;
+
+    case INFINI_DTYPE_F32:
+        LAUNCH_KERNEL(float);
+        break;
+
+    case INFINI_DTYPE_F64:
+        LAUNCH_KERNEL(double);
+        break;
+    
+    case INFINI_DTYPE_I32:
+        LAUNCH_KERNEL(int32_t);
+        break;
+
+    case INFINI_DTYPE_I64:
+        LAUNCH_KERNEL(int64_t);
+        break;
+
+    case INFINI_DTYPE_I8:
+        LAUNCH_KERNEL(int8_t);
+        break;
+
+    case INFINI_DTYPE_U8:
+        LAUNCH_KERNEL(uint8_t);
+        break;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    #undef LAUNCH_KERNEL
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::index_copy::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/moore/index_copy_moore_kernel.h b/src/infiniop/ops/index_copy/moore/index_copy_moore_kernel.h
new file mode 100644
index 000000000..223d9196c
--- /dev/null
+++ b/src/infiniop/ops/index_copy/moore/index_copy_moore_kernel.h
@@ -0,0 +1,73 @@
+#ifndef __INDEX_COPY_MOORE_KERNEL_H__
+#define __INDEX_COPY_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h> // 必须包含，用于 __mt_bfloat16 定义
+
+#include <type_traits> // 用于 std::is_same_v
+
+namespace op::index_copy::moore {
+
+typedef struct IndexCopyOp {
+public:
+    // IndexCopy 算子涉及两种数据类型：数据本身 (T) 和 索引类型 (TIdx)
+    // 逻辑：Output[..., indices[i], ...] = Source[..., i, ...]
+    
+    template <typename T, typename TIdx>
+    __device__ __forceinline__ void operator()(
+        const size_t curr_idx,      // 当前线程处理的 Source 元素的线性索引
+        
+        // 几何参数 (来自 IndexCopyInfo)
+        const size_t index_len,     // Index 向量的长度 (即 Source 在 dim 维度的长度)
+        const size_t inner_size,    // dim 右侧维度的 stride
+        const size_t dim_size,      // Output 在 dim 维度的长度 (用于计算偏移和边界检查)
+        
+        // 指针
+        const T* source_data,       // Source 数据指针 (Input source)
+        const TIdx* indices_data,   // 索引数据指针
+        T* output_data              // Output 数据指针 (In/Out, 通常已由 Input 初始化)
+    ) const {
+        
+        // 1. 坐标映射 (Flat Index -> Multi-dim Index)
+        // Source 逻辑形状为: [Outer, IndexLen, Inner]
+        // 将 curr_idx 分解为 (outer, idx_in_index, inner)
+        
+        size_t inner_idx = curr_idx % inner_size;
+        size_t tmp = curr_idx / inner_size;
+        size_t idx_in_indices = tmp % index_len; // 当前处理的是 indices 张量中的第几个索引
+        size_t outer_idx = tmp / index_len;
+
+        // 2. 读取 Source 数据
+        T src_val = source_data[curr_idx];
+
+        // 3. 获取目标 Index
+        TIdx target_dim_idx = indices_data[idx_in_indices];
+
+        // 4. 处理负索引 (支持 Python 风格，如 -1 代表最后一个元素)
+        if (target_dim_idx < 0) {
+            target_dim_idx += static_cast<TIdx>(dim_size);
+        }
+
+        // 5. 边界检查与赋值
+        if (target_dim_idx >= 0 && target_dim_idx < static_cast<TIdx>(dim_size)) {
+            // 计算 Output 的线性偏移
+            // Output 逻辑形状: [Outer, DimSize, Inner]
+            // Offset = Outer_Pos + Dim_Pos + Inner_Pos
+            size_t out_offset = outer_idx * (dim_size * inner_size) + 
+                                static_cast<size_t>(target_dim_idx) * inner_size + 
+                                inner_idx;
+            
+            // 直接赋值 (Scatter)
+            // MUSA 平台 half/bf16/float 均支持 operator=
+            // 对于重复索引的情况，通常行为是非确定性的 (Last write wins)，无需原子操作
+            output_data[out_offset] = src_val;
+        }
+        // 如果索引越界，IndexCopy 通常忽略该操作，不做任何修改
+    }
+
+} IndexCopyOp;
+
+} // namespace op::index_copy::moore
+
+#endif // __INDEX_COPY_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu
new file mode 100644
index 000000000..27f574697
--- /dev/null
+++ b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cu
@@ -0,0 +1,179 @@
+#include "index_copy_nvidia.cuh"
+#include "../cuda/kernel.cuh" // 假设这是通用 kernel 头文件路径，或者是 index_copy_cuda.h
+#include "../../../handle.h"
+#include <cstdint>
+
+// 【关键】引入 CUDA 浮点类型定义
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+namespace op::index_copy::nvidia {
+
+// ==================================================================
+// 辅助函数：检查指针内存对齐
+// ==================================================================
+template <typename T>
+bool is_aligned(const void *ptr, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+}
+
+// ==================================================================
+// Kernel Launch Logic
+// ==================================================================
+template <typename T, typename TIdx>
+void launch_kernel(
+    void *output,
+    const void *source,
+    const void *indices,
+    const IndexCopyInfo &info,
+    void *stream) {
+
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto src_ptr = reinterpret_cast<const T *>(source);
+    auto idx_ptr = reinterpret_cast<const TIdx *>(indices);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    // 获取几何信息 (无 alpha)
+    size_t outer_size = info.outer_size();
+    size_t inner_size = info.inner_size();
+    size_t dim_size   = info.dim_size();
+    size_t index_len  = info.index_len();
+
+    // Source 总元素数
+    size_t num_source = outer_size * index_len * inner_size;
+
+    // --- 向量化参数配置 ---
+    // 目标：每个线程读取 128-bit (16 Bytes) Source 数据
+    // IndexCopy 是从 Source 读取并写入 Output，Source 是连续读取，适合向量化 Load
+    constexpr int TotalBytes = 16;
+    constexpr int PackSize = TotalBytes / sizeof(T);
+    
+    // 向量化条件检查：
+    // 1. PackSize > 1
+    // 2. Source 总数能整除 PackSize (简化 tail 处理)
+    // 3. Source 指针地址对齐 (Load Vectorized 要求)
+    bool can_vectorize = (PackSize > 1) && 
+                         (num_source % PackSize == 0) &&
+                         is_aligned<T>(source, TotalBytes);
+
+    if (can_vectorize) {
+        // === 路径 A: 向量化读取 Kernel ===
+        size_t num_packs = num_source / PackSize;
+        
+        size_t block_size = 256;
+        size_t grid_size = (num_packs + block_size - 1) / block_size;
+
+        op::index_copy::cuda::index_copy_kernel_vectorized<T, TIdx, PackSize>
+            <<<grid_size, block_size, 0, cuda_stream>>>(
+            out_ptr, src_ptr, idx_ptr, 
+            outer_size, inner_size, dim_size, index_len, 
+            num_packs
+            // 注意：移除了 alpha
+        );
+    } else {
+        // === 路径 B: 标量 Kernel ===
+        size_t block_size = 256;
+        size_t grid_size = (num_source + block_size - 1) / block_size;
+
+        op::index_copy::cuda::index_copy_kernel<T, TIdx>
+            <<<grid_size, block_size, 0, cuda_stream>>>(
+            out_ptr, src_ptr, idx_ptr, 
+            outer_size, inner_size, dim_size, index_len, 
+            num_source
+            // 注意：移除了 alpha
+        );
+    }
+}
+
+// ==================================================================
+// Descriptor 实现
+// ==================================================================
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    int64_t dim,
+    infiniopTensorDescriptor_t index_desc,
+    infiniopTensorDescriptor_t source_desc) { // 注意：移除了 alpha 参数
+
+    // Info 创建
+    auto info_result = IndexCopyInfo::create(out_desc, in_desc, dim, index_desc, source_desc); // 无 alpha
+    if (!info_result) return info_result.status();
+
+    *desc_ptr = new Descriptor(
+        new Opaque(), info_result.take(), 0, handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+// ==================================================================
+// Calculate
+// ==================================================================
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    const void *source,
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+    auto idx_dtype = _info.idx_dtype();
+
+    // 宏：根据 T_STORAGE 类型实例化 launch_kernel
+    // T_STORAGE 将会是: float, double, int32_t, __half, __nv_bfloat16
+    #define LAUNCH_BY_SIZE(T_STORAGE) \
+        switch (idx_dtype) { \
+        case INFINI_DTYPE_I32: \
+            launch_kernel<T_STORAGE, int32_t>(output, source, index, _info, stream); \
+            break; \
+        case INFINI_DTYPE_I64: \
+            launch_kernel<T_STORAGE, int64_t>(output, source, index, _info, stream); \
+            break; \
+        default: return INFINI_STATUS_BAD_TENSOR_DTYPE; \
+        }
+
+    switch (dtype) {
+    // 32-bit Float
+    case INFINI_DTYPE_F32:
+        LAUNCH_BY_SIZE(float);
+        break;
+    // 64-bit Float
+    case INFINI_DTYPE_F64:
+        LAUNCH_BY_SIZE(double);
+        break;
+    // 16-bit Half (fp16) -> 使用 __half
+    case INFINI_DTYPE_F16:
+        LAUNCH_BY_SIZE(__half);
+        break;
+    // 16-bit BFloat16 (bf16) -> 使用 __nv_bfloat16
+    case INFINI_DTYPE_BF16:
+        LAUNCH_BY_SIZE(__nv_bfloat16);
+        break;
+    // Integers
+    case INFINI_DTYPE_I32:
+        LAUNCH_BY_SIZE(int32_t);
+        break;
+    case INFINI_DTYPE_I64:
+        LAUNCH_BY_SIZE(int64_t);
+        break;
+    
+    // 如果有其他整型需求 (I8, U8 等)，也在这里添加 case
+    default: 
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    #undef LAUNCH_BY_SIZE
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::index_copy::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cuh b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cuh
new file mode 100644
index 000000000..296737294
--- /dev/null
+++ b/src/infiniop/ops/index_copy/nvidia/index_copy_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __INDEX_COPY_NVIDIA_CUH__
+#define __INDEX_COPY_NVIDIA_CUH__
+
+#include "../index_copy.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __INDEX_COPY_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy/operator.cc b/src/infiniop/ops/index_copy/operator.cc
new file mode 100644
index 000000000..213189cf7
--- /dev/null
+++ b/src/infiniop/ops/index_copy/operator.cc
@@ -0,0 +1,189 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/index_copy.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/index_copy_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/index_copy_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/index_copy_metax.h"
+#endif
+
+// 【新增】Moore 后端头文件
+#ifdef ENABLE_MOORE_API
+#include "moore/index_copy_moore.h"
+#endif
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateIndexCopyDescriptor(
+    infiniopHandle_t handle,
+    infiniopIndexCopyDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input,
+    int64_t dim,
+    infiniopTensorDescriptor_t index,
+    infiniopTensorDescriptor_t source) {
+
+    #define CREATE(CASE, NAMESPACE)                                             \
+        case CASE:                                                              \
+            return op::index_copy::NAMESPACE::Descriptor::create(               \
+                handle,                                                         \
+                reinterpret_cast<op::index_copy::NAMESPACE::Descriptor **>(desc_ptr), \
+                output,                                                         \
+                input,                                                          \
+                dim,                                                            \
+                index,                                                          \
+                source)
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // 【关键修复】启用 Moore 分支
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetIndexCopyWorkspaceSize(infiniopIndexCopyDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                        \
+        case CASE:                                                                                      \
+            *size = reinterpret_cast<op::index_copy::NAMESPACE::Descriptor *>(desc)->workspaceSize();   \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+    // 【关键修复】启用 Moore 分支
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopIndexCopy(
+    infiniopIndexCopyDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    const void *source,
+    void *stream) {
+
+    #define CALCULATE(CASE, NAMESPACE)                                          \
+        case CASE:                                                              \
+            return reinterpret_cast<const op::index_copy::NAMESPACE::Descriptor *>(desc) \
+                ->calculate(workspace, workspace_size, output, input, index, source, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // 【关键修复】启用 Moore 分支
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroyIndexCopyDescriptor(infiniopIndexCopyDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                          \
+        case CASE:                                                                           \
+            delete reinterpret_cast<const op::index_copy::NAMESPACE::Descriptor *>(desc);    \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // 【关键修复】启用 Moore 分支
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.cc b/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.cc
new file mode 100644
index 000000000..c8ccced6f
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.cc
@@ -0,0 +1,148 @@
+#include "smooth_l1_loss_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+#include <cmath>
+#include <omp.h>
+
+#include "../../../../utils/custom_types.h"
+
+namespace op::smooth_l1_loss::cpu {
+
+Descriptor::~Descriptor() = default;
+
+// ==================================================================
+// 创建描述符
+// ==================================================================
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t target_desc,
+    float beta,
+    int reduction) {
+    
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = SmoothL1LossInfo::create(out_desc, input_desc, target_desc, beta, reduction);
+    CHECK_RESULT(result);
+    
+    *desc_ptr = new Descriptor(
+        nullptr,
+        result.take(),
+        0,
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// ==================================================================
+// 核心计算逻辑
+// ==================================================================
+template <typename T>
+void calculate_cpu_impl(
+    const SmoothL1LossInfo &info,
+    void *output,
+    const void *input,
+    const void *target) {
+
+    size_t numel = info.numel();
+    float beta = info.beta();
+    int reduction = info.reduction();
+    
+    float inv_beta = (beta > 0) ? (1.0f / beta) : 0.0f;
+    float half_beta = 0.5f * beta;
+
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto tar_ptr = reinterpret_cast<const T *>(target);
+
+    // ----------------------------------------------------
+    // 模式 A: Elementwise (None)
+    // ----------------------------------------------------
+    if (reduction == 0) {
+        #pragma omp parallel for schedule(static)
+        for (size_t i = 0; i < numel; ++i) {
+            float in_val = utils::cast<float>(in_ptr[i]);
+            float tar_val = utils::cast<float>(tar_ptr[i]);
+
+            float diff = std::abs(in_val - tar_val);
+            float loss;
+
+            if (diff < beta) {
+                loss = 0.5f * diff * diff * inv_beta;
+            } else {
+                loss = diff - half_beta;
+            }
+
+            // [核心] 计算完 float 后，转回目标类型 T 存储
+            out_ptr[i] = utils::cast<T>(loss);
+        }
+    }
+    // ----------------------------------------------------
+    // 模式 B: Reduction (Mean / Sum)
+    // ----------------------------------------------------
+    else {
+        double total_sum = 0.0;
+
+        #pragma omp parallel for reduction(+:total_sum) schedule(static)
+        for (size_t i = 0; i < numel; ++i) {
+            float in_val = utils::cast<float>(in_ptr[i]);
+            float tar_val = utils::cast<float>(tar_ptr[i]);
+
+            float diff = std::abs(in_val - tar_val);
+            float loss;
+
+            if (diff < beta) {
+                loss = 0.5f * diff * diff * inv_beta;
+            } else {
+                loss = diff - half_beta;
+            }
+            
+            total_sum += static_cast<double>(loss);
+        }
+
+        if (reduction == 1) { // Mean
+            total_sum /= static_cast<double>(numel);
+        }
+
+        // Reduction 结果写入第 0 个位置
+        out_ptr[0] = utils::cast<T>(static_cast<float>(total_sum));
+    }
+}
+
+// ==================================================================
+// 分发逻辑
+// ==================================================================
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *target,
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+
+    #define DISPATCH_TYPE(T) \
+        cpu::calculate_cpu_impl<T>(_info, output, input, target); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (dtype) {
+    case INFINI_DTYPE_F32:
+        DISPATCH_TYPE(float);
+    case INFINI_DTYPE_F64:
+        DISPATCH_TYPE(double);
+    case INFINI_DTYPE_F16:
+        DISPATCH_TYPE(fp16_t);
+    case INFINI_DTYPE_BF16:
+        DISPATCH_TYPE(bf16_t);
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    #undef DISPATCH_TYPE
+}
+
+} // namespace op::smooth_l1_loss::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.h b/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.h
new file mode 100644
index 000000000..ee113c858
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/cpu/smooth_l1_loss_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __SMOOTH_L1_LOSS_CPU_H__
+#define __SMOOTH_L1_LOSS_CPU_H__
+
+#include "../smooth_l1_loss.h"
+
+// 使用宏实例化 op::smooth_l1_loss::cpu::Descriptor 类声明
+DESCRIPTOR(cpu)
+
+#endif // __SMOOTH_L1_LOSS_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh b/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh
new file mode 100644
index 000000000..29d8d0acf
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/cuda/kernel.cuh
@@ -0,0 +1,148 @@
+#ifndef __SMOOTH_L1_LOSS_CUDA_CUH__
+#define __SMOOTH_L1_LOSS_CUDA_CUH__
+
+#if defined(__MACA__) || defined(__MACACC__)
+    #include <maca_fp16.h>
+    #include <maca_bfloat16.h>
+#else
+    #include <cuda_runtime.h>
+    #include <cuda_fp16.h>
+    #include <cuda_bf16.h>
+#endif
+
+#include <cmath>
+
+namespace op::smooth_l1_loss::cuda {
+
+// ==================================================================
+// 基础定义
+// ==================================================================
+template <typename T, int N>
+struct alignas(sizeof(T) * N) Pack {
+    T val[N];
+};
+
+// ==================================================================
+// 归约辅助函数 (Warp & Block Reduction)
+// ==================================================================
+__device__ __forceinline__ float warpReduceSum(float val) {
+    unsigned int mask = 0xffffffff;
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+        val += __shfl_down_sync(mask, val, offset);
+    return val;
+}
+
+__device__ __forceinline__ float blockReduceSum(float val) {
+    static __shared__ float shared[32]; // Max 1024 threads / 32 warps
+    int lane = threadIdx.x % warpSize;
+    int wid = threadIdx.x / warpSize;
+
+    val = warpReduceSum(val);
+    if (lane == 0) shared[wid] = val;
+    __syncthreads();
+
+    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0f;
+    if (wid == 0) val = warpReduceSum(val);
+    return val;
+}
+
+// ==================================================================
+// Functor
+// ==================================================================
+struct SmoothL1LossFunctor {
+    float beta;
+    float inv_beta;
+    float half_beta;
+
+    __host__ __device__ SmoothL1LossFunctor(float beta_val) 
+        : beta(beta_val), inv_beta(1.0f / beta_val), half_beta(0.5f * beta_val) {}
+
+    template <typename T>
+    __device__ __forceinline__ float compute(const T &input, const T &target) const {
+        float in_f = static_cast<float>(input);
+        float tg_f = static_cast<float>(target);
+        float diff = in_f - tg_f;
+        float abs_diff = fabsf(diff);
+
+        if (abs_diff < beta) {
+            return 0.5f * diff * diff * inv_beta;
+        } else {
+            return abs_diff - half_beta;
+        }
+    }
+};
+
+// ==================================================================
+// 1. Elementwise Kernels (reduction='none')
+// ==================================================================
+template <typename T>
+__global__ void smooth_l1_loss_kernel(
+    T * __restrict__ output, const T * __restrict__ input, const T * __restrict__ target,
+    size_t numel, SmoothL1LossFunctor functor) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < numel) {
+        output[idx] = static_cast<T>(functor.compute(input[idx], target[idx]));
+    }
+}
+
+template <typename T, int PackSize>
+__global__ void smooth_l1_loss_kernel_vectorized(
+    T * __restrict__ output, const T * __restrict__ input, const T * __restrict__ target,
+    size_t num_packs, SmoothL1LossFunctor functor) {
+    using PackType = Pack<T, PackSize>;
+    auto out_vec = reinterpret_cast<PackType*>(output);
+    auto in_vec = reinterpret_cast<const PackType*>(input);
+    auto tar_vec = reinterpret_cast<const PackType*>(target);
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_packs) {
+        PackType in_pack = in_vec[idx];
+        PackType tar_pack = tar_vec[idx];
+        PackType out_pack;
+        #pragma unroll
+        for (int i = 0; i < PackSize; ++i) {
+            out_pack.val[i] = static_cast<T>(functor.compute(in_pack.val[i], tar_pack.val[i]));
+        }
+        out_vec[idx] = out_pack;
+    }
+}
+
+// ==================================================================
+// 2. Reduction Kernel (reduction='mean' / 'sum')
+// ==================================================================
+// 简单的 AtomicAdd 全局归约
+template <typename T>
+__global__ void smooth_l1_loss_reduce_kernel(
+    float * output, // 使用 float 累加防止溢出
+    const T * __restrict__ input,
+    const T * __restrict__ target,
+    size_t numel,
+    SmoothL1LossFunctor functor,
+    float scale // Mean模式传 1/N, Sum模式传 1.0
+) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+    float local_sum = 0.0f;
+
+    // Grid-Stride Loop
+    for (size_t i = idx; i < numel; i += stride) {
+        local_sum += functor.compute(input[i], target[i]);
+    }
+
+    // Block Reduction
+    float block_sum = blockReduceSum(local_sum);
+
+    // Global Atomic Add
+    if (threadIdx.x == 0) {
+        atomicAdd(output, block_sum * scale);
+    }
+}
+
+// 辅助 Kernel: 将 float 结果转回目标类型 T 并写入 output
+template <typename T>
+__global__ void cast_float_to_t(T* output, const float* src) {
+    *output = static_cast<T>(*src);
+}
+
+} // namespace op::smooth_l1_loss::cuda
+
+#endif // __SMOOTH_L1_LOSS_CUDA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/info.h b/src/infiniop/ops/smooth_l1_loss/info.h
new file mode 100644
index 000000000..475dd7a95
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/info.h
@@ -0,0 +1,93 @@
+#ifndef __SMOOTH_L1_LOSS_INFO_H__
+#define __SMOOTH_L1_LOSS_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::smooth_l1_loss {
+
+class SmoothL1LossInfo {
+    SmoothL1LossInfo() = default;
+
+public:
+    int _dtype;          // 数据类型 (float, half, etc.)
+    size_t _numel;       // 参与计算的元素总数 (input.numel())
+    float _beta;         // 平滑阈值参数
+    int _reduction;      // 规约模式 (0:None, 1:Mean, 2:Sum)
+
+    int dtype() const { return _dtype; }
+    size_t numel() const { return _numel; }
+    float beta() const { return _beta; }
+    int reduction() const { return _reduction; }
+
+    static utils::Result<SmoothL1LossInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t target_desc,
+        float beta,
+        int reduction) {
+
+        // 1. 检查输入数据类型一致性 (Input vs Target)
+        if (input_desc->dtype() != target_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        
+        // 2. 检查输出数据类型一致性 (Output vs Input)
+        if (out_desc->dtype() != input_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        // 3. 检查输入形状一致性 (Input vs Target)
+        // SmoothL1Loss 要求 input 和 target 形状完全一致 (Elementwise)
+        if (input_desc->ndim() != target_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        
+        const auto &in_shape = input_desc->shape();
+        const auto &tar_shape = target_desc->shape();
+        size_t numel = input_desc->numel();
+
+        for (size_t i = 0; i < input_desc->ndim(); ++i) {
+            if (in_shape[i] != tar_shape[i]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        // 4. 检查输出形状 (Output vs Reduction Mode)
+        // Reduction枚举值: 0=None, 1=Mean, 2=Sum
+        if (reduction == 0) {
+            // Reduction::None -> 输出形状必须与输入一致
+            if (out_desc->ndim() != input_desc->ndim()) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+            const auto &out_shape = out_desc->shape();
+            for (size_t i = 0; i < input_desc->ndim(); ++i) {
+                if (out_shape[i] != in_shape[i]) {
+                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
+                }
+            }
+        } else {
+            // Reduction::Mean/Sum -> 输出通常是标量
+            // 标量的定义可能是 ndim=0，或者 numel=1
+            if (out_desc->numel() != 1) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        // 5. 校验 beta 参数 (必须非负)
+        if (beta < 0) {
+            return INFINI_STATUS_BAD_PARAM; 
+        }
+        return utils::Result<SmoothL1LossInfo>(SmoothL1LossInfo{
+            input_desc->dtype(), // _dtype
+            numel,               // _numel
+            beta,                // _beta
+            reduction            // _reduction
+        });
+    }
+};
+
+} // namespace op::smooth_l1_loss
+
+#endif // __SMOOTH_L1_LOSS_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.h b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.h
new file mode 100644
index 000000000..cf731d377
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SMOOTH_L1_LOSS_METAX_H__
+#define __SMOOTH_L1_LOSS_METAX_H__
+
+#include "../smooth_l1_loss.h"
+
+DESCRIPTOR(metax)
+
+#endif // __SMOOTH_L1_LOSS_METAX_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca
new file mode 100644
index 000000000..a1ba08bfe
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/metax/smooth_l1_loss_metax.maca
@@ -0,0 +1,207 @@
+#include "smooth_l1_loss_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include <mcr/mc_runtime.h>
+#include <common/mc_library_types.h>
+#include <cmath>
+#include <cstdio>
+#include <vector>
+
+#include "../../../tensor.h"
+#include "../cuda/kernel.cuh"
+namespace op::smooth_l1_loss::metax {
+
+// ==================================================================
+// Atomic Helpers
+// ==================================================================
+
+template <typename T> __device__ __forceinline__ void gpuAtomicAdd(T* address, T val) { atomicAdd(address, val); }
+// Float atomic add is usually native, but explicitly defined just in case
+template <> __device__ __forceinline__ void gpuAtomicAdd(float* address, float val) { atomicAdd(address, val); }
+
+// ==================================================================
+// Stride Helper
+// ==================================================================
+struct TensorShape {
+    int ndim;
+    int dims[4];
+    int strides[4];
+};
+
+__device__ inline size_t get_offset(int idx, const TensorShape& shape) {
+    if (shape.ndim == 0) return 0;
+    size_t offset = 0;
+    int rem = idx;
+    #pragma unroll
+    for (int i = shape.ndim - 1; i >= 0; --i) {
+        int dim_sz = shape.dims[i];
+        int pos = rem % dim_sz;
+        rem /= dim_sz;
+        offset += pos * shape.strides[i];
+    }
+    return offset;
+}
+
+// ==================================================================
+// Conversion Kernel
+// ==================================================================
+// 将 float 类型的累加结果转换回目标类型 T 并写入 output
+template <typename T>
+__global__ void cast_float_to_output(T* dest, const float* src) {
+    *dest = static_cast<T>(*src);
+}
+
+// ==================================================================
+// Main Kernel
+// ==================================================================
+
+// AccT: 累加器类型。Reduction模式下强制为 float 以保证精度。
+template <typename T, typename AccT, int ReductionMode>
+__global__ void smooth_l1_loss_kernel(
+    AccT* output, // Reduction时是 float* (workspace), None时是 T* (output)
+    const T* input,
+    const T* target,
+    size_t n,
+    float beta,
+    float scale,
+    TensorShape in_s, TensorShape tg_s, TensorShape out_s)
+{
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+
+    size_t in_off = get_offset(idx, in_s);
+    size_t tg_off = get_offset(idx, tg_s);
+
+    // 全部转为 float 进行高精度计算
+    float val_in = static_cast<float>(input[in_off]);
+    float val_tg = static_cast<float>(target[tg_off]);
+    float diff = val_in - val_tg;
+    float abs_diff = fabsf(diff);
+    float loss = (abs_diff < beta) ? (0.5f * diff * diff / beta) : (abs_diff - 0.5f * beta);
+
+    if constexpr (ReductionMode == 0) { // None
+        size_t out_off = get_offset(idx, out_s);
+        output[out_off] = static_cast<AccT>(loss);
+    } else { // Sum or Mean
+        loss *= scale;
+        // 使用 AccT (float) 进行原子累加
+        gpuAtomicAdd(output, static_cast<AccT>(loss));
+    }
+}
+
+// ==================================================================
+// Launcher
+// ==================================================================
+
+template <typename T>
+void launch_kernel_impl(
+    void* output, void* workspace,
+    const void* input, const void* target,
+    size_t n, float beta, int reduction, 
+    const TensorShape& in_s, const TensorShape& tg_s, const TensorShape& out_s,
+    void* stream)
+{
+    auto mc_stream = reinterpret_cast<mcStream_t>(stream);
+    size_t grid = (n + 255) / 256;
+    float scale = (reduction == 1) ? (1.0f / static_cast<float>(n)) : 1.0f;
+
+    if (reduction == 0) { // None
+        // 直接输出到 output (T*)
+        smooth_l1_loss_kernel<T, T, 0><<<grid, 256, 0, mc_stream>>>(
+            (T*)output, (const T*)input, (const T*)target, n, beta, scale, in_s, tg_s, out_s);
+    } else { // Reduction
+        // 1. 累加到 workspace (float*)，避免 FP16 精度丢失
+        smooth_l1_loss_kernel<T, float, 1><<<grid, 256, 0, mc_stream>>>(
+            (float*)workspace, (const T*)input, (const T*)target, n, beta, scale, in_s, tg_s, out_s);
+            
+        // 2. 将 workspace 的 float 结果转存到 output (T*)
+        cast_float_to_output<T><<<1, 1, 0, mc_stream>>>((T*)output, (const float*)workspace);
+    }
+}
+
+// ==================================================================
+// Descriptor
+// ==================================================================
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    float beta;
+    int reduction;
+    size_t total_elements;
+    TensorShape in_shape;
+    TensorShape tg_shape;
+    TensorShape out_shape;
+};
+
+Descriptor::~Descriptor() { if (_opaque) delete _opaque; }
+
+static TensorShape extract_shape(infiniopTensorDescriptor_t desc) {
+    auto d = reinterpret_cast<const InfiniopTensorDescriptor*>(desc);
+    TensorShape s;
+    s.ndim = d->ndim();
+    if (s.ndim > 4) s.ndim = 4;
+    for (int i = 0; i < s.ndim; ++i) {
+        s.dims[i] = d->shape()[i];
+        s.strides[i] = d->strides()[i];
+    }
+    if (d->ndim() == 0) { s.ndim = 1; s.dims[0] = 1; s.strides[0] = 1; }
+    return s;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_, Descriptor **desc_ptr, 
+    infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, 
+    infiniopTensorDescriptor_t target_desc, float beta, int reduction) 
+{
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto info_result = SmoothL1LossInfo::create(out_desc, input_desc, target_desc, beta, reduction);
+    if (!info_result) return info_result.status();
+
+    auto in_d = reinterpret_cast<const InfiniopTensorDescriptor*>(input_desc);
+    size_t total = 1; for (int i = 0; i < in_d->ndim(); ++i) total *= in_d->shape()[i];
+
+    auto opaque = new Opaque{
+        handle->internal(), beta, reduction, total,
+        extract_shape(input_desc), extract_shape(target_desc), extract_shape(out_desc)
+    };
+    
+    // [关键] 如果需要 Reduction，申请 4 字节 workspace 用于存放 float 累加值
+    size_t ws_size = (reduction != 0) ? sizeof(float) : 0;
+
+    *desc_ptr = new Descriptor(opaque, info_result.take(), ws_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size, void *output, 
+    const void *input, const void *target, void *stream) const 
+{
+    auto mc_stream = reinterpret_cast<mcStream_t>(stream);
+
+    if (_opaque->reduction != 0) {
+        // Reduction 模式：清空 Workspace (float) 而不是 Output
+        // 这样可以确保累加从 0.0f 开始
+        mcMemsetAsync(workspace, 0, sizeof(float), mc_stream);
+    }
+
+    size_t n = _opaque->total_elements;
+    float beta = _opaque->beta;
+    int reduction = _opaque->reduction;
+
+    #define LAUNCH(T) launch_kernel_impl<T>(output, workspace, input, target, n, beta, reduction, _opaque->in_shape, _opaque->tg_shape, _opaque->out_shape, stream)
+
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16: LAUNCH(__half); break;
+    case INFINI_DTYPE_BF16:
+#if defined(__MACA__) || defined(__MACACC__)
+        LAUNCH(__maca_bfloat16);
+#endif
+        break;
+    case INFINI_DTYPE_F32: LAUNCH(float); break;
+    case INFINI_DTYPE_F64: LAUNCH(double); break;
+    default: return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    #undef LAUNCH
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::smooth_l1_loss::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.h b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.h
new file mode 100644
index 000000000..c7358e518
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.h
@@ -0,0 +1,11 @@
+#ifndef __SMOOTH_L1_LOSS_MOORE_API_H__
+#define __SMOOTH_L1_LOSS_MOORE_API_H__
+
+// 引入上层定义的 Descriptor 宏和基础类
+#include "../smooth_l1_loss.h"
+
+// 使用 smooth_l1_loss.h 中定义的 DESCRIPTOR 宏
+// 这将自动生成 op::smooth_l1_loss::moore::Descriptor 类定义
+DESCRIPTOR(moore)
+
+#endif // __SMOOTH_L1_LOSS_MOORE_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.mu b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.mu
new file mode 100644
index 000000000..5b7dbcb0a
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore.mu
@@ -0,0 +1,104 @@
+#include "smooth_l1_loss_moore.h"
+#include "smooth_l1_loss_moore_kernel.h"
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+
+#include "../../../devices/moore/moore_handle.h"
+
+namespace op::smooth_l1_loss::moore {
+
+template <typename T>
+void smooth_l1_loss_moore_launch(
+    const SmoothL1LossInfo &info,
+    T *output,
+    const T *input,
+    const T *target,
+    void *stream) {
+
+    auto musa_stream = (musaStream_t)stream;
+    size_t numel = info.numel();
+    int reduction = info.reduction(); // 0:None, 1:Mean, 2:Sum
+
+    int threads = 256;
+    // Calculate blocks
+    int blocks = (numel + threads - 1) / threads;
+    // Cap blocks to avoid overhead for huge tensors (standard practice)
+    if (blocks > 1024) blocks = 1024;
+
+    if (reduction == 0) {
+        // --- None (Elementwise) ---
+        // Just use simple grid mapping
+        int simple_blocks = (numel + threads - 1) / threads;
+        smooth_l1_loss_elementwise_kernel<T><<<simple_blocks, threads, 0, musa_stream>>>(
+            numel, info.beta(), input, target, output
+        );
+    } else {
+        // --- Mean / Sum (Reduction) ---
+        
+        // 1. Zero out output (Atomic accumulation destination)
+        musaMemsetAsync(output, 0, sizeof(T), musa_stream);
+
+        // 2. Launch Reduction Kernel
+        // Shared Memory size: threads * sizeof(float) because we accumulate in float32
+        size_t smem_size = threads * sizeof(float);
+
+        smooth_l1_loss_reduce_kernel<T><<<blocks, threads, smem_size, musa_stream>>>(
+            numel, info.beta(), input, target, output
+        );
+
+        // 3. Post-processing for Mean
+        if (reduction == 1) { 
+            avg_scaling_kernel<T><<<1, 1, 0, musa_stream>>>(output, numel);
+        }
+    }
+}
+
+// ... (Descriptor implementation remains unchanged) ...
+// Ensure Descriptor::~Descriptor(), create, calculate are still there as before.
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t target_desc,
+    float beta,
+    int reduction) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto info_result = SmoothL1LossInfo::create(out_desc, input_desc, target_desc, beta, reduction);
+    if (!info_result) return info_result.status();
+
+    *desc_ptr = new Descriptor(nullptr, *info_result, 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *target,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        smooth_l1_loss_moore_launch<half>(_info, static_cast<half *>(output), static_cast<const half *>(input), static_cast<const half *>(target), stream); break;
+    case INFINI_DTYPE_BF16:
+        smooth_l1_loss_moore_launch<__mt_bfloat16>(_info, static_cast<__mt_bfloat16 *>(output), static_cast<const __mt_bfloat16 *>(input), static_cast<const __mt_bfloat16 *>(target), stream); break;
+    case INFINI_DTYPE_F32:
+        smooth_l1_loss_moore_launch<float>(_info, static_cast<float *>(output), static_cast<const float *>(input), static_cast<const float *>(target), stream); break;
+    case INFINI_DTYPE_F64:
+        smooth_l1_loss_moore_launch<double>(_info, static_cast<double *>(output), static_cast<const double *>(input), static_cast<const double *>(target), stream); break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::smooth_l1_loss::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore_kernel.h b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore_kernel.h
new file mode 100644
index 000000000..3209e2c59
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/moore/smooth_l1_loss_moore_kernel.h
@@ -0,0 +1,182 @@
+#ifndef __SMOOTH_L1_LOSS_MOORE_KERNEL_H__
+#define __SMOOTH_L1_LOSS_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+#include <cmath>
+#include <type_traits>
+
+namespace op::smooth_l1_loss::moore {
+
+// ==================================================================
+// 1. Type Converter & Atomic Add (Keep existing helpers)
+// ==================================================================
+template <typename T>
+union TypeConverter;
+
+template <>
+union TypeConverter<half> {
+    half val;
+    unsigned short bits;
+};
+
+template <>
+union TypeConverter<__mt_bfloat16> {
+    __mt_bfloat16 val;
+    unsigned short bits;
+};
+
+template <typename T>
+__device__ __forceinline__ void atomic_add_func(T* address, T val) {
+    atomicAdd(address, val);
+}
+
+template <>
+__device__ __forceinline__ void atomic_add_func<half>(half* address, half val) {
+    unsigned short* address_as_us = reinterpret_cast<unsigned short*>(address);
+    unsigned short old = *address_as_us;
+    unsigned short assumed;
+    do {
+        assumed = old;
+        TypeConverter<half> old_converter; old_converter.bits = assumed;
+        float sum_f = __half2float(old_converter.val) + __half2float(val);
+        TypeConverter<half> new_converter; new_converter.val = __float2half(sum_f);
+        old = atomicCAS(address_as_us, assumed, new_converter.bits);
+    } while (assumed != old);
+}
+
+template <>
+__device__ __forceinline__ void atomic_add_func<__mt_bfloat16>(__mt_bfloat16* address, __mt_bfloat16 val) {
+    unsigned short* address_as_us = reinterpret_cast<unsigned short*>(address);
+    unsigned short old = *address_as_us;
+    unsigned short assumed;
+    do {
+        assumed = old;
+        TypeConverter<__mt_bfloat16> old_converter; old_converter.bits = assumed;
+        float sum_f = __bfloat162float(old_converter.val) + __bfloat162float(val);
+        TypeConverter<__mt_bfloat16> new_converter; new_converter.val = __float2bfloat16(sum_f);
+        old = atomicCAS(address_as_us, assumed, new_converter.bits);
+    } while (assumed != old);
+}
+
+// ==================================================================
+// 2. Math Functor
+// ==================================================================
+struct SmoothL1LossMath {
+    template <typename T>
+    __device__ __forceinline__ float operator()(T x_val, T y_val, float beta) const {
+        float x_f, y_f;
+        if constexpr (std::is_same_v<T, half>) {
+            x_f = __half2float(x_val); y_f = __half2float(y_val);
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            x_f = __bfloat162float(x_val); y_f = __bfloat162float(y_val);
+        } else {
+            x_f = static_cast<float>(x_val); y_f = static_cast<float>(y_val);
+        }
+
+        float diff = x_f - y_f;
+        float abs_diff = ::fabsf(diff);
+        if (abs_diff < beta) {
+            return 0.5f * diff * diff / beta;
+        } else {
+            return abs_diff - 0.5f * beta;
+        }
+    }
+};
+
+// ==================================================================
+// 3. Optimized Kernels
+// ==================================================================
+
+// ------------------------------------------------------------------
+// Kernel A: Elementwise (No Reduction)
+// ------------------------------------------------------------------
+template <typename T>
+__global__ void smooth_l1_loss_elementwise_kernel(
+    const size_t numel,
+    const float beta,
+    const T* input,
+    const T* target,
+    T* output) {
+    
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < numel) {
+        float loss_f = SmoothL1LossMath()(input[idx], target[idx], beta);
+        
+        if constexpr (std::is_same_v<T, half>) output[idx] = __float2half(loss_f);
+        else if constexpr (std::is_same_v<T, __mt_bfloat16>) output[idx] = __float2bfloat16(loss_f);
+        else output[idx] = static_cast<T>(loss_f);
+    }
+}
+
+// ------------------------------------------------------------------
+// Kernel B: Block Reduction (For Mean/Sum) - High Precision
+// ------------------------------------------------------------------
+template <typename T>
+__global__ void smooth_l1_loss_reduce_kernel(
+    const size_t numel,
+    const float beta,
+    const T* input,
+    const T* target,
+    T* output) {
+
+    // 1. Thread-Local Accumulation (in Float32)
+    float thread_sum = 0.0f;
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+
+    for (size_t i = idx; i < numel; i += stride) {
+        thread_sum += SmoothL1LossMath()(input[i], target[i], beta);
+    }
+
+    // 2. Shared Memory Reduction
+    // Declare dynamic shared memory (size determined at launch)
+    extern __shared__ float sdata[];
+    unsigned int tid = threadIdx.x;
+    sdata[tid] = thread_sum;
+    __syncthreads();
+
+    // Standard Tree Reduction
+    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] += sdata[tid + s];
+        }
+        __syncthreads();
+    }
+
+    // 3. Block Leader writes to Global Memory
+    if (tid == 0) {
+        float block_sum = sdata[0];
+        // Convert to T only ONCE per block
+        T val_to_add;
+        if constexpr (std::is_same_v<T, half>) val_to_add = __float2half(block_sum);
+        else if constexpr (std::is_same_v<T, __mt_bfloat16>) val_to_add = __float2bfloat16(block_sum);
+        else val_to_add = static_cast<T>(block_sum);
+
+        atomic_add_func(output, val_to_add);
+    }
+}
+
+// ------------------------------------------------------------------
+// Kernel C: Mean Scaling
+// ------------------------------------------------------------------
+template <typename T>
+__global__ void avg_scaling_kernel(T* output, size_t numel) {
+    if (threadIdx.x == 0) {
+        float sum_f;
+        if constexpr (std::is_same_v<T, half>) sum_f = __half2float(output[0]);
+        else if constexpr (std::is_same_v<T, __mt_bfloat16>) sum_f = __bfloat162float(output[0]);
+        else sum_f = static_cast<float>(output[0]);
+
+        float mean_f = sum_f / static_cast<float>(numel);
+
+        if constexpr (std::is_same_v<T, half>) output[0] = __float2half(mean_f);
+        else if constexpr (std::is_same_v<T, __mt_bfloat16>) output[0] = __float2bfloat16(mean_f);
+        else output[0] = static_cast<T>(mean_f);
+    }
+}
+
+} // namespace op::smooth_l1_loss::moore
+
+#endif // __SMOOTH_L1_LOSS_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu b/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu
new file mode 100644
index 000000000..5a5fd9dfe
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cu
@@ -0,0 +1,143 @@
+#include "smooth_l1_loss_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../../../handle.h"
+#include <cstdint>
+#include <algorithm>
+
+namespace op::smooth_l1_loss::nvidia {
+
+// ==================================================================
+// 辅助函数
+// ==================================================================
+template <typename T>
+bool is_aligned(const void *ptr, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+}
+
+// ==================================================================
+// Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, const void *input, const void *target, void* workspace,
+    size_t numel, float beta, int reduction,
+    void *stream) {
+
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto tar_ptr = reinterpret_cast<const T *>(target);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    op::smooth_l1_loss::cuda::SmoothL1LossFunctor functor(beta);
+
+    // ------------------------------------------
+    // 模式 1: Elementwise (None)
+    // ------------------------------------------
+    if (reduction == 0) {
+        auto out_ptr = reinterpret_cast<T *>(output);
+        constexpr int TotalBytes = 16;
+        constexpr int PackSize = TotalBytes / sizeof(T);
+        bool can_vectorize = (PackSize > 1) && (numel % PackSize == 0) &&
+                             is_aligned<T>(output, TotalBytes) &&
+                             is_aligned<T>(input, TotalBytes) &&
+                             is_aligned<T>(target, TotalBytes);
+
+        if (can_vectorize) {
+            size_t num_packs = numel / PackSize;
+            size_t block_size = 256;
+            size_t grid_size = (num_packs + block_size - 1) / block_size;
+            op::smooth_l1_loss::cuda::smooth_l1_loss_kernel_vectorized<T, PackSize>
+                <<<grid_size, block_size, 0, cuda_stream>>>(out_ptr, in_ptr, tar_ptr, num_packs, functor);
+        } else {
+            size_t block_size = 256;
+            size_t grid_size = (numel + block_size - 1) / block_size;
+            op::smooth_l1_loss::cuda::smooth_l1_loss_kernel<T>
+                <<<grid_size, block_size, 0, cuda_stream>>>(out_ptr, in_ptr, tar_ptr, numel, functor);
+        }
+    } 
+    // ------------------------------------------
+    // 模式 2: Reduction (Mean / Sum)
+    // ------------------------------------------
+    else {
+        // 使用 workspace 作为临时的 float 累加器 (精度更高，且方便 atomicAdd)
+        float* acc_ptr = reinterpret_cast<float*>(workspace);
+        
+        // 1. 清零 Accumulator
+        cudaMemsetAsync(acc_ptr, 0, sizeof(float), cuda_stream);
+
+        // 2. 启动 Reduction Kernel
+        float scale = (reduction == 1) ? (1.0f / numel) : 1.0f; // 1=Mean, 2=Sum
+        size_t block_size = 256;
+        // 限制 Grid 大小，避免过多 Block 竞争 atomicAdd
+        size_t grid_size = std::min((numel + block_size - 1) / block_size, static_cast<size_t>(1024));
+
+        op::smooth_l1_loss::cuda::smooth_l1_loss_reduce_kernel<T>
+            <<<grid_size, block_size, 0, cuda_stream>>>(
+                acc_ptr, in_ptr, tar_ptr, numel, functor, scale
+            );
+
+        // 3. 将结果从 float workspace 转回目标类型并写入 output
+        // 输出只有 1 个元素
+        op::smooth_l1_loss::cuda::cast_float_to_t<T>
+            <<<1, 1, 0, cuda_stream>>>(reinterpret_cast<T*>(output), acc_ptr);
+    }
+}
+
+// ==================================================================
+// Descriptor
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { if (_opaque) delete _opaque; }
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t target_desc,
+    float beta, int reduction) {
+
+    auto info_result = SmoothL1LossInfo::create(out_desc, input_desc, target_desc, beta, reduction);
+    if (!info_result) return info_result.status();
+
+    // [关键] 如果是 Reduction 模式，我们需要 4 字节的 workspace 来存 float 中间结果
+    size_t workspace_size = 0;
+    if (reduction != 0) {
+        workspace_size = sizeof(float);
+    }
+
+    *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size, void *output,
+    const void *input, const void *target, void *stream) const {
+
+    auto dtype = _info.dtype();
+    auto numel = _info.numel();
+    float beta = _info.beta();
+    int reduction = _info.reduction();
+
+    // 检查 workspace 是否够用
+    if (reduction != 0 && workspace_size < sizeof(float)) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, target, workspace, numel, beta, reduction, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<nv_bfloat16>(output, input, target, workspace, numel, beta, reduction, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, target, workspace, numel, beta, reduction, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, target, workspace, numel, beta, reduction, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::smooth_l1_loss::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cuh b/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cuh
new file mode 100644
index 000000000..cd6492105
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/nvidia/smooth_l1_loss_nvidia.cuh
@@ -0,0 +1,9 @@
+#ifndef __SMOOTH_L1_LOSS_NVIDIA_CUH__
+#define __SMOOTH_L1_LOSS_NVIDIA_CUH__
+
+#include "../smooth_l1_loss.h"
+
+// 使用宏实例化 op::smooth_l1_loss::nvidia::Descriptor 类声明
+DESCRIPTOR(nvidia)
+
+#endif // __SMOOTH_L1_LOSS_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/operator.cc b/src/infiniop/ops/smooth_l1_loss/operator.cc
new file mode 100644
index 000000000..3703e186f
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/operator.cc
@@ -0,0 +1,188 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/smooth_l1_loss.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/smooth_l1_loss_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/smooth_l1_loss_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/smooth_l1_loss_metax.h"
+#endif
+
+// 【新增】Moore 后端头文件
+#ifdef ENABLE_MOORE_API
+#include "moore/smooth_l1_loss_moore.h"
+#endif
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateSmoothL1LossDescriptor(
+    infiniopHandle_t handle,
+    infiniopSmoothL1LossDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input,
+    infiniopTensorDescriptor_t target,
+    float beta,
+    int reduction) {
+
+    #define CREATE(CASE, NAMESPACE)                                             \
+        case CASE:                                                              \
+            return op::smooth_l1_loss::NAMESPACE::Descriptor::create(           \
+                handle,                                                         \
+                reinterpret_cast<op::smooth_l1_loss::NAMESPACE::Descriptor **>(desc_ptr), \
+                output,                                                         \
+                input,                                                          \
+                target,                                                         \
+                beta,                                                           \
+                reduction)
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // 【新增】Moore 分支
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetSmoothL1LossWorkspaceSize(infiniopSmoothL1LossDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                            \
+        case CASE:                                                                                          \
+            *size = reinterpret_cast<op::smooth_l1_loss::NAMESPACE::Descriptor *>(desc)->workspaceSize();   \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+    // 【新增】Moore 分支
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopSmoothL1Loss(
+    infiniopSmoothL1LossDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *target,
+    void *stream) {
+
+    #define CALCULATE(CASE, NAMESPACE)                                          \
+        case CASE:                                                              \
+            return reinterpret_cast<const op::smooth_l1_loss::NAMESPACE::Descriptor *>(desc) \
+                ->calculate(workspace, workspace_size, output, input, target, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // 【新增】Moore 分支
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroySmoothL1LossDescriptor(infiniopSmoothL1LossDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                              \
+        case CASE:                                                                               \
+            delete reinterpret_cast<const op::smooth_l1_loss::NAMESPACE::Descriptor *>(desc);    \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // 【新增】Moore 分支
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/smooth_l1_loss/smooth_l1_loss.h b/src/infiniop/ops/smooth_l1_loss/smooth_l1_loss.h
new file mode 100644
index 000000000..85296a1d1
--- /dev/null
+++ b/src/infiniop/ops/smooth_l1_loss/smooth_l1_loss.h
@@ -0,0 +1,51 @@
+#ifndef __SMOOTH_L1_LOSS_H__
+#define __SMOOTH_L1_LOSS_H__
+
+#include "../../operator.h"
+#include "info.h" // 引用刚才生成的 SmoothL1LossInfo 定义
+
+// 宏定义：用于生成不同命名空间下的 Descriptor 类
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::smooth_l1_loss::NAMESPACE {                            \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        SmoothL1LossInfo _info;                                          \
+        size_t _workspace_size;                                          \
+                                                                         \
+        Descriptor(                                                      \
+            Opaque *opaque,                                              \
+            SmoothL1LossInfo info,                                       \
+            size_t workspace_size,                                       \
+            infiniDevice_t device_type,                                  \
+            int device_id)                                               \
+            : InfiniopDescriptor{device_type, device_id},                \
+              _opaque(opaque),                                           \
+              _info(info),                                               \
+              _workspace_size(workspace_size) {}                         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+                                                                         \
+        size_t workspaceSize() const { return _workspace_size; }         \
+                                                                         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t out_desc,                         \
+            infiniopTensorDescriptor_t input_desc,                       \
+            infiniopTensorDescriptor_t target_desc,                      \
+            float beta,                                                  \
+            int reduction);                                              \
+                                                                         \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            const void *input,                                           \
+            const void *target,                                          \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif // __SMOOTH_L1_LOSS_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/take/cpu/take_cpu.cc b/src/infiniop/ops/take/cpu/take_cpu.cc
new file mode 100644
index 000000000..27bd2b44f
--- /dev/null
+++ b/src/infiniop/ops/take/cpu/take_cpu.cc
@@ -0,0 +1,138 @@
+#include "take_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+#include <omp.h>
+#include <cstring> 
+
+namespace op::take::cpu {
+
+Descriptor::~Descriptor() = default;
+
+// ==================================================================
+// 创建描述符
+// ==================================================================
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    infiniopTensorDescriptor_t indices_desc) {
+    
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto result = TakeInfo::create(out_desc, in_desc, indices_desc);
+    CHECK_RESULT(result);
+    
+    *desc_ptr = new Descriptor(
+        nullptr,             // Opaque*
+        result.take(),       // Info
+        0,                   // Workspace Size
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// ==================================================================
+// 核心计算逻辑 (模板实现)
+// ==================================================================
+template <typename TData, typename TIdx>
+void calculate_cpu_impl(
+    const TakeInfo &info,
+    void *output,
+    const void *input,
+    const void *indices) {
+
+    size_t num_out = info.num_out();
+    size_t num_in = info.num_in();
+
+    auto out_ptr = reinterpret_cast<TData *>(output);
+    auto in_ptr = reinterpret_cast<const TData *>(input);
+    auto idx_ptr = reinterpret_cast<const TIdx *>(indices);
+
+    // OpenMP 并行化处理
+#pragma omp parallel for schedule(static)
+    for (size_t i = 0; i < num_out; ++i) {
+        TIdx idx = idx_ptr[i];
+
+        // 边界检查
+        if (idx >= 0 && static_cast<size_t>(idx) < num_in) {
+            out_ptr[i] = in_ptr[idx];
+        } else {
+            // 越界处理：填充 0
+            if constexpr (std::is_arithmetic_v<TData>) {
+                // 标准类型 (float, int 等) 直接转换
+                out_ptr[i] = static_cast<TData>(0);
+            } else {
+                out_ptr[i] = utils::cast<TData>(0.0f);
+            }
+        }
+    }
+}
+
+// ==================================================================
+// 执行函数 (分发逻辑)
+// ==================================================================
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *indices,
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+    auto idx_dtype = _info.idx_dtype();
+
+    // 辅助宏：根据 idx_dtype 分发
+    #define DISPATCH_IDX(TDATA) \
+        switch (idx_dtype) { \
+        case INFINI_DTYPE_I32: \
+            cpu::calculate_cpu_impl<TDATA, int32_t>(_info, output, input, indices); \
+            return INFINI_STATUS_SUCCESS; \
+        case INFINI_DTYPE_I64: \
+            cpu::calculate_cpu_impl<TDATA, int64_t>(_info, output, input, indices); \
+            return INFINI_STATUS_SUCCESS; \
+        default: \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE; \
+        }
+
+    // 主 Switch：根据 dtype 分发
+    switch (dtype) {
+    // 浮点类型
+    case INFINI_DTYPE_F16:
+        DISPATCH_IDX(fp16_t);
+    case INFINI_DTYPE_BF16:
+        DISPATCH_IDX(bf16_t);
+    case INFINI_DTYPE_F32:
+        DISPATCH_IDX(float);
+    case INFINI_DTYPE_F64:
+        DISPATCH_IDX(double);
+
+    // 整数类型
+    case INFINI_DTYPE_I8:
+        DISPATCH_IDX(int8_t);
+    case INFINI_DTYPE_U8:
+        DISPATCH_IDX(uint8_t);
+    case INFINI_DTYPE_I16:
+        DISPATCH_IDX(int16_t);
+    case INFINI_DTYPE_U16:
+        DISPATCH_IDX(uint16_t);
+    case INFINI_DTYPE_I32:
+        DISPATCH_IDX(int32_t);
+    case INFINI_DTYPE_U32:
+        DISPATCH_IDX(uint32_t);
+    case INFINI_DTYPE_I64:
+        DISPATCH_IDX(int64_t);
+    case INFINI_DTYPE_U64:
+        DISPATCH_IDX(uint64_t);
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    #undef DISPATCH_IDX
+}
+
+} // namespace op::take::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/take/cpu/take_cpu.h b/src/infiniop/ops/take/cpu/take_cpu.h
new file mode 100644
index 000000000..e87b3975e
--- /dev/null
+++ b/src/infiniop/ops/take/cpu/take_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __TAKE_CPU_H__
+#define __TAKE_CPU_H__
+
+#include "../take.h"
+DESCRIPTOR(cpu)
+
+#endif // __TAKE_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/take/cuda/kernel.cuh b/src/infiniop/ops/take/cuda/kernel.cuh
new file mode 100644
index 000000000..eb0e50f15
--- /dev/null
+++ b/src/infiniop/ops/take/cuda/kernel.cuh
@@ -0,0 +1,93 @@
+#ifndef __TAKE_CUDA_H__
+#define __TAKE_CUDA_H__
+
+//#include <cuda_runtime.h>
+#include <cstdint>
+
+namespace op::take::cuda {
+
+// ==================================================================
+// 1. 定义向量化数据包 (Aligned Pack)
+// ==================================================================
+template <typename T, int N>
+struct alignas(sizeof(T) * N) Pack {
+    T val[N];
+};
+
+// ==================================================================
+// 2. 标量版 Kernel (用于处理非对齐数据或尾部剩余数据)
+// ==================================================================
+template <typename T, typename TIdx>
+__global__ void take_kernel(
+    T * __restrict__ output,
+    const T * __restrict__ input,
+    const TIdx * __restrict__ indices,
+    size_t num_out,
+    size_t num_in
+) {
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+
+    for (size_t i = tid; i < num_out; i += stride) {
+        TIdx idx = __ldg(&indices[i]);
+        
+        // 标量读取
+        if (idx >= 0 && idx < static_cast<TIdx>(num_in)) {
+            output[i] = input[idx];
+        } else {
+            output[i] = static_cast<T>(0);
+        }
+    }
+}
+
+// ==================================================================
+// 3. 向量化 Kernel (优化版)
+// ==================================================================
+/**
+ * @tparam PackSize 每个线程处理的元素个数 (目标是凑齐 128-bit, e.g., float x 4)
+ */
+template <typename T, typename TIdx, int PackSize>
+__global__ void take_kernel_vectorized(
+    T * __restrict__ output,
+    const T * __restrict__ input,
+    const TIdx * __restrict__ indices,
+    size_t num_packs, // 需要处理的 Pack 数量 (num_out / PackSize)
+    size_t num_in
+) {
+    // 将 output 强转为 Pack 指针，实现向量化写入
+    using PackType = Pack<T, PackSize>;
+    PackType *out_vec = reinterpret_cast<PackType *>(output);
+    
+    // Grid-Stride Loop 遍历 Pack
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+
+    for (size_t i = tid; i < num_packs; i += stride) {
+        PackType reg_pack; // 寄存器缓存
+        size_t base_idx = i * PackSize; // 当前 Pack 对应的原始 output 起始索引
+
+        // 循环展开 (Unroll): 关键优化点
+        // 编译器会展开这个循环，生成 PackSize 个独立的 Load 指令
+        // 从而利用 ILP (Instruction Level Parallelism) 掩盖 Input 的随机读取延迟
+        #pragma unroll
+        for (int k = 0; k < PackSize; ++k) {
+            // 读取索引 (Indices 是连续的，L1 Cache 命中率高)
+            // 注意：Indices 类型大小可能与 T 不同，所以独立读取
+            TIdx gather_idx = indices[base_idx + k]; 
+
+            // 随机读取 (Gather)
+            if (gather_idx >= 0 && gather_idx < static_cast<TIdx>(num_in)) {
+                reg_pack.val[k] = input[gather_idx];
+            } else {
+                reg_pack.val[k] = static_cast<T>(0);
+            }
+        }
+
+        // 向量化写入 (一次 STG.128 指令)
+        out_vec[i] = reg_pack;
+    }
+}
+
+} // namespace op::take::cuda
+
+#endif // __TAKE_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/take/info.h b/src/infiniop/ops/take/info.h
new file mode 100644
index 000000000..ff649be04
--- /dev/null
+++ b/src/infiniop/ops/take/info.h
@@ -0,0 +1,69 @@
+#ifndef __TAKE_INFO_H__
+#define __TAKE_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::take {
+
+class TakeInfo {
+    TakeInfo() = default;
+
+public:
+    int _dtype;          // 数据类型 (float, half, etc.)
+    int _idx_dtype;      // 索引类型 (int32, int64)
+    size_t _num_out;     // 输出元素总数 (== indices.numel())
+    size_t _num_in;      // 输入元素总数 (用于边界检查)
+
+    int dtype() const { return _dtype; }
+    int idx_dtype() const { return _idx_dtype; }
+    size_t num_out() const { return _num_out; }
+    size_t num_in() const { return _num_in; }
+
+    static utils::Result<TakeInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t in_desc,
+        infiniopTensorDescriptor_t indices_desc) {
+
+        // 1. 检查数据类型一致性 (Output vs Input)
+        if (out_desc->dtype() != in_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        // 2. 检查索引数据类型 (Indices)
+        int idx_type = indices_desc->dtype();
+        if (idx_type != INFINI_DTYPE_I32 && idx_type != INFINI_DTYPE_I64) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        // 3. 检查形状一致性 (Output vs Indices)
+        if (out_desc->ndim() != indices_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        const auto &out_shape = out_desc->shape();
+        const auto &idx_shape = indices_desc->shape();
+
+        for (size_t i = 0; i < out_desc->ndim(); ++i) {
+            if (out_shape[i] != idx_shape[i]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        // 4. 提取信息
+        int dtype = in_desc->dtype();
+        size_t num_out = out_desc->numel();
+        size_t num_in = in_desc->numel();
+        return utils::Result<TakeInfo>(TakeInfo{
+            dtype,
+            idx_type,
+            num_out,
+            num_in
+        });
+    }
+};
+
+} // namespace op::take
+
+#endif // __TAKE_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/take/metax/take_metax.h b/src/infiniop/ops/take/metax/take_metax.h
new file mode 100644
index 000000000..e879d3dde
--- /dev/null
+++ b/src/infiniop/ops/take/metax/take_metax.h
@@ -0,0 +1,8 @@
+#ifndef __TAKE_METAX_H__
+#define __TAKE_METAX_H__
+
+#include "../take.h"
+
+DESCRIPTOR(metax)
+
+#endif // __TAKE_METAX_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/take/metax/take_metax.maca b/src/infiniop/ops/take/metax/take_metax.maca
new file mode 100644
index 000000000..955047ebd
--- /dev/null
+++ b/src/infiniop/ops/take/metax/take_metax.maca
@@ -0,0 +1,264 @@
+#include "take_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include <mcr/mc_runtime.h>
+#include <common/mc_library_types.h>
+#include <cmath>
+#include <cstdio>
+#include <vector>
+#include <maca_fp16.h>
+#include <maca_bfloat16.h>
+
+#include "../../../tensor.h"
+#include "../cuda/kernel.cuh"
+
+namespace op::take::metax {
+
+// ==================================================================
+// Kernel
+// ==================================================================
+
+template <typename T, typename TIdx>
+__global__ void take_kernel(
+    T* output,
+    const T* input,
+    const TIdx* indices,
+    int outer_size,
+    int inner_size,
+    int indices_size,
+    int input_dim_size)
+{
+    // Flattened Take Logic
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= static_cast<size_t>(indices_size)) return;
+
+    TIdx idx_val = indices[idx];
+
+    if (idx_val < 0) {
+        idx_val += input_dim_size;
+    }
+
+    if (idx_val >= 0 && idx_val < input_dim_size) {
+        output[idx] = input[idx_val];
+    } else {
+        output[idx] = T(0);
+    }
+}
+
+// ==================================================================
+// Kernel Launcher
+// ==================================================================
+
+template <typename T, typename TIdx>
+void launch_kernel_impl(
+    void* output,
+    const void* input,
+    const void* indices,
+    int outer_size,
+    int inner_size,
+    int indices_size,
+    int input_dim_size,
+    void* stream)
+{
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+
+    size_t total_elements =
+        static_cast<size_t>(outer_size) * indices_size * inner_size;
+
+    size_t block_size = 256;
+    size_t grid_size =
+        (total_elements + block_size - 1) / block_size;
+
+    take_kernel<T, TIdx>
+        <<<grid_size, block_size, 0, hc_stream>>>(
+            reinterpret_cast<T*>(output),
+            reinterpret_cast<const T*>(input),
+            reinterpret_cast<const TIdx*>(indices),
+            outer_size,
+            inner_size,
+            indices_size,
+            input_dim_size);
+}
+
+// ==================================================================
+// Descriptor Implementation
+// ==================================================================
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    int outer_size;
+    int inner_size;
+    int indices_size;
+    int input_dim_size;
+    size_t total_bytes;
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) delete _opaque;
+}
+
+static size_t get_element_size(int dtype) {
+    if (dtype == INFINI_DTYPE_F64 || dtype == INFINI_DTYPE_I64) return 8;
+    if (dtype == INFINI_DTYPE_F32 || dtype == INFINI_DTYPE_I32) return 4;
+    return 2;
+}
+
+// ==================================================================
+// Descriptor::create
+// ==================================================================
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    infiniopTensorDescriptor_t indices_desc)
+{
+    auto handle =
+        reinterpret_cast<device::metax::Handle *>(handle_);
+
+    auto info_result =
+        TakeInfo::create(out_desc, in_desc, indices_desc);
+    if (!info_result) {
+        return info_result.status();
+    }
+
+    auto out_d =
+        reinterpret_cast<const InfiniopTensorDescriptor*>(out_desc);
+    auto in_d =
+        reinterpret_cast<const InfiniopTensorDescriptor*>(in_desc);
+    auto idx_d =
+        reinterpret_cast<const InfiniopTensorDescriptor*>(indices_desc);
+
+    int input_dim = 1;
+    for (int i = 0; i < in_d->ndim(); ++i) {
+        input_dim *= in_d->shape()[i];
+    }
+
+    int indices_numel = 1;
+    for (int i = 0; i < idx_d->ndim(); ++i) {
+        indices_numel *= idx_d->shape()[i];
+    }
+
+    int outer = 1;
+    int inner = 1;
+
+    size_t bytes =
+        static_cast<size_t>(indices_numel) *
+        get_element_size(out_d->dtype());
+
+    auto opaque = new Opaque{
+        handle->internal(),
+        outer,
+        inner,
+        indices_numel,
+        input_dim,
+        bytes
+    };
+
+    *desc_ptr = new Descriptor(
+        opaque,
+        info_result.take(),
+        0,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// ==================================================================
+// Descriptor::calculate  
+// ==================================================================
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *indices,
+    void *stream) const
+{
+    auto dtype = _info.dtype();
+    auto idx_dtype = _info.idx_dtype();
+
+    int outer     = _opaque->outer_size;
+    int inner     = _opaque->inner_size;
+    int input_dim = _opaque->input_dim_size;
+    int idx_sz    = _opaque->indices_size;
+
+#define LAUNCH(T, TIdx) \
+    launch_kernel_impl<T, TIdx>( \
+        output, input, indices, \
+        outer, inner, idx_sz, input_dim, stream)
+
+    if (idx_dtype == INFINI_DTYPE_I32) {
+        switch (dtype) {
+
+        case INFINI_DTYPE_F16:
+            LAUNCH(__half, int32_t);
+            break;
+
+        case INFINI_DTYPE_BF16:
+            LAUNCH(__maca_bfloat16, int32_t);
+            break;
+
+        case INFINI_DTYPE_F32:
+            LAUNCH(float, int32_t);
+            break;
+
+        case INFINI_DTYPE_F64:
+            LAUNCH(double, int32_t);
+            break;
+
+        case INFINI_DTYPE_I32:
+            LAUNCH(int32_t, int32_t);
+            break;
+
+        case INFINI_DTYPE_I64:
+            LAUNCH(int64_t, int32_t);
+            break;
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+    } else if (idx_dtype == INFINI_DTYPE_I64) {
+        switch (dtype) {
+
+        case INFINI_DTYPE_F16:
+            LAUNCH(__half, int64_t);
+            break;
+
+        case INFINI_DTYPE_BF16:
+            LAUNCH(__maca_bfloat16, int64_t);
+            break;
+
+        case INFINI_DTYPE_F32:
+            LAUNCH(float, int64_t);
+            break;
+
+        case INFINI_DTYPE_F64:
+            LAUNCH(double, int64_t);
+            break;
+
+        case INFINI_DTYPE_I32:
+            LAUNCH(int32_t, int64_t);
+            break;
+
+        case INFINI_DTYPE_I64:
+            LAUNCH(int64_t, int64_t);
+            break;
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+#undef LAUNCH
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::take::metax
diff --git a/src/infiniop/ops/take/moore/take_moore.h b/src/infiniop/ops/take/moore/take_moore.h
new file mode 100644
index 000000000..3a231d8af
--- /dev/null
+++ b/src/infiniop/ops/take/moore/take_moore.h
@@ -0,0 +1,6 @@
+#ifndef __TAKE_MOORE_API_H__
+#define __TAKE_MOORE_API_H__
+#include "../take.h"
+DESCRIPTOR(moore)
+
+#endif // __TAKE_MOORE_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/take/moore/take_moore.mu b/src/infiniop/ops/take/moore/take_moore.mu
new file mode 100644
index 000000000..10517b431
--- /dev/null
+++ b/src/infiniop/ops/take/moore/take_moore.mu
@@ -0,0 +1,174 @@
+#include "take_moore.h"
+#include "take_moore_kernel.h" // 包含 TakeOp 结构体定义
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+
+#include "../../../devices/moore/moore_handle.h"
+
+namespace op::take::moore {
+
+// ==================================================================
+// 1. Kernel Wrapper Implementation
+// ==================================================================
+
+// 这是一个 Global Kernel 包装器，它调用 take_moore_kernel.h 中的 TakeOp Functor
+template <typename T, typename TIdx>
+__global__ void take_kernel(
+    const size_t num_out,       // 输出元素总数
+    const size_t num_in,        // 输入元素总数 (用于边界检查)
+    const T *input,             // 输入数据
+    const TIdx *indices,        // 索引数据
+    T *output) {                // 输出数据
+    
+    // idx 对应输出张量和索引张量的线性索引
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < num_out) {
+        // 使用 take_moore_kernel.h 中定义的 Functor
+        TakeOp op;
+        op(idx, num_in, input, indices, output);
+    }
+}
+
+// ==================================================================
+// 2. Launcher Implementation
+// ==================================================================
+
+template <typename T, typename TIdx>
+void take_moore_launch(
+    const TakeInfo &info,
+    T *output,
+    const T *input,
+    const void *indices, // indices 传入时是 void*，需要强转为 TIdx*
+    void *stream) {
+    
+    size_t num_out = info.num_out();
+    size_t num_in = info.num_in();
+    
+    // 强转索引指针
+    const TIdx *indices_ptr = static_cast<const TIdx *>(indices);
+
+    int threads = 256;
+    // 使用 size_t 防止溢出，但 GridDim 必须是 int
+    // 如果 num_out 非常大，可能需要 stride loop，此处假设在 Grid 限制内
+    int blocks = (num_out + threads - 1) / threads;
+
+    take_kernel<T, TIdx><<<blocks, threads, 0, (musaStream_t)stream>>>(
+        num_out,
+        num_in,
+        input,
+        indices_ptr,
+        output
+    );
+}
+
+// ==================================================================
+// 3. Descriptor Implementation
+// ==================================================================
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    infiniopTensorDescriptor_t indices_desc) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    // 使用 TakeInfo::create 校验形状和类型
+    auto info_result = TakeInfo::create(out_desc, in_desc, indices_desc);
+    
+    if (!info_result) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+    *desc_ptr = new Descriptor(
+        nullptr,
+        *info_result, // 解包 Result 获取 Info 对象
+        0,            // Take 算子不需要 Workspace
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *indices,
+    void *stream) const {
+
+    // 1. 基础检查
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // 2. 定义分发宏：先确定 Data Type，再确定 Index Type
+    // T 是数据类型 (float, half, etc.)
+    #define LAUNCH_TAKE_KERNEL(T)                                                       \
+        do {                                                                            \
+            if (_info.idx_dtype() == INFINI_DTYPE_I32) {                                \
+                take_moore_launch<T, int32_t>(_info,                                    \
+                    static_cast<T *>(output),                                           \
+                    static_cast<const T *>(input),                                      \
+                    indices, stream);                                                   \
+            } else if (_info.idx_dtype() == INFINI_DTYPE_I64) {                         \
+                take_moore_launch<T, int64_t>(_info,                                    \
+                    static_cast<T *>(output),                                           \
+                    static_cast<const T *>(input),                                      \
+                    indices, stream);                                                   \
+            } else {                                                                    \
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;                                  \
+            }                                                                           \
+        } while (0)
+
+    // 3. 根据数据类型分发
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        LAUNCH_TAKE_KERNEL(half);
+        break;
+        
+    case INFINI_DTYPE_BF16:
+        LAUNCH_TAKE_KERNEL(__mt_bfloat16);
+        break;
+
+    case INFINI_DTYPE_F32:
+        LAUNCH_TAKE_KERNEL(float);
+        break;
+
+    case INFINI_DTYPE_F64:
+        LAUNCH_TAKE_KERNEL(double);
+        break;
+    
+    case INFINI_DTYPE_I32:
+        LAUNCH_TAKE_KERNEL(int32_t);
+        break;
+
+    case INFINI_DTYPE_I64:
+        LAUNCH_TAKE_KERNEL(int64_t);
+        break;
+
+    case INFINI_DTYPE_I8:
+        LAUNCH_TAKE_KERNEL(int8_t);
+        break;
+    
+    case INFINI_DTYPE_U8:
+        LAUNCH_TAKE_KERNEL(uint8_t);
+        break;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    #undef LAUNCH_TAKE_KERNEL
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::take::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/take/moore/take_moore_kernel.h b/src/infiniop/ops/take/moore/take_moore_kernel.h
new file mode 100644
index 000000000..70725cb3a
--- /dev/null
+++ b/src/infiniop/ops/take/moore/take_moore_kernel.h
@@ -0,0 +1,56 @@
+#ifndef __TAKE_MOORE_KERNEL_H__
+#define __TAKE_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h> // 必须包含，用于 __mt_bfloat16 定义
+
+#include <type_traits> // 用于 std::is_same_v
+
+namespace op::take::moore {
+
+typedef struct TakeOp {
+public:
+    // Take 算子涉及两种数据类型：数据本身 (T) 和 索引类型 (TIdx)
+    template <typename T, typename TIdx>
+    __device__ __forceinline__ void operator()(
+        const size_t curr_idx,      // 当前线程处理的线性索引 (对应 output 和 indices 的位置)
+        const size_t num_in,        // 输入张量的元素总数 (用于边界检查)
+        const T* input_data,        // 输入数据指针
+        const TIdx* indices_data,   // 索引数据指针
+        T* output_data              // 输出数据指针
+    ) const {
+        
+        // 1. 获取当前位置的索引值
+        // indices 和 output 形状一致，curr_idx 对应两者的扁平化索引
+        TIdx idx = indices_data[curr_idx];
+
+        // 2. 处理负索引 (支持 Python 风格的负数索引，如 -1 代表最后一个元素)
+        // 如果业务不需要支持负索引，可以移除此逻辑，直接判断 idx < 0
+        if (idx < 0) {
+            idx += static_cast<TIdx>(num_in);
+        }
+
+        // 3. 边界检查与赋值
+        if (idx >= 0 && idx < static_cast<TIdx>(num_in)) {
+            // 合法索引：直接拷贝
+            // half/bf16/float 在 MUSA 中通常支持直接赋值(=)，无需转换
+            output_data[curr_idx] = input_data[idx];
+        } else {
+            // 越界处理：通常填充 0
+            // 使用 constexpr 保持与 avg_pool 代码风格一致的类型处理
+            if constexpr (std::is_same_v<T, half>) {
+                output_data[curr_idx] = __float2half(0.0f);
+            } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+                output_data[curr_idx] = __float2bfloat16(0.0f);
+            } else {
+                output_data[curr_idx] = static_cast<T>(0);
+            }
+        }
+    }
+
+} TakeOp;
+
+} // namespace op::take::moore
+
+#endif // __TAKE_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/take/nvidia/take_nvidia.cu b/src/infiniop/ops/take/nvidia/take_nvidia.cu
new file mode 100644
index 000000000..6f167110c
--- /dev/null
+++ b/src/infiniop/ops/take/nvidia/take_nvidia.cu
@@ -0,0 +1,152 @@
+#include "take_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../../../handle.h"
+#include <cstdint>
+
+namespace op::take::nvidia {
+
+// ==================================================================
+// 辅助函数：检查指针内存对齐
+// ==================================================================
+template <typename T>
+bool is_aligned(const void *ptr, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+}
+
+// ==================================================================
+// Kernel Launch Logic
+// ==================================================================
+template <typename T, typename TIdx>
+void launch_kernel(
+    void *output,
+    const void *input,
+    const void *indices,
+    size_t num_out,
+    size_t num_in,
+    void *stream) {
+
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto idx_ptr = reinterpret_cast<const TIdx *>(indices);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    // --- 向量化参数配置 ---
+    // 目标：每个线程处理 128-bit (16 Bytes) 数据以最大化带宽
+    constexpr int TotalBytes = 16;
+    constexpr int PackSize = TotalBytes / sizeof(T);
+    
+    // 向量化条件检查：
+    // 1. PackSize >= 2: 只有能打包 2 个以上才有意义 (double 是 2 个，float 是 4 个)
+    // 2. 整除检查: 输出元素总数必须能被 PackSize 整除 (简化 Kernel 边界判断)
+    // 3. 地址对齐: output 指针必须 16 字节对齐
+    bool can_vectorize = (PackSize > 1) && 
+                         (num_out % PackSize == 0) &&
+                         is_aligned<T>(output, TotalBytes);
+
+    if (can_vectorize) {
+        // 
+        // === 路径 A: 向量化 Kernel (高性能) ===
+        size_t num_packs = num_out / PackSize;
+        
+        // Block/Grid 配置
+        size_t block_size = 256;
+        size_t grid_size = (num_packs + block_size - 1) / block_size;
+
+        op::take::cuda::take_kernel_vectorized<T, TIdx, PackSize>
+            <<<grid_size, block_size, 0, cuda_stream>>>(
+            out_ptr, in_ptr, idx_ptr, num_packs, num_in
+        );
+    } else {
+        // === 路径 B: 标量 Kernel (回退/兼容) ===
+        size_t block_size = 256;
+        size_t grid_size = (num_out + block_size - 1) / block_size;
+
+        op::take::cuda::take_kernel<T, TIdx>
+            <<<grid_size, block_size, 0, cuda_stream>>>(
+            out_ptr, in_ptr, idx_ptr, num_out, num_in
+        );
+    }
+}
+
+// ==================================================================
+// Descriptor 部分保持不变
+// ==================================================================
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    infiniopTensorDescriptor_t indices_desc) {
+
+    auto info_result = TakeInfo::create(out_desc, in_desc, indices_desc);
+    if (!info_result) return info_result.status();
+
+    *desc_ptr = new Descriptor(
+        new Opaque(), info_result.take(), 0, handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+// ==================================================================
+// Calculate: 结合 "按字节分发" 和 "向量化"
+// ==================================================================
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *indices,
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+    auto idx_dtype = _info.idx_dtype();
+    auto num_out = _info.num_out();
+    auto num_in = _info.num_in();
+
+    // 索引分发宏
+    #define LAUNCH_BY_SIZE(T_STORAGE) \
+        switch (idx_dtype) { \
+        case INFINI_DTYPE_I32: \
+            launch_kernel<T_STORAGE, int32_t>(output, input, indices, num_out, num_in, stream); \
+            break; \
+        case INFINI_DTYPE_I64: \
+            launch_kernel<T_STORAGE, int64_t>(output, input, indices, num_out, num_in, stream); \
+            break; \
+        default: return INFINI_STATUS_BAD_TENSOR_DTYPE; \
+        }
+
+    // 根据数据类型字节大小归类
+    size_t element_size = 0;
+    switch (dtype) {
+    case INFINI_DTYPE_I8: case INFINI_DTYPE_U8:
+        element_size = 1; break;
+    case INFINI_DTYPE_F16: case INFINI_DTYPE_BF16: case INFINI_DTYPE_I16: case INFINI_DTYPE_U16:
+        element_size = 2; break;
+    case INFINI_DTYPE_F32: case INFINI_DTYPE_I32: case INFINI_DTYPE_U32:
+        element_size = 4; break;
+    case INFINI_DTYPE_F64: case INFINI_DTYPE_I64: case INFINI_DTYPE_U64:
+        element_size = 8; break;
+    default: return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    // 分发到对应的存储类型
+    switch (element_size) {
+    case 1: LAUNCH_BY_SIZE(uint8_t); break;   // PackSize = 16
+    case 2: LAUNCH_BY_SIZE(uint16_t); break;  // PackSize = 8 
+    case 4: LAUNCH_BY_SIZE(uint32_t); break;  // PackSize = 4 (float4)
+    case 8: LAUNCH_BY_SIZE(uint64_t); break;  // PackSize = 2 (double2)
+    default: return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    #undef LAUNCH_BY_SIZE
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::take::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/take/nvidia/take_nvidia.cuh b/src/infiniop/ops/take/nvidia/take_nvidia.cuh
new file mode 100644
index 000000000..eeb3e2da9
--- /dev/null
+++ b/src/infiniop/ops/take/nvidia/take_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __TAKE_NVIDIA_CUH__
+#define __TAKE_NVIDIA_CUH__
+
+#include "../take.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __TAKE_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/take/operator.cc b/src/infiniop/ops/take/operator.cc
new file mode 100644
index 000000000..8e99d8195
--- /dev/null
+++ b/src/infiniop/ops/take/operator.cc
@@ -0,0 +1,176 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/take.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/take_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/take_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/take_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/take_moore.h"
+#endif
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateTakeDescriptor(
+    infiniopHandle_t handle,
+    infiniopTakeDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input,
+    infiniopTensorDescriptor_t indices) {
+    #define CREATE(CASE, NAMESPACE)                                             \
+        case CASE:                                                              \
+            return op::take::NAMESPACE::Descriptor::create(                     \
+                handle,                                                         \
+                reinterpret_cast<op::take::NAMESPACE::Descriptor **>(desc_ptr), \
+                output,                                                         \
+                input,                                                          \
+                indices)
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetTakeWorkspaceSize(infiniopTakeDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                    \
+        case CASE:                                                                                  \
+            *size = reinterpret_cast<op::take::NAMESPACE::Descriptor *>(desc)->workspaceSize();     \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopTake(
+    infiniopTakeDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *indices,
+    void *stream) {
+    #define CALCULATE(CASE, NAMESPACE)                                          \
+        case CASE:                                                              \
+            return reinterpret_cast<const op::take::NAMESPACE::Descriptor *>(desc) \
+                ->calculate(workspace, workspace_size, output, input, indices, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroyTakeDescriptor(infiniopTakeDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                          \
+        case CASE:                                                                           \
+            delete reinterpret_cast<const op::take::NAMESPACE::Descriptor *>(desc);          \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/take/take.h b/src/infiniop/ops/take/take.h
new file mode 100644
index 000000000..d45e867ac
--- /dev/null
+++ b/src/infiniop/ops/take/take.h
@@ -0,0 +1,50 @@
+#ifndef __TAKE_H__
+#define __TAKE_H__
+
+#include "../../operator.h"
+#include "info.h" // 引用刚才生成的 TakeInfo 定义
+
+// 宏定义：用于生成不同命名空间下的 Descriptor 类
+#define DESCRIPTOR(NAMESPACE)                                            \
+                                                                         \
+    namespace op::take::NAMESPACE {                                      \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        TakeInfo _info;                                                  \
+        size_t _workspace_size;                                          \
+                                                                         \
+        Descriptor(                                                      \
+            Opaque *opaque,                                              \
+            TakeInfo info,                                               \
+            size_t workspace_size,                                       \
+            infiniDevice_t device_type,                                  \
+            int device_id)                                               \
+            : InfiniopDescriptor{device_type, device_id},                \
+              _opaque(opaque),                                           \
+              _info(info),                                               \
+              _workspace_size(workspace_size) {}                         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+                                                                         \
+        size_t workspaceSize() const { return _workspace_size; }         \
+                                                                         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t out_desc,                         \
+            infiniopTensorDescriptor_t in_desc,                          \
+            infiniopTensorDescriptor_t indices_desc);                    \
+                                                                         \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            const void *input,                                           \
+            const void *indices,                                         \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif // __TAKE_H__
\ No newline at end of file
diff --git a/test/infinicore/ops/hypot.py b/test/infinicore/ops/hypot.py
index 283214c0e..593f09b47 100644
--- a/test/infinicore/ops/hypot.py
+++ b/test/infinicore/ops/hypot.py
@@ -103,9 +103,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.hypot(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.hypot(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.hypot(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/index_add.py b/test/infinicore/ops/index_add.py
index ee136cf31..531e04097 100644
--- a/test/infinicore/ops/index_add.py
+++ b/test/infinicore/ops/index_add.py
@@ -91,9 +91,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.index_add(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.index_add(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.index_add(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/index_copy.py b/test/infinicore/ops/index_copy.py
index f38403320..d2e34b7c3 100644
--- a/test/infinicore/ops/index_copy.py
+++ b/test/infinicore/ops/index_copy.py
@@ -89,9 +89,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.index_copy(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.index_copy(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.index_copy(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/smooth_l1_loss.py b/test/infinicore/ops/smooth_l1_loss.py
index 7f0ca670c..76dae9496 100644
--- a/test/infinicore/ops/smooth_l1_loss.py
+++ b/test/infinicore/ops/smooth_l1_loss.py
@@ -67,9 +67,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.smooth_l1_loss(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.smooth_l1_loss(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.nn.functional.smooth_l1_loss(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/take.py b/test/infinicore/ops/take.py
index 5ca28b86e..22b9e39ef 100644
--- a/test/infinicore/ops/take.py
+++ b/test/infinicore/ops/take.py
@@ -67,9 +67,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.take(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.take(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.take(*args, **kwargs)
 
 
 def main():