amd · jgmelber · Mar 7, 2026 · Mar 7, 2026
@@ -20,3 +20,4 @@ id_ed25519.pub
 *.model
 .cline_storage
 *.egg-info
+CLAUDE.md
@@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Fused dual-GEMV + SiLU + elementwise multiply kernel for AIE2.
+// Same structure as AIE2+ variant but uses LUT-based getTanhBf16.
+
+#define NOCPP
+
+#include "../aie_kernel_utils.h"
+#include "lut_based_ops.h"
+
+#include <aie_api/aie.hpp>
+#include <stdint.h>
+#include <type_traits>
+
+static bfloat16 left_buf[1024] __attribute__((aligned(64)));
+static bfloat16 right_buf[1024] __attribute__((aligned(64)));
+
+template <uint32_t r>
+void matvec_vectorized(uint32_t m,
+                       uint32_t k,
+                       const bfloat16 *__restrict a,
+                       const bfloat16 *__restrict b,
+                       bfloat16 *__restrict c)
+{
+    ::aie::set_rounding(aie::rounding_mode::conv_even);
+    bfloat16 *c_end = c + m;
+    const bfloat16 *b_end = b + k;
+    for (; c < c_end; c++) {
+        aie::accum acc = aie::zeros<accfloat, r>();
+        AIE_LOOP_MIN_ITERATION_COUNT(2)
+        for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
+            aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
+            aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
+            acc = aie::mac(acc, a_vec, b_vec);
+        }
+        *c = static_cast<bfloat16>(aie::reduce_add(acc.template to_vector<float>()));
+    }
+}
+
+extern "C" {
+
+void dual_gemv_matvec_bf16(uint32_t m,
+                           uint32_t k,
+                           uint32_t row_offset,
+                           const bfloat16 *__restrict a_in,
+                           const bfloat16 *__restrict b_in,
+                           uint32_t phase)
+{
+    bfloat16 *dst = (phase == 0) ? left_buf : right_buf;
+    dst += row_offset;
+    matvec_vectorized<64>(m, k, a_in, b_in, dst);
+}
+
+void dual_gemv_silu_mul_bf16(bfloat16 *__restrict c_out, int32_t m_output)
+{
+    event0();
+
+    aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
+    aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
+    AIE_PREPARE_FOR_PIPELINING
+    for (int i = 0; i < m_output; i += 16) {
+        aie::vector<bfloat16, 16> left_val = aie::load_v<16>(left_buf + i);
+        aie::vector<bfloat16, 16> right_val = aie::load_v<16>(right_buf + i);
+
+        aie::vector<bfloat16, 16> half_x = aie::mul(left_val, register_0_5);
+        aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
+        auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
+        aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
+        auto silu_output = aie::mul(left_val, sigmoid_approx);
+
+        auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), right_val);
+        aie::store_v(c_out + i, fused_output.to_vector<bfloat16>());
+    }
+
+    event1();
+}
+
+} // extern "C"
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "../aie_kernel_utils.h"
+#include "lut_based_ops.h"
+
+#include <aie_api/aie.hpp>
+#include <stdint.h>
+
+using namespace aie;
+
+void silu_mul_tanh_approx_bf16(bfloat16 *restrict silu_input,
+                               bfloat16 *restrict mul_input,
+                               bfloat16 *restrict output_vector,
+                               const int32_t vector_size)
+{
+    event0();
+
+    auto it_silu_in = aie::begin_restrict_vector<16>((bfloat16 *)silu_input);
+    auto it_mul_in = aie::begin_restrict_vector<16>((bfloat16 *)mul_input);
+    auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
+
+    aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
+    aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
+    AIE_PREPARE_FOR_PIPELINING
+    AIE_LOOP_MIN_ITERATION_COUNT(64)
+    for (int i = 0; i < vector_size; i += 16) {
+        // Load input vectors
+        aie::vector<bfloat16, 16> input = *it_silu_in++;
+        aie::vector<bfloat16, 16> mul_in = *it_mul_in++;
+
+        // Compute SiLU: x * sigmoid(x) where sigmoid(x) = 0.5 * (1 + tanh(x/2))
+        aie::vector<bfloat16, 16> half_x = aie::mul(input, register_0_5);
+        aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
+        auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
+        aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
+        auto silu_output = aie::mul(input, sigmoid_approx);
+
+        // Fused multiply: silu(input) * mul_input
+        auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), mul_in);
+
+        // Store output vector
+        *it_out++ = fused_output.to_vector<bfloat16>();
+    }
+
+    event1();
+
+    return;
+}
+
+extern "C" {
+
+void silu_mul_bf16(bfloat16 *restrict silu_input,
+                   bfloat16 *restrict mul_input,
+                   bfloat16 *restrict output,
+                   int input_size)
+{
+    silu_mul_tanh_approx_bf16(silu_input, mul_input, output, input_size);
+}
+
+} // extern "C"
@@ -0,0 +1,90 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Fused dual-GEMV + SiLU + elementwise multiply kernel for AIE2+.
+//
+// Computes: output = silu(W1 @ x) * (W2 @ x)
+//
+// Two entry points called from the NPU design's core body:
+//   1. dual_gemv_matvec_bf16: GEMV writing to FIFO buffer c_out + row_offset
+//   2. dual_gemv_silu_mul_bf16: reads from static left_buf/right_buf, writes to FIFO c_out
+//
+// The static buffers are written via scalar stores (from matvec) and read
+// via aie::load_v in the silu_mul phase. Aligned to 64 bytes for safe vector access.
+
+#define NOCPP
+
+#include "../aie_kernel_utils.h"
+
+#include <aie_api/aie.hpp>
+#include <stdint.h>
+#include <type_traits>
+
+static bfloat16 left_buf[1024] __attribute__((aligned(64)));
+static bfloat16 right_buf[1024] __attribute__((aligned(64)));
+
+template <uint32_t r>
+void matvec_vectorized(uint32_t m,
+                       uint32_t k,
+                       const bfloat16 *__restrict a,
+                       const bfloat16 *__restrict b,
+                       bfloat16 *__restrict c)
+{
+    ::aie::set_rounding(aie::rounding_mode::conv_even);
+    bfloat16 *c_end = c + m;
+    const bfloat16 *b_end = b + k;
+    for (; c < c_end; c++) {
+        aie::accum acc = aie::zeros<accfloat, r>();
+        AIE_LOOP_MIN_ITERATION_COUNT(2)
+        for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
+            aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
+            aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
+            acc = aie::mac(acc, a_vec, b_vec);
+        }
+        *c = static_cast<bfloat16>(aie::reduce_add(acc.template to_vector<float>()));
+    }
+}
+
+extern "C" {
+
+// Phase 1 & 2: GEMV writing to a static buffer (left_buf or right_buf)
+// phase=0 writes to left_buf, phase=1 writes to right_buf
+void dual_gemv_matvec_bf16(uint32_t m,
+                           uint32_t k,
+                           uint32_t row_offset,
+                           const bfloat16 *__restrict a_in,
+                           const bfloat16 *__restrict b_in,
+                           uint32_t phase)
+{
+    bfloat16 *dst = (phase == 0) ? left_buf : right_buf;
+    dst += row_offset;
+    matvec_vectorized<64>(m, k, a_in, b_in, dst);
+}
+
+// Phase 3: silu(left_buf) * right_buf -> c_out (FIFO buffer)
+void dual_gemv_silu_mul_bf16(bfloat16 *__restrict c_out, int32_t m_output)
+{
+    event0();
+
+    aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
+    aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
+    AIE_PREPARE_FOR_PIPELINING
+    for (int i = 0; i < m_output; i += 16) {
+        aie::vector<bfloat16, 16> left_val = aie::load_v<16>(left_buf + i);
+        aie::vector<bfloat16, 16> right_val = aie::load_v<16>(right_buf + i);
+
+        // SiLU(x) = x * sigmoid(x) = x * 0.5 * (1 + tanh(x/2))
+        auto half_x = aie::mul(left_val, register_0_5);
+        auto tanh_half_x = aie::tanh<bfloat16>(half_x.to_vector<float>());
+        auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
+        aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
+        auto silu_output = aie::mul(left_val, sigmoid_approx);
+
+        auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), right_val);
+        aie::store_v(c_out + i, fused_output.to_vector<bfloat16>());
+    }
+
+    event1();
+}
+
+} // extern "C"
@@ -0,0 +1,60 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "../aie_kernel_utils.h"
+
+#include <aie_api/aie.hpp>
+#include <stdint.h>
+
+using namespace aie;
+
+void silu_mul_tanh_approx_bf16(bfloat16 *restrict silu_input,
+                               bfloat16 *restrict mul_input,
+                               bfloat16 *restrict output_vector,
+                               const int32_t vector_size)
+{
+    event0();
+
+    auto it_silu_in = aie::begin_restrict_vector<16>((bfloat16 *)silu_input);
+    auto it_mul_in = aie::begin_restrict_vector<16>((bfloat16 *)mul_input);
+    auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
+
+    aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
+    aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
+    AIE_PREPARE_FOR_PIPELINING
+    AIE_LOOP_MIN_ITERATION_COUNT(64)
+    for (int i = 0; i < vector_size; i += 16) {
+        // Load input vectors
+        aie::vector<bfloat16, 16> input = *it_silu_in++;
+        aie::vector<bfloat16, 16> mul_in = *it_mul_in++;
+
+        // Compute SiLU: x * sigmoid(x) where sigmoid(x) = 0.5 * (1 + tanh(x/2))
+        auto half_x = aie::mul(input, register_0_5);
+        auto tanh_half_x = aie::tanh<bfloat16>(half_x.to_vector<float>());
+        auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
+        aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
+        auto silu_output = aie::mul(input, sigmoid_approx);
+
+        // Fused multiply: silu(input) * mul_input
+        auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), mul_in);
+
+        // Store output vector
+        *it_out++ = fused_output.to_vector<bfloat16>();
+    }
+
+    event1();
+
+    return;
+}
+
+extern "C" {
+
+void silu_mul_bf16(bfloat16 *restrict silu_input,
+                   bfloat16 *restrict mul_input,
+                   bfloat16 *restrict output,
+                   int input_size)
+{
+    silu_mul_tanh_approx_bf16(silu_input, mul_input, output, input_size);
+}
+
+} // extern "C"
@@ -3,6 +3,7 @@
 
 from .axpy.op import AIEAXPY
 from .dequant.op import AIEDequant
+from .dual_gemv_silu_mul.op import AIEDualGEMVSiLUMul
 from .elementwise_add.op import AIEElementwiseAdd
 from .elementwise_mul.op import AIEElementwiseMul
 from .gelu.op import AIEGELU
@@ -17,6 +18,7 @@
 from .rope.op import AIERope
 from .sigmoid.op import AIESigmoid
 from .silu.op import AIESiLU
+from .silu_mul.op import AIESiLUMul
 from .softmax.op import AIESoftmax
 from .swiglu_decode.op import AIESwiGLUDecode
 from .swiglu_prefill.op import AIESwiGLUPrefill