Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ id_ed25519.pub
*.model
.cline_storage
*.egg-info
CLAUDE.md
79 changes: 79 additions & 0 deletions aie_kernels/aie2/dual_gemv_silu_mul.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

// Fused dual-GEMV + SiLU + elementwise multiply kernel for AIE2.
// Same structure as AIE2+ variant but uses LUT-based getTanhBf16.

#define NOCPP

#include "../aie_kernel_utils.h"
#include "lut_based_ops.h"

#include <aie_api/aie.hpp>
#include <stdint.h>
#include <type_traits>

static bfloat16 left_buf[1024] __attribute__((aligned(64)));
static bfloat16 right_buf[1024] __attribute__((aligned(64)));

template <uint32_t r>
void matvec_vectorized(uint32_t m,
uint32_t k,
const bfloat16 *__restrict a,
const bfloat16 *__restrict b,
bfloat16 *__restrict c)
{
::aie::set_rounding(aie::rounding_mode::conv_even);
bfloat16 *c_end = c + m;
const bfloat16 *b_end = b + k;
for (; c < c_end; c++) {
aie::accum acc = aie::zeros<accfloat, r>();
AIE_LOOP_MIN_ITERATION_COUNT(2)
for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
acc = aie::mac(acc, a_vec, b_vec);
}
*c = static_cast<bfloat16>(aie::reduce_add(acc.template to_vector<float>()));
}
}

extern "C" {

void dual_gemv_matvec_bf16(uint32_t m,
uint32_t k,
uint32_t row_offset,
const bfloat16 *__restrict a_in,
const bfloat16 *__restrict b_in,
uint32_t phase)
{
bfloat16 *dst = (phase == 0) ? left_buf : right_buf;
dst += row_offset;
matvec_vectorized<64>(m, k, a_in, b_in, dst);
}

void dual_gemv_silu_mul_bf16(bfloat16 *__restrict c_out, int32_t m_output)
{
event0();

aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
AIE_PREPARE_FOR_PIPELINING
for (int i = 0; i < m_output; i += 16) {
aie::vector<bfloat16, 16> left_val = aie::load_v<16>(left_buf + i);
aie::vector<bfloat16, 16> right_val = aie::load_v<16>(right_buf + i);

aie::vector<bfloat16, 16> half_x = aie::mul(left_val, register_0_5);
aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
auto silu_output = aie::mul(left_val, sigmoid_approx);

auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), right_val);
aie::store_v(c_out + i, fused_output.to_vector<bfloat16>());
}

event1();
}

} // extern "C"
61 changes: 61 additions & 0 deletions aie_kernels/aie2/silu_mul.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

#include "../aie_kernel_utils.h"
#include "lut_based_ops.h"

#include <aie_api/aie.hpp>
#include <stdint.h>

using namespace aie;

void silu_mul_tanh_approx_bf16(bfloat16 *restrict silu_input,
bfloat16 *restrict mul_input,
bfloat16 *restrict output_vector,
const int32_t vector_size)
{
event0();

auto it_silu_in = aie::begin_restrict_vector<16>((bfloat16 *)silu_input);
auto it_mul_in = aie::begin_restrict_vector<16>((bfloat16 *)mul_input);
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);

aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
AIE_PREPARE_FOR_PIPELINING
AIE_LOOP_MIN_ITERATION_COUNT(64)
for (int i = 0; i < vector_size; i += 16) {
// Load input vectors
aie::vector<bfloat16, 16> input = *it_silu_in++;
aie::vector<bfloat16, 16> mul_in = *it_mul_in++;

// Compute SiLU: x * sigmoid(x) where sigmoid(x) = 0.5 * (1 + tanh(x/2))
aie::vector<bfloat16, 16> half_x = aie::mul(input, register_0_5);
aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
auto silu_output = aie::mul(input, sigmoid_approx);

// Fused multiply: silu(input) * mul_input
auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), mul_in);

// Store output vector
*it_out++ = fused_output.to_vector<bfloat16>();
}

event1();

return;
}

extern "C" {

void silu_mul_bf16(bfloat16 *restrict silu_input,
bfloat16 *restrict mul_input,
bfloat16 *restrict output,
int input_size)
{
silu_mul_tanh_approx_bf16(silu_input, mul_input, output, input_size);
}

} // extern "C"
90 changes: 90 additions & 0 deletions aie_kernels/aie2p/dual_gemv_silu_mul.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

// Fused dual-GEMV + SiLU + elementwise multiply kernel for AIE2+.
//
// Computes: output = silu(W1 @ x) * (W2 @ x)
//
// Two entry points called from the NPU design's core body:
// 1. dual_gemv_matvec_bf16: GEMV writing to FIFO buffer c_out + row_offset
// 2. dual_gemv_silu_mul_bf16: reads from static left_buf/right_buf, writes to FIFO c_out
//
// The static buffers are written via scalar stores (from matvec) and read
// via aie::load_v in the silu_mul phase. Aligned to 64 bytes for safe vector access.

#define NOCPP

#include "../aie_kernel_utils.h"

#include <aie_api/aie.hpp>
#include <stdint.h>
#include <type_traits>

static bfloat16 left_buf[1024] __attribute__((aligned(64)));
static bfloat16 right_buf[1024] __attribute__((aligned(64)));

template <uint32_t r>
void matvec_vectorized(uint32_t m,
uint32_t k,
const bfloat16 *__restrict a,
const bfloat16 *__restrict b,
bfloat16 *__restrict c)
{
::aie::set_rounding(aie::rounding_mode::conv_even);
bfloat16 *c_end = c + m;
const bfloat16 *b_end = b + k;
for (; c < c_end; c++) {
aie::accum acc = aie::zeros<accfloat, r>();
AIE_LOOP_MIN_ITERATION_COUNT(2)
for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
acc = aie::mac(acc, a_vec, b_vec);
}
*c = static_cast<bfloat16>(aie::reduce_add(acc.template to_vector<float>()));
}
}

extern "C" {

// Phase 1 & 2: GEMV writing to a static buffer (left_buf or right_buf)
// phase=0 writes to left_buf, phase=1 writes to right_buf
void dual_gemv_matvec_bf16(uint32_t m,
uint32_t k,
uint32_t row_offset,
const bfloat16 *__restrict a_in,
const bfloat16 *__restrict b_in,
uint32_t phase)
{
bfloat16 *dst = (phase == 0) ? left_buf : right_buf;
dst += row_offset;
matvec_vectorized<64>(m, k, a_in, b_in, dst);
}

// Phase 3: silu(left_buf) * right_buf -> c_out (FIFO buffer)
void dual_gemv_silu_mul_bf16(bfloat16 *__restrict c_out, int32_t m_output)
{
event0();

aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
AIE_PREPARE_FOR_PIPELINING
for (int i = 0; i < m_output; i += 16) {
aie::vector<bfloat16, 16> left_val = aie::load_v<16>(left_buf + i);
aie::vector<bfloat16, 16> right_val = aie::load_v<16>(right_buf + i);

// SiLU(x) = x * sigmoid(x) = x * 0.5 * (1 + tanh(x/2))
auto half_x = aie::mul(left_val, register_0_5);
auto tanh_half_x = aie::tanh<bfloat16>(half_x.to_vector<float>());
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
auto silu_output = aie::mul(left_val, sigmoid_approx);

auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), right_val);
aie::store_v(c_out + i, fused_output.to_vector<bfloat16>());
}

event1();
}

} // extern "C"
60 changes: 60 additions & 0 deletions aie_kernels/aie2p/silu_mul.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

#include "../aie_kernel_utils.h"

#include <aie_api/aie.hpp>
#include <stdint.h>

using namespace aie;

void silu_mul_tanh_approx_bf16(bfloat16 *restrict silu_input,
bfloat16 *restrict mul_input,
bfloat16 *restrict output_vector,
const int32_t vector_size)
{
event0();

auto it_silu_in = aie::begin_restrict_vector<16>((bfloat16 *)silu_input);
auto it_mul_in = aie::begin_restrict_vector<16>((bfloat16 *)mul_input);
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);

aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
AIE_PREPARE_FOR_PIPELINING
AIE_LOOP_MIN_ITERATION_COUNT(64)
for (int i = 0; i < vector_size; i += 16) {
// Load input vectors
aie::vector<bfloat16, 16> input = *it_silu_in++;
aie::vector<bfloat16, 16> mul_in = *it_mul_in++;

// Compute SiLU: x * sigmoid(x) where sigmoid(x) = 0.5 * (1 + tanh(x/2))
auto half_x = aie::mul(input, register_0_5);
auto tanh_half_x = aie::tanh<bfloat16>(half_x.to_vector<float>());
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
auto silu_output = aie::mul(input, sigmoid_approx);

// Fused multiply: silu(input) * mul_input
auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), mul_in);

// Store output vector
*it_out++ = fused_output.to_vector<bfloat16>();
}

event1();

return;
}

extern "C" {

void silu_mul_bf16(bfloat16 *restrict silu_input,
bfloat16 *restrict mul_input,
bfloat16 *restrict output,
int input_size)
{
silu_mul_tanh_approx_bf16(silu_input, mul_input, output, input_size);
}

} // extern "C"
2 changes: 2 additions & 0 deletions iron/operators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from .axpy.op import AIEAXPY
from .dequant.op import AIEDequant
from .dual_gemv_silu_mul.op import AIEDualGEMVSiLUMul
from .elementwise_add.op import AIEElementwiseAdd
from .elementwise_mul.op import AIEElementwiseMul
from .gelu.op import AIEGELU
Expand All @@ -17,6 +18,7 @@
from .rope.op import AIERope
from .sigmoid.op import AIESigmoid
from .silu.op import AIESiLU
from .silu_mul.op import AIESiLUMul
from .softmax.op import AIESoftmax
from .swiglu_decode.op import AIESwiGLUDecode
from .swiglu_prefill.op import AIESwiGLUPrefill
Expand Down
Loading