Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions lib/include/pulp_conv_naive_fp32.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ void dw_kernel_weight_grad(
void * matMul_DW_args
);

/**
* @brief Weight gradient kernel for Depthwise Convolution with padding and arbitrary stride.
* @param matMul_DW_args pointer to a kernel_DW_args structure
*/
void dw_kernel_weight_grad_padded(
void * matMul_DW_args
);

/**
* @brief Naive core kernel for Depthwise Convolution (input gradient). Parallelizes on the channels.
* @param matMul_DW_args pointer to a matMul_DW_args structure (please refer to pulp_train_utils_fp32.h)
Expand All @@ -44,6 +52,14 @@ void dw_kernel_input_grad(
void * matMul_DW_args
);

/**
* @brief Input gradient kernel for Depthwise Convolution with padding and arbitrary stride.
* @param matMul_DW_args pointer to a kernel_DW_args structure
*/
void dw_kernel_input_grad_padded(
void * matMul_DW_args
);


/** CONV2D KERNELS **/

Expand Down
177 changes: 122 additions & 55 deletions lib/sources/pulp_conv2d_fp32.c
Original file line number Diff line number Diff line change
Expand Up @@ -295,10 +295,10 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
im2col_args.c = C2D_args->coeff;
im2col_args.output = C2D_args->output;
im2col_args.pBuffer = i2c_buffer;
im2col_args.Lpad = 0; //Lpad;
im2col_args.Rpad = 0; //Rpad;
im2col_args.Upad = 0; //Upad;
im2col_args.Dpad = 0; //Dpad;
im2col_args.Lpad = Lpad;
im2col_args.Rpad = Rpad;
im2col_args.Upad = Upad;
im2col_args.Dpad = Dpad;
im2col_args.mod = 0;
im2col_args.stride_w = stride_w;
im2col_args.stride_h = stride_h;
Expand Down Expand Up @@ -538,7 +538,7 @@ void pulp_conv2d_fp32_bw_input_grads_cl(void *Conv2D_args) {
pi_cl_team_fork(NUM_CORES, pulp_blocktransp_fp32, &bt_args);

#ifndef OPTIMIZE
pi_cl_team_fork(NUM_CORES, mm, &matMul_args);
pi_cl_team_fork(NUM_CORES, mm_add, &matMul_args);
#else
struct mm_manager_args man_args;
man_args.mm_args = &matMul_args;
Expand Down Expand Up @@ -743,70 +743,137 @@ void im2col_conv2d_fw_kernel(void *void_args) {
}


// void im2col_conv2d_param_grad_kernel(void *void_args) {
// struct mm_manager_args *man_args = (struct mm_manager_args *) void_args;
// struct matMul_args *args = man_args->mm_args;

// float *__restrict__ inData = args->A;
// float *__restrict__ coeffDiff = args->B;
// float *__restrict__ outDiff = args->C;

// float *__restrict__ biasDiff = args->bias;
// const uint32_t USE_BIASES = args->USE_BIASES;

// const uint32_t H_in = args->H;
// const uint32_t W_in = args->W;
// const uint32_t pW = args->pW;
// const uint32_t pH = args->pH;
// const uint32_t C_in = args->pCin;
// const uint32_t C_out = args->N;

// uint32_t h_str = args->stride_h;
// uint32_t w_str = args->stride_w;
// uint32_t Lpad = args->Lpad;
// uint32_t Rpad = args->Rpad;
// uint32_t Upad = args->Upad;
// uint32_t Dpad = args->Dpad;

// const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
// const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;

// const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES;
// const uint32_t start = pi_core_id() * blockSize;
// const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize;

// const uint32_t HWC = args->HWC;

// int padding = Lpad + Rpad + Upad + Dpad;

// // Perform simple matrix multiplication
// #ifndef OPTIMIZE
// mm(args);
// #else
// mm_manager(man_args);
// #endif

// // Handle biases
// if (USE_BIASES == 1) {
// for (uint32_t co = start; co < stop; co++) {
// float temp = 0;
// for (uint32_t ho = 0; ho < pH; ho++) {
// for (uint32_t wo = 0; wo < pW; wo++) {
// temp += inData[wo + ho * pW + co * pH * pW];
// }
// }
// biasDiff[co] = temp;
// }
// }

// if (HWC != 0 && HWC != 1) {
// // Unsupported layout
// printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the HWC layout (1 for HWC, 0 for CHW). Actual value: %d. Biases not used, even if provided!\n",
// HWC);
// }

// if (USE_BIASES != 0 && USE_BIASES != 1) {
// printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the bias option (1 or 0 - use biases or not). Actual value: %d. Biases not used, even if provided!\n",
// USE_BIASES);
// }
// }

void im2col_conv2d_param_grad_kernel(void *void_args) {
struct mm_manager_args *man_args = (struct mm_manager_args *) void_args;
struct matMul_args *args = man_args->mm_args;

float *__restrict__ inData = args->A;
float *__restrict__ coeffDiff = args->B;
float *__restrict__ outDiff = args->C;
const int core = pi_core_id();

float *__restrict__ A_dY = args->A; // outDiff in PGW caller (dy tile), shape N x K
float *__restrict__ B_i2r = args->B; // im2row(X), shape K x M (or transposed depending)
float *__restrict__ C_dW = args->C; // coeffDiff (dW), shape N x M

float *__restrict__ biasDiff = args->bias;
const uint32_t USE_BIASES = args->USE_BIASES;

const uint32_t H_in = args->H;
const uint32_t W_in = args->W;
const uint32_t pW = args->pW;
const uint32_t pH = args->pH;
const uint32_t C_in = args->pCin;
const uint32_t C_out = args->N;

uint32_t h_str = args->stride_h;
uint32_t w_str = args->stride_w;
uint32_t Lpad = args->Lpad;
uint32_t Rpad = args->Rpad;
uint32_t Upad = args->Upad;
uint32_t Dpad = args->Dpad;

const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;
// These may be uninitialized in your current library path
const uint32_t H_in = (uint32_t)args->H;
const uint32_t W_in = (uint32_t)args->W;
const uint32_t pW = (uint32_t)args->pW;
const uint32_t pH = (uint32_t)args->pH;
const uint32_t C_in = (uint32_t)args->pCin;
const uint32_t C_out = (uint32_t)args->N;

const uint32_t h_str = (uint32_t)args->stride_h;
const uint32_t w_str = (uint32_t)args->stride_w;
const uint32_t Lpad = (uint32_t)args->Lpad;
const uint32_t Rpad = (uint32_t)args->Rpad;
const uint32_t Upad = (uint32_t)args->Upad;
const uint32_t Dpad = (uint32_t)args->Dpad;

const uint32_t K = (uint32_t)args->K;
const uint32_t M = (uint32_t)args->M;
const uint32_t trans_B = (uint32_t)args->trans_B;
const uint32_t HWC = (uint32_t)args->HWC;

// Compute derived (may be nonsense if fields are nonsense)
uint32_t H_out_derived = 0, W_out_derived = 0;
if (h_str != 0 && w_str != 0) {
// guard underflow
if (H_in + Upad + Dpad >= pH && W_in + Lpad + Rpad >= pW) {
H_out_derived = (H_in - pH + Upad + Dpad) / h_str + 1;
W_out_derived = (W_in - pW + Lpad + Rpad) / w_str + 1;
}
}

// block partition
const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES;
const uint32_t start = pi_core_id() * blockSize;
const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize;

const uint32_t HWC = args->HWC;
const uint32_t start = core * blockSize;
const uint32_t stop = (start + blockSize > C_out) ? C_out : (start + blockSize);

int padding = Lpad + Rpad + Upad + Dpad;

// Perform simple matrix multiplication
// --- Compute GEMM-add ---
#ifndef OPTIMIZE
mm(args);
mm_add(args); // NOTE: for param-grad you want add
#else
mm_manager(man_args);
mm_add(args); // mm_manager may call optimized; keep mm_add for debug stability
#endif

// Handle biases
if (USE_BIASES == 1) {
for (uint32_t co = start; co < stop; co++) {
float temp = 0;
for (uint32_t ho = 0; ho < pH; ho++) {
for (uint32_t wo = 0; wo < pW; wo++) {
temp += inData[wo + ho * pW + co * pH * pW];
}
}
biasDiff[co] = temp;
}
}

if (HWC != 0 && HWC != 1) {
// Unsupported layout
printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the HWC layout (1 for HWC, 0 for CHW). Actual value: %d. Biases not used, even if provided!\n",
HWC);
}

if (USE_BIASES != 0 && USE_BIASES != 1) {
printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the bias option (1 or 0 - use biases or not). Actual value: %d. Biases not used, even if provided!\n",
USE_BIASES);
// Bias grad (if enabled) — keep as-is, but make it safe if pH/pW are tile size
if (USE_BIASES == 1 && biasDiff != NULL) {
// NOTE: original code sums inData as if it's [C_out, pH, pW] which is NOT true for dY in CHW param-grad.
// Leaving it unchanged can be wrong; for debug only, print warning.
if (core == 0) {
printf(" [WARN] USE_BIASES path in param-grad kernel may be incorrect for this layout. Currently skipping bias update.\n");
}
// If you actually need bias grad: biasDiff[co] += sum_{k} A_dY[co*K + k] for CHW
}
}
}
30 changes: 28 additions & 2 deletions lib/sources/pulp_conv_dw_fp32.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,20 @@ void pulp_conv_dw_fp32_bw_param_grads_cl(void *DepthWise_Conv_args) {
ker_args.weights = DW_args->coeff;
ker_args.output = DW_args->output;

pi_cl_team_fork(NUM_CORES, dw_kernel_weight_grad, &ker_args);
ker_args.stride_h = DW_args->stride_h;
ker_args.stride_w = DW_args->stride_w;

ker_args.Lpad = DW_args->Lpad;
ker_args.Rpad = DW_args->Rpad;
ker_args.Upad = DW_args->Upad;
ker_args.Dpad = DW_args->Dpad;

if (ker_args.Lpad || ker_args.Rpad || ker_args.Upad || ker_args.Dpad ||
ker_args.stride_h != 1 || ker_args.stride_w != 1) {
pi_cl_team_fork(NUM_CORES, dw_kernel_weight_grad_padded, &ker_args);
} else {
pi_cl_team_fork(NUM_CORES, dw_kernel_weight_grad, &ker_args);
}
}


Expand All @@ -82,5 +95,18 @@ void pulp_conv_dw_fp32_bw_input_grads_cl(void *DepthWise_Conv_args) {
ker_args.weights = DW_args->coeff;
ker_args.output = DW_args->output;

pi_cl_team_fork(NUM_CORES, dw_kernel_input_grad, &ker_args);
ker_args.stride_h = DW_args->stride_h;
ker_args.stride_w = DW_args->stride_w;

ker_args.Lpad = DW_args->Lpad;
ker_args.Rpad = DW_args->Rpad;
ker_args.Upad = DW_args->Upad;
ker_args.Dpad = DW_args->Dpad;

if (ker_args.Lpad || ker_args.Rpad || ker_args.Upad || ker_args.Dpad ||
ker_args.stride_h != 1 || ker_args.stride_w != 1) {
pi_cl_team_fork(NUM_CORES, dw_kernel_input_grad_padded, &ker_args);
} else {
pi_cl_team_fork(NUM_CORES, dw_kernel_input_grad, &ker_args);
}
}
Loading