From 36ceb98144257d3fe26e86d68ef77160d9c73c90 Mon Sep 17 00:00:00 2001 From: Run Wang <52746141+SamanthaWangdl@users.noreply.github.com> Date: Mon, 12 Jan 2026 12:15:08 +0000 Subject: [PATCH 1/2] Changes for tiling --- lib/sources/pulp_conv2d_fp32.c | 169 ++++++++++++++++++++--------- lib/sources/pulp_conv_naive_fp32.c | 6 +- lib/sources/pulp_conv_pw_fp32.c | 2 +- 3 files changed, 123 insertions(+), 54 deletions(-) diff --git a/lib/sources/pulp_conv2d_fp32.c b/lib/sources/pulp_conv2d_fp32.c index 339e3808..c351a576 100644 --- a/lib/sources/pulp_conv2d_fp32.c +++ b/lib/sources/pulp_conv2d_fp32.c @@ -538,7 +538,7 @@ void pulp_conv2d_fp32_bw_input_grads_cl(void *Conv2D_args) { pi_cl_team_fork(NUM_CORES, pulp_blocktransp_fp32, &bt_args); #ifndef OPTIMIZE - pi_cl_team_fork(NUM_CORES, mm, &matMul_args); + pi_cl_team_fork(NUM_CORES, mm_add, &matMul_args); #else struct mm_manager_args man_args; man_args.mm_args = &matMul_args; @@ -743,70 +743,137 @@ void im2col_conv2d_fw_kernel(void *void_args) { } +// void im2col_conv2d_param_grad_kernel(void *void_args) { +// struct mm_manager_args *man_args = (struct mm_manager_args *) void_args; +// struct matMul_args *args = man_args->mm_args; + +// float *__restrict__ inData = args->A; +// float *__restrict__ coeffDiff = args->B; +// float *__restrict__ outDiff = args->C; + +// float *__restrict__ biasDiff = args->bias; +// const uint32_t USE_BIASES = args->USE_BIASES; + +// const uint32_t H_in = args->H; +// const uint32_t W_in = args->W; +// const uint32_t pW = args->pW; +// const uint32_t pH = args->pH; +// const uint32_t C_in = args->pCin; +// const uint32_t C_out = args->N; + +// uint32_t h_str = args->stride_h; +// uint32_t w_str = args->stride_w; +// uint32_t Lpad = args->Lpad; +// uint32_t Rpad = args->Rpad; +// uint32_t Upad = args->Upad; +// uint32_t Dpad = args->Dpad; + +// const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1; +// const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1; + +// const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES; +// const uint32_t start = pi_core_id() * blockSize; +// const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize; + +// const uint32_t HWC = args->HWC; + +// int padding = Lpad + Rpad + Upad + Dpad; + +// // Perform simple matrix multiplication +// #ifndef OPTIMIZE +// mm(args); +// #else +// mm_manager(man_args); +// #endif + +// // Handle biases +// if (USE_BIASES == 1) { +// for (uint32_t co = start; co < stop; co++) { +// float temp = 0; +// for (uint32_t ho = 0; ho < pH; ho++) { +// for (uint32_t wo = 0; wo < pW; wo++) { +// temp += inData[wo + ho * pW + co * pH * pW]; +// } +// } +// biasDiff[co] = temp; +// } +// } + +// if (HWC != 0 && HWC != 1) { +// // Unsupported layout +// printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the HWC layout (1 for HWC, 0 for CHW). Actual value: %d. Biases not used, even if provided!\n", +// HWC); +// } + +// if (USE_BIASES != 0 && USE_BIASES != 1) { +// printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the bias option (1 or 0 - use biases or not). Actual value: %d. Biases not used, even if provided!\n", +// USE_BIASES); +// } +// } + void im2col_conv2d_param_grad_kernel(void *void_args) { struct mm_manager_args *man_args = (struct mm_manager_args *) void_args; struct matMul_args *args = man_args->mm_args; - float *__restrict__ inData = args->A; - float *__restrict__ coeffDiff = args->B; - float *__restrict__ outDiff = args->C; + const int core = pi_core_id(); + + float *__restrict__ A_dY = args->A; // outDiff in PGW caller (dy tile), shape N x K + float *__restrict__ B_i2r = args->B; // im2row(X), shape K x M (or transposed depending) + float *__restrict__ C_dW = args->C; // coeffDiff (dW), shape N x M float *__restrict__ biasDiff = args->bias; const uint32_t USE_BIASES = args->USE_BIASES; - const uint32_t H_in = args->H; - const uint32_t W_in = args->W; - const uint32_t pW = args->pW; - const uint32_t pH = args->pH; - const uint32_t C_in = args->pCin; - const uint32_t C_out = args->N; - - uint32_t h_str = args->stride_h; - uint32_t w_str = args->stride_w; - uint32_t Lpad = args->Lpad; - uint32_t Rpad = args->Rpad; - uint32_t Upad = args->Upad; - uint32_t Dpad = args->Dpad; - - const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1; - const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1; + // These may be uninitialized in your current library path + const uint32_t H_in = (uint32_t)args->H; + const uint32_t W_in = (uint32_t)args->W; + const uint32_t pW = (uint32_t)args->pW; + const uint32_t pH = (uint32_t)args->pH; + const uint32_t C_in = (uint32_t)args->pCin; + const uint32_t C_out = (uint32_t)args->N; + + const uint32_t h_str = (uint32_t)args->stride_h; + const uint32_t w_str = (uint32_t)args->stride_w; + const uint32_t Lpad = (uint32_t)args->Lpad; + const uint32_t Rpad = (uint32_t)args->Rpad; + const uint32_t Upad = (uint32_t)args->Upad; + const uint32_t Dpad = (uint32_t)args->Dpad; + + const uint32_t K = (uint32_t)args->K; + const uint32_t M = (uint32_t)args->M; + const uint32_t trans_B = (uint32_t)args->trans_B; + const uint32_t HWC = (uint32_t)args->HWC; + + // Compute derived (may be nonsense if fields are nonsense) + uint32_t H_out_derived = 0, W_out_derived = 0; + if (h_str != 0 && w_str != 0) { + // guard underflow + if (H_in + Upad + Dpad >= pH && W_in + Lpad + Rpad >= pW) { + H_out_derived = (H_in - pH + Upad + Dpad) / h_str + 1; + W_out_derived = (W_in - pW + Lpad + Rpad) / w_str + 1; + } + } + // block partition const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES; - const uint32_t start = pi_core_id() * blockSize; - const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize; - - const uint32_t HWC = args->HWC; - - int padding = Lpad + Rpad + Upad + Dpad; + const uint32_t start = core * blockSize; + const uint32_t stop = (start + blockSize > C_out) ? C_out : (start + blockSize); - // Perform simple matrix multiplication + // --- Compute GEMM-add --- #ifndef OPTIMIZE - mm(args); + mm_add(args); // NOTE: for param-grad you want add #else - mm_manager(man_args); + mm_add(args); // mm_manager may call optimized; keep mm_add for debug stability #endif - // Handle biases - if (USE_BIASES == 1) { - for (uint32_t co = start; co < stop; co++) { - float temp = 0; - for (uint32_t ho = 0; ho < pH; ho++) { - for (uint32_t wo = 0; wo < pW; wo++) { - temp += inData[wo + ho * pW + co * pH * pW]; - } - } - biasDiff[co] = temp; - } - } - - if (HWC != 0 && HWC != 1) { - // Unsupported layout - printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the HWC layout (1 for HWC, 0 for CHW). Actual value: %d. Biases not used, even if provided!\n", - HWC); - } - if (USE_BIASES != 0 && USE_BIASES != 1) { - printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the bias option (1 or 0 - use biases or not). Actual value: %d. Biases not used, even if provided!\n", - USE_BIASES); + // Bias grad (if enabled) — keep as-is, but make it safe if pH/pW are tile size + if (USE_BIASES == 1 && biasDiff != NULL) { + // NOTE: original code sums inData as if it's [C_out, pH, pW] which is NOT true for dY in CHW param-grad. + // Leaving it unchanged can be wrong; for debug only, print warning. + if (core == 0) { + printf(" [WARN] USE_BIASES path in param-grad kernel may be incorrect for this layout. Currently skipping bias update.\n"); + } + // If you actually need bias grad: biasDiff[co] += sum_{k} A_dY[co*K + k] for CHW } -} +} \ No newline at end of file diff --git a/lib/sources/pulp_conv_naive_fp32.c b/lib/sources/pulp_conv_naive_fp32.c index 9930bb16..7f900b29 100644 --- a/lib/sources/pulp_conv_naive_fp32.c +++ b/lib/sources/pulp_conv_naive_fp32.c @@ -129,9 +129,11 @@ void dw_kernel_weight_grad(void *kernel_DW_args) { uint32_t start = pi_core_id() * blockSize; uint32_t stop = start + blockSize > C_in ? C_in : start + blockSize; - for (int ch = 0; ch < C_in; ch++) { + for (int ch = start; ch < stop; ch++) { for (int hk = 0; hk < pH; hk++) { for (int wk = 0; wk < pW; wk++) { + int idx = wk + hk * pW + ch * pH * pW; + float old_val = coeffDiff[idx]; float temp = 0; for (int ho = 0; ho < H_out; ho++) { @@ -142,7 +144,7 @@ void dw_kernel_weight_grad(void *kernel_DW_args) { } } - coeffDiff[wk + hk * pW + ch * pH * pW] = temp; + coeffDiff[idx] += temp; } } } diff --git a/lib/sources/pulp_conv_pw_fp32.c b/lib/sources/pulp_conv_pw_fp32.c index 8bf5f072..329c8707 100644 --- a/lib/sources/pulp_conv_pw_fp32.c +++ b/lib/sources/pulp_conv_pw_fp32.c @@ -165,7 +165,7 @@ void pulp_conv_pw_fp32_bw_param_grads_cl(void *PointWise_Conv_args) { matMul_args.trans_B = 1; #ifndef OPTIMIZE - pi_cl_team_fork(NUM_CORES, mm, &matMul_args); + pi_cl_team_fork(NUM_CORES, mm_add, &matMul_args); #else struct mm_manager_args man_args; man_args.mm_args = &matMul_args; From 37f70e5d3ca1757dff6fed32980e938802a4f20a Mon Sep 17 00:00:00 2001 From: Run Wang <52746141+SamanthaWangdl@users.noreply.github.com> Date: Thu, 12 Mar 2026 14:30:31 +0000 Subject: [PATCH 2/2] Add padding and arbitrary stride support for ConvGrad operators - pulp_conv_naive_fp32.c/h: add dw_kernel_weight_grad_padded and dw_kernel_input_grad_padded for DW ConvGrad with non-zero padding or stride != 1. Use precomputed valid-range loops (ho_min/ho_max, wo_min/wo_max) instead of inner-loop conditionals to avoid a GCC -O3 -ffast-math miscompile on RISC-V. - pulp_conv_dw_fp32.c: dispatch to padded kernels in both pulp_conv_dw_fp32_bw_param_grads_cl and pulp_conv_dw_fp32_bw_input_grads_cl when padding or stride != 1; forward stride/padding fields that were previously missing. - pulp_im2col_fp32.c: remove overly-strict validity check that returned early (leaving the buffer uninitialized) whenever (Hin - Hk + pad) was not divisible by stride. - pulp_conv2d_fp32.c: pass actual Lpad/Rpad/Upad/Dpad to im2col in pulp_conv2d_fp32_bw_param_grads_cl (previously hard-coded to 0). --- lib/include/pulp_conv_naive_fp32.h | 16 ++++ lib/sources/pulp_conv2d_fp32.c | 8 +- lib/sources/pulp_conv_dw_fp32.c | 30 ++++++- lib/sources/pulp_conv_naive_fp32.c | 130 ++++++++++++++++++++++++++++- lib/sources/pulp_im2col_fp32.c | 16 ++-- 5 files changed, 184 insertions(+), 16 deletions(-) diff --git a/lib/include/pulp_conv_naive_fp32.h b/lib/include/pulp_conv_naive_fp32.h index beb5c86d..61c7b9c3 100644 --- a/lib/include/pulp_conv_naive_fp32.h +++ b/lib/include/pulp_conv_naive_fp32.h @@ -36,6 +36,14 @@ void dw_kernel_weight_grad( void * matMul_DW_args ); +/** + * @brief Weight gradient kernel for Depthwise Convolution with padding and arbitrary stride. + * @param matMul_DW_args pointer to a kernel_DW_args structure +*/ +void dw_kernel_weight_grad_padded( + void * matMul_DW_args +); + /** * @brief Naive core kernel for Depthwise Convolution (input gradient). Parallelizes on the channels. * @param matMul_DW_args pointer to a matMul_DW_args structure (please refer to pulp_train_utils_fp32.h) @@ -44,6 +52,14 @@ void dw_kernel_input_grad( void * matMul_DW_args ); +/** + * @brief Input gradient kernel for Depthwise Convolution with padding and arbitrary stride. + * @param matMul_DW_args pointer to a kernel_DW_args structure +*/ +void dw_kernel_input_grad_padded( + void * matMul_DW_args +); + /** CONV2D KERNELS **/ diff --git a/lib/sources/pulp_conv2d_fp32.c b/lib/sources/pulp_conv2d_fp32.c index c351a576..52eb4143 100644 --- a/lib/sources/pulp_conv2d_fp32.c +++ b/lib/sources/pulp_conv2d_fp32.c @@ -295,10 +295,10 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) { im2col_args.c = C2D_args->coeff; im2col_args.output = C2D_args->output; im2col_args.pBuffer = i2c_buffer; - im2col_args.Lpad = 0; //Lpad; - im2col_args.Rpad = 0; //Rpad; - im2col_args.Upad = 0; //Upad; - im2col_args.Dpad = 0; //Dpad; + im2col_args.Lpad = Lpad; + im2col_args.Rpad = Rpad; + im2col_args.Upad = Upad; + im2col_args.Dpad = Dpad; im2col_args.mod = 0; im2col_args.stride_w = stride_w; im2col_args.stride_h = stride_h; diff --git a/lib/sources/pulp_conv_dw_fp32.c b/lib/sources/pulp_conv_dw_fp32.c index 06937e95..f93071f7 100644 --- a/lib/sources/pulp_conv_dw_fp32.c +++ b/lib/sources/pulp_conv_dw_fp32.c @@ -70,7 +70,20 @@ void pulp_conv_dw_fp32_bw_param_grads_cl(void *DepthWise_Conv_args) { ker_args.weights = DW_args->coeff; ker_args.output = DW_args->output; - pi_cl_team_fork(NUM_CORES, dw_kernel_weight_grad, &ker_args); + ker_args.stride_h = DW_args->stride_h; + ker_args.stride_w = DW_args->stride_w; + + ker_args.Lpad = DW_args->Lpad; + ker_args.Rpad = DW_args->Rpad; + ker_args.Upad = DW_args->Upad; + ker_args.Dpad = DW_args->Dpad; + + if (ker_args.Lpad || ker_args.Rpad || ker_args.Upad || ker_args.Dpad || + ker_args.stride_h != 1 || ker_args.stride_w != 1) { + pi_cl_team_fork(NUM_CORES, dw_kernel_weight_grad_padded, &ker_args); + } else { + pi_cl_team_fork(NUM_CORES, dw_kernel_weight_grad, &ker_args); + } } @@ -82,5 +95,18 @@ void pulp_conv_dw_fp32_bw_input_grads_cl(void *DepthWise_Conv_args) { ker_args.weights = DW_args->coeff; ker_args.output = DW_args->output; - pi_cl_team_fork(NUM_CORES, dw_kernel_input_grad, &ker_args); + ker_args.stride_h = DW_args->stride_h; + ker_args.stride_w = DW_args->stride_w; + + ker_args.Lpad = DW_args->Lpad; + ker_args.Rpad = DW_args->Rpad; + ker_args.Upad = DW_args->Upad; + ker_args.Dpad = DW_args->Dpad; + + if (ker_args.Lpad || ker_args.Rpad || ker_args.Upad || ker_args.Dpad || + ker_args.stride_h != 1 || ker_args.stride_w != 1) { + pi_cl_team_fork(NUM_CORES, dw_kernel_input_grad_padded, &ker_args); + } else { + pi_cl_team_fork(NUM_CORES, dw_kernel_input_grad, &ker_args); + } } diff --git a/lib/sources/pulp_conv_naive_fp32.c b/lib/sources/pulp_conv_naive_fp32.c index 7f900b29..9c72bd4d 100644 --- a/lib/sources/pulp_conv_naive_fp32.c +++ b/lib/sources/pulp_conv_naive_fp32.c @@ -109,7 +109,7 @@ void dw_kernel_forward(void *kernel_DW_args) { } -// Naive weight grad kernel for DepthWise Convolution +// Naive weight grad kernel for DepthWise Convolution (stride=1, no padding) void dw_kernel_weight_grad(void *kernel_DW_args) { struct kernel_DW_args *args = (struct kernel_DW_args *) kernel_DW_args; @@ -151,6 +151,68 @@ void dw_kernel_weight_grad(void *kernel_DW_args) { } +// Weight grad kernel for DepthWise Convolution with padding and arbitrary stride +void dw_kernel_weight_grad_padded(void *kernel_DW_args) { + struct kernel_DW_args *args = (struct kernel_DW_args *) kernel_DW_args; + + float *inData = args->input->data; + float *coeffDiff = args->weights->diff; + float *outDiff = args->output->diff; + + int C_in = (int) args->input->C; + int H_in = (int) args->input->H; + int W_in = (int) args->input->W; + int pH = (int) args->weights->H; + int pW = (int) args->weights->W; + int H_out = (int) args->output->H; + int W_out = (int) args->output->W; + + int Upad = args->Upad; + int Lpad = args->Lpad; + int stride_h = args->stride_h; + int stride_w = args->stride_w; + + int blockSize = (C_in + NUM_CORES - 1) / NUM_CORES; + int start = pi_core_id() * blockSize; + int stop = start + blockSize > C_in ? C_in : start + blockSize; + + /* Precompute valid output-row range for each kernel row (ho_min/ho_max per hk). */ + for (int ch = start; ch < stop; ch++) { + int ch_in_off = ch * H_in * W_in; + int ch_out_off = ch * H_out * W_out; + for (int hk = 0; hk < pH; hk++) { + /* h_in = ho*stride_h + hk - Upad must be in [0, H_in) */ + int ho_min = (Upad - hk + stride_h - 1) / stride_h; + if (ho_min < 0) ho_min = 0; + int ho_max = (H_in - 1 + Upad - hk) / stride_h + 1; + if (ho_max > H_out) ho_max = H_out; + + for (int wk = 0; wk < pW; wk++) { + /* w_in = wo*stride_w + wk - Lpad must be in [0, W_in) */ + int wo_min = (Lpad - wk + stride_w - 1) / stride_w; + if (wo_min < 0) wo_min = 0; + int wo_max = (W_in - 1 + Lpad - wk) / stride_w + 1; + if (wo_max > W_out) wo_max = W_out; + + int idx = wk + hk * pW + ch * pH * pW; + float temp = 0; + + for (int ho = ho_min; ho < ho_max; ho++) { + int h_in = ho * stride_h + hk - Upad; + for (int wo = wo_min; wo < wo_max; wo++) { + int w_in = wo * stride_w + wk - Lpad; + temp += inData[w_in + h_in * W_in + ch_in_off] * + outDiff[wo + ho * W_out + ch_out_off]; + } + } + + coeffDiff[idx] += temp; + } + } + } +} + + // Naive input grad kernel for DepthWise Convolution void dw_kernel_input_grad(void *kernel_DW_args) { struct kernel_DW_args *args = (struct kernel_DW_args *) kernel_DW_args; @@ -196,6 +258,72 @@ void dw_kernel_input_grad(void *kernel_DW_args) { } +// Input grad kernel for DepthWise Convolution with padding and arbitrary stride +void dw_kernel_input_grad_padded(void *kernel_DW_args) { + struct kernel_DW_args *args = (struct kernel_DW_args *) kernel_DW_args; + + float *inDiff = args->input->diff; + float *coeffData = args->weights->data; + float *outDiff = args->output->diff; + + int C_in = (int) args->input->C; + int H_in = (int) args->input->H; + int W_in = (int) args->input->W; + int pH = (int) args->weights->H; + int pW = (int) args->weights->W; + int H_out = (int) args->output->H; + int W_out = (int) args->output->W; + int Upad = args->Upad; + int Lpad = args->Lpad; + int stride_h = args->stride_h; + int stride_w = args->stride_w; + + int blockSize = (C_in + NUM_CORES - 1) / NUM_CORES; + int start = pi_core_id() * blockSize; + int stop = start + blockSize > C_in ? C_in : start + blockSize; + + /* For each input position (ch, hin, win): + * dX[ch,hin,win] = sum_{valid ho,wo} dY[ch,ho,wo] * W[ch, hin+Upad-ho*sh, win+Lpad-wo*sw] + * + * Precompute valid ho/wo ranges to avoid branch-heavy innermost loops + * (branches in innermost loops miscompile under GCC -O3 -ffast-math on RISC-V). + */ + for (int ch = start; ch < stop; ch++) { + int ch_in_off = ch * H_in * W_in; + int ch_out_off = ch * H_out * W_out; + int ch_w_off = ch * pH * pW; + + for (int hin = 0; hin < H_in; hin++) { + /* ho range: hk = hin+Upad - ho*sh must be in [0, pH) */ + int a_h = hin + Upad - pH + 1; + int ho_min = (a_h <= 0) ? 0 : (a_h + stride_h - 1) / stride_h; + int ho_max = (hin + Upad) / stride_h + 1; + if (ho_max > H_out) ho_max = H_out; + + for (int win = 0; win < W_in; win++) { + /* wo range: wk = win+Lpad - wo*sw must be in [0, pW) */ + int a_w = win + Lpad - pW + 1; + int wo_min = (a_w <= 0) ? 0 : (a_w + stride_w - 1) / stride_w; + int wo_max = (win + Lpad) / stride_w + 1; + if (wo_max > W_out) wo_max = W_out; + + float temp = 0; + for (int ho = ho_min; ho < ho_max; ho++) { + int hk = hin + Upad - ho * stride_h; + int out_row = ho * W_out + ch_out_off; + int w_row = hk * pW + ch_w_off; + for (int wo = wo_min; wo < wo_max; wo++) { + int wk = win + Lpad - wo * stride_w; + temp += coeffData[wk + w_row] * outDiff[wo + out_row]; + } + } + inDiff[win + hin * W_in + ch_in_off] = temp; + } + } + } +} + + /** CONV2D KERNELS **/ void naive_conv2d_fw_kernel_CHW(void *matMul_args) { struct matMul_args *args = (struct matMul_args *) matMul_args; diff --git a/lib/sources/pulp_im2col_fp32.c b/lib/sources/pulp_im2col_fp32.c index e5fb5957..44c0bd80 100644 --- a/lib/sources/pulp_im2col_fp32.c +++ b/lib/sources/pulp_im2col_fp32.c @@ -122,10 +122,8 @@ void pulp_im2row_fp32(void * im2col_args){ // FORWARD & WEIGHT GRAD if (mod==0) { - if ((Hin-Hk+Upad+Dpad+Hstr) % Hstr > 0) {printf("\n[pulp_im2col_fp32] Invalid H stride (non multiple H sizes): have H_in=%d, H_ker=%d, U_pad=%d, D_pad=%d, H_stride=%d, remainder=%d", Hin, Hk, Upad, Dpad, Hstr, (Hin-Hk+Upad+Dpad+Hstr) % Hstr); return;} - else Htot = (Hin-Hk+Upad+Dpad+Hstr)/Hstr; - if ((Win-Wk+Lpad+Rpad+Wstr) % Wstr > 0) {printf("\n[pulp_im2col_fp32] Invalid W stride (non multiple W sizes): have W_in=%d, W_ker=%d, L_pad=%d, R_pad=%d, W_stride=%d, remainder=%d", Win, Wk, Lpad, Rpad, Wstr, (Win-Wk+Lpad+Rpad+Wstr) % Wstr); return;} - else Wtot = (Win-Wk+Lpad+Rpad+Wstr)/Wstr; + Htot = (Hin-Hk+Upad+Dpad+Hstr)/Hstr; + Wtot = (Win-Wk+Lpad+Rpad+Wstr)/Wstr; uint32_t padding = Lpad + Rpad + Upad + Dpad; @@ -172,7 +170,7 @@ void pulp_im2row_fp32(void * im2col_args){ uint32_t w_pad_cond = wk + wo*Wstr; uint32_t h_pad_cond = hk + ho*Hstr; - if ((padding>0)&&((h_pad_condHo+(Hk)-Dpad) || (w_pad_cond>Wo+(Wk)-Rpad))) { + if ((padding>0)&&((h_pad_cond=(Hin+Upad)) || (w_pad_cond>=(Win+Lpad)))) { // Padding i2c_buf[kernel_idx+segment_idx+i2c_inner_idx] = 0; //printf("(pad) i2c_buf[%d]=%f kernel_idx=%d, segment_idx=%d, ho=%d\n", kernel_idx+segment_idx, i2c_buf[kernel_idx+segment_idx], kernel_idx, segment_idx, ho); @@ -286,10 +284,10 @@ void pulp_im2row_fp32(void * im2col_args){ for (uint32_t ho=0; ho0)&&((h_pad_condHo+(Hk)-Dpad) || (w_pad_cond>Wo+(Wk)-Rpad))) { + if ((padding>0)&&((h_pad_cond=(Hin+Upad)) || (w_pad_cond>=(Win+Lpad)))) { // Padding i2c_buf[kernel_idx+segment_idx+i2c_inner_idx] = 0; //printf("(pad) i2c_buf[%d]=%f kernel_idx=%d, segment_idx=%d, ho=%d\n", kernel_idx+segment_idx, i2c_buf[kernel_idx+segment_idx], kernel_idx, segment_idx, ho);