From 36ceb98144257d3fe26e86d68ef77160d9c73c90 Mon Sep 17 00:00:00 2001
From: Run Wang <52746141+SamanthaWangdl@users.noreply.github.com>
Date: Mon, 12 Jan 2026 12:15:08 +0000
Subject: [PATCH 1/2] Changes for tiling

---
 lib/sources/pulp_conv2d_fp32.c     | 169 ++++++++++++++++++++---------
 lib/sources/pulp_conv_naive_fp32.c |   6 +-
 lib/sources/pulp_conv_pw_fp32.c    |   2 +-
 3 files changed, 123 insertions(+), 54 deletions(-)

diff --git a/lib/sources/pulp_conv2d_fp32.c b/lib/sources/pulp_conv2d_fp32.c
index 339e3808..c351a576 100644
--- a/lib/sources/pulp_conv2d_fp32.c
+++ b/lib/sources/pulp_conv2d_fp32.c
@@ -538,7 +538,7 @@ void pulp_conv2d_fp32_bw_input_grads_cl(void *Conv2D_args) {
             pi_cl_team_fork(NUM_CORES, pulp_blocktransp_fp32, &bt_args);
 
 #ifndef OPTIMIZE
-            pi_cl_team_fork(NUM_CORES, mm, &matMul_args);
+            pi_cl_team_fork(NUM_CORES, mm_add, &matMul_args);
 #else
             struct mm_manager_args man_args;
             man_args.mm_args = &matMul_args;
@@ -743,70 +743,137 @@ void im2col_conv2d_fw_kernel(void *void_args) {
 }
 
 
+// void im2col_conv2d_param_grad_kernel(void *void_args) {
+//     struct mm_manager_args *man_args = (struct mm_manager_args *) void_args;
+//     struct matMul_args *args = man_args->mm_args;
+
+//     float *__restrict__ inData = args->A;
+//     float *__restrict__ coeffDiff = args->B;
+//     float *__restrict__ outDiff = args->C;
+
+//     float *__restrict__ biasDiff = args->bias;
+//     const uint32_t USE_BIASES = args->USE_BIASES;
+
+//     const uint32_t H_in = args->H;
+//     const uint32_t W_in = args->W;
+//     const uint32_t pW = args->pW;
+//     const uint32_t pH = args->pH;
+//     const uint32_t C_in = args->pCin;
+//     const uint32_t C_out = args->N;
+
+//     uint32_t h_str = args->stride_h;
+//     uint32_t w_str = args->stride_w;
+//     uint32_t Lpad = args->Lpad;
+//     uint32_t Rpad = args->Rpad;
+//     uint32_t Upad = args->Upad;
+//     uint32_t Dpad = args->Dpad;
+
+//     const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
+//     const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;
+
+//     const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES;
+//     const uint32_t start = pi_core_id() * blockSize;
+//     const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize;
+
+//     const uint32_t HWC = args->HWC;
+
+//     int padding = Lpad + Rpad + Upad + Dpad;
+
+//     // Perform simple matrix multiplication
+// #ifndef OPTIMIZE
+//     mm(args);
+// #else
+//     mm_manager(man_args);
+// #endif
+
+//     // Handle biases
+//     if (USE_BIASES == 1) {
+//         for (uint32_t co = start; co < stop; co++) {
+//             float temp = 0;
+//             for (uint32_t ho = 0; ho < pH; ho++) {
+//                 for (uint32_t wo = 0; wo < pW; wo++) {
+//                     temp += inData[wo + ho * pW + co * pH * pW];
+//                 }
+//             }
+//             biasDiff[co] = temp;
+//         }
+//     }
+
+//     if (HWC != 0 && HWC != 1) {
+//         // Unsupported layout
+//         printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the HWC layout (1 for HWC, 0 for CHW). Actual value: %d. Biases not used, even if provided!\n",
+//                HWC);
+//     }
+
+//     if (USE_BIASES != 0 && USE_BIASES != 1) {
+//         printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the bias option (1 or 0 - use biases or not). Actual value: %d. Biases not used, even if provided!\n",
+//                USE_BIASES);
+//     }
+// }
+
 void im2col_conv2d_param_grad_kernel(void *void_args) {
     struct mm_manager_args *man_args = (struct mm_manager_args *) void_args;
     struct matMul_args *args = man_args->mm_args;
 
-    float *__restrict__ inData = args->A;
-    float *__restrict__ coeffDiff = args->B;
-    float *__restrict__ outDiff = args->C;
+    const int core = pi_core_id();
+
+    float *__restrict__ A_dY   = args->A; // outDiff in PGW caller (dy tile), shape N x K
+    float *__restrict__ B_i2r  = args->B; // im2row(X), shape K x M (or transposed depending)
+    float *__restrict__ C_dW   = args->C; // coeffDiff (dW), shape N x M
 
     float *__restrict__ biasDiff = args->bias;
     const uint32_t USE_BIASES = args->USE_BIASES;
 
-    const uint32_t H_in = args->H;
-    const uint32_t W_in = args->W;
-    const uint32_t pW = args->pW;
-    const uint32_t pH = args->pH;
-    const uint32_t C_in = args->pCin;
-    const uint32_t C_out = args->N;
-
-    uint32_t h_str = args->stride_h;
-    uint32_t w_str = args->stride_w;
-    uint32_t Lpad = args->Lpad;
-    uint32_t Rpad = args->Rpad;
-    uint32_t Upad = args->Upad;
-    uint32_t Dpad = args->Dpad;
-
-    const uint32_t H_out = (H_in - pH + Upad + Dpad) / h_str + 1;
-    const uint32_t W_out = (W_in - pW + Lpad + Rpad) / w_str + 1;
+    // These may be uninitialized in your current library path
+    const uint32_t H_in  = (uint32_t)args->H;
+    const uint32_t W_in  = (uint32_t)args->W;
+    const uint32_t pW    = (uint32_t)args->pW;
+    const uint32_t pH    = (uint32_t)args->pH;
+    const uint32_t C_in  = (uint32_t)args->pCin;
+    const uint32_t C_out = (uint32_t)args->N;
+
+    const uint32_t h_str = (uint32_t)args->stride_h;
+    const uint32_t w_str = (uint32_t)args->stride_w;
+    const uint32_t Lpad  = (uint32_t)args->Lpad;
+    const uint32_t Rpad  = (uint32_t)args->Rpad;
+    const uint32_t Upad  = (uint32_t)args->Upad;
+    const uint32_t Dpad  = (uint32_t)args->Dpad;
+
+    const uint32_t K = (uint32_t)args->K;
+    const uint32_t M = (uint32_t)args->M;
+    const uint32_t trans_B = (uint32_t)args->trans_B;
+    const uint32_t HWC = (uint32_t)args->HWC;
+
+    // Compute derived (may be nonsense if fields are nonsense)
+    uint32_t H_out_derived = 0, W_out_derived = 0;
+    if (h_str != 0 && w_str != 0) {
+        // guard underflow
+        if (H_in + Upad + Dpad >= pH && W_in + Lpad + Rpad >= pW) {
+            H_out_derived = (H_in - pH + Upad + Dpad) / h_str + 1;
+            W_out_derived = (W_in - pW + Lpad + Rpad) / w_str + 1;
+        }
+    }
 
+    // block partition
     const uint32_t blockSize = (C_out + NUM_CORES - 1) / NUM_CORES;
-    const uint32_t start = pi_core_id() * blockSize;
-    const uint32_t stop = start + blockSize > C_out ? C_out : start + blockSize;
-
-    const uint32_t HWC = args->HWC;
-
-    int padding = Lpad + Rpad + Upad + Dpad;
+    const uint32_t start = core * blockSize;
+    const uint32_t stop  = (start + blockSize > C_out) ? C_out : (start + blockSize);
 
-    // Perform simple matrix multiplication
+    // --- Compute GEMM-add ---
 #ifndef OPTIMIZE
-    mm(args);
+    mm_add(args);     // NOTE: for param-grad you want add
 #else
-    mm_manager(man_args);
+    mm_add(args);     // mm_manager may call optimized; keep mm_add for debug stability
 #endif
 
-    // Handle biases
-    if (USE_BIASES == 1) {
-        for (uint32_t co = start; co < stop; co++) {
-            float temp = 0;
-            for (uint32_t ho = 0; ho < pH; ho++) {
-                for (uint32_t wo = 0; wo < pW; wo++) {
-                    temp += inData[wo + ho * pW + co * pH * pW];
-                }
-            }
-            biasDiff[co] = temp;
-        }
-    }
-
-    if (HWC != 0 && HWC != 1) {
-        // Unsupported layout
-        printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the HWC layout (1 for HWC, 0 for CHW). Actual value: %d. Biases not used, even if provided!\n",
-               HWC);
-    }
 
-    if (USE_BIASES != 0 && USE_BIASES != 1) {
-        printf("[im2col_conv2d_param_grad_kernel:] Invalid selection of the bias option (1 or 0 - use biases or not). Actual value: %d. Biases not used, even if provided!\n",
-               USE_BIASES);
+    // Bias grad (if enabled) — keep as-is, but make it safe if pH/pW are tile size
+    if (USE_BIASES == 1 && biasDiff != NULL) {
+        // NOTE: original code sums inData as if it's [C_out, pH, pW] which is NOT true for dY in CHW param-grad.
+        // Leaving it unchanged can be wrong; for debug only, print warning.
+        if (core == 0) {
+            printf("  [WARN] USE_BIASES path in param-grad kernel may be incorrect for this layout. Currently skipping bias update.\n");
+        }
+        // If you actually need bias grad: biasDiff[co] += sum_{k} A_dY[co*K + k] for CHW
     }
-}
+}
\ No newline at end of file
diff --git a/lib/sources/pulp_conv_naive_fp32.c b/lib/sources/pulp_conv_naive_fp32.c
index 9930bb16..7f900b29 100644
--- a/lib/sources/pulp_conv_naive_fp32.c
+++ b/lib/sources/pulp_conv_naive_fp32.c
@@ -129,9 +129,11 @@ void dw_kernel_weight_grad(void *kernel_DW_args) {
     uint32_t start = pi_core_id() * blockSize;
     uint32_t stop = start + blockSize > C_in ? C_in : start + blockSize;
 
-    for (int ch = 0; ch < C_in; ch++) {
+    for (int ch = start; ch < stop; ch++) {
         for (int hk = 0; hk < pH; hk++) {
             for (int wk = 0; wk < pW; wk++) {
+                int idx = wk + hk * pW + ch * pH * pW;
+                float old_val = coeffDiff[idx];
                 float temp = 0;
 
                 for (int ho = 0; ho < H_out; ho++) {
@@ -142,7 +144,7 @@ void dw_kernel_weight_grad(void *kernel_DW_args) {
                     }
                 }
 
-                coeffDiff[wk + hk * pW + ch * pH * pW] = temp;
+                coeffDiff[idx] += temp;
             }
         }
     }
diff --git a/lib/sources/pulp_conv_pw_fp32.c b/lib/sources/pulp_conv_pw_fp32.c
index 8bf5f072..329c8707 100644
--- a/lib/sources/pulp_conv_pw_fp32.c
+++ b/lib/sources/pulp_conv_pw_fp32.c
@@ -165,7 +165,7 @@ void pulp_conv_pw_fp32_bw_param_grads_cl(void *PointWise_Conv_args) {
         matMul_args.trans_B = 1;
 
         #ifndef OPTIMIZE
-        pi_cl_team_fork(NUM_CORES, mm, &matMul_args);
+        pi_cl_team_fork(NUM_CORES, mm_add, &matMul_args);
         #else
         struct mm_manager_args man_args;
         man_args.mm_args = &matMul_args;

From 37f70e5d3ca1757dff6fed32980e938802a4f20a Mon Sep 17 00:00:00 2001
From: Run Wang <52746141+SamanthaWangdl@users.noreply.github.com>
Date: Thu, 12 Mar 2026 14:30:31 +0000
Subject: [PATCH 2/2] Add padding and arbitrary stride support for ConvGrad
 operators

- pulp_conv_naive_fp32.c/h: add dw_kernel_weight_grad_padded and
  dw_kernel_input_grad_padded for DW ConvGrad with non-zero padding
  or stride != 1. Use precomputed valid-range loops (ho_min/ho_max,
  wo_min/wo_max) instead of inner-loop conditionals to avoid a
  GCC -O3 -ffast-math miscompile on RISC-V.

- pulp_conv_dw_fp32.c: dispatch to padded kernels in both
  pulp_conv_dw_fp32_bw_param_grads_cl and
  pulp_conv_dw_fp32_bw_input_grads_cl when padding or stride != 1;
  forward stride/padding fields that were previously missing.

- pulp_im2col_fp32.c: remove overly-strict validity check that
  returned early (leaving the buffer uninitialized) whenever
  (Hin - Hk + pad) was not divisible by stride.

- pulp_conv2d_fp32.c: pass actual Lpad/Rpad/Upad/Dpad to im2col in
  pulp_conv2d_fp32_bw_param_grads_cl (previously hard-coded to 0).
---
 lib/include/pulp_conv_naive_fp32.h |  16 ++++
 lib/sources/pulp_conv2d_fp32.c     |   8 +-
 lib/sources/pulp_conv_dw_fp32.c    |  30 ++++++-
 lib/sources/pulp_conv_naive_fp32.c | 130 ++++++++++++++++++++++++++++-
 lib/sources/pulp_im2col_fp32.c     |  16 ++--
 5 files changed, 184 insertions(+), 16 deletions(-)

diff --git a/lib/include/pulp_conv_naive_fp32.h b/lib/include/pulp_conv_naive_fp32.h
index beb5c86d..61c7b9c3 100644
--- a/lib/include/pulp_conv_naive_fp32.h
+++ b/lib/include/pulp_conv_naive_fp32.h
@@ -36,6 +36,14 @@ void dw_kernel_weight_grad(
     void * matMul_DW_args
 );
 
+/**
+ * @brief Weight gradient kernel for Depthwise Convolution with padding and arbitrary stride.
+ * @param matMul_DW_args  pointer to a kernel_DW_args structure
+*/
+void dw_kernel_weight_grad_padded(
+    void * matMul_DW_args
+);
+
 /**
  * @brief Naive core kernel for Depthwise Convolution (input gradient). Parallelizes on the channels.
  * @param matMul_DW_args  pointer to a matMul_DW_args structure (please refer to pulp_train_utils_fp32.h)
@@ -44,6 +52,14 @@ void dw_kernel_input_grad(
     void * matMul_DW_args
 );
 
+/**
+ * @brief Input gradient kernel for Depthwise Convolution with padding and arbitrary stride.
+ * @param matMul_DW_args  pointer to a kernel_DW_args structure
+*/
+void dw_kernel_input_grad_padded(
+    void * matMul_DW_args
+);
+
 
 /** CONV2D KERNELS **/
 
diff --git a/lib/sources/pulp_conv2d_fp32.c b/lib/sources/pulp_conv2d_fp32.c
index c351a576..52eb4143 100644
--- a/lib/sources/pulp_conv2d_fp32.c
+++ b/lib/sources/pulp_conv2d_fp32.c
@@ -295,10 +295,10 @@ void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args) {
             im2col_args.c = C2D_args->coeff;
             im2col_args.output = C2D_args->output;
             im2col_args.pBuffer = i2c_buffer;
-            im2col_args.Lpad = 0; //Lpad;
-            im2col_args.Rpad = 0; //Rpad;
-            im2col_args.Upad = 0; //Upad;
-            im2col_args.Dpad = 0; //Dpad;
+            im2col_args.Lpad = Lpad;
+            im2col_args.Rpad = Rpad;
+            im2col_args.Upad = Upad;
+            im2col_args.Dpad = Dpad;
             im2col_args.mod = 0;
             im2col_args.stride_w = stride_w;
             im2col_args.stride_h = stride_h;
diff --git a/lib/sources/pulp_conv_dw_fp32.c b/lib/sources/pulp_conv_dw_fp32.c
index 06937e95..f93071f7 100644
--- a/lib/sources/pulp_conv_dw_fp32.c
+++ b/lib/sources/pulp_conv_dw_fp32.c
@@ -70,7 +70,20 @@ void pulp_conv_dw_fp32_bw_param_grads_cl(void *DepthWise_Conv_args) {
     ker_args.weights = DW_args->coeff;
     ker_args.output = DW_args->output;
 
-    pi_cl_team_fork(NUM_CORES, dw_kernel_weight_grad, &ker_args);
+    ker_args.stride_h = DW_args->stride_h;
+    ker_args.stride_w = DW_args->stride_w;
+
+    ker_args.Lpad = DW_args->Lpad;
+    ker_args.Rpad = DW_args->Rpad;
+    ker_args.Upad = DW_args->Upad;
+    ker_args.Dpad = DW_args->Dpad;
+
+    if (ker_args.Lpad || ker_args.Rpad || ker_args.Upad || ker_args.Dpad ||
+        ker_args.stride_h != 1 || ker_args.stride_w != 1) {
+        pi_cl_team_fork(NUM_CORES, dw_kernel_weight_grad_padded, &ker_args);
+    } else {
+        pi_cl_team_fork(NUM_CORES, dw_kernel_weight_grad, &ker_args);
+    }
 }
 
 
@@ -82,5 +95,18 @@ void pulp_conv_dw_fp32_bw_input_grads_cl(void *DepthWise_Conv_args) {
     ker_args.weights = DW_args->coeff;
     ker_args.output = DW_args->output;
 
-    pi_cl_team_fork(NUM_CORES, dw_kernel_input_grad, &ker_args);
+    ker_args.stride_h = DW_args->stride_h;
+    ker_args.stride_w = DW_args->stride_w;
+
+    ker_args.Lpad = DW_args->Lpad;
+    ker_args.Rpad = DW_args->Rpad;
+    ker_args.Upad = DW_args->Upad;
+    ker_args.Dpad = DW_args->Dpad;
+
+    if (ker_args.Lpad || ker_args.Rpad || ker_args.Upad || ker_args.Dpad ||
+        ker_args.stride_h != 1 || ker_args.stride_w != 1) {
+        pi_cl_team_fork(NUM_CORES, dw_kernel_input_grad_padded, &ker_args);
+    } else {
+        pi_cl_team_fork(NUM_CORES, dw_kernel_input_grad, &ker_args);
+    }
 }
diff --git a/lib/sources/pulp_conv_naive_fp32.c b/lib/sources/pulp_conv_naive_fp32.c
index 7f900b29..9c72bd4d 100644
--- a/lib/sources/pulp_conv_naive_fp32.c
+++ b/lib/sources/pulp_conv_naive_fp32.c
@@ -109,7 +109,7 @@ void dw_kernel_forward(void *kernel_DW_args) {
 }
 
 
-// Naive weight grad kernel for DepthWise Convolution
+// Naive weight grad kernel for DepthWise Convolution (stride=1, no padding)
 void dw_kernel_weight_grad(void *kernel_DW_args) {
     struct kernel_DW_args *args = (struct kernel_DW_args *) kernel_DW_args;
 
@@ -151,6 +151,68 @@ void dw_kernel_weight_grad(void *kernel_DW_args) {
 }
 
 
+// Weight grad kernel for DepthWise Convolution with padding and arbitrary stride
+void dw_kernel_weight_grad_padded(void *kernel_DW_args) {
+    struct kernel_DW_args *args = (struct kernel_DW_args *) kernel_DW_args;
+
+    float *inData = args->input->data;
+    float *coeffDiff = args->weights->diff;
+    float *outDiff = args->output->diff;
+
+    int C_in  = (int) args->input->C;
+    int H_in  = (int) args->input->H;
+    int W_in  = (int) args->input->W;
+    int pH    = (int) args->weights->H;
+    int pW    = (int) args->weights->W;
+    int H_out = (int) args->output->H;
+    int W_out = (int) args->output->W;
+
+    int Upad     = args->Upad;
+    int Lpad     = args->Lpad;
+    int stride_h = args->stride_h;
+    int stride_w = args->stride_w;
+
+    int blockSize = (C_in + NUM_CORES - 1) / NUM_CORES;
+    int start = pi_core_id() * blockSize;
+    int stop  = start + blockSize > C_in ? C_in : start + blockSize;
+
+    /* Precompute valid output-row range for each kernel row (ho_min/ho_max per hk). */
+    for (int ch = start; ch < stop; ch++) {
+        int ch_in_off  = ch * H_in  * W_in;
+        int ch_out_off = ch * H_out * W_out;
+        for (int hk = 0; hk < pH; hk++) {
+            /* h_in = ho*stride_h + hk - Upad  must be in [0, H_in) */
+            int ho_min = (Upad - hk + stride_h - 1) / stride_h;
+            if (ho_min < 0) ho_min = 0;
+            int ho_max = (H_in - 1 + Upad - hk) / stride_h + 1;
+            if (ho_max > H_out) ho_max = H_out;
+
+            for (int wk = 0; wk < pW; wk++) {
+                /* w_in = wo*stride_w + wk - Lpad  must be in [0, W_in) */
+                int wo_min = (Lpad - wk + stride_w - 1) / stride_w;
+                if (wo_min < 0) wo_min = 0;
+                int wo_max = (W_in - 1 + Lpad - wk) / stride_w + 1;
+                if (wo_max > W_out) wo_max = W_out;
+
+                int idx = wk + hk * pW + ch * pH * pW;
+                float temp = 0;
+
+                for (int ho = ho_min; ho < ho_max; ho++) {
+                    int h_in = ho * stride_h + hk - Upad;
+                    for (int wo = wo_min; wo < wo_max; wo++) {
+                        int w_in = wo * stride_w + wk - Lpad;
+                        temp += inData[w_in + h_in * W_in + ch_in_off] *
+                                outDiff[wo + ho * W_out + ch_out_off];
+                    }
+                }
+
+                coeffDiff[idx] += temp;
+            }
+        }
+    }
+}
+
+
 // Naive input grad kernel for DepthWise Convolution
 void dw_kernel_input_grad(void *kernel_DW_args) {
     struct kernel_DW_args *args = (struct kernel_DW_args *) kernel_DW_args;
@@ -196,6 +258,72 @@ void dw_kernel_input_grad(void *kernel_DW_args) {
 }
 
 
+// Input grad kernel for DepthWise Convolution with padding and arbitrary stride
+void dw_kernel_input_grad_padded(void *kernel_DW_args) {
+    struct kernel_DW_args *args = (struct kernel_DW_args *) kernel_DW_args;
+
+    float *inDiff    = args->input->diff;
+    float *coeffData = args->weights->data;
+    float *outDiff   = args->output->diff;
+
+    int C_in  = (int) args->input->C;
+    int H_in  = (int) args->input->H;
+    int W_in  = (int) args->input->W;
+    int pH    = (int) args->weights->H;
+    int pW    = (int) args->weights->W;
+    int H_out = (int) args->output->H;
+    int W_out = (int) args->output->W;
+    int Upad     = args->Upad;
+    int Lpad     = args->Lpad;
+    int stride_h = args->stride_h;
+    int stride_w = args->stride_w;
+
+    int blockSize = (C_in + NUM_CORES - 1) / NUM_CORES;
+    int start = pi_core_id() * blockSize;
+    int stop  = start + blockSize > C_in ? C_in : start + blockSize;
+
+    /* For each input position (ch, hin, win):
+     *   dX[ch,hin,win] = sum_{valid ho,wo} dY[ch,ho,wo] * W[ch, hin+Upad-ho*sh, win+Lpad-wo*sw]
+     *
+     * Precompute valid ho/wo ranges to avoid branch-heavy innermost loops
+     * (branches in innermost loops miscompile under GCC -O3 -ffast-math on RISC-V).
+     */
+    for (int ch = start; ch < stop; ch++) {
+        int ch_in_off  = ch * H_in  * W_in;
+        int ch_out_off = ch * H_out * W_out;
+        int ch_w_off   = ch * pH * pW;
+
+        for (int hin = 0; hin < H_in; hin++) {
+            /* ho range: hk = hin+Upad - ho*sh must be in [0, pH) */
+            int a_h   = hin + Upad - pH + 1;
+            int ho_min = (a_h <= 0) ? 0 : (a_h + stride_h - 1) / stride_h;
+            int ho_max = (hin + Upad) / stride_h + 1;
+            if (ho_max > H_out) ho_max = H_out;
+
+            for (int win = 0; win < W_in; win++) {
+                /* wo range: wk = win+Lpad - wo*sw must be in [0, pW) */
+                int a_w   = win + Lpad - pW + 1;
+                int wo_min = (a_w <= 0) ? 0 : (a_w + stride_w - 1) / stride_w;
+                int wo_max = (win + Lpad) / stride_w + 1;
+                if (wo_max > W_out) wo_max = W_out;
+
+                float temp = 0;
+                for (int ho = ho_min; ho < ho_max; ho++) {
+                    int hk      = hin + Upad - ho * stride_h;
+                    int out_row = ho * W_out + ch_out_off;
+                    int w_row   = hk * pW    + ch_w_off;
+                    for (int wo = wo_min; wo < wo_max; wo++) {
+                        int wk = win + Lpad - wo * stride_w;
+                        temp += coeffData[wk + w_row] * outDiff[wo + out_row];
+                    }
+                }
+                inDiff[win + hin * W_in + ch_in_off] = temp;
+            }
+        }
+    }
+}
+
+
 /** CONV2D KERNELS **/
 void naive_conv2d_fw_kernel_CHW(void *matMul_args) {
     struct matMul_args *args = (struct matMul_args *) matMul_args;
diff --git a/lib/sources/pulp_im2col_fp32.c b/lib/sources/pulp_im2col_fp32.c
index e5fb5957..44c0bd80 100644
--- a/lib/sources/pulp_im2col_fp32.c
+++ b/lib/sources/pulp_im2col_fp32.c
@@ -122,10 +122,8 @@ void pulp_im2row_fp32(void * im2col_args){
       // FORWARD & WEIGHT GRAD
       if (mod==0)
       {
-        if ((Hin-Hk+Upad+Dpad+Hstr) % Hstr > 0)     {printf("\n[pulp_im2col_fp32] Invalid H stride (non multiple H sizes): have H_in=%d, H_ker=%d, U_pad=%d, D_pad=%d, H_stride=%d, remainder=%d", Hin, Hk, Upad, Dpad, Hstr, (Hin-Hk+Upad+Dpad+Hstr) % Hstr); return;}
-        else                                        Htot = (Hin-Hk+Upad+Dpad+Hstr)/Hstr;
-        if ((Win-Wk+Lpad+Rpad+Wstr) % Wstr > 0)     {printf("\n[pulp_im2col_fp32] Invalid W stride (non multiple W sizes): have W_in=%d, W_ker=%d, L_pad=%d, R_pad=%d, W_stride=%d, remainder=%d", Win, Wk, Lpad, Rpad, Wstr, (Win-Wk+Lpad+Rpad+Wstr) % Wstr); return;}
-        else                                        Wtot = (Win-Wk+Lpad+Rpad+Wstr)/Wstr;
+        Htot = (Hin-Hk+Upad+Dpad+Hstr)/Hstr;
+        Wtot = (Win-Wk+Lpad+Rpad+Wstr)/Wstr;
 
         uint32_t padding = Lpad + Rpad + Upad + Dpad;
 
@@ -172,7 +170,7 @@ void pulp_im2row_fp32(void * im2col_args){
                     uint32_t w_pad_cond = wk + wo*Wstr;
                     uint32_t h_pad_cond = hk + ho*Hstr;
 
-                    if ((padding>0)&&((h_pad_cond<Upad) || (w_pad_cond<Lpad) || (h_pad_cond>Ho+(Hk)-Dpad) || (w_pad_cond>Wo+(Wk)-Rpad))) {
+                    if ((padding>0)&&((h_pad_cond<Upad) || (w_pad_cond<Lpad) || (h_pad_cond>=(Hin+Upad)) || (w_pad_cond>=(Win+Lpad)))) {
                       // Padding
                       i2c_buf[kernel_idx+segment_idx+i2c_inner_idx] = 0;
                       //printf("(pad) i2c_buf[%d]=%f                        kernel_idx=%d, segment_idx=%d, ho=%d\n", kernel_idx+segment_idx, i2c_buf[kernel_idx+segment_idx], kernel_idx, segment_idx, ho);
@@ -286,10 +284,10 @@ void pulp_im2row_fp32(void * im2col_args){
           for (uint32_t ho=0; ho<Htot; ho++) {
             for (uint32_t wo=0; wo<Wtot; wo++) {
               // Initialize padding conditions and variables
-              int pad_l = Lpad - wo*Wstr;  
-              int pad_r = wo*Wstr + (Wk) - Wtot - Rpad;
+              int pad_l = Lpad - wo*Wstr;
+              int pad_r = wo*Wstr + (Wk) - (int)(Win + Lpad);
               int pad_u = Upad - ho*Hstr;
-              int pad_d = ho*Hstr + (Hk) - Htot - Dpad;
+              int pad_d = ho*Hstr + (Hk) - (int)(Hin + Upad);
               uint32_t row_size = Wk;                // Transfer lenght (length of a row)
               uint32_t col_size = Hk;
               int in_shift_idx = 0;             // Index to shift input reading
@@ -811,7 +809,7 @@ void pulp_im2col_fp32(void * im2col_args){
                     uint32_t w_pad_cond = wk + wo*Wstr;
                     uint32_t h_pad_cond = hk + ho*Hstr;
 
-                    if ((padding>0)&&((h_pad_cond<Upad) || (w_pad_cond<Lpad) || (h_pad_cond>Ho+(Hk)-Dpad) || (w_pad_cond>Wo+(Wk)-Rpad))) {
+                    if ((padding>0)&&((h_pad_cond<Upad) || (w_pad_cond<Lpad) || (h_pad_cond>=(Hin+Upad)) || (w_pad_cond>=(Win+Lpad)))) {
                       // Padding
                       i2c_buf[kernel_idx+segment_idx+i2c_inner_idx] = 0;
                       //printf("(pad) i2c_buf[%d]=%f                        kernel_idx=%d, segment_idx=%d, ho=%d\n", kernel_idx+segment_idx, i2c_buf[kernel_idx+segment_idx], kernel_idx, segment_idx, ho);