From ebe1ad740590402c9109ed31321272636e594667 Mon Sep 17 00:00:00 2001
From: Michael Platzer <michael.platzer@axelera.ai>
Date: Wed, 5 Feb 2025 10:52:51 +0000
Subject: [PATCH 1/3] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Replace=20FMA's=20LZC?=
 =?UTF-8?q?=20with=20CVW's=20LZA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/fpnew_fma_multi.sv   | 50 +++++++++++++++++++++--------
 vendor/cvw/fma/fmalza.sv | 68 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 13 deletions(-)
 create mode 100644 vendor/cvw/fma/fmalza.sv

diff --git a/src/fpnew_fma_multi.sv b/src/fpnew_fma_multi.sv
index d5583d92..e84774e8 100644
--- a/src/fpnew_fma_multi.sv
+++ b/src/fpnew_fma_multi.sv
@@ -520,6 +520,23 @@ module fpnew_fma_multi #(
                       ? 1'b1
                       : (effective_subtraction ? 1'b0 : tentative_sign);
 
+  // ------
+  // Leading Zero Anticipator
+  // ------
+  logic [LZC_RESULT_WIDTH-1:0] lza_count;
+
+  fmalza #(
+    .WIDTH ( LOWER_SUM_WIDTH      ),
+    .NF    ( PRECISION_BITS-1     )
+  ) i_fmalza (
+    .A    ( addend_shifted[LOWER_SUM_WIDTH-1:0] ),
+    .Pm   ( product                             ),
+    .Cin  ( inject_carry_in                     ),
+    .sub  ( effective_subtraction               ),
+    .SCnt ( lza_count                           )
+  );
+
+
   // ---------------
   // Internal pipeline
   // ---------------
@@ -531,6 +548,7 @@ module fpnew_fma_multi #(
   logic [SHIFT_AMOUNT_WIDTH-1:0] addend_shamt_q;
   logic                          sticky_before_add_q;
   logic [3*PRECISION_BITS+3:0]   sum_q;
+  logic [LZC_RESULT_WIDTH-1:0]   lza_count_q;
   logic                          final_sign_q;
   fpnew_pkg::fp_format_e         dst_fmt_q2;
   fpnew_pkg::roundmode_e         rnd_mode_q;
@@ -545,6 +563,7 @@ module fpnew_fma_multi #(
   logic                  [0:NUM_MID_REGS][SHIFT_AMOUNT_WIDTH-1:0] mid_pipe_add_shamt_q;
   logic                  [0:NUM_MID_REGS]                         mid_pipe_sticky_q;
   logic                  [0:NUM_MID_REGS][3*PRECISION_BITS+3:0]   mid_pipe_sum_q;
+  logic                  [0:NUM_MID_REGS][LZC_RESULT_WIDTH-1:0]   mid_pipe_lza_count_q;
   logic                  [0:NUM_MID_REGS]                         mid_pipe_final_sign_q;
   fpnew_pkg::roundmode_e [0:NUM_MID_REGS]                         mid_pipe_rnd_mode_q;
   fpnew_pkg::fp_format_e [0:NUM_MID_REGS]                         mid_pipe_dst_fmt_q;
@@ -566,6 +585,7 @@ module fpnew_fma_multi #(
   assign mid_pipe_add_shamt_q[0]   = addend_shamt + addend_normalize_shamt;
   assign mid_pipe_sticky_q[0]      = sticky_before_add;
   assign mid_pipe_sum_q[0]         = sum;
+  assign mid_pipe_lza_count_q[0]   = lza_count;
   assign mid_pipe_final_sign_q[0]  = final_sign;
   assign mid_pipe_rnd_mode_q[0]    = inp_pipe_rnd_mode_q[NUM_INP_REGS];
   assign mid_pipe_dst_fmt_q[0]     = dst_fmt_q;
@@ -599,6 +619,7 @@ module fpnew_fma_multi #(
     `FFL(mid_pipe_add_shamt_q[i+1],   mid_pipe_add_shamt_q[i],   reg_ena, '0)
     `FFL(mid_pipe_sticky_q[i+1],      mid_pipe_sticky_q[i],      reg_ena, '0)
     `FFL(mid_pipe_sum_q[i+1],         mid_pipe_sum_q[i],         reg_ena, '0)
+    `FFL(mid_pipe_lza_count_q[i+1],   mid_pipe_lza_count_q[i],   reg_ena, '0)
     `FFL(mid_pipe_final_sign_q[i+1],  mid_pipe_final_sign_q[i],  reg_ena, '0)
     `FFL(mid_pipe_rnd_mode_q[i+1],    mid_pipe_rnd_mode_q[i],    reg_ena, fpnew_pkg::RNE)
     `FFL(mid_pipe_dst_fmt_q[i+1],     mid_pipe_dst_fmt_q[i],     reg_ena, fpnew_pkg::fp_format_e'(0))
@@ -617,6 +638,7 @@ module fpnew_fma_multi #(
   assign addend_shamt_q          = mid_pipe_add_shamt_q[NUM_MID_REGS];
   assign sticky_before_add_q     = mid_pipe_sticky_q[NUM_MID_REGS];
   assign sum_q                   = mid_pipe_sum_q[NUM_MID_REGS];
+  assign lza_count_q             = mid_pipe_lza_count_q[NUM_MID_REGS];
   assign final_sign_q            = mid_pipe_final_sign_q[NUM_MID_REGS];
   assign rnd_mode_q              = mid_pipe_rnd_mode_q[NUM_MID_REGS];
   assign dst_fmt_q2              = mid_pipe_dst_fmt_q[NUM_MID_REGS];
@@ -629,13 +651,16 @@ module fpnew_fma_multi #(
   // --------------
   logic        [LOWER_SUM_WIDTH-1:0]  sum_lower;              // lower 2p+3 bits of sum are searched
   logic        [LZC_RESULT_WIDTH-1:0] leading_zero_count;     // the number of leading zeroes
+  logic        [LZC_RESULT_WIDTH-1:0] norm_lza_count;         // leading zeroes from LZA without offest
+  logic        [LZC_RESULT_WIDTH-1:0] corrected_lza_count;    // leading zeroes corrected after LZA error
   logic signed [LZC_RESULT_WIDTH:0]   leading_zero_count_sgn; // signed leading-zero count
-  logic                               lzc_zeroes;             // in case only zeroes found
+  logic                               sum_lower_zero;         // in case only zeroes found
 
   logic        [SHIFT_AMOUNT_WIDTH-1:0] norm_shamt; // Normalization shift amount
   logic signed [EXP_WIDTH-1:0]          normalized_exponent;
 
   logic [3*PRECISION_BITS+4:0] sum_shifted;       // result after first normalization shift
+  logic [3*PRECISION_BITS+5:0] lza_pre_shift;
   logic [PRECISION_BITS:0]     final_mantissa;    // final mantissa before rounding with round bit
   logic [2*PRECISION_BITS+2:0] sum_sticky_bits;   // remaining 2p+3 sticky bits after normalization
   logic                        sticky_after_norm; // sticky bit after normalization
@@ -644,16 +669,15 @@ module fpnew_fma_multi #(
 
   assign sum_lower = sum_q[LOWER_SUM_WIDTH-1:0];
 
-  // Leading zero counter for cancellations
-  lzc #(
-    .WIDTH ( LOWER_SUM_WIDTH ),
-    .MODE  ( 1               ) // MODE = 1 counts leading zeroes
-  ) i_lzc (
-    .in_i    ( sum_lower          ),
-    .cnt_o   ( leading_zero_count ),
-    .empty_o ( lzc_zeroes         )
-  );
+  assign sum_lower_zero = sum_lower == '0;
+
+  // If the lower sum is all zeros, the LZC is also zero.
+  assign norm_lza_count = sum_lower_zero ? '0 : lza_count_q;
+
+  assign lza_pre_shift       = sum_q << (PRECISION_BITS + 2 + norm_lza_count);
+  assign corrected_lza_count = lza_pre_shift[3*PRECISION_BITS+5] ? norm_lza_count - 1: norm_lza_count;
 
+  assign leading_zero_count     = corrected_lza_count[LZC_RESULT_WIDTH-1:0];
   assign leading_zero_count_sgn = signed'({1'b0, leading_zero_count});
 
   // Normalization shift amount based on exponents and LZC (unsigned as only left shifts)
@@ -661,10 +685,10 @@ module fpnew_fma_multi #(
     // Product-anchored case or cancellations require LZC
     if ((exponent_difference_q <= 0) || (effective_subtraction_q && (exponent_difference_q <= 2))) begin
       // Normal result (biased exponent > 0 and not a zero)
-      if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !lzc_zeroes) begin
+      if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !sum_lower_zero) begin
         // Undo initial product shift, remove the counted zeroes
-        norm_shamt          = PRECISION_BITS + 2 + leading_zero_count;
-        normalized_exponent = exponent_product_q - leading_zero_count_sgn + 1; // account for shift
+        norm_shamt          = PRECISION_BITS + 1 + norm_lza_count;
+        normalized_exponent = exponent_product_q - signed'({1'b0, norm_lza_count}) + 2; // account for shift
       // Subnormal result
       end else begin
         // Cap the shift distance to align mantissa with minimum exponent
diff --git a/vendor/cvw/fma/fmalza.sv b/vendor/cvw/fma/fmalza.sv
new file mode 100644
index 00000000..e76727b8
--- /dev/null
+++ b/vendor/cvw/fma/fmalza.sv
@@ -0,0 +1,68 @@
+///////////////////////////////////////////
+// fmalza.sv
+//
+// Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
+// Modified:
+//
+// Purpose: Leading Zero Anticipator
+//
+// Documentation: RISC-V System on Chip Design Chapter 13 (Figure 13.14)
+//    See also [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+//
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+// either express or implied. See the License for the specific language governing permissions
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module fmalza #(WIDTH, NF) (
+  input logic [WIDTH-1:0]             A,              // addend
+  input logic [2*NF+1:0]              Pm,             // product
+  input logic                         Cin,            // carry in
+  input logic                         sub,            // subtraction
+  output logic [$clog2(WIDTH+1)-1:0]  SCnt            // normalization shift count for the positive result
+);
+
+  logic [WIDTH:0]                     F;              // most significant bit of F indicates leading digit
+  logic [WIDTH-1:0]                   B;              // zero-extended product with same size as aligned A
+  logic [WIDTH-1:0]                   P, G, K;        // propagate, generate, kill for each column
+  logic [WIDTH-1:0]                   Pp1, Gm1, Km1;  // propagate shifted right by 1, generate/kill shifted left 1
+
+  assign B = {{(NF+2){1'b0}}, Pm, 2'b0};              // Zero extend product
+
+  assign P = A^B;
+  assign G = A&B;
+  assign K = ~A&~B;
+
+  assign Pp1 = {sub, P[WIDTH-1:1]};                   // shift P right by 1 (for P_i+1) , use subtract flag in most significant bit
+  assign Gm1 = {G[WIDTH-2:0], Cin};                   // shift G left by 1 (for G_i-1) and bring in Cin
+  assign Km1 = {K[WIDTH-2:0], ~Cin};                  // shift K left by 1 (for K_i-1) and bring in Cin
+
+  // Apply function to determine Leading pattern
+  //      - note: Schmookler01 uses the numbering system where 0 is the most significant bit
+  assign F[WIDTH]     = ~sub&P[WIDTH-1];
+  assign F[WIDTH-1:0] = (Pp1&(G&~Km1 | K&~Gm1)) | (~Pp1&(K&~Km1 | G&~Gm1));
+
+  lzc #(
+    .WIDTH ( WIDTH+1 ),
+    .MODE  ( 1       ) // MODE = 1 counts leading zeroes
+  ) i_lzc (
+    .in_i    ( F          ),
+    .cnt_o   ( SCnt       ),
+    .empty_o ( /*unused*/ )
+  );
+
+endmodule

From b56c6f2f2ab8bca97090bdecfd87761a741ef47b Mon Sep 17 00:00:00 2001
From: Michael Platzer <michael.platzer@axelera.ai>
Date: Thu, 6 Feb 2025 12:50:12 +0000
Subject: [PATCH 2/3] Add CVW's LZA to Bender manifest

---
 Bender.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Bender.yml b/Bender.yml
index c9b18715..bb5677eb 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -40,6 +40,7 @@ sources:
   - src/fpnew_divsqrt_th_32.sv
   - src/fpnew_divsqrt_th_64_multi.sv
   - src/fpnew_divsqrt_multi.sv
+  - vendor/cvw/fma/fmalza.sv
   - src/fpnew_fma.sv
   - src/fpnew_fma_multi.sv
   - src/fpnew_noncomp.sv

From 69fcc9f4094b0717c9e5a8dc37aad15f62a25444 Mon Sep 17 00:00:00 2001
From: Michael Platzer <michael.platzer@axelera.ai>
Date: Thu, 6 Feb 2025 15:22:30 +0000
Subject: [PATCH 3/3] Cleanup FMA's LZA correction and normalization logic

---
 src/fpnew_fma_multi.sv | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/fpnew_fma_multi.sv b/src/fpnew_fma_multi.sv
index e84774e8..f0b0c4e0 100644
--- a/src/fpnew_fma_multi.sv
+++ b/src/fpnew_fma_multi.sv
@@ -651,16 +651,15 @@ module fpnew_fma_multi #(
   // --------------
   logic        [LOWER_SUM_WIDTH-1:0]  sum_lower;              // lower 2p+3 bits of sum are searched
   logic        [LZC_RESULT_WIDTH-1:0] leading_zero_count;     // the number of leading zeroes
-  logic        [LZC_RESULT_WIDTH-1:0] norm_lza_count;         // leading zeroes from LZA without offest
-  logic        [LZC_RESULT_WIDTH-1:0] corrected_lza_count;    // leading zeroes corrected after LZA error
   logic signed [LZC_RESULT_WIDTH:0]   leading_zero_count_sgn; // signed leading-zero count
   logic                               sum_lower_zero;         // in case only zeroes found
+  logic        [LOWER_SUM_WIDTH:0]    least_leading_0_onehot; // onehot encoded least leading 0
+  logic                               lza_overpredict;        // LZA over-predicted actual LZC by 1
 
   logic        [SHIFT_AMOUNT_WIDTH-1:0] norm_shamt; // Normalization shift amount
   logic signed [EXP_WIDTH-1:0]          normalized_exponent;
 
   logic [3*PRECISION_BITS+4:0] sum_shifted;       // result after first normalization shift
-  logic [3*PRECISION_BITS+5:0] lza_pre_shift;
   logic [PRECISION_BITS:0]     final_mantissa;    // final mantissa before rounding with round bit
   logic [2*PRECISION_BITS+2:0] sum_sticky_bits;   // remaining 2p+3 sticky bits after normalization
   logic                        sticky_after_norm; // sticky bit after normalization
@@ -671,13 +670,14 @@ module fpnew_fma_multi #(
 
   assign sum_lower_zero = sum_lower == '0;
 
-  // If the lower sum is all zeros, the LZC is also zero.
-  assign norm_lza_count = sum_lower_zero ? '0 : lza_count_q;
+  // A carry might have propagated into the least leading 0 bit (the lowest 0 bits just before the
+  // first 1 bit) predicted by the LZA.
+  // Note: This is a mux that only looks at `sum_q[LOWER_SUM_WIDTH:0]`.
+  assign least_leading_0_onehot = {1'b1, {LOWER_SUM_WIDTH{1'b0}}} >> lza_count_q;
+  assign lza_overpredict        = |(sum_q[LOWER_SUM_WIDTH:0] & least_leading_0_onehot);
 
-  assign lza_pre_shift       = sum_q << (PRECISION_BITS + 2 + norm_lza_count);
-  assign corrected_lza_count = lza_pre_shift[3*PRECISION_BITS+5] ? norm_lza_count - 1: norm_lza_count;
-
-  assign leading_zero_count     = corrected_lza_count[LZC_RESULT_WIDTH-1:0];
+  // Get actual LZC by correcting LZA in case of over-prediction
+  assign leading_zero_count     = lza_overpredict ? lza_count_q - 1 : lza_count_q;
   assign leading_zero_count_sgn = signed'({1'b0, leading_zero_count});
 
   // Normalization shift amount based on exponents and LZC (unsigned as only left shifts)
@@ -685,15 +685,20 @@ module fpnew_fma_multi #(
     // Product-anchored case or cancellations require LZC
     if ((exponent_difference_q <= 0) || (effective_subtraction_q && (exponent_difference_q <= 2))) begin
       // Normal result (biased exponent > 0 and not a zero)
-      if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !sum_lower_zero) begin
+      if ((exponent_product_q - signed'({1'b0, lza_count_q}) + 1 >= 0) && !sum_lower_zero) begin
         // Undo initial product shift, remove the counted zeroes
-        norm_shamt          = PRECISION_BITS + 1 + norm_lza_count;
-        normalized_exponent = exponent_product_q - signed'({1'b0, norm_lza_count}) + 2; // account for shift
+        norm_shamt          = PRECISION_BITS + 1 + lza_count_q;
+        normalized_exponent = exponent_product_q - signed'({1'b0, lza_count_q}) + 2; // account for shift
       // Subnormal result
       end else begin
         // Cap the shift distance to align mantissa with minimum exponent
         norm_shamt          = unsigned'(signed'(PRECISION_BITS + 2 + exponent_product_q));
         normalized_exponent = 0; // subnormals encoded as 0
+        // Fix exponent in case of a normal number accidentally being classified as subnormal due
+        // to LZA over-prediction (LZA could be 1 larger than actual LZC)
+        if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !sum_lower_zero) begin
+          normalized_exponent = 1;
+        end
       end
     // Addend-anchored case
     end else begin