From ebe1ad740590402c9109ed31321272636e594667 Mon Sep 17 00:00:00 2001 From: Michael Platzer Date: Wed, 5 Feb 2025 10:52:51 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Replace=20FMA's=20LZC?= =?UTF-8?q?=20with=20CVW's=20LZA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/fpnew_fma_multi.sv | 50 +++++++++++++++++++++-------- vendor/cvw/fma/fmalza.sv | 68 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 13 deletions(-) create mode 100644 vendor/cvw/fma/fmalza.sv diff --git a/src/fpnew_fma_multi.sv b/src/fpnew_fma_multi.sv index d5583d92..e84774e8 100644 --- a/src/fpnew_fma_multi.sv +++ b/src/fpnew_fma_multi.sv @@ -520,6 +520,23 @@ module fpnew_fma_multi #( ? 1'b1 : (effective_subtraction ? 1'b0 : tentative_sign); + // ------ + // Leading Zero Anticipator + // ------ + logic [LZC_RESULT_WIDTH-1:0] lza_count; + + fmalza #( + .WIDTH ( LOWER_SUM_WIDTH ), + .NF ( PRECISION_BITS-1 ) + ) i_fmalza ( + .A ( addend_shifted[LOWER_SUM_WIDTH-1:0] ), + .Pm ( product ), + .Cin ( inject_carry_in ), + .sub ( effective_subtraction ), + .SCnt ( lza_count ) + ); + + // --------------- // Internal pipeline // --------------- @@ -531,6 +548,7 @@ module fpnew_fma_multi #( logic [SHIFT_AMOUNT_WIDTH-1:0] addend_shamt_q; logic sticky_before_add_q; logic [3*PRECISION_BITS+3:0] sum_q; + logic [LZC_RESULT_WIDTH-1:0] lza_count_q; logic final_sign_q; fpnew_pkg::fp_format_e dst_fmt_q2; fpnew_pkg::roundmode_e rnd_mode_q; @@ -545,6 +563,7 @@ module fpnew_fma_multi #( logic [0:NUM_MID_REGS][SHIFT_AMOUNT_WIDTH-1:0] mid_pipe_add_shamt_q; logic [0:NUM_MID_REGS] mid_pipe_sticky_q; logic [0:NUM_MID_REGS][3*PRECISION_BITS+3:0] mid_pipe_sum_q; + logic [0:NUM_MID_REGS][LZC_RESULT_WIDTH-1:0] mid_pipe_lza_count_q; logic [0:NUM_MID_REGS] mid_pipe_final_sign_q; fpnew_pkg::roundmode_e [0:NUM_MID_REGS] mid_pipe_rnd_mode_q; fpnew_pkg::fp_format_e [0:NUM_MID_REGS] mid_pipe_dst_fmt_q; @@ -566,6 +585,7 @@ module fpnew_fma_multi #( assign mid_pipe_add_shamt_q[0] = addend_shamt + addend_normalize_shamt; assign mid_pipe_sticky_q[0] = sticky_before_add; assign mid_pipe_sum_q[0] = sum; + assign mid_pipe_lza_count_q[0] = lza_count; assign mid_pipe_final_sign_q[0] = final_sign; assign mid_pipe_rnd_mode_q[0] = inp_pipe_rnd_mode_q[NUM_INP_REGS]; assign mid_pipe_dst_fmt_q[0] = dst_fmt_q; @@ -599,6 +619,7 @@ module fpnew_fma_multi #( `FFL(mid_pipe_add_shamt_q[i+1], mid_pipe_add_shamt_q[i], reg_ena, '0) `FFL(mid_pipe_sticky_q[i+1], mid_pipe_sticky_q[i], reg_ena, '0) `FFL(mid_pipe_sum_q[i+1], mid_pipe_sum_q[i], reg_ena, '0) + `FFL(mid_pipe_lza_count_q[i+1], mid_pipe_lza_count_q[i], reg_ena, '0) `FFL(mid_pipe_final_sign_q[i+1], mid_pipe_final_sign_q[i], reg_ena, '0) `FFL(mid_pipe_rnd_mode_q[i+1], mid_pipe_rnd_mode_q[i], reg_ena, fpnew_pkg::RNE) `FFL(mid_pipe_dst_fmt_q[i+1], mid_pipe_dst_fmt_q[i], reg_ena, fpnew_pkg::fp_format_e'(0)) @@ -617,6 +638,7 @@ module fpnew_fma_multi #( assign addend_shamt_q = mid_pipe_add_shamt_q[NUM_MID_REGS]; assign sticky_before_add_q = mid_pipe_sticky_q[NUM_MID_REGS]; assign sum_q = mid_pipe_sum_q[NUM_MID_REGS]; + assign lza_count_q = mid_pipe_lza_count_q[NUM_MID_REGS]; assign final_sign_q = mid_pipe_final_sign_q[NUM_MID_REGS]; assign rnd_mode_q = mid_pipe_rnd_mode_q[NUM_MID_REGS]; assign dst_fmt_q2 = mid_pipe_dst_fmt_q[NUM_MID_REGS]; @@ -629,13 +651,16 @@ module fpnew_fma_multi #( // -------------- logic [LOWER_SUM_WIDTH-1:0] sum_lower; // lower 2p+3 bits of sum are searched logic [LZC_RESULT_WIDTH-1:0] leading_zero_count; // the number of leading zeroes + logic [LZC_RESULT_WIDTH-1:0] norm_lza_count; // leading zeroes from LZA without offest + logic [LZC_RESULT_WIDTH-1:0] corrected_lza_count; // leading zeroes corrected after LZA error logic signed [LZC_RESULT_WIDTH:0] leading_zero_count_sgn; // signed leading-zero count - logic lzc_zeroes; // in case only zeroes found + logic sum_lower_zero; // in case only zeroes found logic [SHIFT_AMOUNT_WIDTH-1:0] norm_shamt; // Normalization shift amount logic signed [EXP_WIDTH-1:0] normalized_exponent; logic [3*PRECISION_BITS+4:0] sum_shifted; // result after first normalization shift + logic [3*PRECISION_BITS+5:0] lza_pre_shift; logic [PRECISION_BITS:0] final_mantissa; // final mantissa before rounding with round bit logic [2*PRECISION_BITS+2:0] sum_sticky_bits; // remaining 2p+3 sticky bits after normalization logic sticky_after_norm; // sticky bit after normalization @@ -644,16 +669,15 @@ module fpnew_fma_multi #( assign sum_lower = sum_q[LOWER_SUM_WIDTH-1:0]; - // Leading zero counter for cancellations - lzc #( - .WIDTH ( LOWER_SUM_WIDTH ), - .MODE ( 1 ) // MODE = 1 counts leading zeroes - ) i_lzc ( - .in_i ( sum_lower ), - .cnt_o ( leading_zero_count ), - .empty_o ( lzc_zeroes ) - ); + assign sum_lower_zero = sum_lower == '0; + + // If the lower sum is all zeros, the LZC is also zero. + assign norm_lza_count = sum_lower_zero ? '0 : lza_count_q; + + assign lza_pre_shift = sum_q << (PRECISION_BITS + 2 + norm_lza_count); + assign corrected_lza_count = lza_pre_shift[3*PRECISION_BITS+5] ? norm_lza_count - 1: norm_lza_count; + assign leading_zero_count = corrected_lza_count[LZC_RESULT_WIDTH-1:0]; assign leading_zero_count_sgn = signed'({1'b0, leading_zero_count}); // Normalization shift amount based on exponents and LZC (unsigned as only left shifts) @@ -661,10 +685,10 @@ module fpnew_fma_multi #( // Product-anchored case or cancellations require LZC if ((exponent_difference_q <= 0) || (effective_subtraction_q && (exponent_difference_q <= 2))) begin // Normal result (biased exponent > 0 and not a zero) - if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !lzc_zeroes) begin + if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !sum_lower_zero) begin // Undo initial product shift, remove the counted zeroes - norm_shamt = PRECISION_BITS + 2 + leading_zero_count; - normalized_exponent = exponent_product_q - leading_zero_count_sgn + 1; // account for shift + norm_shamt = PRECISION_BITS + 1 + norm_lza_count; + normalized_exponent = exponent_product_q - signed'({1'b0, norm_lza_count}) + 2; // account for shift // Subnormal result end else begin // Cap the shift distance to align mantissa with minimum exponent diff --git a/vendor/cvw/fma/fmalza.sv b/vendor/cvw/fma/fmalza.sv new file mode 100644 index 00000000..e76727b8 --- /dev/null +++ b/vendor/cvw/fma/fmalza.sv @@ -0,0 +1,68 @@ +/////////////////////////////////////////// +// fmalza.sv +// +// Written: 6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu +// Modified: +// +// Purpose: Leading Zero Anticipator +// +// Documentation: RISC-V System on Chip Design Chapter 13 (Figure 13.14) +// See also [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001] +// +// A component of the CORE-V-WALLY configurable RISC-V project. +// https://github.com/openhwgroup/cvw +// +// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University +// +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file +// except in compliance with the License, or, at your option, the Apache License version 2.0. You +// may obtain a copy of the License at +// +// https://solderpad.org/licenses/SHL-2.1/ +// +// Unless required by applicable law or agreed to in writing, any work distributed under the +// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +//////////////////////////////////////////////////////////////////////////////////////////////// + +module fmalza #(WIDTH, NF) ( + input logic [WIDTH-1:0] A, // addend + input logic [2*NF+1:0] Pm, // product + input logic Cin, // carry in + input logic sub, // subtraction + output logic [$clog2(WIDTH+1)-1:0] SCnt // normalization shift count for the positive result +); + + logic [WIDTH:0] F; // most significant bit of F indicates leading digit + logic [WIDTH-1:0] B; // zero-extended product with same size as aligned A + logic [WIDTH-1:0] P, G, K; // propagate, generate, kill for each column + logic [WIDTH-1:0] Pp1, Gm1, Km1; // propagate shifted right by 1, generate/kill shifted left 1 + + assign B = {{(NF+2){1'b0}}, Pm, 2'b0}; // Zero extend product + + assign P = A^B; + assign G = A&B; + assign K = ~A&~B; + + assign Pp1 = {sub, P[WIDTH-1:1]}; // shift P right by 1 (for P_i+1) , use subtract flag in most significant bit + assign Gm1 = {G[WIDTH-2:0], Cin}; // shift G left by 1 (for G_i-1) and bring in Cin + assign Km1 = {K[WIDTH-2:0], ~Cin}; // shift K left by 1 (for K_i-1) and bring in Cin + + // Apply function to determine Leading pattern + // - note: Schmookler01 uses the numbering system where 0 is the most significant bit + assign F[WIDTH] = ~sub&P[WIDTH-1]; + assign F[WIDTH-1:0] = (Pp1&(G&~Km1 | K&~Gm1)) | (~Pp1&(K&~Km1 | G&~Gm1)); + + lzc #( + .WIDTH ( WIDTH+1 ), + .MODE ( 1 ) // MODE = 1 counts leading zeroes + ) i_lzc ( + .in_i ( F ), + .cnt_o ( SCnt ), + .empty_o ( /*unused*/ ) + ); + +endmodule From b56c6f2f2ab8bca97090bdecfd87761a741ef47b Mon Sep 17 00:00:00 2001 From: Michael Platzer Date: Thu, 6 Feb 2025 12:50:12 +0000 Subject: [PATCH 2/3] Add CVW's LZA to Bender manifest --- Bender.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/Bender.yml b/Bender.yml index c9b18715..bb5677eb 100644 --- a/Bender.yml +++ b/Bender.yml @@ -40,6 +40,7 @@ sources: - src/fpnew_divsqrt_th_32.sv - src/fpnew_divsqrt_th_64_multi.sv - src/fpnew_divsqrt_multi.sv + - vendor/cvw/fma/fmalza.sv - src/fpnew_fma.sv - src/fpnew_fma_multi.sv - src/fpnew_noncomp.sv From 69fcc9f4094b0717c9e5a8dc37aad15f62a25444 Mon Sep 17 00:00:00 2001 From: Michael Platzer Date: Thu, 6 Feb 2025 15:22:30 +0000 Subject: [PATCH 3/3] Cleanup FMA's LZA correction and normalization logic --- src/fpnew_fma_multi.sv | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/fpnew_fma_multi.sv b/src/fpnew_fma_multi.sv index e84774e8..f0b0c4e0 100644 --- a/src/fpnew_fma_multi.sv +++ b/src/fpnew_fma_multi.sv @@ -651,16 +651,15 @@ module fpnew_fma_multi #( // -------------- logic [LOWER_SUM_WIDTH-1:0] sum_lower; // lower 2p+3 bits of sum are searched logic [LZC_RESULT_WIDTH-1:0] leading_zero_count; // the number of leading zeroes - logic [LZC_RESULT_WIDTH-1:0] norm_lza_count; // leading zeroes from LZA without offest - logic [LZC_RESULT_WIDTH-1:0] corrected_lza_count; // leading zeroes corrected after LZA error logic signed [LZC_RESULT_WIDTH:0] leading_zero_count_sgn; // signed leading-zero count logic sum_lower_zero; // in case only zeroes found + logic [LOWER_SUM_WIDTH:0] least_leading_0_onehot; // onehot encoded least leading 0 + logic lza_overpredict; // LZA over-predicted actual LZC by 1 logic [SHIFT_AMOUNT_WIDTH-1:0] norm_shamt; // Normalization shift amount logic signed [EXP_WIDTH-1:0] normalized_exponent; logic [3*PRECISION_BITS+4:0] sum_shifted; // result after first normalization shift - logic [3*PRECISION_BITS+5:0] lza_pre_shift; logic [PRECISION_BITS:0] final_mantissa; // final mantissa before rounding with round bit logic [2*PRECISION_BITS+2:0] sum_sticky_bits; // remaining 2p+3 sticky bits after normalization logic sticky_after_norm; // sticky bit after normalization @@ -671,13 +670,14 @@ module fpnew_fma_multi #( assign sum_lower_zero = sum_lower == '0; - // If the lower sum is all zeros, the LZC is also zero. - assign norm_lza_count = sum_lower_zero ? '0 : lza_count_q; + // A carry might have propagated into the least leading 0 bit (the lowest 0 bits just before the + // first 1 bit) predicted by the LZA. + // Note: This is a mux that only looks at `sum_q[LOWER_SUM_WIDTH:0]`. + assign least_leading_0_onehot = {1'b1, {LOWER_SUM_WIDTH{1'b0}}} >> lza_count_q; + assign lza_overpredict = |(sum_q[LOWER_SUM_WIDTH:0] & least_leading_0_onehot); - assign lza_pre_shift = sum_q << (PRECISION_BITS + 2 + norm_lza_count); - assign corrected_lza_count = lza_pre_shift[3*PRECISION_BITS+5] ? norm_lza_count - 1: norm_lza_count; - - assign leading_zero_count = corrected_lza_count[LZC_RESULT_WIDTH-1:0]; + // Get actual LZC by correcting LZA in case of over-prediction + assign leading_zero_count = lza_overpredict ? lza_count_q - 1 : lza_count_q; assign leading_zero_count_sgn = signed'({1'b0, leading_zero_count}); // Normalization shift amount based on exponents and LZC (unsigned as only left shifts) @@ -685,15 +685,20 @@ module fpnew_fma_multi #( // Product-anchored case or cancellations require LZC if ((exponent_difference_q <= 0) || (effective_subtraction_q && (exponent_difference_q <= 2))) begin // Normal result (biased exponent > 0 and not a zero) - if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !sum_lower_zero) begin + if ((exponent_product_q - signed'({1'b0, lza_count_q}) + 1 >= 0) && !sum_lower_zero) begin // Undo initial product shift, remove the counted zeroes - norm_shamt = PRECISION_BITS + 1 + norm_lza_count; - normalized_exponent = exponent_product_q - signed'({1'b0, norm_lza_count}) + 2; // account for shift + norm_shamt = PRECISION_BITS + 1 + lza_count_q; + normalized_exponent = exponent_product_q - signed'({1'b0, lza_count_q}) + 2; // account for shift // Subnormal result end else begin // Cap the shift distance to align mantissa with minimum exponent norm_shamt = unsigned'(signed'(PRECISION_BITS + 2 + exponent_product_q)); normalized_exponent = 0; // subnormals encoded as 0 + // Fix exponent in case of a normal number accidentally being classified as subnormal due + // to LZA over-prediction (LZA could be 1 larger than actual LZC) + if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !sum_lower_zero) begin + normalized_exponent = 1; + end end // Addend-anchored case end else begin