diff --git a/Bender.yml b/Bender.yml index c9b18715..bb5677eb 100644 --- a/Bender.yml +++ b/Bender.yml @@ -40,6 +40,7 @@ sources: - src/fpnew_divsqrt_th_32.sv - src/fpnew_divsqrt_th_64_multi.sv - src/fpnew_divsqrt_multi.sv + - vendor/cvw/fma/fmalza.sv - src/fpnew_fma.sv - src/fpnew_fma_multi.sv - src/fpnew_noncomp.sv diff --git a/src/fpnew_fma_multi.sv b/src/fpnew_fma_multi.sv index d5583d92..f0b0c4e0 100644 --- a/src/fpnew_fma_multi.sv +++ b/src/fpnew_fma_multi.sv @@ -520,6 +520,23 @@ module fpnew_fma_multi #( ? 1'b1 : (effective_subtraction ? 1'b0 : tentative_sign); + // ------ + // Leading Zero Anticipator + // ------ + logic [LZC_RESULT_WIDTH-1:0] lza_count; + + fmalza #( + .WIDTH ( LOWER_SUM_WIDTH ), + .NF ( PRECISION_BITS-1 ) + ) i_fmalza ( + .A ( addend_shifted[LOWER_SUM_WIDTH-1:0] ), + .Pm ( product ), + .Cin ( inject_carry_in ), + .sub ( effective_subtraction ), + .SCnt ( lza_count ) + ); + + // --------------- // Internal pipeline // --------------- @@ -531,6 +548,7 @@ module fpnew_fma_multi #( logic [SHIFT_AMOUNT_WIDTH-1:0] addend_shamt_q; logic sticky_before_add_q; logic [3*PRECISION_BITS+3:0] sum_q; + logic [LZC_RESULT_WIDTH-1:0] lza_count_q; logic final_sign_q; fpnew_pkg::fp_format_e dst_fmt_q2; fpnew_pkg::roundmode_e rnd_mode_q; @@ -545,6 +563,7 @@ module fpnew_fma_multi #( logic [0:NUM_MID_REGS][SHIFT_AMOUNT_WIDTH-1:0] mid_pipe_add_shamt_q; logic [0:NUM_MID_REGS] mid_pipe_sticky_q; logic [0:NUM_MID_REGS][3*PRECISION_BITS+3:0] mid_pipe_sum_q; + logic [0:NUM_MID_REGS][LZC_RESULT_WIDTH-1:0] mid_pipe_lza_count_q; logic [0:NUM_MID_REGS] mid_pipe_final_sign_q; fpnew_pkg::roundmode_e [0:NUM_MID_REGS] mid_pipe_rnd_mode_q; fpnew_pkg::fp_format_e [0:NUM_MID_REGS] mid_pipe_dst_fmt_q; @@ -566,6 +585,7 @@ module fpnew_fma_multi #( assign mid_pipe_add_shamt_q[0] = addend_shamt + addend_normalize_shamt; assign mid_pipe_sticky_q[0] = sticky_before_add; assign mid_pipe_sum_q[0] = sum; + assign mid_pipe_lza_count_q[0] = lza_count; assign mid_pipe_final_sign_q[0] = final_sign; assign mid_pipe_rnd_mode_q[0] = inp_pipe_rnd_mode_q[NUM_INP_REGS]; assign mid_pipe_dst_fmt_q[0] = dst_fmt_q; @@ -599,6 +619,7 @@ module fpnew_fma_multi #( `FFL(mid_pipe_add_shamt_q[i+1], mid_pipe_add_shamt_q[i], reg_ena, '0) `FFL(mid_pipe_sticky_q[i+1], mid_pipe_sticky_q[i], reg_ena, '0) `FFL(mid_pipe_sum_q[i+1], mid_pipe_sum_q[i], reg_ena, '0) + `FFL(mid_pipe_lza_count_q[i+1], mid_pipe_lza_count_q[i], reg_ena, '0) `FFL(mid_pipe_final_sign_q[i+1], mid_pipe_final_sign_q[i], reg_ena, '0) `FFL(mid_pipe_rnd_mode_q[i+1], mid_pipe_rnd_mode_q[i], reg_ena, fpnew_pkg::RNE) `FFL(mid_pipe_dst_fmt_q[i+1], mid_pipe_dst_fmt_q[i], reg_ena, fpnew_pkg::fp_format_e'(0)) @@ -617,6 +638,7 @@ module fpnew_fma_multi #( assign addend_shamt_q = mid_pipe_add_shamt_q[NUM_MID_REGS]; assign sticky_before_add_q = mid_pipe_sticky_q[NUM_MID_REGS]; assign sum_q = mid_pipe_sum_q[NUM_MID_REGS]; + assign lza_count_q = mid_pipe_lza_count_q[NUM_MID_REGS]; assign final_sign_q = mid_pipe_final_sign_q[NUM_MID_REGS]; assign rnd_mode_q = mid_pipe_rnd_mode_q[NUM_MID_REGS]; assign dst_fmt_q2 = mid_pipe_dst_fmt_q[NUM_MID_REGS]; @@ -630,7 +652,9 @@ module fpnew_fma_multi #( logic [LOWER_SUM_WIDTH-1:0] sum_lower; // lower 2p+3 bits of sum are searched logic [LZC_RESULT_WIDTH-1:0] leading_zero_count; // the number of leading zeroes logic signed [LZC_RESULT_WIDTH:0] leading_zero_count_sgn; // signed leading-zero count - logic lzc_zeroes; // in case only zeroes found + logic sum_lower_zero; // in case only zeroes found + logic [LOWER_SUM_WIDTH:0] least_leading_0_onehot; // onehot encoded least leading 0 + logic lza_overpredict; // LZA over-predicted actual LZC by 1 logic [SHIFT_AMOUNT_WIDTH-1:0] norm_shamt; // Normalization shift amount logic signed [EXP_WIDTH-1:0] normalized_exponent; @@ -644,16 +668,16 @@ module fpnew_fma_multi #( assign sum_lower = sum_q[LOWER_SUM_WIDTH-1:0]; - // Leading zero counter for cancellations - lzc #( - .WIDTH ( LOWER_SUM_WIDTH ), - .MODE ( 1 ) // MODE = 1 counts leading zeroes - ) i_lzc ( - .in_i ( sum_lower ), - .cnt_o ( leading_zero_count ), - .empty_o ( lzc_zeroes ) - ); + assign sum_lower_zero = sum_lower == '0; + // A carry might have propagated into the least leading 0 bit (the lowest 0 bits just before the + // first 1 bit) predicted by the LZA. + // Note: This is a mux that only looks at `sum_q[LOWER_SUM_WIDTH:0]`. + assign least_leading_0_onehot = {1'b1, {LOWER_SUM_WIDTH{1'b0}}} >> lza_count_q; + assign lza_overpredict = |(sum_q[LOWER_SUM_WIDTH:0] & least_leading_0_onehot); + + // Get actual LZC by correcting LZA in case of over-prediction + assign leading_zero_count = lza_overpredict ? lza_count_q - 1 : lza_count_q; assign leading_zero_count_sgn = signed'({1'b0, leading_zero_count}); // Normalization shift amount based on exponents and LZC (unsigned as only left shifts) @@ -661,15 +685,20 @@ module fpnew_fma_multi #( // Product-anchored case or cancellations require LZC if ((exponent_difference_q <= 0) || (effective_subtraction_q && (exponent_difference_q <= 2))) begin // Normal result (biased exponent > 0 and not a zero) - if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !lzc_zeroes) begin + if ((exponent_product_q - signed'({1'b0, lza_count_q}) + 1 >= 0) && !sum_lower_zero) begin // Undo initial product shift, remove the counted zeroes - norm_shamt = PRECISION_BITS + 2 + leading_zero_count; - normalized_exponent = exponent_product_q - leading_zero_count_sgn + 1; // account for shift + norm_shamt = PRECISION_BITS + 1 + lza_count_q; + normalized_exponent = exponent_product_q - signed'({1'b0, lza_count_q}) + 2; // account for shift // Subnormal result end else begin // Cap the shift distance to align mantissa with minimum exponent norm_shamt = unsigned'(signed'(PRECISION_BITS + 2 + exponent_product_q)); normalized_exponent = 0; // subnormals encoded as 0 + // Fix exponent in case of a normal number accidentally being classified as subnormal due + // to LZA over-prediction (LZA could be 1 larger than actual LZC) + if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !sum_lower_zero) begin + normalized_exponent = 1; + end end // Addend-anchored case end else begin diff --git a/vendor/cvw/fma/fmalza.sv b/vendor/cvw/fma/fmalza.sv new file mode 100644 index 00000000..e76727b8 --- /dev/null +++ b/vendor/cvw/fma/fmalza.sv @@ -0,0 +1,68 @@ +/////////////////////////////////////////// +// fmalza.sv +// +// Written: 6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu +// Modified: +// +// Purpose: Leading Zero Anticipator +// +// Documentation: RISC-V System on Chip Design Chapter 13 (Figure 13.14) +// See also [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001] +// +// A component of the CORE-V-WALLY configurable RISC-V project. +// https://github.com/openhwgroup/cvw +// +// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University +// +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file +// except in compliance with the License, or, at your option, the Apache License version 2.0. You +// may obtain a copy of the License at +// +// https://solderpad.org/licenses/SHL-2.1/ +// +// Unless required by applicable law or agreed to in writing, any work distributed under the +// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +//////////////////////////////////////////////////////////////////////////////////////////////// + +module fmalza #(WIDTH, NF) ( + input logic [WIDTH-1:0] A, // addend + input logic [2*NF+1:0] Pm, // product + input logic Cin, // carry in + input logic sub, // subtraction + output logic [$clog2(WIDTH+1)-1:0] SCnt // normalization shift count for the positive result +); + + logic [WIDTH:0] F; // most significant bit of F indicates leading digit + logic [WIDTH-1:0] B; // zero-extended product with same size as aligned A + logic [WIDTH-1:0] P, G, K; // propagate, generate, kill for each column + logic [WIDTH-1:0] Pp1, Gm1, Km1; // propagate shifted right by 1, generate/kill shifted left 1 + + assign B = {{(NF+2){1'b0}}, Pm, 2'b0}; // Zero extend product + + assign P = A^B; + assign G = A&B; + assign K = ~A&~B; + + assign Pp1 = {sub, P[WIDTH-1:1]}; // shift P right by 1 (for P_i+1) , use subtract flag in most significant bit + assign Gm1 = {G[WIDTH-2:0], Cin}; // shift G left by 1 (for G_i-1) and bring in Cin + assign Km1 = {K[WIDTH-2:0], ~Cin}; // shift K left by 1 (for K_i-1) and bring in Cin + + // Apply function to determine Leading pattern + // - note: Schmookler01 uses the numbering system where 0 is the most significant bit + assign F[WIDTH] = ~sub&P[WIDTH-1]; + assign F[WIDTH-1:0] = (Pp1&(G&~Km1 | K&~Gm1)) | (~Pp1&(K&~Km1 | G&~Gm1)); + + lzc #( + .WIDTH ( WIDTH+1 ), + .MODE ( 1 ) // MODE = 1 counts leading zeroes + ) i_lzc ( + .in_i ( F ), + .cnt_o ( SCnt ), + .empty_o ( /*unused*/ ) + ); + +endmodule