From 796fcf45b04c10bd129005c7b46adb108e76b4d3 Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Tue, 2 Dec 2025 14:45:15 +0100
Subject: [PATCH 01/17] hw: Add support for various VC implementations

---
 Bender.yml                        |   1 +
 hw/floo_nw_chimney.sv             | 297 +++++++++++++++++++++++-------
 hw/floo_nw_router.sv              |  26 ++-
 hw/floo_pkg.sv                    |   7 +
 hw/floo_rob_wrapper.sv            |   3 +-
 hw/floo_router.sv                 |  37 +++-
 hw/floo_vc_arbiter.sv             |  93 +++++++++-
 hw/include/floo_noc/typedef.svh   |  54 ++++++
 hw/synth/floo_synth_2tiles.sv     | 213 +++++++++++++++++++++
 hw/synth/floo_synth_params_pkg.sv |   7 +-
 10 files changed, 644 insertions(+), 94 deletions(-)
 create mode 100644 hw/synth/floo_synth_2tiles.sv

diff --git a/Bender.yml b/Bender.yml
index 6562a73e..4c73307d 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -124,3 +124,4 @@ sources:
     - hw/synth/floo_synth_nw_chimney.sv
     - hw/synth/floo_synth_axi_router.sv
     - hw/synth/floo_synth_nw_router.sv
+    - hw/synth/floo_synth_nw_2tiles.sv
diff --git a/hw/floo_nw_chimney.sv b/hw/floo_nw_chimney.sv
index 47b1c40d..31ada007 100644
--- a/hw/floo_nw_chimney.sv
+++ b/hw/floo_nw_chimney.sv
@@ -32,6 +32,12 @@ module floo_nw_chimney #(
   /// Every atomic transactions needs to have a unique ID
   /// and one ID is reserved for non-atomic transactions
   parameter int unsigned MaxAtomicTxns           = 1,
+  /// Enable support for decoupling read and write channels
+  parameter bit EnDecoupledRW                      = 1'b0,
+  /// Specify how many physical channel are used for teh wide connection
+  parameter int unsigned NumWidePhysChannels        = 1,
+  /// Specify which VC implementation to use for the wide channels
+  parameter floo_pkg::vc_impl_e VcImplementation        = floo_pkg::VcNaive,
   /// Node ID type for routing
   parameter type id_t                                   = logic,
   /// RoB index type for reordering.
@@ -145,6 +151,14 @@ module floo_nw_chimney #(
   // For future extension, add an extra opcode in the user_struct_t
   typedef axi_addr_t user_mask_t ;
 
+  // Virtual channel enumeration
+  typedef enum logic {
+    READ  = 1'b1,
+    WRITE = 1'b0
+  } vc_e;
+
+  localparam int unsigned NumVirtualChannels = EnDecoupledRW ? 2 : 1;
+
   // Duplicate AXI port signals to degenerate ports
   // in case they are not used
   axi_narrow_req_t axi_narrow_req_in;
@@ -175,9 +189,9 @@ module floo_nw_chimney #(
   // flit queue
   floo_req_chan_t floo_req_in;
   floo_rsp_chan_t floo_rsp_in;
-  floo_wide_chan_t floo_wide_in;
-  logic floo_req_in_valid, floo_rsp_in_valid, floo_wide_in_valid;
-  logic floo_req_out_ready, floo_rsp_out_ready, floo_wide_out_ready;
+  floo_wide_chan_t floo_wide_in_q;
+  logic floo_req_in_valid, floo_rsp_in_valid, floo_wide_in_valid_q;
+  logic floo_req_out_ready, floo_rsp_out_ready, floo_wide_out_ready_q;
   logic [NumNWAxiChannels-1:0] axi_valid_in, axi_ready_out;
 
   // Flit packing
@@ -210,7 +224,8 @@ module floo_nw_chimney #(
   axi_wide_r_chan_t    axi_wide_unpack_r;
   floo_req_generic_flit_t   floo_req_unpack_generic;
   floo_rsp_generic_flit_t   floo_rsp_unpack_generic;
-  floo_wide_generic_flit_t  floo_wide_unpack_generic;
+  floo_wide_generic_flit_t  floo_wide_unpack_generic_rd;
+  floo_wide_generic_flit_t  floo_wide_unpack_generic_wr;
 
   // Meta Buffers
   axi_narrow_req_t axi_narrow_meta_buf_req_in;
@@ -247,6 +262,55 @@ module floo_nw_chimney #(
   wide_meta_buf_t wide_aw_buf_hdr_in, wide_aw_buf_hdr_out;
   wide_meta_buf_t wide_ar_buf_hdr_in, wide_ar_buf_hdr_out;
 
+  // Virtual channel signals to decouple wide AW from wide AR
+  logic floo_wide_req_arb_gnt_in, floo_wide_req_arb_valid_out;
+
+  /////////////////////////////
+  //  Virtual channel demux  //
+  /////////////////////////////
+
+  // VCs must be demuxed *before* the spill registers to avoid
+  // head-of-line blocking between read and write channels.
+
+  floo_wide_chan_t floo_wide_in_wr, floo_wide_in_rd;
+  logic floo_wide_in_wr_valid, floo_wide_in_rd_valid;
+  logic floo_wide_out_wr_ready, floo_wide_out_rd_ready;
+
+  floo_wide_chan_t floo_wide_in;
+  logic floo_wide_in_valid;
+  logic floo_wide_out_ready;
+
+  if (EnDecoupledRW) begin : gen_vc_demux
+    assign floo_wide_in_wr_valid = floo_wide_i.valid[WRITE];
+    assign floo_wide_in_rd_valid = floo_wide_i.valid[READ];
+    assign floo_wide_o.ready[WRITE] = floo_wide_out_wr_ready;
+    assign floo_wide_o.ready[READ] = floo_wide_out_rd_ready;
+    if (NumWidePhysChannels == 1) begin : gen_single_phys_ch
+      // Connect the single physical channel to both read and write
+      // the valid and ready coming from teh VCs will be used to know if the data can be used
+      assign floo_wide_in_wr = floo_wide_i.wide;
+      assign floo_wide_in_rd = floo_wide_i.wide;
+
+      if (VcImplementation == floo_pkg::VcCreditBased) begin : gen_credit_support
+        // Drive credit signals for incoming requests
+        `FF(floo_wide_o.credit[WRITE], floo_wide_in_wr_valid & floo_wide_out_wr_ready, 1'b0);
+        `FF(floo_wide_o.credit[READ], floo_wide_in_rd_valid & floo_wide_out_rd_ready, 1'b0);
+      end else begin: gen_no_credit_support
+        assign floo_wide_o.credit = '0;
+      end
+
+    end else if (NumWidePhysChannels == 2) begin : gen_dual_phys_ch
+      assign floo_wide_in_wr = floo_wide_i.wide[WRITE];
+      assign floo_wide_in_rd = floo_wide_i.wide[READ];
+    end else begin
+      $fatal(1, "NW CHIMNEY: Unsupported number of wide physical channels");
+    end
+  end else begin : gen_no_vc_demux
+    assign floo_wide_in = floo_wide_i.wide;
+    assign floo_wide_in_valid = floo_wide_i.valid;
+    assign floo_wide_o.ready = floo_wide_out_ready;
+  end
+
   ///////////////////////
   //  Spill registers  //
   ///////////////////////
@@ -430,56 +494,77 @@ module floo_nw_chimney #(
     assign axi_wide_mask_queue = '0;
   end
 
-  if (ChimneyCfgN.CutRsp && ChimneyCfgW.CutRsp) begin : gen_rsp_cuts
+  spill_register #(
+    .T      ( floo_req_chan_t ),
+    .Bypass ( !(ChimneyCfgN.CutRsp && ChimneyCfgW.CutRsp) )
+  ) i_narrow_data_req_arb (
+    .clk_i,
+    .rst_ni,
+    .data_i     ( floo_req_i.req      ),
+    .valid_i    ( floo_req_i.valid    ),
+    .ready_o    ( floo_req_o.ready    ),
+    .data_o     ( floo_req_in         ),
+    .valid_o    ( floo_req_in_valid   ),
+    .ready_i    ( floo_req_out_ready  )
+  );
+
+  spill_register #(
+    .T      ( floo_rsp_chan_t ),
+    .Bypass ( !(ChimneyCfgN.CutRsp && ChimneyCfgW.CutRsp) )
+    ) i_narrow_data_rsp_arb (
+    .clk_i,
+    .rst_ni,
+    .data_i     ( floo_rsp_i.rsp      ),
+    .valid_i    ( floo_rsp_i.valid    ),
+    .ready_o    ( floo_rsp_o.ready    ),
+    .data_o     ( floo_rsp_in         ),
+    .valid_o    ( floo_rsp_in_valid   ),
+    .ready_i    ( floo_rsp_out_ready  )
+  );
+
+  floo_wide_chan_t floo_wide_in_wr_q, floo_wide_in_rd_q;
+  logic floo_wide_in_wr_valid_q, floo_wide_in_rd_valid_q;
+  logic floo_wide_out_wr_ready_q, floo_wide_out_rd_ready_q;
+
+  if (EnDecoupledRW) begin : gen_spill_vc
     spill_register #(
-      .T ( floo_req_chan_t )
-    ) i_narrow_data_req_arb (
+      .T ( floo_wide_chan_t )
+    ) i_wide_wr_req_arb (
       .clk_i,
       .rst_ni,
-      .data_i     ( floo_req_i.req      ),
-      .valid_i    ( floo_req_i.valid    ),
-      .ready_o    ( floo_req_o.ready    ),
-      .data_o     ( floo_req_in         ),
-      .valid_o    ( floo_req_in_valid   ),
-      .ready_i    ( floo_req_out_ready  )
+      .data_i     ( floo_wide_in_wr          ),
+      .valid_i    ( floo_wide_in_wr_valid    ),
+      .ready_o    ( floo_wide_out_wr_ready   ),
+      .data_o     ( floo_wide_in_wr_q        ),
+      .valid_o    ( floo_wide_in_wr_valid_q  ),
+      .ready_i    ( floo_wide_out_wr_ready_q )
     );
-
     spill_register #(
-      .T ( floo_rsp_chan_t )
-    ) i_narrow_data_rsp_arb (
+      .T ( floo_wide_chan_t )
+    ) i_wide_rd_req_arb (
       .clk_i,
       .rst_ni,
-      .data_i     ( floo_rsp_i.rsp      ),
-      .valid_i    ( floo_rsp_i.valid    ),
-      .ready_o    ( floo_rsp_o.ready    ),
-      .data_o     ( floo_rsp_in         ),
-      .valid_o    ( floo_rsp_in_valid   ),
-      .ready_i    ( floo_rsp_out_ready  )
+      .data_i     ( floo_wide_in_rd          ),
+      .valid_i    ( floo_wide_in_rd_valid    ),
+      .ready_o    ( floo_wide_out_rd_ready   ),
+      .data_o     ( floo_wide_in_rd_q        ),
+      .valid_o    ( floo_wide_in_rd_valid_q  ),
+      .ready_i    ( floo_wide_out_rd_ready_q )
     );
-
+  end else begin : gen_spill_wide
     spill_register #(
-      .T ( floo_wide_chan_t )
+      .T      ( floo_wide_chan_t ),
+      .Bypass ( !(ChimneyCfgN.CutRsp && ChimneyCfgW.CutRsp) )
     ) i_wide_data_req_arb (
       .clk_i,
       .rst_ni,
-      .data_i     ( floo_wide_i.wide    ),
-      .valid_i    ( floo_wide_i.valid   ),
-      .ready_o    ( floo_wide_o.ready   ),
-      .data_o     ( floo_wide_in        ),
-      .valid_o    ( floo_wide_in_valid  ),
-      .ready_i    ( floo_wide_out_ready )
+      .data_i     ( floo_wide_in          ),
+      .valid_i    ( floo_wide_in_valid    ),
+      .ready_o    ( floo_wide_out_ready   ),
+      .data_o     ( floo_wide_in_q        ),
+      .valid_o    ( floo_wide_in_valid_q  ),
+      .ready_i    ( floo_wide_out_ready_q )
     );
-
-  end else begin : gen_no_rsp_cuts
-    assign floo_req_in = floo_req_i.req;
-    assign floo_rsp_in = floo_rsp_i.rsp;
-    assign floo_wide_in = floo_wide_i.wide;
-    assign floo_req_in_valid = floo_req_i.valid;
-    assign floo_rsp_in_valid = floo_rsp_i.valid;
-    assign floo_wide_in_valid = floo_wide_i.valid;
-    assign floo_req_o.ready = floo_req_out_ready;
-    assign floo_rsp_o.ready = floo_rsp_out_ready;
-    assign floo_wide_o.ready = floo_wide_out_ready;
   end
 
   logic narrow_aw_out_queue_valid, narrow_aw_out_queue_ready;
@@ -725,9 +810,9 @@ module floo_nw_chimney #(
   logic wide_r_rob_rob_req;
   logic wide_r_rob_last;
   rob_idx_t wide_r_rob_rob_idx;
-  assign wide_r_rob_rob_req = floo_wide_in.wide_r.hdr.rob_req;
-  assign wide_r_rob_rob_idx = floo_wide_in.wide_r.hdr.rob_idx;
-  assign wide_r_rob_last = floo_wide_in.wide_r.payload.last;
+  assign wide_r_rob_rob_req = floo_wide_in_rd_q.wide_r.hdr.rob_req;
+  assign wide_r_rob_rob_idx = floo_wide_in_rd_q.wide_r.hdr.rob_idx;
+  assign wide_r_rob_last = floo_wide_in_rd_q.wide_r.payload.last;
 
   floo_rob_wrapper #(
     .RoBType        ( ChimneyCfgW.RRoBType      ),
@@ -1129,19 +1214,49 @@ module floo_nw_chimney #(
     .valid_o  ( floo_rsp_o.valid      )
   );
 
-  floo_wormhole_arbiter #(
-    .NumRoutes  ( 3                         ),
-    .flit_t     ( floo_wide_generic_flit_t  )
-  ) i_wide_wormhole_arbiter (
-    .clk_i,
-    .rst_ni,
-    .valid_i  ( floo_wide_arb_req_in  ),
-    .data_i   ( floo_wide_arb_in      ),
-    .ready_o  ( floo_wide_arb_gnt_out ),
-    .data_o   ( floo_wide_o.wide      ),
-    .ready_i  ( floo_wide_i.ready     ),
-    .valid_o  ( floo_wide_o.valid     )
-  );
+  if (NumWidePhysChannels == 1) begin: gen_wide_out_wrmh
+    floo_wormhole_arbiter #(
+      .NumRoutes  ( 3                         ),
+      .flit_t     ( floo_wide_generic_flit_t  )
+    ) i_wide_wormhole_arbiter (
+      .clk_i,
+      .rst_ni,
+      .valid_i  ( floo_wide_arb_req_in         ),
+      .data_i   ( floo_wide_arb_in             ),
+      .ready_o  ( floo_wide_arb_gnt_out        ),
+      .data_o   ( floo_wide_o.wide             ),
+      .ready_i  ( floo_wide_req_arb_gnt_in     ),
+      .valid_o  ( floo_wide_req_arb_valid_out  )
+    );
+
+    // Mux the valid of the read and write channels to the ACK/NACK protocol
+    // of the virtual channel for decoupled read and write output requests.
+    // AW/W -> Virtual Channel 0
+    // R -> Virtual Channel 1
+    // TODO(lleone): check if this really solve DEADLOCK!!!!
+    if (EnDecoupledRW) begin: gen_vc_rw_ack
+      assign floo_wide_o.valid[0] = (floo_wide_o.wide[0].generic.hdr.axi_ch != WideR) ? floo_wide_req_arb_valid_out : 1'b0;
+      assign floo_wide_o.valid[1] = (floo_wide_o.wide[0].generic.hdr.axi_ch == WideR) ? floo_wide_req_arb_valid_out : 1'b0;
+      assign floo_wide_req_arb_gnt_in = (floo_wide_o.wide[0].generic.hdr.axi_ch != WideR) ?
+                                        floo_wide_i.ready[0] : floo_wide_i.ready[1];
+    end else begin: gen_no_vc_rw_ack
+      assign floo_wide_o.valid = floo_wide_req_arb_valid_out;
+      assign floo_wide_req_arb_gnt_in = floo_wide_i.ready;
+    end
+  end else if (NumWidePhysChannels == 2) begin: gen_wide_phys_ch
+    // Connect write channel
+    assign floo_wide_o.wide[0] = floo_wide_arb_in[WideW];
+    assign floo_wide_o.valid[0] = floo_wide_arb_req_in[WideW];
+    assign floo_wide_arb_gnt_out[WideW] = floo_wide_i.ready[0];
+
+    // Connect read channel
+    assign floo_wide_o.wide[1] = floo_wide_arb_in[WideR];
+    assign floo_wide_o.valid[1] = floo_wide_arb_req_in[WideR];
+    assign floo_wide_arb_gnt_out[WideR] = floo_wide_i.ready[1];
+
+  end else begin
+    $fatal(1, "NW CHIMNEY: Unsupported number of wide physical channels");
+  end
 
   ////////////////////
   // FLIT UNPACKER  //
@@ -1163,14 +1278,13 @@ module floo_nw_chimney #(
   assign axi_narrow_unpack_ar = floo_req_in.narrow_ar.payload;
   assign axi_narrow_unpack_r  = floo_rsp_in.narrow_r.payload;
   assign axi_narrow_unpack_b  = floo_rsp_in.narrow_b.payload;
-  assign axi_wide_unpack_aw   = floo_wide_in.wide_aw.payload;
-  assign axi_wide_unpack_w    = floo_wide_in.wide_w.payload;
+  assign axi_wide_unpack_aw   = floo_wide_in_wr_q.wide_aw.payload;
+  assign axi_wide_unpack_w    = floo_wide_in_wr_q.wide_w.payload;
   assign axi_wide_unpack_ar   = floo_req_in.wide_ar.payload;
-  assign axi_wide_unpack_r    = floo_wide_in.wide_r.payload;
+  assign axi_wide_unpack_r    = floo_wide_in_rd_q.wide_r.payload;
   assign axi_wide_unpack_b    = floo_rsp_in.wide_b.payload;
-  assign floo_req_unpack_generic  = floo_req_in.generic;
-  assign floo_rsp_unpack_generic  = floo_rsp_in.generic;
-  assign floo_wide_unpack_generic = floo_wide_in.generic;
+  assign floo_req_unpack_generic = floo_req_in.generic;
+  assign floo_rsp_unpack_generic = floo_rsp_in.generic;
 
 
   assign axi_valid_in[NarrowAw] = floo_req_in_valid &&
@@ -1209,7 +1323,45 @@ module floo_nw_chimney #(
 
   assign floo_req_out_ready  = axi_ready_out[floo_req_unpack_generic.hdr.axi_ch];
   assign floo_rsp_out_ready  = axi_ready_out[floo_rsp_unpack_generic.hdr.axi_ch];
-  assign floo_wide_out_ready = axi_ready_out[floo_wide_unpack_generic.hdr.axi_ch];
+
+  // Flit unpacking on the wide interface
+  if (EnDecoupledRW) begin
+
+    assign floo_wide_unpack_generic_wr = floo_wide_in_wr_q.generic;
+    assign floo_wide_unpack_generic_rd = floo_wide_in_rd_q.generic;
+
+    // Directly connect read VC to AXI R channel
+    assign axi_valid_in[WideR] = ChimneyCfgW.EnMgrPort && floo_wide_in_rd_valid_q;
+    assign floo_wide_out_rd_ready_q = axi_ready_out[WideR];
+
+    // Demux write VC to AXI AW and W channels
+    stream_demux #(
+      .N_OUP(NumVirtualChannels)
+    ) i_wide_wr_flit_demux (
+      .inp_valid_i(floo_wide_in_wr_valid_q),
+      .inp_ready_o(floo_wide_out_wr_ready_q),
+      .oup_sel_i  (floo_wide_unpack_generic_wr.hdr.axi_ch == WideAw ? 1'b1 : 1'b0),
+      .oup_valid_o({axi_valid_in[WideAw], axi_valid_in[WideW]}),
+      .oup_ready_i({axi_ready_out[WideAw], axi_ready_out[WideW]})
+    );
+
+  end else begin
+
+    // Demux single physical channel to AXI AW, W and R channels
+    assign floo_wide_out_ready_q = axi_ready_out[floo_wide_in_q.generic.hdr.axi_ch];
+    assign axi_valid_in[WideR] = ChimneyCfgW.EnMgrPort && floo_wide_in_valid_q &&
+                                 (floo_wide_in_q.generic.hdr.axi_ch == WideR);
+    assign axi_valid_in[WideAw] = floo_wide_in_valid_q &&
+                                  (floo_wide_in_q.generic.hdr.axi_ch == WideAw);
+    assign axi_valid_in[WideW] = floo_wide_in_valid_q &&
+                                 (floo_wide_in_q.generic.hdr.axi_ch == WideW);
+
+    // Aliases to uniformly write downstream logic handling both cases, with and without VCs
+    assign floo_wide_unpack_generic_wr = floo_wide_in_q.generic;
+    assign floo_wide_unpack_generic_rd = floo_wide_in_q.generic;
+    assign floo_wide_in_rd_valid_q = floo_wide_in_valid_q;
+    assign floo_wide_in_wr_valid_q = floo_wide_in_valid_q;
+  end
 
   /////////////////////////////
   // AXI req/rsp generation  //
@@ -1277,7 +1429,7 @@ module floo_nw_chimney #(
   };
   assign wide_aw_buf_hdr_in = '{
     id: axi_wide_unpack_aw.id,
-    hdr: floo_wide_unpack_generic.hdr
+    hdr: floo_wide_unpack_generic_wr.hdr
   };
   assign wide_ar_buf_hdr_in = '{
     id: axi_wide_unpack_ar.id,
@@ -1436,8 +1588,8 @@ module floo_nw_chimney #(
                            (floo_rsp_unpack_generic.hdr.axi_ch == NarrowR)))
   `ASSERT(NoWideMgrPortBResponse, ChimneyCfgW.EnMgrPort || !(floo_rsp_in_valid &&
                            (floo_rsp_unpack_generic.hdr.axi_ch == WideB)))
-  `ASSERT(NoWideMgrPortRResponse, ChimneyCfgW.EnMgrPort || !(floo_wide_in_valid &&
-                           (floo_wide_unpack_generic.hdr.axi_ch == WideR)))
+  `ASSERT(NoWideMgrPortRResponse, ChimneyCfgW.EnMgrPort || !(floo_wide_in_rd_valid_q &&
+                           (floo_wide_unpack_generic_rd.hdr.axi_ch == WideR)))
   // Network Interface cannot accept any AW, AR and W requests if `En*SbrPort` is not set
   `ASSERT(NoNarrowSbrPortAwRequest, ChimneyCfgN.EnSbrPort || !(floo_req_in_valid &&
                            (floo_req_unpack_generic.hdr.axi_ch == NarrowAw)))
@@ -1452,4 +1604,15 @@ module floo_nw_chimney #(
   `ASSERT(NoWideSbrPortWRequest,  ChimneyCfgW.EnSbrPort || !(floo_wide_in_valid &&
                            (floo_wide_unpack_generic.hdr.axi_ch == WideW)))
 
+  // When virtual channels for decoupled read and write is enabled,
+  // req_i and req_o must have same amount of VCs, equal to NumVirtualChannels
+  `ASSERT_INIT(VCMismatchInputReady, !EnDecoupledRW | ($bits(floo_wide_i.ready) == NumVirtualChannels),
+    $sformatf("Input request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
+  `ASSERT_INIT(VCMismatchOutputReady, !EnDecoupledRW | ($bits(floo_wide_o.ready) == NumVirtualChannels),
+    $sformatf("Output request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
+  `ASSERT_INIT(VCMismatchInputValid, !EnDecoupledRW | ($bits(floo_wide_i.valid) == NumVirtualChannels),
+    $sformatf("Input request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
+  `ASSERT_INIT(VCMismatchOutputValid, !EnDecoupledRW | ($bits(floo_wide_o.valid) == NumVirtualChannels),
+    $sformatf("Output request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
+
 endmodule
diff --git a/hw/floo_nw_router.sv b/hw/floo_nw_router.sv
index 8e9b144e..76fa8323 100644
--- a/hw/floo_nw_router.sv
+++ b/hw/floo_nw_router.sv
@@ -29,6 +29,11 @@ module floo_nw_router #(
   /// Disable illegal connections in router
   /// (only applies for `RouteAlgo == XYRouting`)
   parameter bit          XYRouteOpt           = 1'b1,
+  /// Enable decoupling between Read and Write WIDE channels using virtual channels
+  /// assumed that write transactions are alwasy on VC0.
+  parameter int unsigned  NumWideVirtChannels       = 32'd1,
+  parameter int unsigned  NumWidePhysChannels       = 32'd1,
+  parameter floo_pkg::vc_impl_e VcImplementation    = floo_pkg::VcNaive,
   /// Enable multicast feature
   parameter bit          EnMultiCast          = 1'b0,
   /// Node ID type
@@ -87,13 +92,15 @@ module floo_nw_router #(
   floo_rsp_chan_t [NumInputs-1:0] rsp_out;
   floo_req_chan_t [NumOutputs-1:0] req_out;
   floo_rsp_chan_t [NumOutputs-1:0] rsp_in;
-  floo_wide_chan_t [NumRoutes-1:0] wide_in, wide_out;
+  floo_wide_chan_t [NumRoutes-1:0][NumWidePhysChannels-1:0] wide_in;
+  floo_wide_chan_t [NumRoutes-1:0][NumWidePhysChannels-1:0] wide_out;
   logic [NumInputs-1:0] req_valid_in, req_ready_out;
   logic [NumInputs-1:0] rsp_valid_out, rsp_ready_in;
   logic [NumOutputs-1:0] req_valid_out, req_ready_in;
   logic [NumOutputs-1:0] rsp_valid_in, rsp_ready_out;
-  logic [NumRoutes-1:0] wide_valid_in, wide_valid_out;
-  logic [NumRoutes-1:0] wide_ready_in, wide_ready_out;
+  logic [NumRoutes-1:0][NumWideVirtChannels-1:0] wide_valid_in, wide_valid_out;
+  logic [NumRoutes-1:0][NumWideVirtChannels-1:0] wide_ready_in, wide_ready_out;
+  logic [NumRoutes-1:0][NumWideVirtChannels-1:0] wide_credit_in, wide_credit_out;
 
   for (genvar i = 0; i < NumInputs; i++) begin : gen_chimney_req
     assign req_valid_in[i] = floo_req_i[i].valid;
@@ -120,6 +127,8 @@ module floo_nw_router #(
     assign floo_wide_o[i].valid = wide_valid_out[i];
     assign wide_ready_in[i] = floo_wide_i[i].ready;
     assign floo_wide_o[i].wide = wide_out[i];
+    assign floo_wide_o[i].credit = wide_credit_out[i];
+    assign wide_credit_in[i] = floo_wide_i[i].credit;
   end
 
   floo_router #(
@@ -147,6 +156,7 @@ module floo_nw_router #(
     .valid_i        ( req_valid_in  ),
     .ready_o        ( req_ready_out ),
     .data_i         ( req_in        ),
+    .credit_i       ( '0            ),
     .valid_o        ( req_valid_out ),
     .ready_i        ( req_ready_in  ),
     .data_o         ( req_out       )
@@ -198,6 +208,7 @@ module floo_nw_router #(
     .valid_i        ( rsp_valid_in  ),
     .ready_o        ( rsp_ready_out ),
     .data_i         ( rsp_in        ),
+    .credit_i       ( '0            ),
     .valid_o        ( rsp_valid_out ),
     .ready_i        ( rsp_ready_in  ),
     .data_o         ( rsp_out       )
@@ -206,14 +217,15 @@ module floo_nw_router #(
 
   floo_router #(
     .NumRoutes        ( NumRoutes                 ),
-    .NumPhysChannels  ( 1                         ),
-    .NumVirtChannels  ( 1                         ),
+    .NumPhysChannels  ( NumWidePhysChannels       ),
+    .NumVirtChannels  ( NumWideVirtChannels       ),
     .InFifoDepth      ( InFifoDepth               ),
     .OutFifoDepth     ( OutFifoDepth              ),
     .RouteAlgo        ( RouteAlgo                 ),
     .XYRouteOpt       ( XYRouteOpt                ),
     .NumAddrRules     ( NumAddrRules              ),
     .NoLoopback       ( 1'b1                      ),
+    .VcImplementation ( VcImplementation          ),
     .EnMultiCast      ( EnMultiCast               ),
     .EnReduction      ( 1'b0                      ),
     .id_t             ( id_t                      ),
@@ -228,9 +240,11 @@ module floo_nw_router #(
     .valid_i        ( wide_valid_in   ),
     .ready_o        ( wide_ready_out  ),
     .data_i         ( wide_in         ),
+    .credit_i       ( wide_credit_in  ),
     .valid_o        ( wide_valid_out  ),
     .ready_i        ( wide_ready_in   ),
-    .data_o         ( wide_out        )
+    .data_o         ( wide_out        ),
+    .credit_o       ( wide_credit_out )
   );
 
 endmodule
diff --git a/hw/floo_pkg.sv b/hw/floo_pkg.sv
index 17e2b94d..625c3cc4 100644
--- a/hw/floo_pkg.sv
+++ b/hw/floo_pkg.sv
@@ -83,6 +83,13 @@ package floo_pkg;
     NumAxiChannels = 3'd5
   } axi_ch_e;
 
+  /// Virtual channel implementation types
+  typedef enum logic[1:0] {
+    VcNaive = 2'd0,
+    VcCreditBased = 2'd1,
+    VcPreemptValid  = 2'd2
+  } vc_impl_e;
+
   /// The types of collective communication
   typedef enum logic [1:0] {
     /// Normal communication
diff --git a/hw/floo_rob_wrapper.sv b/hw/floo_rob_wrapper.sv
index 9e435e0f..35185f3b 100644
--- a/hw/floo_rob_wrapper.sv
+++ b/hw/floo_rob_wrapper.sv
@@ -168,7 +168,8 @@ module floo_rob_wrapper
       .inject_axi_id_i              ( '0                  ),
       .inject_i                     ( 1'b0                ),
       .pop_axi_id_i                 ( rsp_i.id            ),
-      .pop_i                        ( pop && rsp_ready_i  )  // Only pop on handshake
+      .pop_i                        ( pop && rsp_ready_i  ), // Only pop on handshake
+      .any_outstanding_trx_o        (                     )
     );
 
   end else begin : gen_error
diff --git a/hw/floo_router.sv b/hw/floo_router.sv
index cea85e11..70af90ed 100644
--- a/hw/floo_router.sv
+++ b/hw/floo_router.sv
@@ -38,6 +38,8 @@ module floo_router
   parameter bit          XYRouteOpt       = 1'b1,
   /// Disables loopback connections
   parameter bit          NoLoopback       = 1'b1,
+  /// Select VC implementation
+  parameter floo_pkg::vc_impl_e VcImplementation = floo_pkg::VcNaive,
   /// Enable Multicast feature
   parameter bit          EnMultiCast      = 1'b0,
   /// Enable reduction feature
@@ -60,10 +62,12 @@ module floo_router
   input  logic  [NumInput-1:0][NumVirtChannels-1:0]  valid_i,
   output logic  [NumInput-1:0][NumVirtChannels-1:0]  ready_o,
   input  flit_t [NumInput-1:0][NumPhysChannels-1:0]  data_i,
+  output logic  [NumInput-1:0][NumVirtChannels-1:0]  credit_o,
   /// Output channels
   output logic  [NumOutput-1:0][NumVirtChannels-1:0] valid_o,
   input  logic  [NumOutput-1:0][NumVirtChannels-1:0] ready_i,
-  output flit_t [NumOutput-1:0][NumPhysChannels-1:0] data_o
+  output flit_t [NumOutput-1:0][NumPhysChannels-1:0] data_o,
+  input  logic  [NumOutput-1:0][NumVirtChannels-1:0] credit_i
 );
 
   // TODO MICHAERO: assert NumPhysChannels <= NumVirtChannels
@@ -73,6 +77,9 @@ module floo_router
 
   logic  [NumInput-1:0][NumVirtChannels-1:0][NumOutput-1:0] route_mask;
 
+  // Credit generation for virtual channel support
+  logic [NumInput-1:0][NumVirtChannels-1:0] credit_gnt_q, credit_gnt_d;
+
   // Router input part
   for (genvar in = 0; in < NumInput; in++) begin : gen_input
     for (genvar v = 0; v < NumVirtChannels; v++) begin : gen_virt_input
@@ -128,6 +135,14 @@ module floo_router
         .route_sel_id_o (                       )
       );
 
+      // Credit count generation. Assign 1 upon any handshake
+      if (VcImplementation == floo_pkg::VcCreditBased) begin: gen_credit_support
+        assign credit_o[in][v] = credit_gnt_q[in][v];
+        assign credit_gnt_d[in][v] = in_valid[in][v] & in_ready[in][v];
+        `FF(credit_gnt_q[in][v], credit_gnt_d[in][v], 1'b0);
+      end else begin: gen_no_credit
+        assign credit_o[in][v] = 1'b1;
+      end
     end
   end
 
@@ -266,20 +281,22 @@ module floo_router
 
     // Arbitrate virtual channels onto the physical channel
     floo_vc_arbiter #(
-      .NumVirtChannels ( NumVirtChannels ),
-      .flit_t          ( flit_t          ),
-      .NumPhysChannels ( NumPhysChannels )
+      .NumVirtChannels  ( NumVirtChannels  ),
+      .flit_t           ( flit_t           ),
+      .NumPhysChannels  ( NumPhysChannels  ),
+      .VcImplementation ( VcImplementation )
     ) i_vc_arbiter (
       .clk_i,
       .rst_ni,
 
-      .valid_i ( out_buffered_valid[out] ),
-      .ready_o ( out_buffered_ready[out] ),
-      .data_i  ( out_buffered_data [out] ),
+      .valid_i  ( out_buffered_valid[out] ),
+      .ready_o  ( out_buffered_ready[out] ),
+      .data_i   ( out_buffered_data [out] ),
 
-      .ready_i ( ready_i  [out] ),
-      .valid_o ( valid_o  [out] ),
-      .data_o  ( data_o   [out] )
+      .ready_i  ( ready_i  [out] ),
+      .valid_o  ( valid_o  [out] ),
+      .data_o   ( data_o   [out] ),
+      .credit_i ( credit_i[out] )
     );
   end
 
diff --git a/hw/floo_vc_arbiter.sv b/hw/floo_vc_arbiter.sv
index 8b5de592..b2f9e1d5 100644
--- a/hw/floo_vc_arbiter.sv
+++ b/hw/floo_vc_arbiter.sv
@@ -4,12 +4,17 @@
 //
 // Tim Fischer <fischeti@iis.ee.ethz.ch>
 
+`include "common_cells/assertions.svh"
+`include "common_cells/registers.svh"
+
 /// A virtual channel arbiter
 module floo_vc_arbiter import floo_pkg::*;
 #(
-  parameter int unsigned NumVirtChannels = 1,
-  parameter type         flit_t          = logic,
-  parameter int unsigned NumPhysChannels = 1
+  parameter int unsigned NumVirtChannels  = 1,
+  parameter type         flit_t           = logic,
+  parameter int unsigned NumPhysChannels  = 1,
+  parameter vc_impl_e    VcImplementation = VcNaive,
+  parameter int unsigned NumCredits       = 3
 ) (
   input  logic                        clk_i,
   input  logic                        rst_ni,
@@ -20,7 +25,8 @@ module floo_vc_arbiter import floo_pkg::*;
   /// Ports towards the physical channels
   input  logic  [NumVirtChannels-1:0] ready_i,
   output logic  [NumVirtChannels-1:0] valid_o,
-  output flit_t [NumPhysChannels-1:0] data_o
+  output flit_t [NumPhysChannels-1:0] data_o,
+  input  logic  [NumVirtChannels-1:0] credit_i
 );
 
 if (NumVirtChannels == NumPhysChannels) begin : gen_virt_eq_phys
@@ -35,10 +41,53 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
     logic [NumVirtChannels-1:0] vc_arb_req_in;
     logic                       vc_arb_req_out, vc_arb_gnt_in;
 
-    // A Virtual channel is only considered for arbitration if the virtual
-    // channel holds valid data `valid_i` and the next router is ready to
-    // receive data on this virtual channel `ready_i`.
-    assign vc_arb_req_in = valid_i & ready_i;
+    // Signals to support credit based arbitration
+    logic [NumVirtChannels-1:0] credit_handshake, credit_left;
+
+    // Mask used to switch VC requests
+    logic [NumVirtChannels-1:0] mask_q, mask_d;
+
+    ////////////////////////////////
+    // Lock and mask update logic //
+    ////////////////////////////////
+
+    always_comb begin
+      mask_d = mask_q;
+      // If we have a valid request but no grant, and the
+      // other VC has a grant, switch VC in the next cycle
+      if (vc_arb_req_out) begin
+        if (!vc_arb_gnt_in) begin
+          if (ready_i[vc_arb_idx ? 0 : 1] && valid_i[vc_arb_idx ? 0 : 1]) mask_d = ~(1'b1 << vc_arb_idx);
+        end else begin
+          mask_d = '1;
+        end
+      end
+    end
+
+    `FF(mask_q, mask_d, '1, clk_i, rst_ni)
+
+    //////////////////////////
+    // VC arbitration logic //
+    //////////////////////////
+
+    if (VcImplementation == VcPreemptValid) begin : gen_preempt_valid
+      // Initially, any valid channel can request access to the physical channel.
+      // However, to guarantee deadlock freedom, we must be able to preempt the
+      // virtual channel holding the physical link and put the other channel on
+      // the link. To do so, we mask the VC holding the link, when required.
+      assign vc_arb_req_in = valid_i & mask_q;
+    end else if (VcImplementation == VcCreditBased) begin : gen_credit_based
+      // In case of credit based approach, the valid is set only if there are credits left
+      assign vc_arb_req_in = valid_i & credit_left;
+    end else begin : gen_standard
+      // A Virtual channel is only considered for arbitration if the virtual
+      // channel holds valid data `valid_i` and the next router is ready to
+      // receive data on this virtual channel `ready_i`.
+      assign vc_arb_req_in = valid_i & ready_i;
+    end
+
+    // A credit is taken only after handshake has occured
+    assign credit_handshake = valid_o & ready_o;
 
     // The arbitration tree only accepts a single grant signal. Therefore,
     // The grant of the channel that has won the arbitration is forwarded
@@ -68,6 +117,25 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
       .data_o   ( data_o            ),
       .idx_o    ( vc_arb_idx        )
     );
+
+  if (VcImplementation == VcCreditBased) begin: gen_credit
+    for (genvar v = 0; v < NumVirtChannels; v++) begin : gen_vc_credits
+      credit_counter #(
+        .NumCredits(NumCredits)
+      ) i_vc_credit_counter (
+        .clk_i            ( clk_i                     ),
+        .rst_ni           ( rst_ni                    ),
+        .credit_o         ( /* unused */              ),
+        .credit_give_i    ( credit_i[v]               ),
+        .credit_take_i    ( credit_handshake[v]       ),
+        .credit_init_i    ( 1'b0                      ),
+        .credit_left_o    ( credit_left[v]            ),
+        .credit_crit_o    ( /* unused */              ),
+        .credit_full_o    ( /* unused */              )
+      );
+    end
+  end
+
   end else begin : gen_odd_phys
     $fatal(1, "unimplemented!");
 
@@ -75,5 +143,14 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
 
   end
 
+  ////////////////
+  // Assertions //
+  ////////////////
+
+  // Only one VC can access the physical link at a time
+  `ASSERT(OneHotOutputValid, $onehot0(valid_o))
+
+  // Currently only supports two virtual channels
+  `ASSERT_INIT(SupportedNumVirtChannels, NumVirtChannels <= 2)
 
 endmodule
diff --git a/hw/include/floo_noc/typedef.svh b/hw/include/floo_noc/typedef.svh
index 478315d9..c7c64513 100644
--- a/hw/include/floo_noc/typedef.svh
+++ b/hw/include/floo_noc/typedef.svh
@@ -288,6 +288,32 @@
     floo_``chan_name``_chan_t ``chan_name``;  \
   } floo_``name``_t;
 
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+// Defines the all the link types with a ready-valid handshaking interface
+// It support virtual channeling by extending the handshakes
+//
+// Arguments:
+// - name: Name of the link type
+// - chan_name: Name of the channel type to transport
+// - vc_num: Number of virtual channels
+// - phy_num: Number of physical channels
+//
+// Assumption: vc_num >= phy_num
+//
+// Usage Example:
+// localparam floo_pkg::axi_cfg_t AxiCfg = '{...};
+// `FLOO_TYPEDEF_HDR_T(hdr_t, ...)
+// `FLOO_TYPEDEF_AXI_FROM_CFG(my_axi, AxiCfg)
+// `FLOO_TYPEDEF_AXI_CHAN_ALL(my_axi, req, rsp, my_axi_in, AxiCfg, hdr_t)
+// FLOO_TYPEDEF_VIRT_CHAN_LINK_T(req, my_axi, 1, 1)
+//
+`define FLOO_TYPEDEF_VIRT_CHAN_LINK_T(name, chan_name, vc_num, phy_num)   \
+  typedef struct packed {                                        \
+    logic [vc_num-1:0] valid;                                    \
+    logic [vc_num-1:0] ready;                                    \
+    logic [vc_num-1:0] credit;                                 \
+    floo_``chan_name``_chan_t [phy_num-1:0] ``chan_name``;       \
+  } floo_``name``_t;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Defines the all the link types with credit-based flow control interface
@@ -357,6 +383,34 @@
   `FLOO_TYPEDEF_LINK_T(rsp, rsp_chan)                                           \
   `FLOO_TYPEDEF_LINK_T(wide, wide_chan)
 
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+// Defines the all the link types with ready-valid handshaking interface
+// for a narrow-wide AXI interface configuration which implements a simple
+// virtual channeling.
+//
+// Arguments:
+// - req: Name of the `req` link type
+// - rsp: Name of the `rsp` link type
+// - wide: Name of the `wide` link type
+// - req_chan: Name of the `req` channel type to transport
+// - rsp_chan: Name of the `rsp` channel type to transport
+// - wide_chan: Name of the `wide` channel type to transport
+// - req_virt_chan: Number of virtual channel for the narrow link
+// - wide_virt_chan: Number of virtual channel for the wide link
+//
+// Usage Example:
+// localparam floo_pkg::axi_cfg_t AxiCfgN = '{...};
+// localparam floo_pkg::axi_cfg_t AxiCfgW = '{...};
+// `FLOO_TYPEDEF_HDR_T(hdr_t, ...)
+// `FLOO_TYPEDEF_AXI_FROM_CFG(my_narrow_axi, AxiCfgN)
+// `FLOO_TYPEDEF_AXI_FROM_CFG(my_wide_axi, AxiCfgW)
+// `FLOO_TYPEDEF_NW_CHAN_ALL(axi, my_req, my_rsp, my_wide, my_axi_narrow_in, my_axi_wide_in, AxiCfgN, AxiCfgW, hdr_t)
+// `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, my_req, my_rsp, my_wide, 1, 2)
+`define FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req_chan, rsp_chan, wide_chan, req_virt_chan, wide_virt_chan, wide_phys_chan)  \
+  `FLOO_TYPEDEF_VIRT_CHAN_LINK_T(req, req_chan, req_virt_chan, req_virt_chan)                                                            \
+  `FLOO_TYPEDEF_VIRT_CHAN_LINK_T(rsp, rsp_chan, 1, 1)                                                                        \
+  `FLOO_TYPEDEF_VIRT_CHAN_LINK_T(wide, wide_chan, wide_virt_chan, wide_phys_chan)
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Defines the all the link types with credit-based flow control interface
 // for a single AXI interface configuration
diff --git a/hw/synth/floo_synth_2tiles.sv b/hw/synth/floo_synth_2tiles.sv
new file mode 100644
index 00000000..13c5a49b
--- /dev/null
+++ b/hw/synth/floo_synth_2tiles.sv
@@ -0,0 +1,213 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Lorenzo Leone <lleone@iis.ee.ethz.ch>
+//
+// This wrapper wnats to simulate a floorplan with 2 tiles. It basically conenct the two routers
+// west <-> east ports together. For this reason, at the interface there will be twice the number of
+// ports compared to a single tile.
+//
+module floo_synth_nw_2tiles
+  import floo_pkg::*;
+  import floo_synth_params_pkg::*;
+  import floo_synth_nw_pkg::*;
+  import floo_synth_collective_pkg::*;
+#(
+  parameter int unsigned NumPorts = int'(floo_pkg::NumDirections),
+  parameter int unsigned  NumWideVirtChannel = 1,
+  parameter int unsigned  NumWidePhysChannel = 1,
+  parameter int unsigned  VcImpl = 32'd0
+) (
+  input  logic clk_i,
+  input  logic rst_ni,
+  input  logic test_enable_i,
+
+  input  id_t id_1_i,
+  input logic id_route_map_1_i,
+
+  input  id_t id_0_i,
+  input logic id_route_map_0_i,
+
+  input  floo_req_t  [NumPorts-2:0] floo_req_1_i,
+  input  floo_rsp_t  [NumPorts-2:0] floo_rsp_1_i,
+  output floo_req_t  [NumPorts-2:0] floo_req_1_o,
+  output floo_rsp_t  [NumPorts-2:0] floo_rsp_1_o,
+  input  floo_wide_t [NumPorts-2:0] floo_wide_1_i,
+  output floo_wide_t [NumPorts-2:0] floo_wide_1_o,
+  input  floo_req_t  [NumPorts-2:0] floo_req_0_i,
+  input  floo_rsp_t  [NumPorts-2:0] floo_rsp_0_i,
+  output floo_req_t  [NumPorts-2:0] floo_req_0_o,
+  output floo_rsp_t  [NumPorts-2:0] floo_rsp_0_o,
+  input  floo_wide_t [NumPorts-2:0] floo_wide_0_i,
+  output floo_wide_t [NumPorts-2:0] floo_wide_0_o
+);
+
+// Intermediate signals to connect the two virtual tiles
+floo_req_t  [NumPorts-1:0] floo_req_1_in;
+floo_rsp_t  [NumPorts-1:0] floo_rsp_1_in;
+floo_req_t  [NumPorts-1:0] floo_req_1_out;
+floo_rsp_t  [NumPorts-1:0] floo_rsp_1_out;
+floo_wide_t [NumPorts-1:0] floo_wide_1_in;
+floo_wide_t [NumPorts-1:0] floo_wide_1_out;
+
+floo_req_t  [NumPorts-1:0] floo_req_0_in;
+floo_rsp_t  [NumPorts-1:0] floo_rsp_0_in;
+floo_req_t  [NumPorts-1:0] floo_req_0_out;
+floo_rsp_t  [NumPorts-1:0] floo_rsp_0_out;
+floo_wide_t [NumPorts-1:0] floo_wide_0_in;
+floo_wide_t [NumPorts-1:0] floo_wide_0_out;
+
+localparam floo_pkg::vc_impl_e  VcImplementation = floo_pkg::vc_impl_e'(VcImpl);
+
+// Tile 1
+for (genvar p = 0; p < NumPorts; p++) begin
+  if (p != West) begin
+    assign floo_req_1_in[p] = floo_req_1_i[tile1_idx_map(p)];
+    assign floo_rsp_1_in[p] = floo_rsp_1_i[tile1_idx_map(p)];
+    assign floo_rsp_1_o[tile1_idx_map(p)] = floo_rsp_1_out[p];
+    assign floo_req_1_o[tile1_idx_map(p)] = floo_req_1_out[p];
+
+    assign floo_wide_1_in[p] = floo_wide_1_i[tile1_idx_map(p)];
+    assign floo_wide_1_o[tile1_idx_map(p)] = floo_wide_1_out[p];
+  end
+end
+
+// Tile 0
+for (genvar p = 0; p < NumPorts; p++) begin
+  if (p != East) begin
+    assign floo_req_0_in[p] = floo_req_0_i[tile0_idx_map(p)];
+    assign floo_rsp_0_in[p] = floo_rsp_0_i[tile0_idx_map(p)];
+    assign floo_rsp_0_o[tile0_idx_map(p)] = floo_rsp_0_out[p];
+    assign floo_req_0_o[tile0_idx_map(p)] = floo_req_0_out[p];
+
+    assign floo_wide_0_in[p] = floo_wide_0_i[tile0_idx_map(p)];
+    assign floo_wide_0_o[tile0_idx_map(p)] = floo_wide_0_out[p];
+  end
+end
+
+assign floo_req_0_in[East] = floo_req_1_out[West];
+assign floo_rsp_1_in[West] = floo_rsp_0_out[East];
+assign floo_req_1_in[West] = floo_req_0_out[East];
+assign floo_rsp_0_in[East] = floo_rsp_1_out[West];
+
+assign floo_wide_0_in[East] = floo_wide_1_out[West];
+assign floo_wide_1_in[West] = floo_wide_0_out[East];
+
+floo_nw_router #(
+  .AxiCfgN       ( AxiCfgN             ),
+  .AxiCfgW       ( AxiCfgW             ),
+  .RouteAlgo     ( RouteCfg.RouteAlgo  ),
+  .NumRoutes     ( NumPorts            ),
+  .NumAddrRules  ( 1                   ),
+  .InFifoDepth   ( InFifoDepth         ),
+  .OutFifoDepth  ( OutFifoDepth        ),
+  .XYRouteOpt    ( 1'b0                ),
+  .NumWideVirtChannels (NumWideVirtChannel),
+  .NumWidePhysChannels (NumWidePhysChannel),
+  .VcImplementation (VcImplementation),
+  .id_t          ( id_t                ),
+  .hdr_t         ( hdr_t               ),
+  .floo_req_t    ( floo_req_t          ),
+  .floo_rsp_t    ( floo_rsp_t          ),
+  .floo_wide_t   ( floo_wide_t         )
+) i_floo_nw_router1 (
+  .clk_i          ( clk_i           ),
+  .rst_ni         ( rst_ni          ),
+  .test_enable_i  ( test_enable_i   ),
+  .id_i           ( id_1_i            ),
+  .id_route_map_i ( id_route_map_1_i  ),
+  .floo_req_i     ( floo_req_1_in      ),
+  .floo_rsp_i     ( floo_rsp_1_in      ),
+  .floo_req_o     ( floo_req_1_out      ),
+  .floo_rsp_o     ( floo_rsp_1_out      ),
+  .floo_wide_i    ( floo_wide_1_in     ),
+  .floo_wide_o    ( floo_wide_1_out     ),
+  .offload_wide_req_op_o          (),
+  .offload_wide_req_operand1_o    (),
+  .offload_wide_req_operand2_o    (),
+  .offload_wide_req_valid_o       (),
+  .offload_wide_req_ready_i       ('0),
+  .offload_wide_resp_result_i     ('0),
+  .offload_wide_resp_valid_i      ('0),
+  .offload_wide_resp_ready_o      (),
+  // Narrow Reduction offload port
+  .offload_narrow_req_op_o        (),
+  .offload_narrow_req_operand1_o  (),
+  .offload_narrow_req_operand2_o  (),
+  .offload_narrow_req_valid_o     (),
+  .offload_narrow_req_ready_i     ('0),
+  .offload_narrow_resp_result_i   ('0),
+  .offload_narrow_resp_valid_i    ('0),
+  .offload_narrow_resp_ready_o    ()
+);
+
+floo_nw_router #(
+  .AxiCfgN       ( AxiCfgN             ),
+  .AxiCfgW       ( AxiCfgW             ),
+  .RouteAlgo     ( RouteCfg.RouteAlgo  ),
+  .NumRoutes     ( NumPorts            ),
+  .NumAddrRules  ( 1                   ),
+  .InFifoDepth   ( InFifoDepth         ),
+  .OutFifoDepth  ( OutFifoDepth        ),
+  .XYRouteOpt    ( 1'b0                ),
+  .NumWideVirtChannels (NumWideVirtChannel),
+  .NumWidePhysChannels (NumWidePhysChannel),
+  .VcImplementation (VcImplementation),
+  .id_t          ( id_t                ),
+  .hdr_t         ( hdr_t               ),
+  .floo_req_t    ( floo_req_t          ),
+  .floo_rsp_t    ( floo_rsp_t          ),
+  .floo_wide_t   ( floo_wide_t         )
+) i_floo_nw_router0 (
+  .clk_i          ( clk_i           ),
+  .rst_ni         ( rst_ni          ),
+  .test_enable_i  ( test_enable_i   ),
+  .id_i           ( id_0_i            ),
+  .id_route_map_i ( id_route_map_0_i  ),
+  .floo_req_i     ( floo_req_0_in      ),
+  .floo_rsp_i     ( floo_rsp_0_in       ),
+  .floo_req_o     ( floo_req_0_out      ),
+  .floo_rsp_o     ( floo_rsp_0_out      ),
+  .floo_wide_i    ( floo_wide_0_in      ),
+  .floo_wide_o    ( floo_wide_0_out     ),
+  .offload_wide_req_op_o          (),
+  .offload_wide_req_operand1_o    (),
+  .offload_wide_req_operand2_o    (),
+  .offload_wide_req_valid_o       (),
+  .offload_wide_req_ready_i       ('0),
+  .offload_wide_resp_result_i     ('0),
+  .offload_wide_resp_valid_i      ('0),
+  .offload_wide_resp_ready_o      (),
+  // Narrow Reduction offload port
+  .offload_narrow_req_op_o        (),
+  .offload_narrow_req_operand1_o  (),
+  .offload_narrow_req_operand2_o  (),
+  .offload_narrow_req_valid_o     (),
+  .offload_narrow_req_ready_i     ('0),
+  .offload_narrow_resp_result_i   ('0),
+  .offload_narrow_resp_valid_i    ('0),
+  .offload_narrow_resp_ready_o    ()
+);
+
+function automatic int tile0_idx_map(route_direction_e dir);
+  case (dir)
+    North: return 0;
+    // East:  return 1;
+    South: return 1;
+    West:  return 2;
+    Eject: return 3;
+  endcase
+endfunction
+
+function automatic int tile1_idx_map(route_direction_e dir);
+  case (dir)
+    North: return 0;
+    East:  return 1;
+    South: return 2;
+    // West:  return 2;
+    Eject: return 3;
+  endcase
+endfunction
+
+endmodule
diff --git a/hw/synth/floo_synth_params_pkg.sv b/hw/synth/floo_synth_params_pkg.sv
index 05cda8f0..db5ffd31 100644
--- a/hw/synth/floo_synth_params_pkg.sv
+++ b/hw/synth/floo_synth_params_pkg.sv
@@ -8,6 +8,7 @@
 `include "floo_noc/typedef.svh"
 
 package floo_synth_params_pkg;
+  import floo_pkg::*;
 
   // Router parameters
   localparam int unsigned InFifoDepth = 2;
@@ -16,7 +17,7 @@ package floo_synth_params_pkg;
   // Default route config for testing
   localparam floo_pkg::route_cfg_t RouteCfg = '{
     RouteAlgo: floo_pkg::XYRouting,
-    UseIdTable: 0,
+    UseIdTable: 1,
     XYAddrOffsetX: 16,
     XYAddrOffsetY: 20,
     default: '0 // Potentially enable Multicast features
@@ -96,7 +97,9 @@ package floo_synth_nw_pkg;
   `FLOO_TYPEDEF_AXI_FROM_CFG(axi_wide, AxiCfgW)
   `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in,
       AxiCfgN, AxiCfgW, hdr_t)
-  `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req, rsp, wide)
+  // `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req, rsp, wide)
+  // Enable the following VC LINK when you want to experiment the use of virtual channels in collective
+  `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2, 1)
 
 endpackage
 

From fbc948d2125f3d7b0778940eeb41933f492a85e0 Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Tue, 2 Dec 2025 18:22:44 +0100
Subject: [PATCH 02/17] fix VCS simulation

---
 hw/floo_nw_chimney.sv  | 12 +++---------
 hw/floo_rob_wrapper.sv |  3 +--
 hw/floo_vc_arbiter.sv  |  2 +-
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/hw/floo_nw_chimney.sv b/hw/floo_nw_chimney.sv
index 31ada007..c89f94f7 100644
--- a/hw/floo_nw_chimney.sv
+++ b/hw/floo_nw_chimney.sv
@@ -1301,12 +1301,6 @@ module floo_nw_chimney #(
                                   (floo_rsp_unpack_generic.hdr.axi_ch  == NarrowR);
   assign axi_valid_in[WideB]    = ChimneyCfgW.EnMgrPort && floo_rsp_in_valid &&
                                   (floo_rsp_unpack_generic.hdr.axi_ch  == WideB);
-  assign axi_valid_in[WideAw]   = floo_wide_in_valid &&
-                                  (floo_wide_unpack_generic.hdr.axi_ch == WideAw);
-  assign axi_valid_in[WideW]    = floo_wide_in_valid &&
-                                  (floo_wide_unpack_generic.hdr.axi_ch  == WideW);
-  assign axi_valid_in[WideR]    = ChimneyCfgW.EnMgrPort && floo_wide_in_valid &&
-                                  (floo_wide_unpack_generic.hdr.axi_ch  == WideR);
 
   assign axi_ready_out[NarrowAw]  = axi_narrow_meta_buf_rsp_out.aw_ready;
   assign axi_ready_out[NarrowW]   = axi_narrow_meta_buf_rsp_out.w_ready;
@@ -1325,7 +1319,7 @@ module floo_nw_chimney #(
   assign floo_rsp_out_ready  = axi_ready_out[floo_rsp_unpack_generic.hdr.axi_ch];
 
   // Flit unpacking on the wide interface
-  if (EnDecoupledRW) begin
+  if (EnDecoupledRW) begin: gen_mux_decouple_rdwr
 
     assign floo_wide_unpack_generic_wr = floo_wide_in_wr_q.generic;
     assign floo_wide_unpack_generic_rd = floo_wide_in_rd_q.generic;
@@ -1345,7 +1339,7 @@ module floo_nw_chimney #(
       .oup_ready_i({axi_ready_out[WideAw], axi_ready_out[WideW]})
     );
 
-  end else begin
+  end else begin:gen_nomux_decouple_rdwr
 
     // Demux single physical channel to AXI AW, W and R channels
     assign floo_wide_out_ready_q = axi_ready_out[floo_wide_in_q.generic.hdr.axi_ch];
@@ -1602,7 +1596,7 @@ module floo_nw_chimney #(
   `ASSERT(NoWideSbrPortArRequest, ChimneyCfgW.EnSbrPort || !(floo_req_in_valid &&
                            (floo_req_unpack_generic.hdr.axi_ch == WideAr)))
   `ASSERT(NoWideSbrPortWRequest,  ChimneyCfgW.EnSbrPort || !(floo_wide_in_valid &&
-                           (floo_wide_unpack_generic.hdr.axi_ch == WideW)))
+                           (floo_wide_unpack_generic_wr.hdr.axi_ch == WideW)))
 
   // When virtual channels for decoupled read and write is enabled,
   // req_i and req_o must have same amount of VCs, equal to NumVirtualChannels
diff --git a/hw/floo_rob_wrapper.sv b/hw/floo_rob_wrapper.sv
index 35185f3b..9e435e0f 100644
--- a/hw/floo_rob_wrapper.sv
+++ b/hw/floo_rob_wrapper.sv
@@ -168,8 +168,7 @@ module floo_rob_wrapper
       .inject_axi_id_i              ( '0                  ),
       .inject_i                     ( 1'b0                ),
       .pop_axi_id_i                 ( rsp_i.id            ),
-      .pop_i                        ( pop && rsp_ready_i  ), // Only pop on handshake
-      .any_outstanding_trx_o        (                     )
+      .pop_i                        ( pop && rsp_ready_i  )  // Only pop on handshake
     );
 
   end else begin : gen_error
diff --git a/hw/floo_vc_arbiter.sv b/hw/floo_vc_arbiter.sv
index b2f9e1d5..644bbc2e 100644
--- a/hw/floo_vc_arbiter.sv
+++ b/hw/floo_vc_arbiter.sv
@@ -151,6 +151,6 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
   `ASSERT(OneHotOutputValid, $onehot0(valid_o))
 
   // Currently only supports two virtual channels
-  `ASSERT_INIT(SupportedNumVirtChannels, NumVirtChannels <= 2)
+  `ASSERT_INIT(SupportedNumVirtChannels, !VcImplementation || (NumVirtChannels <= 2))
 
 endmodule

From be1020e36b79e77c0f01da9c221bffba899753c3 Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Wed, 3 Dec 2025 11:46:11 +0100
Subject: [PATCH 03/17] hw: Tie credit to zero at req/rsp interface

---
 hw/floo_nw_router.sv | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/floo_nw_router.sv b/hw/floo_nw_router.sv
index 76fa8323..dc40b6c9 100644
--- a/hw/floo_nw_router.sv
+++ b/hw/floo_nw_router.sv
@@ -105,6 +105,7 @@ module floo_nw_router #(
   for (genvar i = 0; i < NumInputs; i++) begin : gen_chimney_req
     assign req_valid_in[i] = floo_req_i[i].valid;
     assign floo_req_o[i].ready = req_ready_out[i];
+    assign floo_req_o[i].credit = '0; // Narrow links never rely on credit based flow
     assign req_in[i] = floo_req_i[i].req;
     assign floo_rsp_o[i].valid = rsp_valid_out[i];
     assign rsp_ready_in[i] = floo_rsp_i[i].ready;
@@ -118,6 +119,7 @@ module floo_nw_router #(
     assign rsp_valid_in[i] = floo_rsp_i[i].valid;
     assign floo_rsp_o[i].ready = rsp_ready_out[i];
     assign rsp_in[i] = floo_rsp_i[i].rsp;
+    assign floo_rsp_o[i].credit = '0; // Narrow links never rely on credit based flow
   end
 
   for (genvar i = 0; i < NumRoutes; i++) begin : gen_chimney_wide

From d60b67c0e9392ef80e3740a02c71c2ad80f760dc Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Wed, 3 Dec 2025 14:51:04 +0100
Subject: [PATCH 04/17] hw: Fix drive of unpacked flit with decoupling

---
 hw/floo_nw_chimney.sv           | 28 +++++++++++++++++-----------
 hw/floo_nw_router.sv            |  6 ++++--
 hw/include/floo_noc/typedef.svh | 19 ++++++++++---------
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/hw/floo_nw_chimney.sv b/hw/floo_nw_chimney.sv
index c89f94f7..621ed1e6 100644
--- a/hw/floo_nw_chimney.sv
+++ b/hw/floo_nw_chimney.sv
@@ -810,9 +810,12 @@ module floo_nw_chimney #(
   logic wide_r_rob_rob_req;
   logic wide_r_rob_last;
   rob_idx_t wide_r_rob_rob_idx;
-  assign wide_r_rob_rob_req = floo_wide_in_rd_q.wide_r.hdr.rob_req;
-  assign wide_r_rob_rob_idx = floo_wide_in_rd_q.wide_r.hdr.rob_idx;
-  assign wide_r_rob_last = floo_wide_in_rd_q.wide_r.payload.last;
+  assign wide_r_rob_rob_req = (!EnDecoupledRW) ? floo_wide_in_q.wide_r.hdr.rob_req :
+                                                  floo_wide_in_rd_q.wide_r.hdr.rob_req;
+  assign wide_r_rob_rob_idx = (!EnDecoupledRW) ? floo_wide_in_q.wide_r.hdr.rob_idx :
+                                                floo_wide_in_rd_q.wide_r.hdr.rob_idx;
+  assign wide_r_rob_last = (!EnDecoupledRW) ? floo_wide_in_q.wide_r.payload.last :
+                                              floo_wide_in_rd_q.wide_r.payload.last;
 
   floo_rob_wrapper #(
     .RoBType        ( ChimneyCfgW.RRoBType      ),
@@ -1229,16 +1232,16 @@ module floo_nw_chimney #(
       .valid_o  ( floo_wide_req_arb_valid_out  )
     );
 
-    // Mux the valid of the read and write channels to the ACK/NACK protocol
-    // of the virtual channel for decoupled read and write output requests.
+    // Mux the ready of the read and write channels to the ACK/NACK protocol
+    // Demux the valid signals based on the channel type
     // AW/W -> Virtual Channel 0
     // R -> Virtual Channel 1
     // TODO(lleone): check if this really solve DEADLOCK!!!!
     if (EnDecoupledRW) begin: gen_vc_rw_ack
-      assign floo_wide_o.valid[0] = (floo_wide_o.wide[0].generic.hdr.axi_ch != WideR) ? floo_wide_req_arb_valid_out : 1'b0;
-      assign floo_wide_o.valid[1] = (floo_wide_o.wide[0].generic.hdr.axi_ch == WideR) ? floo_wide_req_arb_valid_out : 1'b0;
+      assign floo_wide_o.valid[WRITE] = (floo_wide_o.wide[0].generic.hdr.axi_ch != WideR) ? floo_wide_req_arb_valid_out : 1'b0;
+      assign floo_wide_o.valid[READ] = (floo_wide_o.wide[0].generic.hdr.axi_ch == WideR) ? floo_wide_req_arb_valid_out : 1'b0;
       assign floo_wide_req_arb_gnt_in = (floo_wide_o.wide[0].generic.hdr.axi_ch != WideR) ?
-                                        floo_wide_i.ready[0] : floo_wide_i.ready[1];
+                                        floo_wide_i.ready[WRITE] : floo_wide_i.ready[READ];
     end else begin: gen_no_vc_rw_ack
       assign floo_wide_o.valid = floo_wide_req_arb_valid_out;
       assign floo_wide_req_arb_gnt_in = floo_wide_i.ready;
@@ -1278,10 +1281,13 @@ module floo_nw_chimney #(
   assign axi_narrow_unpack_ar = floo_req_in.narrow_ar.payload;
   assign axi_narrow_unpack_r  = floo_rsp_in.narrow_r.payload;
   assign axi_narrow_unpack_b  = floo_rsp_in.narrow_b.payload;
-  assign axi_wide_unpack_aw   = floo_wide_in_wr_q.wide_aw.payload;
-  assign axi_wide_unpack_w    = floo_wide_in_wr_q.wide_w.payload;
+  assign axi_wide_unpack_aw   = (!EnDecoupledRW) ? floo_wide_in_q.wide_aw.payload :
+                                                   floo_wide_in_wr_q.wide_aw.payload;
+  assign axi_wide_unpack_w    = (!EnDecoupledRW) ? floo_wide_in_q.wide_w.payload :
+                                                   floo_wide_in_wr_q.wide_w.payload;
   assign axi_wide_unpack_ar   = floo_req_in.wide_ar.payload;
-  assign axi_wide_unpack_r    = floo_wide_in_rd_q.wide_r.payload;
+  assign axi_wide_unpack_r    = (!EnDecoupledRW) ? floo_wide_in_q.wide_r.payload :
+                                                   floo_wide_in_rd_q.wide_r.payload;
   assign axi_wide_unpack_b    = floo_rsp_in.wide_b.payload;
   assign floo_req_unpack_generic = floo_req_in.generic;
   assign floo_rsp_unpack_generic = floo_rsp_in.generic;
diff --git a/hw/floo_nw_router.sv b/hw/floo_nw_router.sv
index dc40b6c9..b1789e4a 100644
--- a/hw/floo_nw_router.sv
+++ b/hw/floo_nw_router.sv
@@ -161,7 +161,8 @@ module floo_nw_router #(
     .credit_i       ( '0            ),
     .valid_o        ( req_valid_out ),
     .ready_i        ( req_ready_in  ),
-    .data_o         ( req_out       )
+    .data_o         ( req_out       ),
+    .credit_o       ( /*unused */   )
   );
 
   // We construct the masks for the narrow and wide B responses here.
@@ -213,7 +214,8 @@ module floo_nw_router #(
     .credit_i       ( '0            ),
     .valid_o        ( rsp_valid_out ),
     .ready_i        ( rsp_ready_in  ),
-    .data_o         ( rsp_out       )
+    .data_o         ( rsp_out       ),
+    .credit_o       ( /*unused */   )
   );
 
 
diff --git a/hw/include/floo_noc/typedef.svh b/hw/include/floo_noc/typedef.svh
index c7c64513..5bcaa747 100644
--- a/hw/include/floo_noc/typedef.svh
+++ b/hw/include/floo_noc/typedef.svh
@@ -281,11 +281,12 @@
 // `FLOO_TYPEDEF_AXI_FROM_CFG(my_axi, AxiCfg)
 // `FLOO_TYPEDEF_AXI_CHAN_ALL(my_axi, req, rsp, my_axi_in, AxiCfg, hdr_t)
 // FLOO_TYPEDEF_LINK_T(req, my_axi)
-`define FLOO_TYPEDEF_LINK_T(name, chan_name)  \
-  typedef struct packed {                     \
-    logic valid;                              \
-    logic ready;                              \
-    floo_``chan_name``_chan_t ``chan_name``;  \
+`define FLOO_TYPEDEF_LINK_T(name, chan_name, vc_num = 1, phy_num = 1)   \
+  typedef struct packed {                                        \
+    logic [vc_num-1:0] valid;                                    \
+    logic [vc_num-1:0] ready;                                    \
+    logic [vc_num-1:0] credit;                                 \
+    floo_``chan_name``_chan_t [phy_num-1:0] ``chan_name``;       \
   } floo_``name``_t;
 
   ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -378,10 +379,10 @@
 // `FLOO_TYPEDEF_AXI_FROM_CFG(my_wide_axi, AxiCfgW)
 // `FLOO_TYPEDEF_NW_CHAN_ALL(axi, my_req, my_rsp, my_wide, my_axi_narrow_in, my_axi_wide_in, AxiCfgN, AxiCfgW, hdr_t)
 // `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, my_req, my_rsp, my_wide)
-`define FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req_chan, rsp_chan, wide_chan) \
-  `FLOO_TYPEDEF_LINK_T(req, req_chan)                                           \
-  `FLOO_TYPEDEF_LINK_T(rsp, rsp_chan)                                           \
-  `FLOO_TYPEDEF_LINK_T(wide, wide_chan)
+`define FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req_chan, rsp_chan, wide_chan, req_vc_num = 1, rsp_vc_num = 1, wide_vc_num = 1, wide_phy_num = 1) \
+  `FLOO_TYPEDEF_LINK_T(req, req_chan, req_vc_num, 1)                                           \
+  `FLOO_TYPEDEF_LINK_T(rsp, rsp_chan, rsp_vc_num, 1)                                           \
+  `FLOO_TYPEDEF_LINK_T(wide, wide_chan, wide_vc_num, wide_phy_num)
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Defines the all the link types with ready-valid handshaking interface

From 69678842b7df6146cb7fffb4cd628c49854a29a9 Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Wed, 3 Dec 2025 15:41:48 +0100
Subject: [PATCH 05/17] revert typedef to keep separate VC and not

---
 hw/floo_nw_router.sv            | 23 +++++++++++++++++++----
 hw/include/floo_noc/typedef.svh | 19 +++++++++----------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/hw/floo_nw_router.sv b/hw/floo_nw_router.sv
index b1789e4a..929f93cf 100644
--- a/hw/floo_nw_router.sv
+++ b/hw/floo_nw_router.sv
@@ -105,7 +105,6 @@ module floo_nw_router #(
   for (genvar i = 0; i < NumInputs; i++) begin : gen_chimney_req
     assign req_valid_in[i] = floo_req_i[i].valid;
     assign floo_req_o[i].ready = req_ready_out[i];
-    assign floo_req_o[i].credit = '0; // Narrow links never rely on credit based flow
     assign req_in[i] = floo_req_i[i].req;
     assign floo_rsp_o[i].valid = rsp_valid_out[i];
     assign rsp_ready_in[i] = floo_rsp_i[i].ready;
@@ -119,7 +118,6 @@ module floo_nw_router #(
     assign rsp_valid_in[i] = floo_rsp_i[i].valid;
     assign floo_rsp_o[i].ready = rsp_ready_out[i];
     assign rsp_in[i] = floo_rsp_i[i].rsp;
-    assign floo_rsp_o[i].credit = '0; // Narrow links never rely on credit based flow
   end
 
   for (genvar i = 0; i < NumRoutes; i++) begin : gen_chimney_wide
@@ -129,8 +127,25 @@ module floo_nw_router #(
     assign floo_wide_o[i].valid = wide_valid_out[i];
     assign wide_ready_in[i] = floo_wide_i[i].ready;
     assign floo_wide_o[i].wide = wide_out[i];
-    assign floo_wide_o[i].credit = wide_credit_out[i];
-    assign wide_credit_in[i] = floo_wide_i[i].credit;
+  end
+
+  // Generation of credit based conenctions only when necessary
+  if (VcImplementation == floo_pkg::VcCreditBased) begin: gen_credit_connections
+    // Narrow lreq inks never rely on credit based flow
+    for (genvar i = 0; i < NumInputs; i++) begin: gen_credit_tied_zero_req
+      assign floo_req_o[i].credit = '0;
+    end
+    // Narrow rsp links never rely on credit based flow
+    for (genvar i = 0; i < NumOutputs; i++) begin: gen_credit_tied_zero_rsp
+      assign floo_rsp_o[i].credit = '0;
+    end
+    // Wide links credit connections
+    for (genvar i = 0; i < NumRoutes; i++) begin: gen_credit_wide
+      assign floo_wide_o[i].credit = wide_credit_out[i];
+      assign wide_credit_in[i] = floo_wide_i[i].credit;
+    end
+  end else begin: gen_no_credit_connections
+    assign wide_credit_in = '0;
   end
 
   floo_router #(
diff --git a/hw/include/floo_noc/typedef.svh b/hw/include/floo_noc/typedef.svh
index 5bcaa747..c7c64513 100644
--- a/hw/include/floo_noc/typedef.svh
+++ b/hw/include/floo_noc/typedef.svh
@@ -281,12 +281,11 @@
 // `FLOO_TYPEDEF_AXI_FROM_CFG(my_axi, AxiCfg)
 // `FLOO_TYPEDEF_AXI_CHAN_ALL(my_axi, req, rsp, my_axi_in, AxiCfg, hdr_t)
 // FLOO_TYPEDEF_LINK_T(req, my_axi)
-`define FLOO_TYPEDEF_LINK_T(name, chan_name, vc_num = 1, phy_num = 1)   \
-  typedef struct packed {                                        \
-    logic [vc_num-1:0] valid;                                    \
-    logic [vc_num-1:0] ready;                                    \
-    logic [vc_num-1:0] credit;                                 \
-    floo_``chan_name``_chan_t [phy_num-1:0] ``chan_name``;       \
+`define FLOO_TYPEDEF_LINK_T(name, chan_name)  \
+  typedef struct packed {                     \
+    logic valid;                              \
+    logic ready;                              \
+    floo_``chan_name``_chan_t ``chan_name``;  \
   } floo_``name``_t;
 
   ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -379,10 +378,10 @@
 // `FLOO_TYPEDEF_AXI_FROM_CFG(my_wide_axi, AxiCfgW)
 // `FLOO_TYPEDEF_NW_CHAN_ALL(axi, my_req, my_rsp, my_wide, my_axi_narrow_in, my_axi_wide_in, AxiCfgN, AxiCfgW, hdr_t)
 // `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, my_req, my_rsp, my_wide)
-`define FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req_chan, rsp_chan, wide_chan, req_vc_num = 1, rsp_vc_num = 1, wide_vc_num = 1, wide_phy_num = 1) \
-  `FLOO_TYPEDEF_LINK_T(req, req_chan, req_vc_num, 1)                                           \
-  `FLOO_TYPEDEF_LINK_T(rsp, rsp_chan, rsp_vc_num, 1)                                           \
-  `FLOO_TYPEDEF_LINK_T(wide, wide_chan, wide_vc_num, wide_phy_num)
+`define FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req_chan, rsp_chan, wide_chan) \
+  `FLOO_TYPEDEF_LINK_T(req, req_chan)                                           \
+  `FLOO_TYPEDEF_LINK_T(rsp, rsp_chan)                                           \
+  `FLOO_TYPEDEF_LINK_T(wide, wide_chan)
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Defines the all the link types with ready-valid handshaking interface

From 58a096555d0d924cd28609f6d5062c9d410521cc Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Wed, 3 Dec 2025 16:04:49 +0100
Subject: [PATCH 06/17] hw: Deprecate complex VC

---
 .gitlab-ci.yml                                |  2 -
 Bender.yml                                    | 40 ++++++++++---------
 hw/{ => deprecated}/floo_nw_vc_chimney.sv     |  0
 hw/{ => deprecated}/floo_nw_vc_router.sv      |  0
 hw/{ => deprecated}/floo_vc_router.sv         |  0
 .../vc_router_util/floo_credit_counter.sv     |  0
 .../vc_router_util/floo_input_fifo.sv         |  0
 .../vc_router_util/floo_input_port.sv         |  0
 .../vc_router_util/floo_look_ahead_routing.sv |  0
 .../vc_router_util/floo_mux.sv                |  0
 .../vc_router_util/floo_rr_arbiter.sv         |  0
 .../vc_router_util/floo_sa_global.sv          |  0
 .../vc_router_util/floo_sa_local.sv           |  0
 .../vc_router_util/floo_vc_assignment.sv      |  0
 .../vc_router_util/floo_vc_router_switch.sv   |  0
 .../vc_router_util/floo_vc_selection.sv       |  0
 16 files changed, 22 insertions(+), 20 deletions(-)
 rename hw/{ => deprecated}/floo_nw_vc_chimney.sv (100%)
 rename hw/{ => deprecated}/floo_nw_vc_router.sv (100%)
 rename hw/{ => deprecated}/floo_vc_router.sv (100%)
 rename hw/{ => deprecated}/vc_router_util/floo_credit_counter.sv (100%)
 rename hw/{ => deprecated}/vc_router_util/floo_input_fifo.sv (100%)
 rename hw/{ => deprecated}/vc_router_util/floo_input_port.sv (100%)
 rename hw/{ => deprecated}/vc_router_util/floo_look_ahead_routing.sv (100%)
 rename hw/{ => deprecated}/vc_router_util/floo_mux.sv (100%)
 rename hw/{ => deprecated}/vc_router_util/floo_rr_arbiter.sv (100%)
 rename hw/{ => deprecated}/vc_router_util/floo_sa_global.sv (100%)
 rename hw/{ => deprecated}/vc_router_util/floo_sa_local.sv (100%)
 rename hw/{ => deprecated}/vc_router_util/floo_vc_assignment.sv (100%)
 rename hw/{ => deprecated}/vc_router_util/floo_vc_router_switch.sv (100%)
 rename hw/{ => deprecated}/vc_router_util/floo_vc_selection.sv (100%)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d8a9c577..7fe57dd4 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -47,7 +47,6 @@ compile-vcs:
     matrix:
       - TB_DUT:
         - tb_floo_router
-        - tb_floo_vc_router
         - tb_floo_axi_chimney
         - tb_floo_nw_chimney
         - tb_floo_rob
@@ -85,7 +84,6 @@ run-sim:
       - SIMULATOR: [vsim, vcs]
         TB_DUT:
         - tb_floo_router
-        - tb_floo_vc_router
         - tb_floo_axi_chimney
         - tb_floo_nw_chimney
         - tb_floo_rob
diff --git a/Bender.yml b/Bender.yml
index 4c73307d..4398ff93 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -52,23 +52,6 @@ sources:
   - hw/floo_axi_router.sv
   - hw/floo_nw_router.sv
 
-  # VC Router (Level 1)
-  - hw/vc_router_util/floo_credit_counter.sv
-  - hw/vc_router_util/floo_input_fifo.sv
-  - hw/vc_router_util/floo_input_port.sv
-  - hw/vc_router_util/floo_look_ahead_routing.sv
-  - hw/vc_router_util/floo_mux.sv
-  - hw/vc_router_util/floo_rr_arbiter.sv
-  - hw/vc_router_util/floo_sa_global.sv
-  - hw/vc_router_util/floo_sa_local.sv
-  - hw/vc_router_util/floo_vc_assignment.sv
-  - hw/vc_router_util/floo_vc_router_switch.sv
-  - hw/vc_router_util/floo_vc_selection.sv
-  # Level 2
-  - hw/floo_vc_router.sv
-  - hw/floo_nw_vc_chimney.sv
-  - hw/floo_nw_vc_router.sv
-
   - target: floo_test
     include_dirs:
       - hw/test/include
@@ -90,7 +73,6 @@ sources:
       - hw/tb/tb_floo_nw_chimney.sv
       - hw/tb/tb_floo_router.sv
       - hw/tb/tb_floo_rob.sv
-      - hw/tb/tb_floo_vc_router.sv
       - hw/tb/tb_floo_rob_multicast.sv
 
   - target: all(any(floo_test, floo_synth), axi_mesh)
@@ -115,6 +97,7 @@ sources:
     files:
       - hw/tb/tb_floo_nw_mesh.sv
 
+
   - target: all(floo_synth, synthesis)
     files:
     # Level 0
@@ -125,3 +108,24 @@ sources:
     - hw/synth/floo_synth_axi_router.sv
     - hw/synth/floo_synth_nw_router.sv
     - hw/synth/floo_synth_nw_2tiles.sv
+
+  - target: floo_deprecated_hw
+    files:
+      # Level 1
+      - hw/deprecated/vc_router_util/floo_credit_counter.sv
+      - hw/deprecated/vc_router_util/floo_input_fifo.sv
+      - hw/deprecated/vc_router_util/floo_input_port.sv
+      - hw/deprecated/vc_router_util/floo_look_ahead_routing.sv
+      - hw/deprecated/vc_router_util/floo_mux.sv
+      - hw/deprecated/vc_router_util/floo_rr_arbiter.sv
+      - hw/deprecated/vc_router_util/floo_sa_global.sv
+      - hw/deprecated/vc_router_util/floo_sa_local.sv
+      - hw/deprecated/vc_router_util/floo_vc_assignment.sv
+      - hw/deprecated/vc_router_util/floo_vc_router_switch.sv
+      - hw/deprecated/vc_router_util/floo_vc_selection.sv
+
+      # Level 1
+      - hw/floo_vc_router.sv
+      - hw/floo_nw_vc_chimney.sv
+      - hw/floo_nw_vc_router.sv
+      - hw/tb/tb_floo_vc_router.sv
diff --git a/hw/floo_nw_vc_chimney.sv b/hw/deprecated/floo_nw_vc_chimney.sv
similarity index 100%
rename from hw/floo_nw_vc_chimney.sv
rename to hw/deprecated/floo_nw_vc_chimney.sv
diff --git a/hw/floo_nw_vc_router.sv b/hw/deprecated/floo_nw_vc_router.sv
similarity index 100%
rename from hw/floo_nw_vc_router.sv
rename to hw/deprecated/floo_nw_vc_router.sv
diff --git a/hw/floo_vc_router.sv b/hw/deprecated/floo_vc_router.sv
similarity index 100%
rename from hw/floo_vc_router.sv
rename to hw/deprecated/floo_vc_router.sv
diff --git a/hw/vc_router_util/floo_credit_counter.sv b/hw/deprecated/vc_router_util/floo_credit_counter.sv
similarity index 100%
rename from hw/vc_router_util/floo_credit_counter.sv
rename to hw/deprecated/vc_router_util/floo_credit_counter.sv
diff --git a/hw/vc_router_util/floo_input_fifo.sv b/hw/deprecated/vc_router_util/floo_input_fifo.sv
similarity index 100%
rename from hw/vc_router_util/floo_input_fifo.sv
rename to hw/deprecated/vc_router_util/floo_input_fifo.sv
diff --git a/hw/vc_router_util/floo_input_port.sv b/hw/deprecated/vc_router_util/floo_input_port.sv
similarity index 100%
rename from hw/vc_router_util/floo_input_port.sv
rename to hw/deprecated/vc_router_util/floo_input_port.sv
diff --git a/hw/vc_router_util/floo_look_ahead_routing.sv b/hw/deprecated/vc_router_util/floo_look_ahead_routing.sv
similarity index 100%
rename from hw/vc_router_util/floo_look_ahead_routing.sv
rename to hw/deprecated/vc_router_util/floo_look_ahead_routing.sv
diff --git a/hw/vc_router_util/floo_mux.sv b/hw/deprecated/vc_router_util/floo_mux.sv
similarity index 100%
rename from hw/vc_router_util/floo_mux.sv
rename to hw/deprecated/vc_router_util/floo_mux.sv
diff --git a/hw/vc_router_util/floo_rr_arbiter.sv b/hw/deprecated/vc_router_util/floo_rr_arbiter.sv
similarity index 100%
rename from hw/vc_router_util/floo_rr_arbiter.sv
rename to hw/deprecated/vc_router_util/floo_rr_arbiter.sv
diff --git a/hw/vc_router_util/floo_sa_global.sv b/hw/deprecated/vc_router_util/floo_sa_global.sv
similarity index 100%
rename from hw/vc_router_util/floo_sa_global.sv
rename to hw/deprecated/vc_router_util/floo_sa_global.sv
diff --git a/hw/vc_router_util/floo_sa_local.sv b/hw/deprecated/vc_router_util/floo_sa_local.sv
similarity index 100%
rename from hw/vc_router_util/floo_sa_local.sv
rename to hw/deprecated/vc_router_util/floo_sa_local.sv
diff --git a/hw/vc_router_util/floo_vc_assignment.sv b/hw/deprecated/vc_router_util/floo_vc_assignment.sv
similarity index 100%
rename from hw/vc_router_util/floo_vc_assignment.sv
rename to hw/deprecated/vc_router_util/floo_vc_assignment.sv
diff --git a/hw/vc_router_util/floo_vc_router_switch.sv b/hw/deprecated/vc_router_util/floo_vc_router_switch.sv
similarity index 100%
rename from hw/vc_router_util/floo_vc_router_switch.sv
rename to hw/deprecated/vc_router_util/floo_vc_router_switch.sv
diff --git a/hw/vc_router_util/floo_vc_selection.sv b/hw/deprecated/vc_router_util/floo_vc_selection.sv
similarity index 100%
rename from hw/vc_router_util/floo_vc_selection.sv
rename to hw/deprecated/vc_router_util/floo_vc_selection.sv

From 609aaafa35404ea1657d77c9c8fd14c49fc3666f Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Wed, 3 Dec 2025 16:58:20 +0100
Subject: [PATCH 07/17] synth: Delete 2tile wrapper from Bender deps

---
 Bender.yml                        | 1 -
 hw/synth/floo_synth_params_pkg.sv | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/Bender.yml b/Bender.yml
index 4398ff93..e1429cf3 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -107,7 +107,6 @@ sources:
     - hw/synth/floo_synth_nw_chimney.sv
     - hw/synth/floo_synth_axi_router.sv
     - hw/synth/floo_synth_nw_router.sv
-    - hw/synth/floo_synth_nw_2tiles.sv
 
   - target: floo_deprecated_hw
     files:
diff --git a/hw/synth/floo_synth_params_pkg.sv b/hw/synth/floo_synth_params_pkg.sv
index db5ffd31..d0197055 100644
--- a/hw/synth/floo_synth_params_pkg.sv
+++ b/hw/synth/floo_synth_params_pkg.sv
@@ -17,7 +17,7 @@ package floo_synth_params_pkg;
   // Default route config for testing
   localparam floo_pkg::route_cfg_t RouteCfg = '{
     RouteAlgo: floo_pkg::XYRouting,
-    UseIdTable: 1,
+    UseIdTable: 0,
     XYAddrOffsetX: 16,
     XYAddrOffsetY: 20,
     default: '0 // Potentially enable Multicast features

From 9819e2685a25d588723e4c8fe65cffd586979f2c Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Wed, 3 Dec 2025 17:49:02 +0100
Subject: [PATCH 08/17] lint: Fix linting rules

---
 hw/floo_nw_chimney.sv         | 31 ++++++++++++++++++-------------
 hw/floo_vc_arbiter.sv         |  4 +++-
 hw/synth/floo_synth_2tiles.sv | 14 +++++++-------
 3 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/hw/floo_nw_chimney.sv b/hw/floo_nw_chimney.sv
index 621ed1e6..189c80f4 100644
--- a/hw/floo_nw_chimney.sv
+++ b/hw/floo_nw_chimney.sv
@@ -302,7 +302,7 @@ module floo_nw_chimney #(
     end else if (NumWidePhysChannels == 2) begin : gen_dual_phys_ch
       assign floo_wide_in_wr = floo_wide_i.wide[WRITE];
       assign floo_wide_in_rd = floo_wide_i.wide[READ];
-    end else begin
+    end else begin: gen_illegal_cfg
       $fatal(1, "NW CHIMNEY: Unsupported number of wide physical channels");
     end
   end else begin : gen_no_vc_demux
@@ -1238,8 +1238,10 @@ module floo_nw_chimney #(
     // R -> Virtual Channel 1
     // TODO(lleone): check if this really solve DEADLOCK!!!!
     if (EnDecoupledRW) begin: gen_vc_rw_ack
-      assign floo_wide_o.valid[WRITE] = (floo_wide_o.wide[0].generic.hdr.axi_ch != WideR) ? floo_wide_req_arb_valid_out : 1'b0;
-      assign floo_wide_o.valid[READ] = (floo_wide_o.wide[0].generic.hdr.axi_ch == WideR) ? floo_wide_req_arb_valid_out : 1'b0;
+      assign floo_wide_o.valid[WRITE] = (floo_wide_o.wide[0].generic.hdr.axi_ch != WideR) ?
+                                         floo_wide_req_arb_valid_out : 1'b0;
+      assign floo_wide_o.valid[READ] = (floo_wide_o.wide[0].generic.hdr.axi_ch == WideR) ?
+                                         floo_wide_req_arb_valid_out : 1'b0;
       assign floo_wide_req_arb_gnt_in = (floo_wide_o.wide[0].generic.hdr.axi_ch != WideR) ?
                                         floo_wide_i.ready[WRITE] : floo_wide_i.ready[READ];
     end else begin: gen_no_vc_rw_ack
@@ -1256,8 +1258,7 @@ module floo_nw_chimney #(
     assign floo_wide_o.wide[1] = floo_wide_arb_in[WideR];
     assign floo_wide_o.valid[1] = floo_wide_arb_req_in[WideR];
     assign floo_wide_arb_gnt_out[WideR] = floo_wide_i.ready[1];
-
-  end else begin
+  end else begin: gen_illegal_cfg
     $fatal(1, "NW CHIMNEY: Unsupported number of wide physical channels");
   end
 
@@ -1606,13 +1607,17 @@ module floo_nw_chimney #(
 
   // When virtual channels for decoupled read and write is enabled,
   // req_i and req_o must have same amount of VCs, equal to NumVirtualChannels
-  `ASSERT_INIT(VCMismatchInputReady, !EnDecoupledRW | ($bits(floo_wide_i.ready) == NumVirtualChannels),
-    $sformatf("Input request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
-  `ASSERT_INIT(VCMismatchOutputReady, !EnDecoupledRW | ($bits(floo_wide_o.ready) == NumVirtualChannels),
-    $sformatf("Output request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
-  `ASSERT_INIT(VCMismatchInputValid, !EnDecoupledRW | ($bits(floo_wide_i.valid) == NumVirtualChannels),
-    $sformatf("Input request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
-  `ASSERT_INIT(VCMismatchOutputValid, !EnDecoupledRW | ($bits(floo_wide_o.valid) == NumVirtualChannels),
-    $sformatf("Output request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
+  `ASSERT_INIT(VCMismatchInputReady,
+          !EnDecoupledRW | ($bits(floo_wide_i.ready) == NumVirtualChannels),
+          $sformatf("Input request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
+  `ASSERT_INIT(VCMismatchOutputReady,
+          !EnDecoupledRW | ($bits(floo_wide_o.ready) == NumVirtualChannels),
+          $sformatf("Output request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
+  `ASSERT_INIT(VCMismatchInputValid,
+          !EnDecoupledRW | ($bits(floo_wide_i.valid) == NumVirtualChannels),
+          $sformatf("Input request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
+  `ASSERT_INIT(VCMismatchOutputValid,
+          !EnDecoupledRW | ($bits(floo_wide_o.valid) == NumVirtualChannels),
+          $sformatf("Output request must have %0d VCs when EnDecoupledRW==1", NumVirtualChannels));
 
 endmodule
diff --git a/hw/floo_vc_arbiter.sv b/hw/floo_vc_arbiter.sv
index 644bbc2e..4b79c47a 100644
--- a/hw/floo_vc_arbiter.sv
+++ b/hw/floo_vc_arbiter.sv
@@ -57,7 +57,9 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
       // other VC has a grant, switch VC in the next cycle
       if (vc_arb_req_out) begin
         if (!vc_arb_gnt_in) begin
-          if (ready_i[vc_arb_idx ? 0 : 1] && valid_i[vc_arb_idx ? 0 : 1]) mask_d = ~(1'b1 << vc_arb_idx);
+          if (ready_i[vc_arb_idx ? 0 : 1] && valid_i[vc_arb_idx ? 0 : 1]) begin: gen_valid_mask
+              mask_d = ~(1'b1 << vc_arb_idx);
+          end
         end else begin
           mask_d = '1;
         end
diff --git a/hw/synth/floo_synth_2tiles.sv b/hw/synth/floo_synth_2tiles.sv
index 13c5a49b..556dd396 100644
--- a/hw/synth/floo_synth_2tiles.sv
+++ b/hw/synth/floo_synth_2tiles.sv
@@ -8,7 +8,7 @@
 // west <-> east ports together. For this reason, at the interface there will be twice the number of
 // ports compared to a single tile.
 //
-module floo_synth_nw_2tiles
+module floo_synth_2tiles
   import floo_pkg::*;
   import floo_synth_params_pkg::*;
   import floo_synth_nw_pkg::*;
@@ -61,8 +61,8 @@ floo_wide_t [NumPorts-1:0] floo_wide_0_out;
 localparam floo_pkg::vc_impl_e  VcImplementation = floo_pkg::vc_impl_e'(VcImpl);
 
 // Tile 1
-for (genvar p = 0; p < NumPorts; p++) begin
-  if (p != West) begin
+for (genvar p = 0; p < NumPorts; p++) begin: gen_tile_1_connections
+  if (p != West) begin: gen_skip_west_tile_1
     assign floo_req_1_in[p] = floo_req_1_i[tile1_idx_map(p)];
     assign floo_rsp_1_in[p] = floo_rsp_1_i[tile1_idx_map(p)];
     assign floo_rsp_1_o[tile1_idx_map(p)] = floo_rsp_1_out[p];
@@ -74,8 +74,8 @@ for (genvar p = 0; p < NumPorts; p++) begin
 end
 
 // Tile 0
-for (genvar p = 0; p < NumPorts; p++) begin
-  if (p != East) begin
+for (genvar p = 0; p < NumPorts; p++) begin: gen_tile_0_connections
+  if (p != East) begin: gen_skip_east_tile_0
     assign floo_req_0_in[p] = floo_req_0_i[tile0_idx_map(p)];
     assign floo_rsp_0_in[p] = floo_rsp_0_i[tile0_idx_map(p)];
     assign floo_rsp_0_o[tile0_idx_map(p)] = floo_rsp_0_out[p];
@@ -191,7 +191,7 @@ floo_nw_router #(
 );
 
 function automatic int tile0_idx_map(route_direction_e dir);
-  case (dir)
+  unique case (dir)
     North: return 0;
     // East:  return 1;
     South: return 1;
@@ -201,7 +201,7 @@ function automatic int tile0_idx_map(route_direction_e dir);
 endfunction
 
 function automatic int tile1_idx_map(route_direction_e dir);
-  case (dir)
+  unique case (dir)
     North: return 0;
     East:  return 1;
     South: return 2;

From f0c5bc83815b529c91c5a1057a29883d9359b689 Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Fri, 12 Dec 2025 17:03:04 +0100
Subject: [PATCH 09/17] initial PR clean

---
 hw/floo_nw_chimney.sv             | 11 ++++++++---
 hw/floo_nw_router.sv              |  6 +++---
 hw/floo_router.sv                 |  6 +++---
 hw/floo_vc_arbiter.sv             | 10 +++++-----
 hw/synth/floo_synth_params_pkg.sv |  4 ++--
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/hw/floo_nw_chimney.sv b/hw/floo_nw_chimney.sv
index 189c80f4..b232e6c7 100644
--- a/hw/floo_nw_chimney.sv
+++ b/hw/floo_nw_chimney.sv
@@ -33,11 +33,16 @@ module floo_nw_chimney #(
   /// and one ID is reserved for non-atomic transactions
   parameter int unsigned MaxAtomicTxns           = 1,
   /// Enable support for decoupling read and write channels
+  /// when enabled, the wide read and write transactions
+  /// use separate channels (virtual or physical)
   parameter bit EnDecoupledRW                      = 1'b0,
-  /// Specify how many physical channel are used for teh wide connection
+  /// Specify how many physical channel are used for the wide connection
+  /// This parameter is used together with `EnDecoupledRW`. When read
+  /// and write channekls are decoupled, the two streams can either share
+  /// a single physical channel or use two separate physical channels.
   parameter int unsigned NumWidePhysChannels        = 1,
   /// Specify which VC implementation to use for the wide channels
-  parameter floo_pkg::vc_impl_e VcImplementation        = floo_pkg::VcNaive,
+  parameter floo_pkg::vc_impl_e VcImpl              = floo_pkg::VcNaive,
   /// Node ID type for routing
   parameter type id_t                                   = logic,
   /// RoB index type for reordering.
@@ -291,7 +296,7 @@ module floo_nw_chimney #(
       assign floo_wide_in_wr = floo_wide_i.wide;
       assign floo_wide_in_rd = floo_wide_i.wide;
 
-      if (VcImplementation == floo_pkg::VcCreditBased) begin : gen_credit_support
+      if (VcImpl == floo_pkg::VcCreditBased) begin : gen_credit_support
         // Drive credit signals for incoming requests
         `FF(floo_wide_o.credit[WRITE], floo_wide_in_wr_valid & floo_wide_out_wr_ready, 1'b0);
         `FF(floo_wide_o.credit[READ], floo_wide_in_rd_valid & floo_wide_out_rd_ready, 1'b0);
diff --git a/hw/floo_nw_router.sv b/hw/floo_nw_router.sv
index 929f93cf..a55920b7 100644
--- a/hw/floo_nw_router.sv
+++ b/hw/floo_nw_router.sv
@@ -33,7 +33,7 @@ module floo_nw_router #(
   /// assumed that write transactions are alwasy on VC0.
   parameter int unsigned  NumWideVirtChannels       = 32'd1,
   parameter int unsigned  NumWidePhysChannels       = 32'd1,
-  parameter floo_pkg::vc_impl_e VcImplementation    = floo_pkg::VcNaive,
+  parameter floo_pkg::vc_impl_e VcImpl              = floo_pkg::VcNaive,
   /// Enable multicast feature
   parameter bit          EnMultiCast          = 1'b0,
   /// Node ID type
@@ -130,7 +130,7 @@ module floo_nw_router #(
   end
 
   // Generation of credit based conenctions only when necessary
-  if (VcImplementation == floo_pkg::VcCreditBased) begin: gen_credit_connections
+  if (VcImpl == floo_pkg::VcCreditBased) begin: gen_credit_connections
     // Narrow lreq inks never rely on credit based flow
     for (genvar i = 0; i < NumInputs; i++) begin: gen_credit_tied_zero_req
       assign floo_req_o[i].credit = '0;
@@ -244,7 +244,7 @@ module floo_nw_router #(
     .XYRouteOpt       ( XYRouteOpt                ),
     .NumAddrRules     ( NumAddrRules              ),
     .NoLoopback       ( 1'b1                      ),
-    .VcImplementation ( VcImplementation          ),
+    .VcImpl           ( VcImpl                    ),
     .EnMultiCast      ( EnMultiCast               ),
     .EnReduction      ( 1'b0                      ),
     .id_t             ( id_t                      ),
diff --git a/hw/floo_router.sv b/hw/floo_router.sv
index 70af90ed..09bfaf0f 100644
--- a/hw/floo_router.sv
+++ b/hw/floo_router.sv
@@ -39,7 +39,7 @@ module floo_router
   /// Disables loopback connections
   parameter bit          NoLoopback       = 1'b1,
   /// Select VC implementation
-  parameter floo_pkg::vc_impl_e VcImplementation = floo_pkg::VcNaive,
+  parameter floo_pkg::vc_impl_e VcImpl    = floo_pkg::VcNaive,
   /// Enable Multicast feature
   parameter bit          EnMultiCast      = 1'b0,
   /// Enable reduction feature
@@ -136,7 +136,7 @@ module floo_router
       );
 
       // Credit count generation. Assign 1 upon any handshake
-      if (VcImplementation == floo_pkg::VcCreditBased) begin: gen_credit_support
+      if (VcImpl == floo_pkg::VcCreditBased) begin: gen_credit_support
         assign credit_o[in][v] = credit_gnt_q[in][v];
         assign credit_gnt_d[in][v] = in_valid[in][v] & in_ready[in][v];
         `FF(credit_gnt_q[in][v], credit_gnt_d[in][v], 1'b0);
@@ -284,7 +284,7 @@ module floo_router
       .NumVirtChannels  ( NumVirtChannels  ),
       .flit_t           ( flit_t           ),
       .NumPhysChannels  ( NumPhysChannels  ),
-      .VcImplementation ( VcImplementation )
+      .VcImpl ( VcImpl )
     ) i_vc_arbiter (
       .clk_i,
       .rst_ni,
diff --git a/hw/floo_vc_arbiter.sv b/hw/floo_vc_arbiter.sv
index 4b79c47a..a9f52841 100644
--- a/hw/floo_vc_arbiter.sv
+++ b/hw/floo_vc_arbiter.sv
@@ -13,7 +13,7 @@ module floo_vc_arbiter import floo_pkg::*;
   parameter int unsigned NumVirtChannels  = 1,
   parameter type         flit_t           = logic,
   parameter int unsigned NumPhysChannels  = 1,
-  parameter vc_impl_e    VcImplementation = VcNaive,
+  parameter vc_impl_e    VcImpl           = VcNaive,
   parameter int unsigned NumCredits       = 3
 ) (
   input  logic                        clk_i,
@@ -72,13 +72,13 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
     // VC arbitration logic //
     //////////////////////////
 
-    if (VcImplementation == VcPreemptValid) begin : gen_preempt_valid
+    if (VcImpl == VcPreemptValid) begin : gen_preempt_valid
       // Initially, any valid channel can request access to the physical channel.
       // However, to guarantee deadlock freedom, we must be able to preempt the
       // virtual channel holding the physical link and put the other channel on
       // the link. To do so, we mask the VC holding the link, when required.
       assign vc_arb_req_in = valid_i & mask_q;
-    end else if (VcImplementation == VcCreditBased) begin : gen_credit_based
+    end else if (VcImpl == VcCreditBased) begin : gen_credit_based
       // In case of credit based approach, the valid is set only if there are credits left
       assign vc_arb_req_in = valid_i & credit_left;
     end else begin : gen_standard
@@ -120,7 +120,7 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
       .idx_o    ( vc_arb_idx        )
     );
 
-  if (VcImplementation == VcCreditBased) begin: gen_credit
+  if (VcImpl == VcCreditBased) begin: gen_credit
     for (genvar v = 0; v < NumVirtChannels; v++) begin : gen_vc_credits
       credit_counter #(
         .NumCredits(NumCredits)
@@ -153,6 +153,6 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
   `ASSERT(OneHotOutputValid, $onehot0(valid_o))
 
   // Currently only supports two virtual channels
-  `ASSERT_INIT(SupportedNumVirtChannels, !VcImplementation || (NumVirtChannels <= 2))
+  `ASSERT_INIT(SupportedNumVirtChannels, !VcImpl || (NumVirtChannels <= 2))
 
 endmodule
diff --git a/hw/synth/floo_synth_params_pkg.sv b/hw/synth/floo_synth_params_pkg.sv
index d0197055..59bb6c82 100644
--- a/hw/synth/floo_synth_params_pkg.sv
+++ b/hw/synth/floo_synth_params_pkg.sv
@@ -97,9 +97,9 @@ package floo_synth_nw_pkg;
   `FLOO_TYPEDEF_AXI_FROM_CFG(axi_wide, AxiCfgW)
   `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in,
       AxiCfgN, AxiCfgW, hdr_t)
-  // `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req, rsp, wide)
+  `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req, rsp, wide)
   // Enable the following VC LINK when you want to experiment the use of virtual channels in collective
-  `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2, 1)
+  // `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2, 1)
 
 endpackage
 

From 746a0cfed9c609d0c336142c91e50e266c35dcb9 Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Fri, 12 Dec 2025 18:29:04 +0100
Subject: [PATCH 10/17] apply changes from review

---
 hw/floo_nw_chimney.sv | 30 ++++++++++++------------------
 hw/floo_nw_router.sv  |  2 +-
 hw/floo_pkg.sv        | 17 ++++++++++++++++-
 hw/floo_router.sv     |  2 +-
 hw/floo_vc_arbiter.sv |  6 +++---
 5 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/hw/floo_nw_chimney.sv b/hw/floo_nw_chimney.sv
index b232e6c7..8d093934 100644
--- a/hw/floo_nw_chimney.sv
+++ b/hw/floo_nw_chimney.sv
@@ -156,12 +156,6 @@ module floo_nw_chimney #(
   // For future extension, add an extra opcode in the user_struct_t
   typedef axi_addr_t user_mask_t ;
 
-  // Virtual channel enumeration
-  typedef enum logic {
-    READ  = 1'b1,
-    WRITE = 1'b0
-  } vc_e;
-
   localparam int unsigned NumVirtualChannels = EnDecoupledRW ? 2 : 1;
 
   // Duplicate AXI port signals to degenerate ports
@@ -286,27 +280,27 @@ module floo_nw_chimney #(
   logic floo_wide_out_ready;
 
   if (EnDecoupledRW) begin : gen_vc_demux
-    assign floo_wide_in_wr_valid = floo_wide_i.valid[WRITE];
-    assign floo_wide_in_rd_valid = floo_wide_i.valid[READ];
-    assign floo_wide_o.ready[WRITE] = floo_wide_out_wr_ready;
-    assign floo_wide_o.ready[READ] = floo_wide_out_rd_ready;
+    assign floo_wide_in_wr_valid = floo_wide_i.valid[Write];
+    assign floo_wide_in_rd_valid = floo_wide_i.valid[Read];
+    assign floo_wide_o.ready[Write] = floo_wide_out_wr_ready;
+    assign floo_wide_o.ready[Read] = floo_wide_out_rd_ready;
     if (NumWidePhysChannels == 1) begin : gen_single_phys_ch
       // Connect the single physical channel to both read and write
       // the valid and ready coming from teh VCs will be used to know if the data can be used
       assign floo_wide_in_wr = floo_wide_i.wide;
       assign floo_wide_in_rd = floo_wide_i.wide;
 
-      if (VcImpl == floo_pkg::VcCreditBased) begin : gen_credit_support
+      if (VcImpl == floo_pkg::VcCredit) begin : gen_credit_support
         // Drive credit signals for incoming requests
-        `FF(floo_wide_o.credit[WRITE], floo_wide_in_wr_valid & floo_wide_out_wr_ready, 1'b0);
-        `FF(floo_wide_o.credit[READ], floo_wide_in_rd_valid & floo_wide_out_rd_ready, 1'b0);
+        `FF(floo_wide_o.credit[Write], floo_wide_in_wr_valid & floo_wide_out_wr_ready, 1'b0);
+        `FF(floo_wide_o.credit[Read], floo_wide_in_rd_valid & floo_wide_out_rd_ready, 1'b0);
       end else begin: gen_no_credit_support
         assign floo_wide_o.credit = '0;
       end
 
     end else if (NumWidePhysChannels == 2) begin : gen_dual_phys_ch
-      assign floo_wide_in_wr = floo_wide_i.wide[WRITE];
-      assign floo_wide_in_rd = floo_wide_i.wide[READ];
+      assign floo_wide_in_wr = floo_wide_i.wide[Write];
+      assign floo_wide_in_rd = floo_wide_i.wide[Read];
     end else begin: gen_illegal_cfg
       $fatal(1, "NW CHIMNEY: Unsupported number of wide physical channels");
     end
@@ -1243,12 +1237,12 @@ module floo_nw_chimney #(
     // R -> Virtual Channel 1
     // TODO(lleone): check if this really solve DEADLOCK!!!!
     if (EnDecoupledRW) begin: gen_vc_rw_ack
-      assign floo_wide_o.valid[WRITE] = (floo_wide_o.wide[0].generic.hdr.axi_ch != WideR) ?
+      assign floo_wide_o.valid[Write] = (floo_wide_o.wide[0].generic.hdr.axi_ch != WideR) ?
                                          floo_wide_req_arb_valid_out : 1'b0;
-      assign floo_wide_o.valid[READ] = (floo_wide_o.wide[0].generic.hdr.axi_ch == WideR) ?
+      assign floo_wide_o.valid[Read] = (floo_wide_o.wide[0].generic.hdr.axi_ch == WideR) ?
                                          floo_wide_req_arb_valid_out : 1'b0;
       assign floo_wide_req_arb_gnt_in = (floo_wide_o.wide[0].generic.hdr.axi_ch != WideR) ?
-                                        floo_wide_i.ready[WRITE] : floo_wide_i.ready[READ];
+                                        floo_wide_i.ready[Write] : floo_wide_i.ready[Read];
     end else begin: gen_no_vc_rw_ack
       assign floo_wide_o.valid = floo_wide_req_arb_valid_out;
       assign floo_wide_req_arb_gnt_in = floo_wide_i.ready;
diff --git a/hw/floo_nw_router.sv b/hw/floo_nw_router.sv
index a55920b7..9bc10aaa 100644
--- a/hw/floo_nw_router.sv
+++ b/hw/floo_nw_router.sv
@@ -130,7 +130,7 @@ module floo_nw_router #(
   end
 
   // Generation of credit based conenctions only when necessary
-  if (VcImpl == floo_pkg::VcCreditBased) begin: gen_credit_connections
+  if (VcImpl == floo_pkg::VcCredit) begin: gen_credit_connections
     // Narrow lreq inks never rely on credit based flow
     for (genvar i = 0; i < NumInputs; i++) begin: gen_credit_tied_zero_req
       assign floo_req_o[i].credit = '0;
diff --git a/hw/floo_pkg.sv b/hw/floo_pkg.sv
index 625c3cc4..2844f666 100644
--- a/hw/floo_pkg.sv
+++ b/hw/floo_pkg.sv
@@ -85,11 +85,26 @@ package floo_pkg;
 
   /// Virtual channel implementation types
   typedef enum logic[1:0] {
+    /// The naive implementation  placed the valid and the data on the physical link
+    /// if teh downstream subordinate is ready. However this will create an in2out
+    /// path that span over the entire cluster tile, possibly limiting clock frequency.
     VcNaive = 2'd0,
-    VcCreditBased = 2'd1,
+    /// The credit based approach allows to cut the in2out path.
+    /// However, to support maximum transmission bandwidth, the subordinate input FIFO
+    /// must be able to store at least 3 flits, increasing significantly the router area.
+    VcCredit = 2'd1,
+    /// The preemptive valid approach allows to cut the in2out path.
+    /// This does not require the subordiante input FIFO to be larger than 2 flits,
+    /// while still supporting maximum transmission bandwidth.
     VcPreemptValid  = 2'd2
   } vc_impl_e;
 
+  // Virtual channel index association for read and write channels
+  typedef enum logic {
+    Read  = 1'b1,
+    Write = 1'b0
+  } vc_e;
+
   /// The types of collective communication
   typedef enum logic [1:0] {
     /// Normal communication
diff --git a/hw/floo_router.sv b/hw/floo_router.sv
index 09bfaf0f..e4e11dca 100644
--- a/hw/floo_router.sv
+++ b/hw/floo_router.sv
@@ -136,7 +136,7 @@ module floo_router
       );
 
       // Credit count generation. Assign 1 upon any handshake
-      if (VcImpl == floo_pkg::VcCreditBased) begin: gen_credit_support
+      if (VcImpl == floo_pkg::VcCredit) begin: gen_credit_support
         assign credit_o[in][v] = credit_gnt_q[in][v];
         assign credit_gnt_d[in][v] = in_valid[in][v] & in_ready[in][v];
         `FF(credit_gnt_q[in][v], credit_gnt_d[in][v], 1'b0);
diff --git a/hw/floo_vc_arbiter.sv b/hw/floo_vc_arbiter.sv
index a9f52841..48e7d0b7 100644
--- a/hw/floo_vc_arbiter.sv
+++ b/hw/floo_vc_arbiter.sv
@@ -78,7 +78,7 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
       // virtual channel holding the physical link and put the other channel on
       // the link. To do so, we mask the VC holding the link, when required.
       assign vc_arb_req_in = valid_i & mask_q;
-    end else if (VcImpl == VcCreditBased) begin : gen_credit_based
+    end else if (VcImpl == VcCredit) begin : gen_credit_based
       // In case of credit based approach, the valid is set only if there are credits left
       assign vc_arb_req_in = valid_i & credit_left;
     end else begin : gen_standard
@@ -120,7 +120,7 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
       .idx_o    ( vc_arb_idx        )
     );
 
-  if (VcImpl == VcCreditBased) begin: gen_credit
+  if (VcImpl == VcCredit) begin: gen_credit
     for (genvar v = 0; v < NumVirtChannels; v++) begin : gen_vc_credits
       credit_counter #(
         .NumCredits(NumCredits)
@@ -153,6 +153,6 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
   `ASSERT(OneHotOutputValid, $onehot0(valid_o))
 
   // Currently only supports two virtual channels
-  `ASSERT_INIT(SupportedNumVirtChannels, !VcImpl || (NumVirtChannels <= 2))
+  `ASSERT_INIT(SupportedNumVirtChannels, (VcImpl != floo_pkg::VcNaive) || (NumVirtChannels <= 2))
 
 endmodule

From d4e10b53d84a05149161b5945e5f2d7af92fd4b9 Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Tue, 16 Dec 2025 08:32:25 +0100
Subject: [PATCH 11/17] depr: Move deprectaed typedef

---
 Bender.yml                      |  2 +
 hw/include/floo_noc/typedef.svh | 71 ---------------------------------
 2 files changed, 2 insertions(+), 71 deletions(-)

diff --git a/Bender.yml b/Bender.yml
index e1429cf3..23fcd4c1 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -109,6 +109,8 @@ sources:
     - hw/synth/floo_synth_nw_router.sv
 
   - target: floo_deprecated_hw
+    include_dirs:
+      - hw/deprecated/include
     files:
       # Level 1
       - hw/deprecated/vc_router_util/floo_credit_counter.sv
diff --git a/hw/include/floo_noc/typedef.svh b/hw/include/floo_noc/typedef.svh
index c7c64513..c1cf0b3c 100644
--- a/hw/include/floo_noc/typedef.svh
+++ b/hw/include/floo_noc/typedef.svh
@@ -315,29 +315,6 @@
     floo_``chan_name``_chan_t [phy_num-1:0] ``chan_name``;       \
   } floo_``name``_t;
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Defines the all the link types with credit-based flow control interface
-// for use with virtual channels
-//
-// Arguments:
-// - name: Name of the link type
-// - chan_name: Name of the channel type to transport
-// - vc_id_t: Type of the virtual channel ID
-//
-// Usage Example:
-// localparam floo_pkg::axi_cfg_t AxiCfg = '{...};
-// `FLOO_TYPEDEF_HDR_T(hdr_t, ...)
-// `FLOO_TYPEDEF_AXI_FROM_CFG(my_axi, AxiCfg)
-// `FLOO_TYPEDEF_AXI_CHAN_ALL(my_axi, req, rsp, my_axi_in, AxiCfg, hdr_t)
-// FLOO_TYPEDEF_LINK_T(vc_req, my_axi)
-`define FLOO_TYPEDEF_VC_LINK_T(name, chan_name, vc_id_t)  \
-  typedef struct packed {                                 \
-    logic valid;                                          \
-    logic credit_v;                                       \
-    vc_id_t credit_id;                                    \
-    floo_``chan_name``_chan_t ``chan_name``;              \
-  } floo_``name``_t;
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Defines the all the link types with ready-valid handshaking interface
 // for a single AXI interface configuration
@@ -411,52 +388,4 @@
   `FLOO_TYPEDEF_VIRT_CHAN_LINK_T(rsp, rsp_chan, 1, 1)                                                                        \
   `FLOO_TYPEDEF_VIRT_CHAN_LINK_T(wide, wide_chan, wide_virt_chan, wide_phys_chan)
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Defines the all the link types with credit-based flow control interface
-// for a single AXI interface configuration
-//
-// Arguments:
-// - req: Name of the `req` link type
-// - rsp: Name of the `rsp` link type
-// - req_chan: Name of the `req` channel type to transport
-// - rsp_chan: Name of the `rsp` channel type to transport
-// - vc_id_t: Type of the virtual channel ID
-//
-// Usage Example:
-// localparam floo_pkg::axi_cfg_t AxiCfg = '{...};
-// `FLOO_TYPEDEF_HDR_T(hdr_t, ...)
-// `FLOO_TYPEDEF_AXI_FROM_CFG(my_axi, AxiCfg)
-// `FLOO_TYPEDEF_AXI_CHAN_ALL(my_axi, my_req, my_rsp, my_axi_in, AxiCfg, hdr_t)
-// `FLOO_TYPEDEF_VC_AXI_LINK_ALL(vc_req, vc_rsp, my_req, my_rsp)
-`define FLOO_TYPEDEF_VC_AXI_LINK_ALL(req, rsp, req_chan, rsp_chan, vc_id_t) \
-  `FLOO_TYPEDEF_VC_LINK_T(req, req_chan, vc_id_t)                           \
-  `FLOO_TYPEDEF_VC_LINK_T(rsp, rsp_chan, vc_id_t)                           \
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Defines the all the link types with credit-based flow control interface
-// for a narrow-wide AXI interface configuration
-//
-// Arguments:
-// - req: Name of the `req` link type
-// - rsp: Name of the `rsp` link type
-// - wide: Name of the `wide` link type
-// - req_chan: Name of the `req` channel type to transport
-// - rsp_chan: Name of the `rsp` channel type to transport
-// - wide_chan: Name of the `wide` channel type to transport
-// - vc_id_t: Type of the virtual channel ID
-//
-// Usage Example:
-// localparam floo_pkg::axi_cfg_t AxiCfgN = '{...};
-// localparam floo_pkg::axi_cfg_t AxiCfgW = '{...};
-// `FLOO_TYPEDEF_HDR_T(hdr_t, ...)
-// `FLOO_TYPEDEF_AXI_FROM_CFG(my_narrow_axi, AxiCfgN)
-// `FLOO_TYPEDEF_AXI_FROM_CFG(my_wide_axi, AxiCfgW)
-// `FLOO_TYPEDEF_NW_CHAN_ALL(axi, my_req, my_rsp, my_wide, my_axi_narrow_in, my_axi_wide_in, AxiCfgN, AxiCfgW, hdr_t)
-// `FLOO_TYPEDEF_NW_LINK_ALL(vc_req, vc_rsp, vc_wide, my_req, my_rsp, my_wide)
-`define FLOO_TYPEDEF_VC_NW_LINK_ALL(req, rsp, wide, req_chan, rsp_chan, wide_chan, vc_id_t) \
-  `FLOO_TYPEDEF_VC_LINK_T(req, req_chan, vc_id_t)                                           \
-  `FLOO_TYPEDEF_VC_LINK_T(rsp, rsp_chan, vc_id_t)                                           \
-  `FLOO_TYPEDEF_VC_LINK_T(wide, wide_chan, vc_id_t)
-
 `endif // FLOO_NOC_TYPEDEF_SVH_

From e8dba35256feda2b1f3317a1dbbd6352e0002e5e Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Tue, 16 Dec 2025 09:11:36 +0100
Subject: [PATCH 12/17] pr: Clean pr to be merged

---
 hw/deprecated/include/typedef.svh | 85 +++++++++++++++++++++++++++++++
 hw/floo_nw_router.sv              | 22 +++-----
 hw/floo_vc_arbiter.sv             | 29 ++++++-----
 hw/include/floo_noc/typedef.svh   |  5 +-
 4 files changed, 110 insertions(+), 31 deletions(-)
 create mode 100644 hw/deprecated/include/typedef.svh

diff --git a/hw/deprecated/include/typedef.svh b/hw/deprecated/include/typedef.svh
new file mode 100644
index 00000000..3c7f5f6f
--- /dev/null
+++ b/hw/deprecated/include/typedef.svh
@@ -0,0 +1,85 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+// Authors:
+// - Tim Fischer <fischeti@iis.ee.ethz.ch>
+// - Michael Rogenmoser <michaero@iis.ee.ethz.ch>
+
+// Macros to define the FlooNoC data types
+
+`ifndef FLOO_NOC_TYPEDEF_DEPRECATED_SVH_
+`define FLOO_NOC_TYPEDEF_DEPRECATED_SVH_
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Defines the all the link types with credit-based flow control interface
+// for use with virtual channels
+//
+// Arguments:
+// - name: Name of the link type
+// - chan_name: Name of the channel type to transport
+// - vc_id_t: Type of the virtual channel ID
+//
+// Usage Example:
+// localparam floo_pkg::axi_cfg_t AxiCfg = '{...};
+// `FLOO_TYPEDEF_HDR_T(hdr_t, ...)
+// `FLOO_TYPEDEF_AXI_FROM_CFG(my_axi, AxiCfg)
+// `FLOO_TYPEDEF_AXI_CHAN_ALL(my_axi, req, rsp, my_axi_in, AxiCfg, hdr_t)
+// FLOO_TYPEDEF_LINK_T(vc_req, my_axi)
+`define FLOO_TYPEDEF_VC_LINK_T(name, chan_name, vc_id_t)  \
+  typedef struct packed {                                 \
+    logic valid;                                          \
+    logic credit_v;                                       \
+    vc_id_t credit_id;                                    \
+    floo_``chan_name``_chan_t ``chan_name``;              \
+  } floo_``name``_t;
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+// Defines the all the link types with credit-based flow control interface
+// for a single AXI interface configuration
+//
+// Arguments:
+// - req: Name of the `req` link type
+// - rsp: Name of the `rsp` link type
+// - req_chan: Name of the `req` channel type to transport
+// - rsp_chan: Name of the `rsp` channel type to transport
+// - vc_id_t: Type of the virtual channel ID
+//
+// Usage Example:
+// localparam floo_pkg::axi_cfg_t AxiCfg = '{...};
+// `FLOO_TYPEDEF_HDR_T(hdr_t, ...)
+// `FLOO_TYPEDEF_AXI_FROM_CFG(my_axi, AxiCfg)
+// `FLOO_TYPEDEF_AXI_CHAN_ALL(my_axi, my_req, my_rsp, my_axi_in, AxiCfg, hdr_t)
+// `FLOO_TYPEDEF_VC_AXI_LINK_ALL(vc_req, vc_rsp, my_req, my_rsp)
+`define FLOO_TYPEDEF_VC_AXI_LINK_ALL(req, rsp, req_chan, rsp_chan, vc_id_t) \
+  `FLOO_TYPEDEF_VC_LINK_T(req, req_chan, vc_id_t)                           \
+  `FLOO_TYPEDEF_VC_LINK_T(rsp, rsp_chan, vc_id_t)                           \
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Defines the all the link types with credit-based flow control interface
+// for a narrow-wide AXI interface configuration
+//
+// Arguments:
+// - req: Name of the `req` link type
+// - rsp: Name of the `rsp` link type
+// - wide: Name of the `wide` link type
+// - req_chan: Name of the `req` channel type to transport
+// - rsp_chan: Name of the `rsp` channel type to transport
+// - wide_chan: Name of the `wide` channel type to transport
+// - vc_id_t: Type of the virtual channel ID
+//
+// Usage Example:
+// localparam floo_pkg::axi_cfg_t AxiCfgN = '{...};
+// localparam floo_pkg::axi_cfg_t AxiCfgW = '{...};
+// `FLOO_TYPEDEF_HDR_T(hdr_t, ...)
+// `FLOO_TYPEDEF_AXI_FROM_CFG(my_narrow_axi, AxiCfgN)
+// `FLOO_TYPEDEF_AXI_FROM_CFG(my_wide_axi, AxiCfgW)
+// `FLOO_TYPEDEF_NW_CHAN_ALL(axi, my_req, my_rsp, my_wide, my_axi_narrow_in, my_axi_wide_in, AxiCfgN, AxiCfgW, hdr_t)
+// `FLOO_TYPEDEF_NW_LINK_ALL(vc_req, vc_rsp, vc_wide, my_req, my_rsp, my_wide)
+`define FLOO_TYPEDEF_VC_NW_LINK_ALL(req, rsp, wide, req_chan, rsp_chan, wide_chan, vc_id_t) \
+  `FLOO_TYPEDEF_VC_LINK_T(req, req_chan, vc_id_t)                                           \
+  `FLOO_TYPEDEF_VC_LINK_T(rsp, rsp_chan, vc_id_t)                                           \
+  `FLOO_TYPEDEF_VC_LINK_T(wide, wide_chan, vc_id_t)
+
+`endif // FLOO_NOC_TYPEDEF_DEPRECATED_SVH_
diff --git a/hw/floo_nw_router.sv b/hw/floo_nw_router.sv
index 9bc10aaa..14336f88 100644
--- a/hw/floo_nw_router.sv
+++ b/hw/floo_nw_router.sv
@@ -94,10 +94,10 @@ module floo_nw_router #(
   floo_rsp_chan_t [NumOutputs-1:0] rsp_in;
   floo_wide_chan_t [NumRoutes-1:0][NumWidePhysChannels-1:0] wide_in;
   floo_wide_chan_t [NumRoutes-1:0][NumWidePhysChannels-1:0] wide_out;
-  logic [NumInputs-1:0] req_valid_in, req_ready_out;
+  logic [NumInputs-1:0] req_valid_in, req_ready_out, req_credit_out;
   logic [NumInputs-1:0] rsp_valid_out, rsp_ready_in;
-  logic [NumOutputs-1:0] req_valid_out, req_ready_in;
-  logic [NumOutputs-1:0] rsp_valid_in, rsp_ready_out;
+  logic [NumOutputs-1:0] req_valid_out, req_ready_in, req_credit_in;
+  logic [NumOutputs-1:0] rsp_valid_in, rsp_ready_out, rsp_credit_out;
   logic [NumRoutes-1:0][NumWideVirtChannels-1:0] wide_valid_in, wide_valid_out;
   logic [NumRoutes-1:0][NumWideVirtChannels-1:0] wide_ready_in, wide_ready_out;
   logic [NumRoutes-1:0][NumWideVirtChannels-1:0] wide_credit_in, wide_credit_out;
@@ -105,6 +105,7 @@ module floo_nw_router #(
   for (genvar i = 0; i < NumInputs; i++) begin : gen_chimney_req
     assign req_valid_in[i] = floo_req_i[i].valid;
     assign floo_req_o[i].ready = req_ready_out[i];
+    assign floo_req_o[i].credit = req_credit_out[i];
     assign req_in[i] = floo_req_i[i].req;
     assign floo_rsp_o[i].valid = rsp_valid_out[i];
     assign rsp_ready_in[i] = floo_rsp_i[i].ready;
@@ -117,6 +118,7 @@ module floo_nw_router #(
     assign floo_req_o[i].req = req_out[i];
     assign rsp_valid_in[i] = floo_rsp_i[i].valid;
     assign floo_rsp_o[i].ready = rsp_ready_out[i];
+    assign floo_rsp_o[i].credit = rsp_credit_out[i];
     assign rsp_in[i] = floo_rsp_i[i].rsp;
   end
 
@@ -131,21 +133,11 @@ module floo_nw_router #(
 
   // Generation of credit based conenctions only when necessary
   if (VcImpl == floo_pkg::VcCredit) begin: gen_credit_connections
-    // Narrow lreq inks never rely on credit based flow
-    for (genvar i = 0; i < NumInputs; i++) begin: gen_credit_tied_zero_req
-      assign floo_req_o[i].credit = '0;
-    end
-    // Narrow rsp links never rely on credit based flow
-    for (genvar i = 0; i < NumOutputs; i++) begin: gen_credit_tied_zero_rsp
-      assign floo_rsp_o[i].credit = '0;
-    end
     // Wide links credit connections
     for (genvar i = 0; i < NumRoutes; i++) begin: gen_credit_wide
       assign floo_wide_o[i].credit = wide_credit_out[i];
       assign wide_credit_in[i] = floo_wide_i[i].credit;
     end
-  end else begin: gen_no_credit_connections
-    assign wide_credit_in = '0;
   end
 
   floo_router #(
@@ -177,7 +169,7 @@ module floo_nw_router #(
     .valid_o        ( req_valid_out ),
     .ready_i        ( req_ready_in  ),
     .data_o         ( req_out       ),
-    .credit_o       ( /*unused */   )
+    .credit_o       ( req_credit_out) /* unused */
   );
 
   // We construct the masks for the narrow and wide B responses here.
@@ -230,7 +222,7 @@ module floo_nw_router #(
     .valid_o        ( rsp_valid_out ),
     .ready_i        ( rsp_ready_in  ),
     .data_o         ( rsp_out       ),
-    .credit_o       ( /*unused */   )
+    .credit_o       ( rsp_credit_out) /* unused */
   );
 
 
diff --git a/hw/floo_vc_arbiter.sv b/hw/floo_vc_arbiter.sv
index 48e7d0b7..a78096a8 100644
--- a/hw/floo_vc_arbiter.sv
+++ b/hw/floo_vc_arbiter.sv
@@ -51,22 +51,25 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
     // Lock and mask update logic //
     ////////////////////////////////
 
-    always_comb begin
-      mask_d = mask_q;
-      // If we have a valid request but no grant, and the
-      // other VC has a grant, switch VC in the next cycle
-      if (vc_arb_req_out) begin
-        if (!vc_arb_gnt_in) begin
-          if (ready_i[vc_arb_idx ? 0 : 1] && valid_i[vc_arb_idx ? 0 : 1]) begin: gen_valid_mask
-              mask_d = ~(1'b1 << vc_arb_idx);
+    if (VcImpl == VcPreemptValid) begin: gen_preempt_valid_mask
+      always_comb begin
+        mask_d = mask_q;
+        // If we have a valid request but no grant, and the
+        // other VC has a grant, switch VC in the next cycle. This logic works with only two VCs
+        // A more sophisticated arbitration mechanism would be needed for more VCs.
+        if (vc_arb_req_out) begin
+          if (!vc_arb_gnt_in) begin
+            if (ready_i[~vc_arb_idx] && valid_i[~vc_arb_idx]) begin: gen_valid_mask
+                mask_d = ~(1'b1 << vc_arb_idx);
+            end
+          end else begin
+            mask_d = '1;
           end
-        end else begin
-          mask_d = '1;
         end
       end
-    end
 
-    `FF(mask_q, mask_d, '1, clk_i, rst_ni)
+      `FF(mask_q, mask_d, '1, clk_i, rst_ni)
+    end
 
     //////////////////////////
     // VC arbitration logic //
@@ -153,6 +156,6 @@ end else if (NumPhysChannels == 1) begin : gen_single_phys
   `ASSERT(OneHotOutputValid, $onehot0(valid_o))
 
   // Currently only supports two virtual channels
-  `ASSERT_INIT(SupportedNumVirtChannels, (VcImpl != floo_pkg::VcNaive) || (NumVirtChannels <= 2))
+  `ASSERT_INIT(SupportedNumVirtChannels, (VcImpl == floo_pkg::VcNaive) || (NumVirtChannels <= 2))
 
 endmodule
diff --git a/hw/include/floo_noc/typedef.svh b/hw/include/floo_noc/typedef.svh
index c1cf0b3c..2cea3a31 100644
--- a/hw/include/floo_noc/typedef.svh
+++ b/hw/include/floo_noc/typedef.svh
@@ -372,7 +372,6 @@
 // - req_chan: Name of the `req` channel type to transport
 // - rsp_chan: Name of the `rsp` channel type to transport
 // - wide_chan: Name of the `wide` channel type to transport
-// - req_virt_chan: Number of virtual channel for the narrow link
 // - wide_virt_chan: Number of virtual channel for the wide link
 //
 // Usage Example:
@@ -383,8 +382,8 @@
 // `FLOO_TYPEDEF_AXI_FROM_CFG(my_wide_axi, AxiCfgW)
 // `FLOO_TYPEDEF_NW_CHAN_ALL(axi, my_req, my_rsp, my_wide, my_axi_narrow_in, my_axi_wide_in, AxiCfgN, AxiCfgW, hdr_t)
 // `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, my_req, my_rsp, my_wide, 1, 2)
-`define FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req_chan, rsp_chan, wide_chan, req_virt_chan, wide_virt_chan, wide_phys_chan)  \
-  `FLOO_TYPEDEF_VIRT_CHAN_LINK_T(req, req_chan, req_virt_chan, req_virt_chan)                                                            \
+`define FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req_chan, rsp_chan, wide_chan, wide_virt_chan, wide_phys_chan)  \
+  `FLOO_TYPEDEF_VIRT_CHAN_LINK_T(req, req_chan, 1, 1)                                                            \
   `FLOO_TYPEDEF_VIRT_CHAN_LINK_T(rsp, rsp_chan, 1, 1)                                                                        \
   `FLOO_TYPEDEF_VIRT_CHAN_LINK_T(wide, wide_chan, wide_virt_chan, wide_phys_chan)
 

From 15e48ed48e2a8dbeb740bd5d0857134a62f71d79 Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Tue, 16 Dec 2025 11:18:12 +0100
Subject: [PATCH 13/17] hw: Tie-off credit in chimney

---
 hw/floo_nw_chimney.sv | 4 ++++
 hw/floo_nw_router.sv  | 9 +++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/hw/floo_nw_chimney.sv b/hw/floo_nw_chimney.sv
index 8d093934..b7e32d5b 100644
--- a/hw/floo_nw_chimney.sv
+++ b/hw/floo_nw_chimney.sv
@@ -1215,6 +1215,10 @@ module floo_nw_chimney #(
     .ready_i  ( floo_rsp_i.ready      ),
     .valid_o  ( floo_rsp_o.valid      )
   );
+  if (VcImpl == floo_pkg::VcCredit) begin : gen_credit_tie // Credit is never used for narrow req/rsp
+    assign floo_req_o.credit = '0;
+    assign floo_rsp_o.credit = '0;
+  end
 
   if (NumWidePhysChannels == 1) begin: gen_wide_out_wrmh
     floo_wormhole_arbiter #(
diff --git a/hw/floo_nw_router.sv b/hw/floo_nw_router.sv
index 14336f88..5d34e09e 100644
--- a/hw/floo_nw_router.sv
+++ b/hw/floo_nw_router.sv
@@ -105,7 +105,6 @@ module floo_nw_router #(
   for (genvar i = 0; i < NumInputs; i++) begin : gen_chimney_req
     assign req_valid_in[i] = floo_req_i[i].valid;
     assign floo_req_o[i].ready = req_ready_out[i];
-    assign floo_req_o[i].credit = req_credit_out[i];
     assign req_in[i] = floo_req_i[i].req;
     assign floo_rsp_o[i].valid = rsp_valid_out[i];
     assign rsp_ready_in[i] = floo_rsp_i[i].ready;
@@ -118,7 +117,6 @@ module floo_nw_router #(
     assign floo_req_o[i].req = req_out[i];
     assign rsp_valid_in[i] = floo_rsp_i[i].valid;
     assign floo_rsp_o[i].ready = rsp_ready_out[i];
-    assign floo_rsp_o[i].credit = rsp_credit_out[i];
     assign rsp_in[i] = floo_rsp_i[i].rsp;
   end
 
@@ -133,6 +131,13 @@ module floo_nw_router #(
 
   // Generation of credit based conenctions only when necessary
   if (VcImpl == floo_pkg::VcCredit) begin: gen_credit_connections
+    // Narrow links credit connections
+    for (genvar i = 0; i < NumInputs; i++) begin: gen_credit_req
+      assign floo_req_o[i].credit = req_credit_out[i];
+    end
+    for (genvar i = 0; i < NumOutputs; i++) begin: gen_credit_rsp
+      assign floo_rsp_o[i].credit = rsp_credit_out[i];
+    end
     // Wide links credit connections
     for (genvar i = 0; i < NumRoutes; i++) begin: gen_credit_wide
       assign floo_wide_o[i].credit = wide_credit_out[i];

From 0a89b830fd7d024330f9d143ef8fe4d33bea8153 Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Tue, 16 Dec 2025 16:33:06 +0100
Subject: [PATCH 14/17] hw: Improve VC parametrization

---
 .github/workflows/lint.yml |  2 +-
 hw/floo_nw_chimney.sv      | 19 ++++++++-----------
 hw/floo_nw_router.sv       | 10 ++++++----
 hw/floo_pkg.sv             | 14 +++++++++++++-
 4 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index ca2b76bf..94f9c33f 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -22,7 +22,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check Bender up-to-date
-        uses: pulp-platform/pulp-actions/bender-up-to-date@v2
+        uses: pulp-platform/pulp-actions/bender-up-to-date@v2.4.4
 
   ################
   # Verible Lint #
diff --git a/hw/floo_nw_chimney.sv b/hw/floo_nw_chimney.sv
index b7e32d5b..06d476b6 100644
--- a/hw/floo_nw_chimney.sv
+++ b/hw/floo_nw_chimney.sv
@@ -32,15 +32,8 @@ module floo_nw_chimney #(
   /// Every atomic transactions needs to have a unique ID
   /// and one ID is reserved for non-atomic transactions
   parameter int unsigned MaxAtomicTxns           = 1,
-  /// Enable support for decoupling read and write channels
-  /// when enabled, the wide read and write transactions
-  /// use separate channels (virtual or physical)
-  parameter bit EnDecoupledRW                      = 1'b0,
-  /// Specify how many physical channel are used for the wide connection
-  /// This parameter is used together with `EnDecoupledRW`. When read
-  /// and write channekls are decoupled, the two streams can either share
-  /// a single physical channel or use two separate physical channels.
-  parameter int unsigned NumWidePhysChannels        = 1,
+  /// Enable or disable decoupling of read and write transfers on the wide link
+  parameter floo_pkg::wide_rw_decouple_e WideRwDecouple = floo_pkg::None,
   /// Specify which VC implementation to use for the wide channels
   parameter floo_pkg::vc_impl_e VcImpl              = floo_pkg::VcNaive,
   /// Node ID type for routing
@@ -156,7 +149,10 @@ module floo_nw_chimney #(
   // For future extension, add an extra opcode in the user_struct_t
   typedef axi_addr_t user_mask_t ;
 
-  localparam int unsigned NumVirtualChannels = EnDecoupledRW ? 2 : 1;
+  // Derive paramters for decoupling read and write
+  localparam bit EnDecoupledRW = (WideRwDecouple != floo_pkg::None);
+  localparam int unsigned NumVirtualChannels = (WideRwDecouple == floo_pkg::None) ? 1 : 2;
+  localparam int unsigned NumWidePhysChannels = (WideRwDecouple == floo_pkg::Phys) ? 2 : 1;
 
   // Duplicate AXI port signals to degenerate ports
   // in case they are not used
@@ -1215,7 +1211,8 @@ module floo_nw_chimney #(
     .ready_i  ( floo_rsp_i.ready      ),
     .valid_o  ( floo_rsp_o.valid      )
   );
-  if (VcImpl == floo_pkg::VcCredit) begin : gen_credit_tie // Credit is never used for narrow req/rsp
+  // Credit is never used for narrow req/rsp
+  if (VcImpl == floo_pkg::VcCredit) begin : gen_credit_tie
     assign floo_req_o.credit = '0;
     assign floo_rsp_o.credit = '0;
   end
diff --git a/hw/floo_nw_router.sv b/hw/floo_nw_router.sv
index 5d34e09e..232aea06 100644
--- a/hw/floo_nw_router.sv
+++ b/hw/floo_nw_router.sv
@@ -29,10 +29,9 @@ module floo_nw_router #(
   /// Disable illegal connections in router
   /// (only applies for `RouteAlgo == XYRouting`)
   parameter bit          XYRouteOpt           = 1'b1,
-  /// Enable decoupling between Read and Write WIDE channels using virtual channels
-  /// assumed that write transactions are alwasy on VC0.
-  parameter int unsigned  NumWideVirtChannels       = 32'd1,
-  parameter int unsigned  NumWidePhysChannels       = 32'd1,
+  /// Enable decoupling between Read and Write WIDE channels using virtual or
+  /// physical channels: assumed that write transactions are alwasy on VC0.
+  parameter floo_pkg::wide_rw_decouple_e WideRwDecouple = floo_pkg::None,
   parameter floo_pkg::vc_impl_e VcImpl              = floo_pkg::VcNaive,
   /// Enable multicast feature
   parameter bit          EnMultiCast          = 1'b0,
@@ -71,6 +70,9 @@ module floo_nw_router #(
   output  floo_wide_t [NumRoutes-1:0] floo_wide_o
 );
 
+  localparam int unsigned NumWidePhysChannels = (WideRwDecouple == floo_pkg::Phys) ? 2 : 1;
+  localparam int unsigned NumWideVirtChannels = (WideRwDecouple == floo_pkg::None) ? 1 : 2;
+
   typedef logic [AxiCfgN.AddrWidth-1:0] axi_addr_t;
   typedef logic [AxiCfgN.InIdWidth-1:0] axi_narrow_in_id_t;
   typedef logic [AxiCfgN.UserWidth-1:0] axi_narrow_user_t;
diff --git a/hw/floo_pkg.sv b/hw/floo_pkg.sv
index 2844f666..cbb048a2 100644
--- a/hw/floo_pkg.sv
+++ b/hw/floo_pkg.sv
@@ -99,12 +99,24 @@ package floo_pkg;
     VcPreemptValid  = 2'd2
   } vc_impl_e;
 
-  // Virtual channel index association for read and write channels
+  /// Virtual channel index association for read and write channels
   typedef enum logic {
     Read  = 1'b1,
     Write = 1'b0
   } vc_e;
 
+  /// Implementation of the read/write wide scheme
+  typedef enum logic[1:0] {
+    /// Share same wide link for read and write channels
+    /// this create a coupling between read and write transfers
+    None = 2'd0,
+    /// Decouple read and write transfers, using vc_e implementation
+    /// Write transactions are always assigned to VC0, while Read to VC1
+    Vc = 2'd1,
+    /// Decouple read and write transfers, using separate wide links
+    Phys = 2'd2
+  } wide_rw_decouple_e;
+
   /// The types of collective communication
   typedef enum logic [1:0] {
     /// Normal communication

From 1ba464c7ba8cdacf1034fe437c31b2b1430b945e Mon Sep 17 00:00:00 2001
From: Raphael <raroth@student.ethz.ch>
Date: Wed, 16 Jul 2025 17:51:10 +0200
Subject: [PATCH 15/17] hw: Introduce support for reduction operations

When an endpoint initiates a wide multicast DMA transfer from another
endpoint to itself (and possibly other endpoints), the following
deadlock occurs. The DMA issues an AR, causing a read burst to come
back from the router to the initiator endpoint. When the first read
beats arrive, the DMA issues a write burst. This write burst loops
back to the initiator endpoint, and may take control of the physical
link. As writes lock the link for the entire burst, but the burst
cannot complete as it needs the stalling read beats to feed the write
burst (a DMA requirement), there is a deadlock. Note, this can happen
also without multicast, so long as the system uses loopback.

hw: Add spill registers for VCs in chimney

hw: Fix reduction support with VC for read and write (working)

hw: Merging collective configuration parameters

hw: Initial setup for PnR experiments

hw: Add internal offload cuts

hw: Fix parametrization multicast

hw: Fix parametrization adding micro collective ops

pnr: Adapt synth wrapper to script

fixes to script synthesis runs

hw: Create two physical channels on wide_in interface of eject port

hw: Support outstanding barriers with overlapping inputs

hw: Fix `floo_reduction_sync`

hw: Add comment to add assertion in `floo_route_xymask`

synt: Fix synth wrapper and include chimney

hw: Clean reduction_sync

hw: Re-implement reduction simple controller (reduction_unit)

synth: Adapt supprot for chimney with VC
---
 Bender.lock                             |  17 +-
 Bender.yml                              |   9 +-
 Makefile                                |   2 +-
 hw/floo_alu.sv                          | 394 ++++++++++
 hw/floo_id_translation.sv               |   5 +-
 hw/floo_meta_buffer.sv                  |   7 +-
 hw/floo_nw_chimney.sv                   |   5 +-
 hw/floo_nw_router.sv                    | 300 +++++---
 hw/floo_offload_reduction.sv            | 545 ++++++++++++++
 hw/floo_offload_reduction_buffer.sv     | 128 ++++
 hw/floo_offload_reduction_controller.sv | 924 ++++++++++++++++++++++++
 hw/floo_offload_reduction_stalling.sv   | 101 +++
 hw/floo_offload_reduction_taggen.sv     | 248 +++++++
 hw/floo_output_arbiter.sv               | 170 +++--
 hw/floo_pkg.sv                          | 371 +++++++++-
 hw/floo_reduction_arbiter.sv            | 208 ++++--
 hw/floo_reduction_sync.sv               |  67 +-
 hw/floo_reduction_unit.sv               | 331 +++++++++
 hw/floo_route_comp.sv                   |   6 +-
 hw/floo_route_select.sv                 |  74 +-
 hw/floo_route_xymask.sv                 | 223 ++++--
 hw/floo_router.sv                       | 362 ++++++++--
 hw/include/floo_noc/typedef.svh         |  11 +-
 hw/synth/floo_synth_nw_chimney.sv       |  55 +-
 hw/synth/floo_synth_nw_router.sv        | 117 ++-
 hw/synth/floo_synth_params_pkg.sv       | 305 ++++++++
 hw/synth/snitch_cluster_pkg.sv          | 734 +++++++++++++++++++
 hw/test/floo_reduction_offloads.sv      | 338 +++++++++
 hw/test/floo_test_pkg.sv                |   5 +-
 29 files changed, 5598 insertions(+), 464 deletions(-)
 create mode 100644 hw/floo_alu.sv
 create mode 100644 hw/floo_offload_reduction.sv
 create mode 100644 hw/floo_offload_reduction_buffer.sv
 create mode 100644 hw/floo_offload_reduction_controller.sv
 create mode 100644 hw/floo_offload_reduction_stalling.sv
 create mode 100644 hw/floo_offload_reduction_taggen.sv
 create mode 100644 hw/floo_reduction_unit.sv
 create mode 100644 hw/synth/snitch_cluster_pkg.sv
 create mode 100644 hw/test/floo_reduction_offloads.sv

diff --git a/Bender.lock b/Bender.lock
index 7102334a..585ee8a4 100644
--- a/Bender.lock
+++ b/Bender.lock
@@ -10,7 +10,7 @@ packages:
     revision: 78831b6feba265d5ee2683bbf42b4150f8a35c43
     version: 0.39.8
     source:
-      Git: https://github.com/pulp-platform/axi.git
+      Git: https://github.com/Lura518/axi.git
     dependencies:
     - common_cells
     - common_verification
@@ -51,6 +51,21 @@ packages:
     source:
       Path: pd
     dependencies: []
+  fpnew:
+    revision: a8e0cba6dd50f357ece73c2c955d96efc3c6c315
+    version: null
+    source:
+      Git: https://github.com/pulp-platform/cvfpu.git
+    dependencies:
+    - common_cells
+    - fpu_div_sqrt_mvp
+  fpu_div_sqrt_mvp:
+    revision: 86e1f558b3c95e91577c41b2fc452c86b04e85ac
+    version: 1.0.4
+    source:
+      Git: https://github.com/pulp-platform/fpu_div_sqrt_mvp.git
+    dependencies:
+    - common_cells
   idma:
     revision: 28a36e5e07705549e59fc33db96ab681bc1ca88e
     version: 0.6.5
diff --git a/Bender.yml b/Bender.yml
index 23fcd4c1..e8cabc75 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -16,6 +16,7 @@ dependencies:
   axi: { git: "https://github.com/pulp-platform/axi.git", version: 0.39.7 }
   axi_riscv_atomics: { git: "https://github.com/pulp-platform/axi_riscv_atomics.git", version: 0.8.2 }
   floo_noc_pd: {path: ./pd}
+  FPnew:              { git: "https://github.com/pulp-platform/cvfpu.git",        rev:      pulp-v0.1.3 }
 
 export_include_dirs:
   - hw/include
@@ -36,16 +37,18 @@ sources:
   - hw/floo_rob_wrapper.sv
   - hw/floo_reduction_sync.sv
   - hw/floo_route_xymask.sv
+  - hw/floo_alu.sv
   # Level 2
   - hw/floo_route_select.sv
   - hw/floo_route_comp.sv
   - hw/floo_meta_buffer.sv
   - hw/floo_reduction_arbiter.sv
+  - hw/floo_reduction_unit.sv
   # Level 3
   - hw/floo_output_arbiter.sv
   # Level 4
   - hw/floo_nw_join.sv
-  - hw/floo_axi_chimney.sv
+  #- hw/floo_axi_chimney.sv
   - hw/floo_nw_chimney.sv
   - hw/floo_router.sv
   # Level 5 (Wrappers)
@@ -67,13 +70,15 @@ sources:
       - hw/test/axi_reorder_remap_compare.sv
       - hw/test/axi_bw_monitor.sv
       - hw/test/floo_hbm_model.sv
+#      - hw/test/floo_reduction_offloads.sv
       - hw/test/axi_aw_w_sync.sv
       # Level 2
-      - hw/tb/tb_floo_axi_chimney.sv
+      #- hw/tb/tb_floo_axi_chimney.sv
       - hw/tb/tb_floo_nw_chimney.sv
       - hw/tb/tb_floo_router.sv
       - hw/tb/tb_floo_rob.sv
       - hw/tb/tb_floo_rob_multicast.sv
+      #- hw/tb/tb_floo_rob_multicast.sv
 
   - target: all(any(floo_test, floo_synth), axi_mesh)
     files:
diff --git a/Makefile b/Makefile
index 8d63485d..f96c6f20 100644
--- a/Makefile
+++ b/Makefile
@@ -187,7 +187,7 @@ clean-spyglass:
 ###################
 
 PD_REMOTE ?= git@iis-git.ee.ethz.ch:axi-noc/floo_noc_pd.git
-PD_BRANCH ?= master
+PD_BRANCH ?= feature/reduction
 PD_DIR = $(FLOO_ROOT)/pd
 
 .PHONY: init-pd
diff --git a/hw/floo_alu.sv b/hw/floo_alu.sv
new file mode 100644
index 00000000..e5396b76
--- /dev/null
+++ b/hw/floo_alu.sv
@@ -0,0 +1,394 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Raphael Roth <raroth@student.ethz.ch>
+
+// This 32 bit alu is designed to be used as an easy offload unit for the offload reduction.
+// It should resemble the FPU from openhw group and could potentially be extended.
+
+`include "common_cells/assertions.svh"
+
+package alu_pkg;
+  // STRONGLY Inspired by the fpnew from openhw group!
+
+  // ---------
+  // INT TYPES
+  // ---------
+  // | Enumerator | Width  |
+  // |:----------:|-------:|
+  // | INT8       |  8 bit |
+  // | UINT8      |  8 bit |
+  // | INT16      | 16 bit |
+  // | UINT16     | 16 bit |
+  // | INT32      | 32 bit |
+  // | UINT32     | 32 bit |
+  // | INT64      | 64 bit |
+  // | UINT64     | 64 bit |
+  // *NOTE:* Add new formats only at the end of the enumeration for backwards compatibilty!
+  localparam int unsigned NUM_INT_FORMATS = 8;
+  localparam int unsigned INT_FORMAT_BITS = $clog2(NUM_INT_FORMATS);
+
+  // Int formats (Uint required for differentation between signed / unsigned min max)
+  typedef enum logic [INT_FORMAT_BITS-1:0] {
+    INT8,
+    UINT8,
+    INT16,
+    UINT16,
+    INT32,
+    UINT32,
+    INT64,
+    UINT64
+    // add new formats here
+  } alu_int_format_e;
+
+    // Returns the width of an INT format by index
+  function automatic int unsigned int_width(alu_int_format_e ifmt);
+    unique case (ifmt)
+      INT8:  return 8;
+      UINT8:  return 8;
+      INT16: return 16;
+      UINT16: return 16;
+      INT32: return 32;
+      UINT32: return 32;
+      INT64: return 64;
+      UINT64: return 64;
+      default: begin
+        // pragma translate_off
+        $fatal(1, "Invalid INT format supplied");
+        // pragma translate_on
+        // just return any integer to avoid any latches
+        // hopefully this error is caught by simulation
+        return INT8;
+      end
+    endcase
+  endfunction
+
+  // --------------
+  // ALU OPERATIONS
+  // --------------
+  localparam int unsigned NUM_INT_OPERATION = 4;
+  localparam int unsigned INT_OPERATION_BITS = $clog2(NUM_INT_OPERATION);
+
+  // Int Operation
+  typedef enum logic [INT_OPERATION_BITS-1:0] {
+    ADD,
+    MUL,
+    MIN,
+    MAX
+  } alu_operation_e;
+
+  // --------------
+  // STATUS
+  // --------------
+  typedef struct packed {
+    logic is_zero;
+  } alu_status_t;
+
+endpackage
+
+// Wrapper incl. decoder for the ALU
+module floo_reduction_alu import floo_pkg::*; #() (
+  input  logic              clk_i,
+  input  logic              rst_ni,
+  input  logic              flush_i,
+  /// IF towards external FPU
+  input  logic[63:0]        alu_req_op1_i,
+  input  logic[63:0]        alu_req_op2_i,
+  input  collect_op_e       alu_req_type_i,
+  input  logic              alu_req_valid_i,
+  output logic              alu_req_ready_o,
+  /// IF from external ALU
+  output logic[63:0]        alu_resp_data_o,
+  output logic              alu_resp_valid_o,
+  input  logic              alu_resp_ready_i
+);
+
+  /* All local parameter */
+
+  /* All Typedef Vars */
+
+  // Typedef for the input of the ALU
+  typedef struct packed {
+    logic [1:0][63:0]         operands;
+    alu_pkg::alu_operation_e  op;
+    alu_pkg::alu_int_format_e fmt;
+    logic                     vectorial_op;
+  } alu_in_t;
+
+  // Typedef for the output of the ALU
+  typedef struct packed {
+    logic [63:0] result;
+  } alu_out_t;
+
+  /* Variable declaration */
+  alu_in_t alu_in;
+  alu_out_t alu_out;
+
+  /* Module Declaration */
+
+  // Parse the ALU request
+  always_comb begin
+    // Init default values
+    alu_in = '0;
+
+    // Set default Values
+    alu_in.vectorial_op = 1'b0;
+    alu_in.operands[0] = alu_req_op1_i;
+    alu_in.operands[1] = alu_req_op2_i;
+
+    // Define the operation we want to execute on the FPU
+    unique casez (alu_req_type_i)
+      (floo_pkg::A_Add) : begin
+        alu_in.op = alu_pkg::ADD;
+        alu_in.fmt = alu_pkg::INT32;
+      end
+      (floo_pkg::A_Mul) : begin
+        alu_in.op = alu_pkg::MUL;
+        alu_in.fmt = alu_pkg::INT32;
+      end
+      (floo_pkg::A_Min_S) : begin
+        alu_in.op = alu_pkg::MIN;
+        alu_in.fmt = alu_pkg::INT32;
+      end
+      (floo_pkg::A_Min_U) : begin
+        alu_in.op = alu_pkg::MIN;
+        alu_in.fmt = alu_pkg::UINT32;
+      end
+      (floo_pkg::A_Max_S) : begin
+        alu_in.op = alu_pkg::MAX;
+        alu_in.fmt = alu_pkg::INT32;
+      end
+      (floo_pkg::A_Max_U) : begin
+        alu_in.op = alu_pkg::MAX;
+        alu_in.fmt = alu_pkg::UINT32;
+      end
+      default : begin
+        alu_in.op = alu_pkg::ADD;
+        alu_in.fmt = alu_pkg::INT32;
+      end
+    endcase
+  end
+
+  // Instanciate the ALU
+  floo_alu_top #(
+    .tag_t                (logic),
+    .CutOutput            (1'b1),
+    .CutInput             (1'b0)
+  ) i_alu (
+    .clk_i                (clk_i),
+    .rst_ni               (rst_ni),
+    .flush_i              (flush_i),
+    .operands_i           (alu_in.operands),
+    .op_i                 (alu_in.op),
+    .fmt_i                (alu_in.fmt),
+    .vector_mode_i        (alu_in.vectorial_op),
+    .tag_i                (1'b0),
+    .in_valid_i           (alu_req_valid_i),
+    .in_ready_o           (alu_req_ready_o),
+    .result_o             (alu_out.result),
+    .status_o             (),
+    .tag_o                (),
+    .out_valid_o          (alu_resp_valid_o),
+    .out_ready_i          (alu_resp_ready_i)
+  );
+
+  // Assign the output signal of the ALU
+  assign alu_resp_data_o = alu_out.result;
+
+endmodule
+
+// ALU Module which should similar to the fpnew module from the openhw group
+module floo_alu_top #(
+  parameter type          tag_t = logic,
+  parameter bit           CutOutput = 1'b1,
+  parameter bit           CutInput = 1'b1,
+  // Do not change
+  localparam int unsigned WIDTH = 64,
+  localparam int unsigned NUM_OPERANDS = 2
+) (
+  input logic                                 clk_i,
+  input logic                                 rst_ni,
+  input logic                                 flush_i,
+  /// Input Signal
+  input logic [NUM_OPERANDS-1:0][WIDTH-1:0]   operands_i,
+  input alu_pkg::alu_operation_e              op_i,
+  input alu_pkg::alu_int_format_e             fmt_i,
+  input logic                                 vector_mode_i,
+  input tag_t                                 tag_i,
+  input logic                                 in_valid_i,
+  output logic                                in_ready_o,
+  /// Output Signal
+  output logic [WIDTH-1:0]                    result_o,
+  output alu_pkg::alu_status_t                status_o,
+  output tag_t                                tag_o,
+  output logic                                out_valid_o,
+  input  logic                                out_ready_i
+);
+
+/* All local parameter */
+
+/* All Typedef Vars */
+
+// Typedefs for the cut to avoid a cut for everything
+typedef struct packed {
+  logic [NUM_OPERANDS-1:0][WIDTH-1:0] operands;
+  alu_pkg::alu_operation_e op;
+  alu_pkg::alu_int_format_e fmt;
+  logic vector_mode;
+  tag_t tag;
+} cut_input_t;
+
+typedef struct packed {
+  logic [WIDTH-1:0] result;
+  alu_pkg::alu_status_t status;
+  tag_t tag;
+} cut_output_t;
+
+/* Variable declaration */
+
+// Vars after the input cut
+logic [NUM_OPERANDS-1:0][WIDTH-1:0]   operands_q;
+alu_pkg::alu_operation_e op_q;
+alu_pkg::alu_int_format_e fmt_q;
+logic vector_mode_q;
+tag_t tag_q;
+logic in_valid_q;
+logic in_ready_q;
+
+// Vars with the result infront of the output cut
+logic [WIDTH-1:0] result_d;
+alu_pkg::alu_status_t status_d;
+tag_t tag_d;
+logic out_valid_d;
+logic out_ready_d;
+
+// trunc'ed signal to support only 32 Bit signal
+logic [NUM_OPERANDS-1:0][31:0]        operands_32;
+logic [31:0] res_32;
+logic [31:0] adder_res_32;
+logic [31:0] mul_res_32;
+logic [31:0] min_res_32;
+logic [31:0] max_res_32;
+
+/* Module Declaration */
+
+// Input Cut to split the ALU from the rest of the system
+if (CutInput == 1'b1) begin
+  spill_register_flushable #(
+    .T                  (cut_input_t),
+    .Bypass             (1'b0)
+  ) i_output_cut (
+    .clk_i              (clk_i),
+    .rst_ni             (rst_ni),
+    .valid_i            (in_valid_i),
+    .flush_i            (flush_i),
+    .ready_o            (in_ready_o),
+    .data_i             ({operands_i, op_i, fmt_i, vector_mode_i, tag_i}),
+    .valid_o            (in_valid_q),
+    .ready_i            (in_ready_q),
+    .data_o             ({operands_q, op_q, fmt_q, vector_mode_q, tag_q})
+  );
+end else begin
+  assign operands_q = operands_i;
+  assign op_q = op_i;
+  assign fmt_q = fmt_i;
+  assign vector_mode_q = vector_mode_i;
+  assign tag_q = tag_i;
+  assign in_valid_q = in_valid_i;
+  assign in_ready_o = in_ready_q;
+end
+
+// Implement ALU here
+// Parse both operands to 32 Bit
+for (genvar i = 0; i < NUM_OPERANDS;i++) begin
+  assign operands_32[i] = operands_q[i][31:0];
+end
+
+// Adder Path
+assign adder_res_32 = operands_32[1] + operands_32[0];
+
+// Multiplier Path
+always_comb begin
+  mul_res_32 = '0;
+  for (int i = 0; i < 32; i++) begin
+    mul_res_32 = (|((operands_32[0] >> i) & 1)) ? mul_res_32 + (operands_32[1] << i) : mul_res_32;
+  end
+end
+
+// Min / Max Path
+always_comb begin : gen_minmax
+  logic sign;
+
+  max_res_32 = '0;
+  min_res_32 = '0;
+  sign = 1'b0;
+
+  // Determint if we require sign > When we extend the signal by 1 bit then we can use the signed hw
+  // for both the signed and unsigned case.
+  if(fmt_q == alu_pkg::INT32) begin
+    sign = 1'b1;
+  end
+
+  // Calc the min / max signal in the same case
+  if($signed({sign & operands_32[0][31], operands_32[0]}) > $signed({sign & operands_32[1][31], operands_32[1]})) begin
+    max_res_32 = operands_32[0];
+    min_res_32 = operands_32[1];
+  end else begin
+    max_res_32 = operands_32[1];
+    min_res_32 = operands_32[0];
+  end
+end
+
+// Mux the result together
+always_comb begin : result_mux
+  res_32 = '0;
+  unique case (op_i)
+    alu_pkg::ADD:   res_32 = adder_res_32;
+    alu_pkg::MUL:   res_32 = mul_res_32;
+    alu_pkg::MIN:   res_32 = min_res_32;
+    alu_pkg::MAX:   res_32 = max_res_32;
+    default:        res_32 = '0;
+  endcase
+end
+
+// Sign extend the 32 Bit result
+assign result_d = {{32{res_32[31]}},res_32};
+
+// Bypass tag & handshake
+assign tag_d = tag_q;
+assign out_valid_d = in_valid_q;
+assign in_ready_q = out_ready_d;
+assign status_d.is_zero = ~ (|res_32); // Or Connect all signal and invert to determin if we have a 0 signal
+
+// introduce cut at output of ALU
+if (CutOutput == 1'b1) begin
+  spill_register_flushable #(
+    .T                  (cut_output_t),
+    .Bypass             (1'b0)
+  ) i_output_cut (
+    .clk_i              (clk_i),
+    .rst_ni             (rst_ni),
+    .valid_i            (out_valid_d),
+    .flush_i            (flush_i),
+    .ready_o            (out_ready_d),
+    .data_i             ({result_d, status_d, tag_d}),
+    .valid_o            (out_valid_o),
+    .ready_i            (out_ready_i),
+    .data_o             ({result_o, status_o, tag_o})
+  );
+end else begin
+  assign result_o = result_d;
+  assign status_o = status_d;
+  assign tag_o = tag_d;
+  assign out_valid_o = out_valid_d;
+  assign out_ready_d = out_ready_i;
+end
+
+/* Assertions for the module */
+
+// Currently we only support 32Bit operations! Could be extended in the future
+`ASSERT(Invalid_Input, !((fmt_i != alu_pkg::INT32) && (fmt_i != alu_pkg::UINT32)))
+`ASSERT(Invalid_Vector_Ops, !(vector_mode_i != 1'b0))
+
+endmodule
diff --git a/hw/floo_id_translation.sv b/hw/floo_id_translation.sv
index 1743520a..629144ef 100644
--- a/hw/floo_id_translation.sv
+++ b/hw/floo_id_translation.sv
@@ -35,6 +35,7 @@ module floo_id_translation #(
   output mask_sel_t   mask_addr_y_o
 );
 
+  localparam bit EnCollective = floo_pkg::is_en_collective(RouteCfg.CollectiveCfg.OpCfg);
 
   if (RouteCfg.UseIdTable) begin : gen_addr_decoder
     logic dec_error;
@@ -58,9 +59,9 @@ module floo_id_translation #(
     );
 
     `ASSERT(DecodeError, !(dec_error && valid_i), clk_i, !rst_ni,
-        $sformatf("Error decoding address 0x%0x.", addr_i));
+        $sformatf("Error decoding address 0x%0x.", $sampled(addr_i)));
 
-    if (RouteCfg.EnMultiCast) begin: gen_mcast_id_mask
+    if (EnCollective) begin: gen_mcast_id_mask
       assign mask_addr_x_o = idx_out.mask_x;
       assign mask_addr_y_o = idx_out.mask_y;
       assign id_o = idx_out.id;
diff --git a/hw/floo_meta_buffer.sv b/hw/floo_meta_buffer.sv
index 64ba8e2c..5fd8d63a 100644
--- a/hw/floo_meta_buffer.sv
+++ b/hw/floo_meta_buffer.sv
@@ -68,6 +68,9 @@ module floo_meta_buffer #(
   typedef logic [OutIdWidth-1:0] id_out_t;
   typedef logic [IdMinWidth-1:0] id_min_t;
 
+  // Collective operations parameters
+  localparam bit EnCollective = floo_pkg::is_en_collective(RouteCfg.CollectiveCfg.OpCfg);
+
   logic ar_no_atop_buf_full, aw_no_atop_buf_full;
   logic ar_no_atop_push, aw_no_atop_push;
   logic ar_no_atop_pop, aw_no_atop_pop;
@@ -218,7 +221,7 @@ module floo_meta_buffer #(
 
   // NoC addr/mask to AXI addr/mask conversion
   localparam int unsigned AddrWidth = $bits(addr_t);
-  if (RouteCfg.EnMultiCast && RouteCfg.UseIdTable &&
+  if (EnCollective && RouteCfg.UseIdTable &&
      (RouteCfg.RouteAlgo == floo_pkg::XYRouting))
   begin : gen_mcast_table_conversion
     id_t out, in_mask, in_id;
@@ -227,7 +230,7 @@ module floo_meta_buffer #(
     addr_t in_addr;
     id_t base_id;
 
-    assign in_mask = aw_buf_i.hdr.mask;
+    assign in_mask = aw_buf_i.hdr.collective_mask;
     assign in_id = aw_buf_i.hdr.dst_id;
     assign base_id = '{x: x_mask_sel.base_id, y: y_mask_sel.base_id, port_id: '0};
     assign out = ((~in_mask & in_id) | (in_mask & id_i)) & ~base_id;
diff --git a/hw/floo_nw_chimney.sv b/hw/floo_nw_chimney.sv
index 06d476b6..70e03a05 100644
--- a/hw/floo_nw_chimney.sv
+++ b/hw/floo_nw_chimney.sv
@@ -84,6 +84,7 @@ module floo_nw_chimney #(
   parameter type floo_rsp_t                             = logic,
   /// Floo `wide` link type
   parameter type floo_wide_t                            = logic,
+  parameter type floo_wide_in_t                         = logic,
   /// SRAM configuration type `tc_sram_impl` in RoB
   /// Only used if technology-dependent SRAM is used
   parameter type sram_cfg_t                             = logic,
@@ -117,7 +118,7 @@ module floo_nw_chimney #(
   /// Input links from NoC
   input  floo_req_t   floo_req_i,
   input  floo_rsp_t   floo_rsp_i,
-  input  floo_wide_t  floo_wide_i
+  input  floo_wide_in_t  floo_wide_i
 );
 
   import floo_pkg::*;
@@ -153,6 +154,8 @@ module floo_nw_chimney #(
   localparam bit EnDecoupledRW = (WideRwDecouple != floo_pkg::None);
   localparam int unsigned NumVirtualChannels = (WideRwDecouple == floo_pkg::None) ? 1 : 2;
   localparam int unsigned NumWidePhysChannels = (WideRwDecouple == floo_pkg::Phys) ? 2 : 1;
+  // Collective communication configuration
+  localparam floo_pkg::collect_op_fe_cfg_t CollectOpCfg = RouteCfg.CollectiveCfg.OpCfg;
 
   // Duplicate AXI port signals to degenerate ports
   // in case they are not used
diff --git a/hw/floo_nw_router.sv b/hw/floo_nw_router.sv
index 232aea06..b7dfe620 100644
--- a/hw/floo_nw_router.sv
+++ b/hw/floo_nw_router.sv
@@ -11,46 +11,61 @@
 /// Wrapper of a multi-link router for narrow and wide links
 module floo_nw_router #(
   /// Config of the narrow AXI interfaces (see floo_pkg::axi_cfg_t for details)
-  parameter floo_pkg::axi_cfg_t AxiCfgN       = '0,
+  parameter floo_pkg::axi_cfg_t AxiCfgN             = '0,
   /// Config of the wide AXI interfaces (see floo_pkg::axi_cfg_t for details)
-  parameter floo_pkg::axi_cfg_t AxiCfgW       = '0,
+  parameter floo_pkg::axi_cfg_t AxiCfgW             = '0,
   /// Routing algorithm
-  parameter floo_pkg::route_algo_e RouteAlgo  = floo_pkg::XYRouting,
+  parameter floo_pkg::route_algo_e RouteAlgo        = floo_pkg::XYRouting,
   /// Number of input/output ports
-  parameter int unsigned NumRoutes            = 0,
+  parameter int unsigned NumRoutes                  = 0,
   /// Number of input ports
-  parameter int unsigned NumInputs            = NumRoutes,
+  parameter int unsigned NumInputs                  = NumRoutes,
   /// Number of output ports
-  parameter int unsigned NumOutputs           = NumRoutes,
+  parameter int unsigned NumOutputs                 = NumRoutes,
   /// Input buffer depth
-  parameter int unsigned InFifoDepth          = 0,
+  parameter int unsigned InFifoDepth                = 0,
   /// Output buffer depth
-  parameter int unsigned OutFifoDepth         = 0,
+  parameter int unsigned OutFifoDepth               = 0,
   /// Disable illegal connections in router
   /// (only applies for `RouteAlgo == XYRouting`)
   parameter bit          XYRouteOpt           = 1'b1,
-  /// Enable decoupling between Read and Write WIDE channels using virtual or
+  /// Disables loopback connections
+  parameter bit          NoLoopback                 = 1'b1,
+    /// Enable decoupling between Read and Write WIDE channels using virtual or
   /// physical channels: assumed that write transactions are alwasy on VC0.
   parameter floo_pkg::wide_rw_decouple_e WideRwDecouple = floo_pkg::None,
   parameter floo_pkg::vc_impl_e VcImpl              = floo_pkg::VcNaive,
-  /// Enable multicast feature
-  parameter bit          EnMultiCast          = 1'b0,
   /// Node ID type
-  parameter type id_t                         = logic,
+  parameter type id_t                               = logic,
   /// Header type
-  parameter type hdr_t                        = logic,
+  parameter type hdr_t                              = logic,
   /// Number of rules in the route table
   /// (only used for `RouteAlgo == IdTable`)
-  parameter int unsigned NumAddrRules         = 0,
+  parameter int unsigned NumAddrRules               = 0,
   /// Address rule type
   /// (only used for `RouteAlgo == IdTable`)
-  parameter type addr_rule_t                  = logic,
+  parameter type addr_rule_t                        = logic,
   /// Floo `req` link type
-  parameter type floo_req_t                   = logic,
+  parameter type floo_req_t                         = logic,
   /// Floo `rsp` link type
-  parameter type floo_rsp_t                   = logic,
+  parameter type floo_rsp_t                         = logic,
   /// Floo `wide` link type
-  parameter type floo_wide_t                  = logic
+  parameter type floo_wide_t                        = logic,
+  parameter type floo_wide_out_t                    = logic,
+  /// Possible operation for offloading (must match type in header)
+  parameter type RdWideOperation_t                  = logic,
+  parameter type RdNarrowOperation_t                = logic,
+  /// Data type of the offload reduction
+  parameter type RdWideData_t                       = logic,
+  parameter type RdNarrowData_t                     = logic,
+  /// Parameter to define which type of collective operation support
+  parameter floo_pkg::collect_op_fe_cfg_t CollectiveOpCfg = floo_pkg::CollectiveOpDefaultCfg,
+  /// Parameter for the wide reduction configuration
+  parameter floo_pkg::reduction_cfg_t RdWideCfg     = floo_pkg::ReductionDefaultCfg,
+  /// Parameter for the narrow reduction configuration
+  parameter floo_pkg::reduction_cfg_t RdNarrowCfg   = floo_pkg::ReductionDefaultCfg,
+  /// Paramter for the response router
+  parameter floo_pkg::reduction_cfg_t RdRespCfg     = floo_pkg::ReductionDefaultCfg
 ) (
   input  logic   clk_i,
   input  logic   rst_ni,
@@ -62,12 +77,32 @@ module floo_nw_router #(
   /// (only used for `RouteAlgo == IdTable`)
   input  addr_rule_t [NumAddrRules-1:0] id_route_map_i,
   /// Input and output links
-  input   floo_req_t [NumInputs-1:0] floo_req_i,
-  input   floo_rsp_t [NumOutputs-1:0] floo_rsp_i,
-  output  floo_req_t [NumOutputs-1:0] floo_req_o,
-  output  floo_rsp_t [NumInputs-1:0] floo_rsp_o,
-  input   floo_wide_t [NumRoutes-1:0] floo_wide_i,
-  output  floo_wide_t [NumRoutes-1:0] floo_wide_o
+  input   floo_req_t [NumInputs-1:0]    floo_req_i,
+  input   floo_rsp_t [NumOutputs-1:0]   floo_rsp_i,
+  output  floo_req_t [NumOutputs-1:0]   floo_req_o,
+  output  floo_rsp_t [NumInputs-1:0]    floo_rsp_o,
+  input   floo_wide_t [NumRoutes-1:0]   floo_wide_i,
+  output  floo_wide_out_t [NumRoutes-1:0]   floo_wide_o,
+  /// Wide IF towards the offload logic
+  output RdWideOperation_t              offload_wide_req_op_o,
+  output RdWideData_t                   offload_wide_req_operand1_o,
+  output RdWideData_t                   offload_wide_req_operand2_o,
+  output logic                          offload_wide_req_valid_o,
+  input logic                           offload_wide_req_ready_i,
+  /// Wide IF from external FPU
+  input RdWideData_t                    offload_wide_resp_result_i,
+  input logic                           offload_wide_resp_valid_i,
+  output logic                          offload_wide_resp_ready_o,
+  /// Narrow IF towards the offload logic
+  output RdNarrowOperation_t            offload_narrow_req_op_o,
+  output RdNarrowData_t                 offload_narrow_req_operand1_o,
+  output RdNarrowData_t                 offload_narrow_req_operand2_o,
+  output logic                          offload_narrow_req_valid_o,
+  input logic                           offload_narrow_req_ready_i,
+  /// Narrow IF from external FPU
+  input RdNarrowData_t                  offload_narrow_resp_result_i,
+  input logic                           offload_narrow_resp_valid_i,
+  output logic                          offload_narrow_resp_ready_o
 );
 
   localparam int unsigned NumWidePhysChannels = (WideRwDecouple == floo_pkg::Phys) ? 2 : 1;
@@ -90,6 +125,63 @@ module floo_nw_router #(
       axi_wide_in_id_t, axi_wide_data_t, axi_wide_strb_t, axi_wide_user_t)
   `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow, axi_wide, AxiCfgN, AxiCfgW, hdr_t)
 
+  // Local parameters to properly configure collective operation support
+  // hiding the complexity to the user
+  localparam int unsigned WideVirtChannel = (EnDecoupledRW) ? 2 : 1;
+
+  localparam floo_pkg::collect_op_be_cfg_t CollectiveReqCfg = '{
+    EnMulticast : CollectiveOpCfg.EnNarrowMulticast,
+    EnLSBAnd    : CollectiveOpCfg.EnLSBAnd,
+    EnF_Add     : 1'b0,
+    EnF_Mul     : 1'b0,
+    EnF_Min     : 1'b0,
+    EnF_Max     : 1'b0,
+    EnA_Add     : CollectiveOpCfg.EnA_Add,
+    EnA_Mul     : CollectiveOpCfg.EnA_Mul,
+    EnA_Min_S   : CollectiveOpCfg.EnA_Min_S,
+    EnA_Min_U   : CollectiveOpCfg.EnA_Min_U,
+    EnA_Max_S   : CollectiveOpCfg.EnA_Max_S,
+    EnA_Max_U   : CollectiveOpCfg.EnA_Max_U,
+    EnSelectAW  : CollectiveOpCfg.EnLSBAnd,
+    EnCollectB  : 1'b0
+  };
+
+  localparam floo_pkg::collect_op_be_cfg_t CollectiveRspCfg = '{
+    EnMulticast : floo_pkg::is_en_narrow_reduction(CollectiveOpCfg) |
+                  floo_pkg::is_en_wide_reduction(CollectiveOpCfg),
+    EnLSBAnd    : 1'b0,
+    EnF_Add     : 1'b0,
+    EnF_Mul     : 1'b0,
+    EnF_Min     : 1'b0,
+    EnF_Max     : 1'b0,
+    EnA_Add     : 1'b0,
+    EnA_Mul     : 1'b0,
+    EnA_Min_S   : 1'b0,
+    EnA_Min_U   : 1'b0,
+    EnA_Max_S   : 1'b0,
+    EnA_Max_U   : 1'b0,
+    EnSelectAW  : 1'b0,
+    EnCollectB  : CollectiveOpCfg.EnNarrowMulticast |
+                  CollectiveOpCfg.EnWideMulticast
+  };
+
+  localparam floo_pkg::collect_op_be_cfg_t CollectiveWideCfg = '{
+    EnMulticast : CollectiveOpCfg.EnWideMulticast,
+    EnLSBAnd    : 1'b0,
+    EnF_Add     : CollectiveOpCfg.EnF_Add,
+    EnF_Mul     : CollectiveOpCfg.EnF_Mul,
+    EnF_Min     : CollectiveOpCfg.EnF_Min,
+    EnF_Max     : CollectiveOpCfg.EnF_Max,
+    EnA_Add     : 1'b0,
+    EnA_Mul     : 1'b0,
+    EnA_Min_S   : 1'b0,
+    EnA_Min_U   : 1'b0,
+    EnA_Max_S   : 1'b0,
+    EnA_Max_U   : 1'b0,
+    EnSelectAW  : 1'b0,
+    EnCollectB  : 1'b0
+  };
+
   floo_req_chan_t [NumInputs-1:0] req_in;
   floo_rsp_chan_t [NumInputs-1:0] rsp_out;
   floo_req_chan_t [NumOutputs-1:0] req_out;
@@ -148,26 +240,31 @@ module floo_nw_router #(
   end
 
   floo_router #(
-    .NumInput         ( NumInputs               ),
-    .NumOutput        ( NumOutputs              ),
-    .NumPhysChannels  ( 1                       ),
-    .NumVirtChannels  ( 1                       ),
-    .InFifoDepth      ( InFifoDepth             ),
-    .OutFifoDepth     ( OutFifoDepth            ),
-    .RouteAlgo        ( RouteAlgo               ),
-    .XYRouteOpt       ( XYRouteOpt              ),
-    .NumAddrRules     ( NumAddrRules            ),
-    .NoLoopback       ( 1'b1                    ),
-    .EnMultiCast      ( EnMultiCast             ),
-    .EnReduction      ( 1'b0                    ),
-    .id_t             ( id_t                    ),
-    .addr_rule_t      ( addr_rule_t             ),
-    .flit_t           ( floo_req_generic_flit_t )
+    .NumInput             ( NumInputs                 ),
+    .NumOutput            ( NumOutputs                ),
+    .NumPhysChannels      ( 1                         ),
+    .NumVirtChannels      ( 1                         ),
+    .InFifoDepth          ( InFifoDepth               ),
+    .OutFifoDepth         ( OutFifoDepth              ),
+    .RouteAlgo            ( RouteAlgo                 ),
+    .XYRouteOpt           ( XYRouteOpt                ),
+    .NumAddrRules         ( NumAddrRules              ),
+    .NoLoopback           ( NoLoopback                ),
+    .id_t                 ( id_t                      ),
+    .addr_rule_t          ( addr_rule_t               ),
+    .flit_t               ( floo_req_generic_flit_t   ),
+    .hdr_t                ( hdr_t                     ),
+    .RdOperation_t        ( RdNarrowOperation_t       ),
+    .RdData_t             ( RdNarrowData_t            ),
+    .CollectiveCfg        ( CollectiveReqCfg          ),
+    .RedCfg                ( RdNarrowCfg               ),
+    .AxiCfgOffload        ( AxiCfgN                   ),
+    .AxiCfgParallel       ( AxiCfgN                   )
   ) i_req_floo_router (
     .clk_i,
     .rst_ni,
     .test_enable_i,
-    .xy_id_i        ( id_i ),
+    .xy_id_i                  ( id_i                            ),
     .id_route_map_i,
     .valid_i        ( req_valid_in  ),
     .ready_o        ( req_ready_out ),
@@ -176,51 +273,41 @@ module floo_nw_router #(
     .valid_o        ( req_valid_out ),
     .ready_i        ( req_ready_in  ),
     .data_o         ( req_out       ),
-    .credit_o       ( req_credit_out) /* unused */
+    .credit_o       ( req_credit_out), /* unused */
+    .offload_req_op_o         ( offload_narrow_req_op_o         ),
+    .offload_req_operand1_o   ( offload_narrow_req_operand1_o   ),
+    .offload_req_operand2_o   ( offload_narrow_req_operand2_o   ),
+    .offload_req_valid_o      ( offload_narrow_req_valid_o      ),
+    .offload_req_ready_i      ( offload_narrow_req_ready_i      ),
+    .offload_resp_result_i    ( offload_narrow_resp_result_i    ),
+    .offload_resp_valid_i     ( offload_narrow_resp_valid_i     ),
+    .offload_resp_ready_o     ( offload_narrow_resp_ready_o     )
   );
 
-  // We construct the masks for the narrow and wide B responses here.
-  // Every bit of the payload is set to 0, except for the bits that
-  // correspond to the resp field.
-  localparam axi_narrow_b_chan_t NarrowBMask = '{resp: 2'b11, default: '0};
-  localparam floo_axi_narrow_b_flit_t NarrowBFlitMask = '{
-    payload: NarrowBMask,
-    hdr: '0,
-    rsvd: '0
-  };
-  localparam axi_narrow_b_chan_t WideBMask = '{resp: 2'b11, default: '0};
-  localparam floo_axi_wide_b_flit_t WideBFlitMask = '{
-    payload: WideBMask,
-    hdr: '0,
-    rsvd: '0
-  };
-
-  // Enable reduction for the B response.
-  // Disable multicast for the B response.
   floo_router #(
-    .NumInput         ( NumInputs               ),
-    .NumOutput        ( NumOutputs              ),
-    .NumPhysChannels  ( 1                       ),
-    .NumVirtChannels  ( 1                       ),
-    .InFifoDepth      ( InFifoDepth             ),
-    .OutFifoDepth     ( OutFifoDepth            ),
-    .RouteAlgo        ( RouteAlgo               ),
-    .XYRouteOpt       ( XYRouteOpt              ),
-    .NumAddrRules     ( NumAddrRules            ),
-    .NoLoopback       ( 1'b1                    ),
-    .EnMultiCast      ( 1'b0                    ),
-    .EnReduction      ( EnMultiCast             ),
-    .id_t             ( id_t                    ),
-    .addr_rule_t      ( addr_rule_t             ),
-    .flit_t           ( floo_rsp_generic_flit_t ),
-    .payload_t        ( floo_rsp_payload_t      ),
-    .NarrowRspMask    ( floo_rsp_generic_flit_t'(NarrowBFlitMask.payload) ),
-    .WideRspMask      ( floo_rsp_generic_flit_t'(WideBFlitMask.payload)   )
+    .NumInput             ( NumInputs               ),
+    .NumOutput            ( NumOutputs              ),
+    .NumPhysChannels      ( 1                       ),
+    .NumVirtChannels      ( 1                       ),
+    .InFifoDepth          ( InFifoDepth             ),
+    .OutFifoDepth         ( OutFifoDepth            ),
+    .RouteAlgo            ( RouteAlgo               ),
+    .XYRouteOpt           ( XYRouteOpt              ),
+    .NumAddrRules         ( NumAddrRules            ),
+    .NoLoopback           ( NoLoopback              ),
+    .id_t                 ( id_t                    ),
+    .addr_rule_t          ( addr_rule_t             ),
+    .flit_t               ( floo_rsp_generic_flit_t ),
+    .hdr_t                ( hdr_t                   ),
+    .CollectiveCfg        ( CollectiveRspCfg        ),
+    .RedCfg                ( RdRespCfg               ),
+    .AxiCfgOffload        ( '0                      ),
+    .AxiCfgParallel       ( AxiCfgN                 )
   ) i_rsp_floo_router (
     .clk_i,
     .rst_ni,
     .test_enable_i,
-    .xy_id_i        ( id_i ),
+    .xy_id_i                  ( id_i          ),
     .id_route_map_i,
     .valid_i        ( rsp_valid_in  ),
     .ready_o        ( rsp_ready_out ),
@@ -229,31 +316,44 @@ module floo_nw_router #(
     .valid_o        ( rsp_valid_out ),
     .ready_i        ( rsp_ready_in  ),
     .data_o         ( rsp_out       ),
-    .credit_o       ( rsp_credit_out) /* unused */
+    .credit_o       ( rsp_credit_out), /* unused */
+    .offload_req_op_o         (               ),
+    .offload_req_operand1_o   (               ),
+    .offload_req_operand2_o   (               ),
+    .offload_req_valid_o      (               ),
+    .offload_req_ready_i      ( '0            ),
+    .offload_resp_result_i    ( '0            ),
+    .offload_resp_valid_i     ( '0            ),
+    .offload_resp_ready_o     (               )
   );
 
 
   floo_router #(
-    .NumRoutes        ( NumRoutes                 ),
-    .NumPhysChannels  ( NumWidePhysChannels       ),
-    .NumVirtChannels  ( NumWideVirtChannels       ),
-    .InFifoDepth      ( InFifoDepth               ),
-    .OutFifoDepth     ( OutFifoDepth              ),
-    .RouteAlgo        ( RouteAlgo                 ),
-    .XYRouteOpt       ( XYRouteOpt                ),
-    .NumAddrRules     ( NumAddrRules              ),
-    .NoLoopback       ( 1'b1                      ),
-    .VcImpl           ( VcImpl                    ),
-    .EnMultiCast      ( EnMultiCast               ),
-    .EnReduction      ( 1'b0                      ),
-    .id_t             ( id_t                      ),
-    .addr_rule_t      ( addr_rule_t               ),
-    .flit_t           ( floo_wide_generic_flit_t  )
+    .NumRoutes            ( NumRoutes                 ),
+    .NumPhysChannels      ( NumWidePhysChannels       ),
+    .NumVirtChannels      ( NumWideVirtChannels       ),
+    .InFifoDepth          ( InFifoDepth               ),
+    .OutFifoDepth         ( OutFifoDepth              ),
+    .RouteAlgo            ( RouteAlgo                 ),
+    .XYRouteOpt           ( XYRouteOpt                ),
+    .NumAddrRules         ( NumAddrRules              ),
+    .NoLoopback           ( NoLoopback                ),
+    .VcImpl               ( VcImpl                    ),
+    .id_t                 ( id_t                      ),
+    .addr_rule_t          ( addr_rule_t               ),
+    .flit_t               ( floo_wide_generic_flit_t  ),
+    .hdr_t                ( hdr_t                     ),
+    .RdOperation_t        ( RdWideOperation_t         ),
+    .RdData_t             ( RdWideData_t              ),
+    .CollectiveCfg        ( CollectiveWideCfg         ),
+    .RedCfg               ( RdWideCfg                 ),
+    .AxiCfgOffload        ( AxiCfgW                   ),
+    .AxiCfgParallel       ( '0                        )
   ) i_wide_req_floo_router (
     .clk_i,
     .rst_ni,
     .test_enable_i,
-    .xy_id_i        ( id_i ),
+    .xy_id_i                  ( id_i                          ),
     .id_route_map_i,
     .valid_i        ( wide_valid_in   ),
     .ready_o        ( wide_ready_out  ),
@@ -262,7 +362,15 @@ module floo_nw_router #(
     .valid_o        ( wide_valid_out  ),
     .ready_i        ( wide_ready_in   ),
     .data_o         ( wide_out        ),
-    .credit_o       ( wide_credit_out )
+    .credit_o       ( wide_credit_out ),
+    .offload_req_op_o         ( offload_wide_req_op_o         ),
+    .offload_req_operand1_o   ( offload_wide_req_operand1_o   ),
+    .offload_req_operand2_o   ( offload_wide_req_operand2_o   ),
+    .offload_req_valid_o      ( offload_wide_req_valid_o      ),
+    .offload_req_ready_i      ( offload_wide_req_ready_i      ),
+    .offload_resp_result_i    ( offload_wide_resp_result_i    ),
+    .offload_resp_valid_i     ( offload_wide_resp_valid_i     ),
+    .offload_resp_ready_o     ( offload_wide_resp_ready_o     )
   );
 
 endmodule
diff --git a/hw/floo_offload_reduction.sv b/hw/floo_offload_reduction.sv
new file mode 100644
index 00000000..bbbcc7b7
--- /dev/null
+++ b/hw/floo_offload_reduction.sv
@@ -0,0 +1,545 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Raphael Roth <raroth@student.ethz.ch>
+
+// This module describes the datapath of the offload reduction. The controller is part of the
+// "floo_offload_reduction_buffer.sv" module and describes there.
+// The selected operation we want to support for now are defined under floo_pkg::reduction_op_t!
+//
+// The main design goal was to allow a fully pipelined operation e.g. if out inputs provide each
+// cycle a new elements to reduce then the underlying reduction utilization should be 100% during
+// the reduction. However as this required more tracking effort we support other modes too.
+// When we reduce elements from at least three different inputs we are required to have a
+// partial result buffer for intermediate results.
+//
+// Overall three modes are supported: Generic, Stalling, Simple, see the controller for more
+// documentation.
+//
+// For the generic system we require a tag based system to allow tracking of each elements as we
+// elements from different reduction iteration in-flight. All elements which hold the same tag
+// need to be reduced together. This allows to separate the tag generation and the reduction logic.
+//
+// Additionally ever element gets an mask which indicates which elements are already reduced in
+// the reduction_data as reduction with more than two inputs require two (or more) iterations.
+// The mask allows for an easy comparison for the final result e.g. if it is equal to the input
+// mask then all required inputs are reduced together.
+//
+// For the full documentation see the Masterthesis of Raphael Roth
+
+// Restriction:
+// - Currently we only support reduction of elements belonging to the same reduction stream.
+//   In AXI term: different beats belonging to the same burst are okay but not two different
+//   bursts.
+// - The max number of input is currently fixed to 6. This can be extended but the size of
+//   the tag_t depends on it.
+// - We only support symmetric configurations e.g. NumInput and NumOutput needs to be equal
+
+// Open Points:
+// - The status of the reduction logic (e.g. FPU) is currently not evaluated by the contoller!
+
+`include "common_cells/assertions.svh"
+
+module floo_offload_reduction import floo_pkg::*; #(
+  /// Number of Routes (Currently support only symmetric configurations)
+  parameter int unsigned NumRoutes              = 1,
+  /// Various types used by floonoc / routing
+  parameter type         flit_t                 = logic,
+  parameter type         hdr_t                  = logic,
+  parameter type         id_t                   = logic,
+  /// Data payload size to extract from the floo flit
+  parameter type         RdData_t               = logic,
+  /// Possible reduction operation(s)
+  parameter type         RdOperation_t          = logic,
+  /// Parameter for the reduction configuration
+  parameter reduction_cfg_t RdCfg               = '0,
+  /// Axi Configuration
+  parameter axi_cfg_t    AxiCfg                 = '0
+) (
+  /// Control Inputs
+  input  logic                                  clk_i,
+  input  logic                                  rst_ni,
+  input  logic                                  flush_i,
+  input  id_t                                   node_id_i,
+  /// Ports towards the input routes
+  input  logic  [NumRoutes-1:0]                 valid_i,
+  output logic  [NumRoutes-1:0]                 ready_o,
+  input  flit_t [NumRoutes-1:0]                 data_i,
+  input  logic  [NumRoutes-1:0][NumRoutes-1:0]  output_route_i,
+  input  logic  [NumRoutes-1:0][NumRoutes-1:0]  expected_input_i,
+  /// Ports towards the output routes
+  output logic  [NumRoutes-1:0]                 valid_o,
+  input  logic  [NumRoutes-1:0]                 ready_i,
+  output flit_t [NumRoutes-1:0]                 data_o,
+  /// IF towards external reduction
+  output RdOperation_t                          reduction_req_type_o,
+  output RdData_t                               reduction_req_op1_o,
+  output RdData_t                               reduction_req_op2_o,
+  output logic                                  reduction_req_valid_o,
+  input  logic                                  reduction_req_ready_i,
+  /// IF from external reduction
+  input  RdData_t                               reduction_resp_data_i,
+  input  logic                                  reduction_resp_valid_i,
+  output logic                                  reduction_resp_ready_o
+);
+
+/* All local parameter */
+
+// Set the complexity of the Controller
+localparam bit GENERIC  = (RdCfg.RdControllConf == ControllerGeneric);
+localparam bit SIMPLE   = (RdCfg.RdControllConf == ControllerSimple);
+localparam bit STALLING = (RdCfg.RdControllConf == ControllerStalling);
+
+/* All Typedef Vars */
+// Index Variable to control the crossbar and the partial buffer
+typedef logic [cf_math_pkg::idx_width(RdCfg.RdPartialBufferSize)-1:0] part_res_idx_t;
+
+// Generate the types for the mask, the tag and the red_data
+typedef logic [RdCfg.RdTagBits-1:0] tag_t;
+typedef logic [NumRoutes-1:0] mask_t;
+
+// dfferent combination between flit / data / tag / mask for the main data path
+typedef struct packed {
+  flit_t flit;
+  mask_t input_exp;
+  mask_t output_dir;
+  tag_t tag;
+} flit_in_out_dir_tag_t;
+
+typedef struct packed {
+  flit_t flit;
+  mask_t mask;
+  tag_t tag;
+} flit_mask_tag_t;
+
+typedef struct packed {
+  RdData_t data;
+  mask_t mask;
+  tag_t tag;
+} red_data_mask_tag_t;
+
+typedef struct packed {
+  RdData_t data;
+  mask_t mask;
+} red_data_mask_t;
+
+typedef struct packed {
+  RdData_t data;
+  tag_t tag;
+} red_data_tag_t;
+
+// TODO (lleone): Remove and create teh proper REQ/RSP struct for the offload interface
+typedef struct packed {
+  RdOperation_t op;
+  RdData_t      operand1;
+  RdData_t      operand2;
+} red_intsr_t;
+
+/* Variable declaration */
+
+// Variable for the tag generation
+tag_t  [NumRoutes-1:0] fifo_tag;
+
+// Output signals from the input FIFO's
+flit_in_out_dir_tag_t [NumRoutes-1:0] fifo_out_data;
+logic                 [NumRoutes-1:0] fifo_out_valid;
+logic                 [NumRoutes-1:0] fifo_out_ready;
+
+// Input signals which are already mapped to the corresponding operand
+red_data_mask_tag_t [1:0] input_mapped_operands_data;
+logic               [1:0] input_mapped_operands_valid;
+logic               [1:0] input_mapped_operands_ready;
+
+// Signal from the partial result buffer
+red_data_mask_tag_t [1:0] partial_result_buffer_data;
+logic               [1:0] partial_result_buffer_valid;
+logic               [1:0] partial_result_buffer_ready;
+
+// Signal after the merge between the partial result buffer and the inputs
+red_data_mask_tag_t [1:0] merged_data;
+logic               [1:0] merged_valid;
+logic               [1:0] merged_ready;
+
+// Signal after joining the handsake
+logic join_operands_valid;
+logic join_operands_ready;
+
+// Var to determint the executed operation
+RdOperation_t reduction_scheduled_operation;
+
+// Signal for the FPU response
+red_data_mask_tag_t reduction_resp_data;
+
+// Singal to the partial result buffer
+red_data_mask_tag_t input_partial_result_buf_data;
+logic               input_partial_result_buf_valid;
+logic               input_partial_result_buf_ready;
+
+// Signal for the fully reduced result
+red_data_tag_t  fully_reduced_data;
+logic           fully_reduced_valid;
+logic           fully_reduced_ready;
+
+// Signal toward the output of the reduction logic
+flit_mask_tag_t final_flit;
+logic           final_valid;
+logic           final_ready;
+
+// Control Signal to either merge the partial buffer or the inputs
+logic [1:0] ctrl_sel_part_res;
+
+// Control Signal for the output demultiplexer
+logic ctrl_demux;
+
+// Selector for the partial result buffer
+part_res_idx_t [1:0] ctrl_sel_buffer_idx;
+
+// Spyglass signals from the partial result buffer
+tag_t [RdCfg.RdPartialBufferSize-1:0] spyglass_tag;
+logic [RdCfg.RdPartialBufferSize-1:0] spyglass_valid;
+
+// Output instruction for the functional unit
+red_intsr_t reduction_instr_out, reduction_intsr_out_cut;
+logic reduction_req_valid_out, reduction_req_ready_in;
+RdData_t reduction_resp_data_in;
+logic reduction_resp_valid_in, reduction_resp_ready_out;
+
+// The tag is only required for the Generic configuration
+// For each incoming element generate the corresponding tag.
+if(GENERIC == 1'b1) begin : gen_tag_generation
+  floo_offload_reduction_taggen #(
+      .NumRoutes        (NumRoutes),
+      .TAG_T            (tag_t),
+      .RdTagBits        (RdCfg.RdTagBits)
+  ) i_gen_tag (
+      .clk_i            (clk_i),
+      .rst_ni           (rst_ni),
+      .flush_i          (flush_i),
+      .mask_i           (expected_input_i),
+      .valid_i          (valid_i),
+      .ready_i          (ready_o),
+      .tag_o            (fifo_tag)
+  );
+end else begin : gen_bypass_tag_generation
+  assign fifo_tag = '0;
+end
+
+// Fifo's for all inputs to ack the incoming data
+// and to reduce unnecessary backpressure into the system.
+if(RdCfg.RdFifoDepth > 0) begin : gen_input_fifo
+  for (genvar i = 0; i < NumRoutes; i++) begin : gen_routes
+      stream_fifo #(
+        .FALL_THROUGH           (RdCfg.RdFifoFallThrough),
+        .DEPTH                  (RdCfg.RdFifoDepth),
+        .T                      (flit_in_out_dir_tag_t)
+      ) i_in_fifo_generic (
+        .clk_i                  (clk_i),
+        .rst_ni                 (rst_ni),
+        .flush_i                (flush_i),
+        .testmode_i             (1'b0),
+        .usage_o                (),
+        .data_i                 ({data_i[i], expected_input_i[i], output_route_i[i], fifo_tag[i]}),
+        .valid_i                (valid_i[i]),
+        .ready_o                (ready_o[i]),
+        .data_o                 (fifo_out_data[i]),
+        .valid_o                (fifo_out_valid[i]),
+        .ready_i                (fifo_out_ready[i])
+      );
+  end
+end else begin : gen_no_input_fifo
+  for (genvar i = 0; i < NumRoutes; i++) begin : gen_routes
+    assign fifo_out_data[i] = {data_i[i], expected_input_i[i], output_route_i[i], fifo_tag[i]};
+    assign fifo_out_valid[i] = valid_i[i];
+    assign ready_o[i] = fifo_out_ready[i];
+  end
+end
+
+// Controller which runs the hole reduction
+floo_offload_reduction_controller #(
+  .NumRoutes                    (NumRoutes),
+  .RdPartialBufferSize          (RdCfg.RdPartialBufferSize),
+  .RdPipelineDepth              (RdCfg.RdPipelineDepth),
+  .RdData_t                     (RdData_t),
+  .RdOperation_t                (RdOperation_t),
+  .tag_t                        (tag_t),
+  .mask_t                       (mask_t),
+  .flit_t                       (flit_t),
+  .hdr_t                        (hdr_t),
+  .data_tag_t                   (red_data_tag_t),
+  .data_mask_tag_t              (red_data_mask_tag_t),
+  .flit_mask_tag_t              (flit_mask_tag_t),
+  .flit_in_out_dir_tag_t        (flit_in_out_dir_tag_t),
+  .idx_part_res_t               (part_res_idx_t),
+  .GENERIC                      (GENERIC),
+  .SIMPLE                       (SIMPLE),
+  .STALLING                     (STALLING),
+  .RdSupportAxi                 (RdCfg.RdSupportAxi),
+  .AxiCfg                       (AxiCfg),
+  .RdEnableBypass               (RdCfg.RdEnableBypass)
+) i_reduction_controller (
+  .clk_i                        (clk_i),
+  .rst_ni                       (rst_ni),
+  .flush_i                      (flush_i),
+  .head_fifo_flit_i             (fifo_out_data),
+  .head_fifo_valid_i            (fifo_out_valid),
+  .head_fifo_ready_o            (fifo_out_ready),
+  .operand_data_o               (input_mapped_operands_data),
+  .operand_valid_o              (input_mapped_operands_valid),
+  .operand_ready_i              (input_mapped_operands_ready),
+  .reduction_req_operation_o    (reduction_scheduled_operation),
+  .reduction_req_mask_i         (merged_data[0].mask | merged_data[1].mask),
+  .reduction_req_valid_i        (reduction_req_valid_out),
+  .reduction_req_ready_i        (reduction_req_ready_in),
+  .fully_red_data_i             (fully_reduced_data),
+  .fully_red_valid_i            (fully_reduced_valid),
+  .fully_red_ready_o            (fully_reduced_ready),
+  .reduction_resp_mask_i        (reduction_resp_data.mask),
+  .reduction_resp_tag_i         (reduction_resp_data.tag),
+  .reduction_resp_valid_i       (reduction_resp_valid_in),
+  .reduction_resp_ready_i       (reduction_resp_ready_out),
+  .final_flit_o                 (final_flit),
+  .final_valid_o                (final_valid),
+  .final_ready_i                (final_ready),
+  .buf_spyglass_tag_i           (spyglass_tag),
+  .buf_spyglass_valid_i         (spyglass_valid),
+  .select_partial_result_idx_o  (ctrl_sel_buffer_idx),
+  .ctrl_part_res_mux_o          (ctrl_sel_part_res),
+  .ctrl_output_demux_o          (ctrl_demux)
+);
+
+// Generate the MUX to include the partial buffer
+if((GENERIC == 1'b1) || (STALLING == 1'b1)) begin : gen_mux_partial_result
+  for (genvar i = 0; i < 2; i++) begin : gen_mux_partial_result_loop
+      stream_mux #(
+          .DATA_T             (red_data_mask_tag_t),
+          .N_INP              (2)
+      ) i_merge_part_res_and_input (
+          .inp_data_i         ({partial_result_buffer_data[i], input_mapped_operands_data[i]}),
+          .inp_valid_i        ({partial_result_buffer_valid[i], input_mapped_operands_valid[i]}),
+          .inp_ready_o        ({partial_result_buffer_ready[i], input_mapped_operands_ready[i]}),
+          .inp_sel_i          (ctrl_sel_part_res[i]),
+          .oup_data_o         (merged_data[i]),
+          .oup_valid_o        (merged_valid[i]),
+          .oup_ready_i        (merged_ready[i])
+      );
+  end
+end else begin : gen_bypass_mux_partial_result
+  assign merged_data = input_mapped_operands_data;
+  assign merged_valid = input_mapped_operands_valid;
+  assign input_mapped_operands_ready = merged_ready;
+  assign partial_result_buffer_ready = '0;
+end
+
+// Join the Handshake for the operands controll path's
+stream_join #(
+    .N_INP                  (2)
+) i_join_controlpath_operands (
+    .inp_valid_i            (merged_valid),
+    .inp_ready_o            (merged_ready),
+    .oup_valid_o            (join_operands_valid),
+    .oup_ready_i            (join_operands_ready)
+);
+
+// Connect the HS for the output request to the FPU
+assign reduction_req_valid_out = join_operands_valid;
+assign join_operands_ready = reduction_req_ready_in;
+
+// Output the operands here
+assign reduction_instr_out.operand1 = merged_data[0].data;
+assign reduction_instr_out.operand2 = merged_data[1].data;
+assign reduction_instr_out.op = reduction_scheduled_operation;
+
+// Note: At this position in the dataflow of this file lies the external reduction hardware
+// After some (5) cycles the request turns comes back as response.
+// The external Reduction requires at least 1 cycle to avoid hw-loops!
+
+// We buffer the tag internally rather than pass it to the outside
+// TODO: Add assertion that if "reduction_req_valid_o" is set that both tag are equal!
+if(GENERIC == 1'b1) begin : gen_fifo_for_tag
+  fifo_v3 #(
+      .FALL_THROUGH     (1'b0),
+      .dtype            (tag_t),
+      .DEPTH            (RdCfg.RdPipelineDepth+2)
+  ) i_fifo_mask_parallel_fpu (
+      .clk_i            (clk_i),
+      .rst_ni           (rst_ni),
+      .flush_i          (flush_i),
+      .testmode_i       (1'b0),
+      .full_o           (),
+      .empty_o          (),
+      .usage_o          (),
+      .data_i           (merged_data[0].tag),
+      .push_i           (reduction_req_ready_in & reduction_req_valid_out),  // active fpu req hs
+      .data_o           (reduction_resp_data.tag),
+      .pop_i            (reduction_resp_valid_in & reduction_resp_ready_out) // active fpu resp hs
+  );
+end else begin
+  assign reduction_resp_data.tag = '0;
+end
+
+// We buffer the mask internally rather than pass it to the outside
+// The mask is or-connected because the results are "added"
+// The mask field determins which input element is already
+// reduced in the given element.
+// from partial buffer: more than 1 bit set
+// from input: only 1 bit set
+// TODO: Add assertion that if "reduction_req_valid_o" is set that no bit position is set
+//       in both mask as this would mean we have already added the element once!
+if((GENERIC == 1'b1) || (STALLING == 1'b1)) begin : gen_fifo_for_mask
+  fifo_v3 #(
+      .FALL_THROUGH     (1'b0),
+      .dtype            (mask_t),
+      .DEPTH            (RdCfg.RdPipelineDepth+2)
+  ) i_fifo_mask_parallel_fpu (
+      .clk_i            (clk_i),
+      .rst_ni           (rst_ni),
+      .flush_i          (flush_i),
+      .testmode_i       (1'b0),
+      .full_o           (),
+      .empty_o          (),
+      .usage_o          (),
+      .data_i           (merged_data[0].mask | merged_data[1].mask),
+      .push_i           (reduction_req_ready_in & reduction_req_valid_out),  // active fpu req hs
+      .data_o           (reduction_resp_data.mask),
+      .pop_i            (reduction_resp_valid_in & reduction_resp_ready_out) // active fpu resp hs
+  );
+end else begin
+  assign reduction_resp_data.mask = '0;
+end
+
+// Merge the response from the reduction with the interal tag / mask storage
+assign reduction_resp_data.data = reduction_resp_data_in;
+
+// Demux the output of the fpu
+// TODO (lleone): Change if !SIMPLE
+if((GENERIC == 1'b1) || (STALLING == 1'b1)) begin : gen_demux_partial_result
+  stream_demux #(
+    .N_OUP              (2)
+  ) i_stream_demux_output_fpu (
+    .inp_valid_i        (reduction_resp_valid_in),
+    .inp_ready_o        (reduction_resp_ready_out),
+    .oup_sel_i          (ctrl_demux),
+    .oup_valid_o        ({fully_reduced_valid, input_partial_result_buf_valid}),
+    .oup_ready_i        ({fully_reduced_ready, input_partial_result_buf_ready})
+  );
+end else begin
+  assign fully_reduced_valid = reduction_resp_valid_in;
+  assign reduction_resp_ready_out = fully_reduced_ready;
+  assign input_partial_result_buf_valid = 1'b0;
+end
+
+// Assign the data beloning to the mux
+assign input_partial_result_buf_data = reduction_resp_data;
+assign fully_reduced_data.data = reduction_resp_data.data;
+assign fully_reduced_data.tag = reduction_resp_data.tag;
+
+// Dynammically fork the data into the correct output direction
+// (Currently only 1 output direction is set by the dyn fork.
+// Potentially we could support here a reduce and multicast operation
+// if more than one output is set.
+
+// TODO: By introducing a cut on these signal used by the multiplexer we could separate
+//       the reduction logic from the rest of the router (timing wise at least).
+//       Additionally the input fifo's would need to be configured as not fall through!
+stream_fork_dynamic #(
+  .N_OUP          (NumRoutes)
+) i_dynamic_fork (
+  .clk_i          (clk_i),
+  .rst_ni         (rst_ni),
+  .valid_i        (final_valid),
+  .ready_o        (final_ready),
+  .sel_i          (final_flit.mask),
+  .sel_valid_i    (final_valid),
+  .sel_ready_o    (),
+  .valid_o        (valid_o),
+  .ready_i        (ready_i)
+);
+
+// Dublicate the output data for all output IF
+always_comb begin : gen_dublicate_output_data
+  data_o = '0;
+  for(int i = 0; i < NumRoutes;i++) begin
+    data_o[i] = final_flit.flit;
+  end
+end
+
+// Generate the partial result buffer only if we are in the GENERIC or the STALLING case
+if((GENERIC == 1'b1) || (STALLING == 1'b1)) begin : gen_partial_result_buffer
+  floo_offload_reduction_buffer #(
+      .data_mask_tag_t    (red_data_mask_tag_t),
+      .tag_t              (tag_t),
+      .NElements          (RdCfg.RdPartialBufferSize),
+      .NOutPorts          (2)
+  ) i_buf_part_result (
+      .clk_i              (clk_i),
+      .rst_ni             (rst_ni),
+      .flush_i            (flush_i),
+      .inp_data_i         (input_partial_result_buf_data),
+      .inp_valid_i        (input_partial_result_buf_valid),
+      .inp_ready_o        (input_partial_result_buf_ready),
+      .oup_data_o         (partial_result_buffer_data),
+      .oup_valid_o        (partial_result_buffer_valid),
+      .oup_ready_i        (partial_result_buffer_ready),
+      .inp_sel_valid_i    (ctrl_sel_part_res),
+      .inp_sel_i          (ctrl_sel_buffer_idx),
+      .spyglass_valid_o   (spyglass_valid),
+      .spyglass_tag_o     (spyglass_tag)
+  );
+end else begin
+  assign input_partial_result_buf_ready = 1'b0;
+  assign partial_result_buffer_data = '0;
+  assign partial_result_buffer_valid = '0;
+  assign spyglass_valid = '0;
+  assign spyglass_tag = '0;
+end
+
+// TODO (lleone): Create a REQ/RSP struct for teh following interface
+// and replace all the spill regsiters with just one for REQ and one for RSP
+  spill_register #(
+        .T (red_intsr_t),
+        .Bypass (!RdCfg.CutOffloadIntf)
+  ) i_offload_cut_req (
+        .clk_i,
+        .rst_ni,
+        .data_i   (reduction_instr_out),
+        .valid_i  (reduction_req_valid_out),
+        .ready_o  (reduction_req_ready_in),
+        .data_o   (reduction_intsr_out_cut),
+        .valid_o  (reduction_req_valid_o),
+        .ready_i  (reduction_req_ready_i)
+  );
+
+  spill_register #(
+        .T (RdData_t),
+        .Bypass (!RdCfg.CutOffloadIntf)
+  ) i_offload_cut_rsp (
+        .clk_i,
+        .rst_ni,
+        .data_i   (reduction_resp_data_i),
+        .valid_i  (reduction_resp_valid_i),
+        .ready_o  (reduction_resp_ready_o),
+        .data_o   (reduction_resp_data_in),
+        .valid_o  (reduction_resp_valid_in),
+        .ready_i  (reduction_resp_ready_out)
+  );
+
+// TODO(lleone): When uniforming the offload interface, get rid of this part, isnce the cur will be of the type of the interface
+assign reduction_req_type_o = reduction_intsr_out_cut.op;
+assign reduction_req_op1_o = reduction_intsr_out_cut.operand1;
+assign reduction_req_op2_o = reduction_intsr_out_cut.operand2;
+
+/* ASSERTION Checks */
+// The fp reduction supports up to 6 operands
+`ASSERT_INIT(Number_Input_Route_Invalid, !(NumRoutes > 6))
+// Currently we only support reduction extension with an pipeline depth of at least 1 cycle as otherwise loops could be generated!
+`ASSERT_INIT(ReductionPipelineDepth, !(RdCfg.RdPipelineDepth == 0))
+// The size needs to be at least 2 for the partial buffer for the generic / stalling proceessor
+`ASSERT_INIT(PartialBufferSize, !((GENERIC | STALLING) && (RdCfg.RdPartialBufferSize < 2)))
+// We can only run GENERIC or SIMPLE or STALLING
+`ASSERT_INIT(Invalid_Configuration_1, !(GENERIC & SIMPLE))
+`ASSERT_INIT(Invalid_Configuration_2, !(STALLING & SIMPLE))
+`ASSERT_INIT(Invalid_Configuration_3, !(GENERIC & STALLING))
+`ASSERT_INIT(Invalid_Configuration_4, (GENERIC | STALLING | SIMPLE))
+
+endmodule
diff --git a/hw/floo_offload_reduction_buffer.sv b/hw/floo_offload_reduction_buffer.sv
new file mode 100644
index 00000000..f68c7b82
--- /dev/null
+++ b/hw/floo_offload_reduction_buffer.sv
@@ -0,0 +1,128 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Raphael Roth <raroth@student.ethz.ch>
+
+// This module creates a buffer in which all elements can be accessed from the outside.
+// The module is implemented as a 0-cycle buffer where the current output is selected
+// by an index input. This index is guarded with an valid signal so that the data only
+// change if an valid is applied.
+
+`include "common_cells/registers.svh"
+`include "common_cells/assertions.svh"
+
+module floo_offload_reduction_buffer #(
+    /// Parameter type for reduction
+    parameter type data_mask_tag_t                      = logic,
+    /// Tag type for the spyglass
+    parameter type tag_t                                = logic,
+    /// Number of elements for the partial result buffer
+    parameter integer NElements                         = 0,
+    /// Number of outputs for the buffer
+    parameter integer NOutPorts                         = 1,
+    /// Dependent parameters, DO NOT OVERRIDE!
+    /// TODO(lleone): There is a function for this
+    parameter integer LogNElements                       = (NElements > 32'd1) ? unsigned'($clog2(NElements)) : 1'b1
+) (
+    /// Control Inputs
+    input  logic                                        clk_i,
+    input  logic                                        rst_ni,
+    input  logic                                        flush_i,
+    /// All Input Connections
+    input  data_mask_tag_t                              inp_data_i,
+    input  logic                                        inp_valid_i,
+    output logic                                        inp_ready_o,
+    /// All Output Connections
+    output data_mask_tag_t [NOutPorts-1:0]              oup_data_o,
+    output logic  [NOutPorts-1:0]                       oup_valid_o,
+    input  logic  [NOutPorts-1:0]                       oup_ready_i,
+    /// Selections
+    input  logic [NOutPorts-1:0]                        inp_sel_valid_i,
+    input  logic [NOutPorts-1:0][LogNElements-1:0]      inp_sel_i,
+    /// Spyglass to all entries of the Buffer
+    output logic [NElements-1:0]                        spyglass_valid_o,
+    output tag_t [NElements-1:0]                        spyglass_tag_o
+);
+
+/* All Typedef Vars */
+
+// a line of the buffer
+typedef struct packed {
+    data_mask_tag_t data;
+    logic f_valid;
+} buff_entry_t;
+
+/* Variable declaration */
+buff_entry_t [NElements-1:0] buffer_d, buffer_q; // Buffer to hold the data
+logic empty_field_found;
+logic [NElements-1:0] ready_sig_buf;
+
+/* Description */
+
+always_comb begin : partial_result_buffer
+    buffer_d = buffer_q;    // Init the buffer with the old data
+
+    // All Output signal
+    inp_ready_o = 1'b0;
+    oup_data_o = '0;
+    oup_valid_o = '0;
+    spyglass_tag_o = '0;
+    spyglass_valid_o = '0;
+
+    // Intermidiate signal
+    empty_field_found = 1'b0;
+    ready_sig_buf = '0;
+
+    // Store the Data into the Buffer
+    for(int i = 0; i < NElements; i++) begin
+        if((buffer_d[i].f_valid == 1'b0) && (empty_field_found == 1'b0) && (inp_valid_i == 1'b1)) begin
+            // Lock the Entry
+            empty_field_found = 1'b1;
+
+            // Ack the handshake
+            inp_ready_o = 1'b1;
+
+            // Copy the actual data
+            buffer_d[i].data = inp_data_i;
+            buffer_d[i].f_valid = 1'b1;
+        end
+    end
+
+    // Implement the Spyglass here!
+    for(int i = 0; i < NElements; i++) begin
+        spyglass_tag_o[i] = buffer_d[i].data.tag;
+        spyglass_valid_o[i] = buffer_d[i].f_valid;
+    end
+
+    // Assign the output for each defined port
+    for(int i = 0; i < NOutPorts;i++) begin
+        if(inp_sel_valid_i[i] == 1'b1) begin
+            oup_data_o[i] = buffer_d[inp_sel_i[i]].data;
+            oup_valid_o[i] = buffer_d[inp_sel_i[i]].f_valid;
+            ready_sig_buf[inp_sel_i[i]] = oup_ready_i[i];
+        end
+    end
+
+    // If we receive any valid handshake on any IF then reset the valid flag
+    for(int i = 0; i < NElements; i++) begin
+        if((ready_sig_buf[i] == 1'b1) && (buffer_d[i].f_valid == 1'b1)) begin
+            buffer_d[i].f_valid = 1'b0;
+        end
+    end
+
+    // Reset Buffer if flush is asserted & avoid piping data to the output
+    if(flush_i == 1'b1) begin
+        buffer_d = '0;
+        oup_valid_o = '0;
+    end
+end
+
+// Store the Buffer
+`FF(buffer_q, buffer_d, '0, clk_i, rst_ni)
+
+// We require at least a size of 2 for the partial result buffer (otherwise deadlock potential)
+`ASSERT_INIT(WrongNumberOfElements, !(NElements < 2))
+
+
+endmodule
diff --git a/hw/floo_offload_reduction_controller.sv b/hw/floo_offload_reduction_controller.sv
new file mode 100644
index 00000000..480b53ad
--- /dev/null
+++ b/hw/floo_offload_reduction_controller.sv
@@ -0,0 +1,924 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Raphael Roth <raroth@student.ethz.ch>
+
+// This module controls the whole reduction by setting all Mux / DeMux / Buff Select into its
+// appropriate position. To control the reduction(s) we have one buffer which stores all the
+// required metadata.
+//
+// We have three different configurations: Generic, Stalling & Simple
+//
+// Generic:     Most sufficicated option!
+//              Allows to have multiple reduction ongoing with the downside of requiring a TAG
+//              for each element. This mode can achieve a 100% FPU utilization (5 Stages) given
+//              enough data to reduce. To achieve 100% FPU utilization in all cases the partial
+//              buffer size should be as deep as the FPU pipeline.
+//
+// Stalling:    Good balance between area overhead & performence
+//              Allows one ongoing reduction if more than two flits are involved.
+//              It stalls all incoming element until the one
+//              it is currently working on is finished therefor no tag requirements.
+//              Can not achieve a 100% FPU utilization if more than two inputs are involved in
+//              the reduction. The partial buffer can be reduced to its minimum size of 2.
+//
+// Simple:      least hw overhead
+//              All incoming reduction can consist of only two elements therefor no partial buffer
+//              is required. However the user writing the software has to guarantee that the only
+//              two elements are used in each reduction. This requires that the sw guy is aware of
+//              the physical implementation of the NoC. No tag or mask is required as we can start
+//              the reduction as soon as we have two elements on the input.
+//
+// To guarantee the ordering in the generic implementation this module implements priority scheme
+// e.g. the first buffer entry has the most priority, then the second, then the third etc.
+// It works only on elements from lower priority if the higher ones can not schedule
+// any operation.
+//
+// This module must be aware of the underlying communication protocol embedded in the FlooNoc.
+// Any header like information (e.g. AXI AW transmission) needs to be forwarded just once
+// without reducing anything. This is implemented by a bypass option.
+
+// Limits:
+// - We can not handle out-of-order
+// - We can not handle multiple reduction when they do not belong to the same subset of addresses
+//   (Due to the tag generation)
+
+// Open Points:
+// - AW flits stalls the hole process > We could potentially send the AW as soon as we receive
+//   the first  AW packet and then delet the others rather than waiting until all AW flits are
+//   available.
+
+// Disclaimer:
+// Sorry for the mess in the code ;) I had to add too much configuration option(s)!
+
+`include "common_cells/registers.svh"
+`include "common_cells/assertions.svh"
+`include "axi/typedef.svh"
+`include "floo_noc/typedef.svh"
+
+module floo_offload_reduction_controller #(
+    /// Number of Routes
+    parameter int unsigned  NumRoutes               = 1,
+    /// Partial buffer size for partial results
+    /// used in Generic / Stalling configuration
+    parameter int unsigned  RdPartialBufferSize     = 3,
+    /// Pipeline depth of the external reduction logic
+    parameter int unsigned  RdPipelineDepth         = 3,
+    /// Data payload size to extract from the floo flit
+    parameter type          RdData_t                = logic,
+    /// Possible reduction operation(s)
+    parameter type          RdOperation_t           = logic,
+    /// Various types used by floonoc / routing
+    parameter type          tag_t                   = logic,
+    parameter type          mask_t                  = logic,
+    parameter type          flit_t                  = logic,
+    parameter type          hdr_t                   = logic,
+    parameter type          data_tag_t              = logic,
+    parameter type          data_mask_tag_t         = logic,
+    parameter type          flit_mask_tag_t         = logic,
+    parameter type          flit_in_out_dir_tag_t   = logic,
+    /// Type to address the entries of the partial result buffer
+    parameter type          idx_part_res_t          = logic,
+    /// Controller configuration
+    parameter bit           GENERIC                 = 1'b1,
+    parameter bit           SIMPLE                  = 1'b0,
+    parameter bit           STALLING                = 1'b0,
+    /// Defines if the underlying protocol is AXI
+    parameter bit           RdSupportAxi            = 1'b1,
+    /// Axi Configuration
+    parameter floo_pkg::axi_cfg_t AxiCfg            = '0,
+    /// Define if we support a bypass or not (for AXI AW header)
+    parameter bit           RdEnableBypass          = 1'b1
+) (
+    /// Control Signals
+    input   logic                                   clk_i,
+    input   logic                                   rst_ni,
+    input   logic                                   flush_i,
+    /// Input from the fifos
+    input   flit_in_out_dir_tag_t [NumRoutes-1:0]   head_fifo_flit_i,
+    input   logic [NumRoutes-1:0]                   head_fifo_valid_i,
+    output  logic [NumRoutes-1:0]                   head_fifo_ready_o,
+    /// Output Operands (Support only 2 operands)
+    output  data_mask_tag_t [1:0]                   operand_data_o,
+    output  logic [1:0]                             operand_valid_o,
+    input   logic [1:0]                             operand_ready_i,
+    /// Metadata reduction req
+    output  RdOperation_t                           reduction_req_operation_o,
+    input   mask_t                                  reduction_req_mask_i,
+    input   logic                                   reduction_req_valid_i,
+    input   logic                                   reduction_req_ready_i,
+    /// Final Input from the reduction offload (fully reduced)
+    input   data_tag_t                              fully_red_data_i,
+    input   logic                                   fully_red_valid_i,
+    output  logic                                   fully_red_ready_o,
+    /// Metadata reduction resp
+    input   mask_t                                  reduction_resp_mask_i,
+    input   tag_t                                   reduction_resp_tag_i,
+    input   logic                                   reduction_resp_valid_i,
+    input   logic                                   reduction_resp_ready_i,
+    /// Flit provided to the output of the reduction logic
+    output  flit_mask_tag_t                         final_flit_o,
+    output  logic                                   final_valid_o,
+    input   logic                                   final_ready_i,
+    /// Spyglass from the partial result buffer
+    input   tag_t [RdPartialBufferSize-1:0]         buf_spyglass_tag_i,
+    input   logic [RdPartialBufferSize-1:0]         buf_spyglass_valid_i,
+    /// Contol Output for the index of the partial result buffer
+    output idx_part_res_t [1:0]                     select_partial_result_idx_o,
+    /// Control Signal for the Muxes / DeMuxes
+    output logic [1:0]                              ctrl_part_res_mux_o,
+    output logic                                    ctrl_output_demux_o
+);
+
+/* All local parameter */
+// Buffer Size to control all ongoing reduction
+// @ GENERIC: Needs to be at least to be RdPartialBufferSize which is RdPipelineDepth+1
+// @ STALLING: We only have one reduction ongoing so 1 is enough
+// @ SIMPLE: We do not need any buffer so 1 to avoid questa panic (will be optimized away as never used)
+localparam int unsigned RdBufferSize = (GENERIC) ? (RdPartialBufferSize) : 1;
+
+localparam bit [NumRoutes-1:0] ONES = 1;
+
+/* All Typedef Vars */
+
+// Generate the axi / floo types to extract all relevant information
+// If we want to support another protocol rather AXI then add it here
+typedef logic [AxiCfg.AddrWidth-1:0] axi_addr_t;
+typedef logic [AxiCfg.InIdWidth-1:0] axi_in_id_t;
+typedef logic [AxiCfg.OutIdWidth-1:0] axi_out_id_t;
+typedef logic [AxiCfg.UserWidth-1:0] axi_user_t;
+typedef logic [AxiCfg.DataWidth-1:0] axi_data_t;
+typedef logic [AxiCfg.DataWidth/8-1:0] axi_strb_t;
+
+`AXI_TYPEDEF_ALL_CT(axi, axi_req_t, axi_rsp_t, axi_addr_t, axi_in_id_t, axi_data_t, axi_strb_t, axi_user_t)
+`AXI_TYPEDEF_AW_CHAN_T(axi_out_aw_chan_t, axi_addr_t, axi_out_id_t, axi_user_t)
+`FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi, AxiCfg, hdr_t)
+
+// Typedef to encompass an ongoing reduction in the buffer
+// TODO(raroth):    Try to store only the header inside the buffer_t and not the hole flit!
+//          It looks like the Synth can not optimize away the unused FlipFlop.
+//          I assumed this but I think it won't do it.
+//          It will get an uglier code but It can save some Area!
+typedef struct packed {
+    // Copy (one) flit for all metadata in the package
+    flit_t                      header;
+    // Final reduction mask e.g. from which input i need to reduce input flits
+    mask_t                      final_mask;
+    // Assigned tag by the generator
+    tag_t                       tag;
+    // Output direction (N - E - S - W - L) of the flit
+    mask_t                      output_dir;
+    // Is the entry valid
+    logic                       f_valid;
+    // forward directly with the bypass
+    logic                       f_forwarding;
+} buffer_t;
+
+// Index for the input
+typedef logic [cf_math_pkg::idx_width(NumRoutes)-1:0] idx_input_t;
+
+/* Variable declaration */
+
+// Buffer to hold all reduction info's
+buffer_t [RdBufferSize-1:0]             buffer_q, buffer_d;
+
+// Stalled input signals
+flit_in_out_dir_tag_t [NumRoutes-1:0]   stalling_flit;
+logic [NumRoutes-1:0]                   stalling_valid;
+logic [NumRoutes-1:0]                   stalling_ready;
+
+// Signal to insert a new reduction in the buffer (serialzied approach to reduce required logic)
+// Iterates over all input and set to 1 if we found a tag that is not yet inside the buffer
+flit_in_out_dir_tag_t [NumRoutes-1:0]   unkown_incoming_flit;
+flit_in_out_dir_tag_t                   new_incoming_flit;
+logic [NumRoutes-1:0]                   unkown_incoming_valid;
+logic                                   new_incoming_valid;
+
+// Indicates if we have inserted the new data into the buffer or not
+logic f_insert_data_in_buffer;
+
+// Flags to find two valid operands for an reduction
+logic f_op1_found;
+logic f_op2_found;
+
+// Temporary signals for all selection(s)
+idx_input_t [1:0]       tmp_sel_input;
+idx_part_res_t [1:0]    tmp_sel_part_res_buf;
+logic [1:0]             tmp_part_res_mux;
+
+// Locked in signals to prevent changed during backpressure
+logic                   locked_d, locked_q;
+idx_input_t [1:0]       selected_input_d, selected_input_q;
+idx_part_res_t [1:0]    selected_partial_result_buffer_d, selected_partial_result_buffer_q;
+logic [1:0]             selected_partial_result_mux_d, selected_partial_result_mux_q;
+RdOperation_t           selected_op_d, selected_op_q;
+tag_t                   selected_tag_d, selected_tag_q;
+
+// Bypass Signal
+flit_mask_tag_t bypass_flit;
+logic           bypass_valid;
+logic           bypass_ready;
+
+// internal generated signals which holds the output mask for the current reduction response
+flit_t  fully_red_flit;
+mask_t  fully_red_mask;
+flit_t  metadata_out_flit;
+mask_t  metadata_out_mask;
+
+// Var used for the simple controller
+flit_t req_header;
+mask_t req_output_mask;
+logic simple_reduction_ongoing_n;
+
+// Signal to retire the elements from the buffer
+logic retire_element;
+logic stalling_reduction_ongoing_n;
+logic backpressure_fpu_resp;
+
+/* Module Declaration */
+
+// If the stalling mode is enabled then we have to stall the inputs
+// e.g. we deassert the valid without ackknowledge to the outside world
+// and only resets of the final flit leaves the reduction logic
+if(STALLING) begin : gen_stalling
+    for(genvar i = 0; i < NumRoutes;i++) begin
+        floo_offload_reduction_stalling #() i_stalling_module (
+            .clk_i          (clk_i),
+            .rst_ni         (rst_ni),
+            .flush_i        (flush_i),
+            .src_valid_i    (head_fifo_valid_i[i]),
+            .src_ready_o    (head_fifo_ready_o[i]),
+            .stalling_i     (retire_element | (bypass_valid & bypass_ready)),
+            .dst_valid_o    (stalling_valid[i]),
+            .dst_ready_i    (stalling_ready[i])
+        );
+    end
+    assign stalling_flit = head_fifo_flit_i;
+end else begin : gen_no_stalling
+    assign stalling_flit = head_fifo_flit_i;
+    assign stalling_valid = head_fifo_valid_i;
+    assign head_fifo_ready_o = stalling_ready;
+end
+
+// Check if on any input we have new data available. Each input is checked against
+// each buffer entry if the tag is available. However only one input will be forwarded
+// into the buffer.
+if(GENERIC || STALLING) begin : gen_filter_unkown_flit_tags
+    // Search if any element on the input can be inserted into the buffer
+    always_comb begin
+        // Init all Vars
+        unkown_incoming_flit = '0;
+        unkown_incoming_valid = '0;
+
+        // Loop over all inputs
+        for(int k = 0; k < NumRoutes; k++) begin
+            // Is the incoming flit valid?
+            if(stalling_valid[k] == 1'b1) begin
+                // This input can be inserted if it is not already in the buffer
+                unkown_incoming_valid[k] = 1'b1;
+                // Assign the acutal data here
+                unkown_incoming_flit[k] = stalling_flit[k];
+                // Go through the hole buffer and check if the element is already inside or not
+                for(int j = 0; j < RdBufferSize; j++) begin
+                    if((stalling_flit[k].tag == buffer_q[j].tag) && (buffer_q[j].f_valid == 1'b1)) begin
+                        unkown_incoming_valid[k] = 1'b0;
+                    end
+                end
+            end
+        end
+    end
+end else begin
+    assign unkown_incoming_flit = '0;
+    assign unkown_incoming_valid = 1'b0;
+end
+
+// Select one of the unkown flits to be inserted in the buffer next. (Prio. lower indexes)
+// Both loop's could be combined but maybe there could be a better way to find the lsb
+// indexes here
+if(GENERIC || STALLING) begin : gen_incoming_data
+    // Search if any element on the input can be inserted into the buffer
+    always_comb begin
+        // Init all Vars
+        new_incoming_flit = '0;
+        new_incoming_valid = 1'b0;
+
+        // Loop over unkown inputs
+        for(int k = 0; k < NumRoutes; k++) begin
+            // Is the incoming flit valid and is not already one selected?
+            if((unkown_incoming_valid[k] == 1'b1) && (new_incoming_valid == 1'b0)) begin
+                // If we find a valid one - selct our selection
+                new_incoming_valid = 1'b1;
+                new_incoming_flit = unkown_incoming_flit[k];
+            end
+        end
+    end
+end else begin
+    assign new_incoming_flit = '0;
+    assign new_incoming_valid = 1'b0;
+end
+
+// Dedect if the system does apply backpressure
+// The problem is that we want to fill the FPU pipeline. However if backpressure is applied we can
+// only insert an element if at least one element originates in the partial result buffer.
+// Otherwise we could deadlock the system.
+assign backpressure_fpu_resp = reduction_resp_valid_i & (~reduction_resp_ready_i);
+
+// The control part can be split into 4 distinctive stages (with additional substages)
+// 1. Stage: Populate the buffer with new data
+// 2. Stage: Schedule a reduction when possible
+// 3. Stage: Schedule a direct passthrough if possible (AXI AW)
+// 4. Stage: Retire a buffer entry if the corresponding tag leaves the reduction logic
+// 5. Stage: If one higher prio buffer entry is free then push the buffer by one position
+
+// The entries of the buffer are prioritized by the index. Operation from the first index
+// are priorited over the ones from the second, then from the third etc.
+// This only happens if we use the generic configuration as the stalling one has only
+// a single buffer entry (b.c. it works only on one reduction at the time).
+
+// TODO(raroth): optimize the timing for the generic controller here. The "locked_d" variable serializes
+//       the evaluation of the meta data buffer. If we remove this and introduce an priority
+//       arbiter at the output to select the most "pressing" reduction we could restore the timing
+if(GENERIC || STALLING) begin : gen_controller_stalling_generic
+    always_comb begin
+        // Init all Vars
+        buffer_d = buffer_q;
+        locked_d = locked_q;
+        selected_input_d = selected_input_q;
+        selected_partial_result_buffer_d = selected_partial_result_buffer_q;
+        selected_partial_result_mux_d = selected_partial_result_mux_q;
+        selected_op_d = selected_op_q;
+        selected_tag_d = selected_tag_q;
+
+        // All ready signals for the input(s)
+        stalling_ready = '0;
+
+        // Output signal to the reduction
+        operand_data_o = '0;
+        operand_valid_o = '0;
+
+        // Init control signal for partial result / mux
+        select_partial_result_idx_o = '0;
+        ctrl_part_res_mux_o = '0;
+        reduction_req_operation_o = floo_pkg::F_Add;
+
+        // Flags
+        f_op1_found = 1'b0;
+        f_op2_found = 1'b0;
+        f_insert_data_in_buffer = 1'b0;
+
+        // Temporary selector signals
+        tmp_sel_input = '0;
+        tmp_sel_part_res_buf = '0;
+        tmp_part_res_mux = '0;
+
+        // Signal to directly bypass the reduction
+        bypass_flit = '0;
+        bypass_valid = 1'b0;
+
+        // Iterate over all buffer entries - handle by prio
+        for(int i = 0; i < RdBufferSize; i++) begin
+            // Reset Var for the loop
+            f_op1_found = 1'b0;
+            f_op2_found = 1'b0;
+            tmp_sel_input = '0;
+            tmp_sel_part_res_buf = '0;
+            tmp_part_res_mux = '0;
+
+            // 1. Stage: Accept new Data into the Buffer if we have free space and a valid entry
+            if(buffer_d[i].f_valid == 1'b0) begin
+
+                // Check if we can insert a new element
+                if((new_incoming_valid == 1'b1) && (f_insert_data_in_buffer == 1'b0)) begin
+                    // Lock in such a way that only one buffer entry can accept the data
+                    f_insert_data_in_buffer = 1'b1;
+                    // Insert the data into the selected entry
+                    buffer_d[i].header = new_incoming_flit.flit;
+                    buffer_d[i].final_mask = new_incoming_flit.input_exp;
+                    buffer_d[i].output_dir = new_incoming_flit.output_dir;
+                    buffer_d[i].tag = new_incoming_flit.tag;
+                    buffer_d[i].f_valid = 1'b1;
+                    // Check if we have to directly forward the flit
+                    buffer_d[i].f_forwarding = (new_incoming_flit.flit.hdr.collective_op == floo_pkg::SeqAW) ? 1'b1 : 1'b0;
+                    if((buffer_d[i].f_forwarding == 1'b1) && (RdEnableBypass == 1'b0)) begin
+                        $error($time, "An AW flit got to a reduction controller which does not support bypass");
+                    end
+                end
+            end
+
+            // 2.1 Stage: Try to schedula an operation from the partial result buffer when:
+            //      - Entry in Buffer is valid
+            //      - No higher prioritized buffer already has scheduled an operation
+            //      - The reduction is not an AW transaction
+            if( (buffer_d[i].f_valid == 1'b1) && (locked_d == 1'b0) &&
+                (buffer_d[i].f_forwarding == 0)) begin
+
+                // First iterate over the partial result buffer
+                for(int j = 0; j < RdPartialBufferSize;j++) begin
+                    if((buf_spyglass_tag_i[j] == buffer_d[i].tag) && (buf_spyglass_valid_i[j] == 1'b1)) begin
+                        if(f_op1_found == 1'b0) begin
+                            // Select the appropriate entry in the partial result buffer
+                            tmp_sel_part_res_buf[0] = j;
+                            // Switch the Mux0 from the input to the partial result buffer
+                            tmp_part_res_mux[0] = 1'b1;
+                            // lock the first op
+                            f_op1_found = 1'b1;
+                        end else if(f_op2_found == 1'b0) begin
+                            // Select the appropriate entry in the partial result buffer
+                            tmp_sel_part_res_buf[1] = j;
+                            // Switch the Mux1 from the input to the partial result buffer
+                            tmp_part_res_mux[1] = 1'b1;
+                            // lock the second op
+                            f_op2_found = 1'b1;
+                        end
+                    end
+                end
+            end
+
+            // 2.2 Stage: Try to schedula an operation from the inputs when:
+            //      - Entry in Buffer is valid
+            //      - No higher prioritized buffer already has scheduled an operation
+            //      - The reduction is not an AW transaction
+            //      - No backpressure is applied to the FPU response or f_op1_found is 1
+            //        and f_op2_found is 0 (otherwise deadlock potential!)
+            if( (buffer_d[i].f_valid == 1'b1) &&
+                (locked_d == 1'b0) &&
+                (buffer_d[i].f_forwarding == 0) &&
+                ((backpressure_fpu_resp == 1'b0) || ((f_op1_found == 1'b1) && (f_op2_found == 1'b0)))) begin
+
+                // Iterate over all inputs
+                for(int j = 0; j < NumRoutes;j++) begin
+                    if((stalling_flit[j].tag == buffer_d[i].tag) && (stalling_valid[j] == 1'b1)) begin
+                        if(f_op1_found == 1'b0) begin
+                            // Select the appropriate input (Mux0 is per default input selection)
+                            tmp_sel_input[0] = j;
+                            // lock the first op
+                            f_op1_found = 1'b1;
+                        end else if(f_op2_found == 1'b0) begin
+                            // Select the appropriate input (Mux1 is per default input selection)
+                            tmp_sel_input[1] = j;
+                            // lock the second op
+                            f_op2_found = 1'b1;
+                        end
+                    end
+                end
+            end
+
+            // 2.3 Stage: Schedule an operation if:
+            //  - Both operands are found
+            //  - No locked in operation
+            if( (f_op1_found == 1'b1) &&
+                (f_op2_found == 1'b1) &&
+                (locked_d == 1'b0)) begin
+
+                // lock the output signals
+                locked_d = 1'b1;
+                // Copy the required signal
+                selected_input_d = tmp_sel_input;
+                selected_partial_result_buffer_d = tmp_sel_part_res_buf;
+                selected_partial_result_mux_d = tmp_part_res_mux;
+                selected_op_d = buffer_d[i].header.hdr.collective_op;
+                selected_tag_d = buffer_d[i].tag;
+            end
+
+            // 3 Stage: Send an AW flit to the output if:
+            // - buffer entry 0 to ensure ordering
+            // - The reduction is an AW transaction
+            // - All required AW flits are aligned at the input
+            // - If stalling: Only when the fpu hasn't a element in, otherwise reordering will occure!
+            if( (buffer_d[i].f_valid == 1'b1) &&
+                (buffer_d[i].f_forwarding == 1) &&
+                ((stalling_valid & buffer_d[i].final_mask) == buffer_d[i].final_mask) &&
+                ((stalling_reduction_ongoing_n == 1'b1) || (!STALLING)) &&
+                (i == 0) &&
+                (RdEnableBypass == 1'b1)) begin
+
+                // Assign valid & data signal to the output
+                bypass_valid = 1'b1;
+                bypass_flit.flit = buffer_d[i].header;
+                bypass_flit.mask = buffer_d[i].output_dir;
+                bypass_flit.tag = buffer_d[i].tag;
+                // Multicast the bypass ready signal to the input's which requires the signal
+                stalling_ready = (buffer_d[i].final_mask & {(NumRoutes){bypass_ready}});
+            end
+
+            // 4 Stage: Retire an element if:
+            // - Valid Buffer Element
+            // - Tag matches the one that leaves the reduction logic
+            // - Handshake on the output
+            //
+            // Note:
+            // To garantee ordering we only should retire from the 0'th entry!
+            // However to avoid deadlocks with not retired instruction I allow
+            // to retire from any position. TODO: Solve by introducing assertion
+            if( (buffer_d[i].f_valid == 1'b1) &&
+                (buffer_d[i].tag == final_flit_o.tag) &&
+                ((retire_element == 1'b1) || ((bypass_valid == 1'b1) && (bypass_ready == 1'b1) && STALLING))) begin
+
+                // Reset the valid flag of the buffer
+                buffer_d[i].f_valid = 1'b0;
+                // ATTENTION! DO NOT OVERWRITE THE buffer_d[i].mask & .header fields!
+                // Otherwise the stalling implementation won't work
+                if(i > 0) begin
+                    $error($time, "We retired an element other from buffer entry 0. This should not happen.");
+                end
+            end
+        end
+
+        // 5 Stage: Copy the data to a higher prio slot if it is free and we are valid
+        // (Stalling case already handled by having i only 0!)
+        // TODO(lleone): Can you change the loop to start from i = 1?
+        for(int i = 0; i < RdBufferSize; i++) begin
+            if(i != 0) begin
+                if((buffer_d[i-1].f_valid == 1'b0) && (buffer_d[i].f_valid == 1'b1)) begin
+                    buffer_d[i-1] = buffer_d[i];    // copy the data (incl. valid bit)
+                    buffer_d[i] = '0;               // delet all old data
+                end
+            end
+        end
+
+        // Handle all locked in signals!
+        if(locked_d == 1'b1) begin
+            // Handle both operands
+            for(int i = 0; i < 2; i++) begin
+                // Use data provided by the partial result buffer
+                if(selected_partial_result_mux_d[i] == 1'b1) begin : fetch_result_from_partial_buffer
+                    select_partial_result_idx_o[i] = selected_partial_result_buffer_d[i];
+                // Use data provided by the input(s)
+                end else begin : fetch_result_from_input
+                    // data: extract the data from the AXI W channel in the selected input
+                    // mask: shift the 00001 according to the selected input
+                    // tag: just get it from the locked in version
+                    if(RdSupportAxi) begin
+                        operand_data_o[i] = {extractAXIWdata(stalling_flit[selected_input_d[i]].flit), ONES << selected_input_d[i], selected_tag_d};
+                    end
+                    // Set the valid bit
+                    operand_valid_o[i] = 1'b1;
+                    // Forward the ready bit without influencing already existing ready bits
+                    // on other inputs. We can schedule an bypass and a operation in the same
+                    // cycle.
+                    // We either shift 00001 or 00000 to the left according to the selected input
+                    stalling_ready = stalling_ready | ((ONES & {(NumRoutes){operand_ready_i[i]}}) << selected_input_d[i]);
+                end
+            end
+            // Set the selected OP
+            reduction_req_operation_o = selected_op_d;
+            // Set the mux
+            ctrl_part_res_mux_o = selected_partial_result_mux_d;
+        end
+
+        // Release the lock if we recognize a valid handshake
+        if((reduction_req_valid_i == 1'b1) && (reduction_req_ready_i == 1'b1)) begin
+            locked_d = 1'b0;
+        end
+    end
+end else begin
+    // Set all not required vars to 0
+    assign select_partial_result_idx_o = '0;
+    assign ctrl_part_res_mux_o = '0;
+    assign selected_partial_result_buffer_d = '0;
+    assign selected_partial_result_mux_d = '0;
+    assign selected_tag_d = '0;
+    assign selected_op_d = '0;
+end
+
+// Determint when an element from the buffer should be retired. In the stalling case we track
+// if the element is to be forwarded to the output after this iteration.
+if(GENERIC) begin : gen_generic_retirement
+    assign retire_element = final_valid_o & final_ready_i;
+end else if(STALLING) begin : gen_stalling_retirement
+    assign retire_element = ((buffer_d[0].final_mask == reduction_req_mask_i) && (reduction_req_valid_i == 1'b1) && (reduction_req_ready_i == 1'b1)) ? 1'b1 : 1'b0;
+end else begin
+    assign retire_element = 1'b0;
+end
+
+// Simple controller which is only able to combine two flits
+// TODO: Add assertion to the decoded mask so that at most only two bits can be set!
+if(SIMPLE) begin : gen_simple_controller
+    always_comb begin
+
+        // Init all Vars
+        buffer_d = buffer_q;
+        locked_d = locked_q;
+        selected_input_d = selected_input_q;
+
+        // All ready signals for the input(s)
+        stalling_ready = '0;
+
+        // Output signal to the reduction
+        operand_data_o = '0;
+        operand_valid_o = '0;
+
+        // Init control signal for partial result / mux
+        reduction_req_operation_o = '0;
+
+        // Signal to directly bypass the reduction
+        bypass_flit = '0;
+        bypass_valid = 1'b0;
+
+        // Simple controller specific vars
+        req_header = '0;
+        req_output_mask = '0;
+
+        // Set intial value for the op found signal
+        tmp_sel_input = '0;
+        f_op1_found = 1'b0;
+        f_op2_found = 1'b0;
+
+        // 1.1 Stage: Search for schedulable operands when:
+        // - Input is valid
+        // - Currently no operation locked in
+        for(int i = 0; i < NumRoutes; i++) begin
+            // Find the first operand
+            if((stalling_valid[i] == 1'b1) && (f_op1_found == 1'b0) && (locked_d == 1'b0)) begin
+                // Select the appropriate input (No Mux in simple case)
+                tmp_sel_input[0] = i;
+                // lock the first op
+                f_op1_found = 1'b1;
+            // Find the second operand
+            end else if((stalling_valid[i] == 1'b1) && (f_op1_found == 1'b1) && (f_op2_found == 1'b0) && (locked_d == 1'b0)) begin
+                // Select the appropriate input (No Mux in simple case)
+                tmp_sel_input[1] = i;
+                // lock the second op
+                f_op2_found = 1'b1;
+            end
+        end
+
+        // 1.2 Stage: Schedule an operation if:
+        //  - Both operands are found
+        //  - No locked in operation
+        if( (f_op1_found == 1'b1) &&
+            (f_op2_found == 1'b1) &&
+            (locked_d == 1'b0)) begin
+
+            // lock the output matrix
+            locked_d = 1'b1;
+            // Copy the required signal
+            selected_input_d = tmp_sel_input;
+        end
+
+        // 1.3 Stage: Forward the data to the FPU or the bypass
+        if(locked_d == 1'b1) begin
+            // Handle the case for a bypassable flit
+            if(stalling_flit[selected_input_d[0]].flit.hdr.collective_op == floo_pkg::SeqAW) begin
+                // Stall sending the bypass until the pipeline is empty to avoid reordering
+                if(simple_reduction_ongoing_n) begin
+                    // AW flit found - direct forward to the output
+                    bypass_valid = 1'b1;
+                    // Forward the entire AW flit
+                    bypass_flit.flit = stalling_flit[selected_input_d[0]].flit;
+                    bypass_flit.mask = stalling_flit[selected_input_d[0]].output_dir;
+                    // Forward the ready signal to all involved inputs
+                    stalling_ready = (stalling_flit[selected_input_d[0]].input_exp & {(NumRoutes){bypass_ready}});
+                end
+            end else begin
+                // Iterate over all operands and prepare the data
+                stalling_ready = '0;
+                for(int i = 0; i < 2; i++) begin
+                    if(RdSupportAxi) begin
+                        operand_data_o[i].data = extractAXIWdata(stalling_flit[selected_input_d[i]].flit);
+                    end
+
+                    // Forward the handshaking
+                    stalling_ready = stalling_ready | ((ONES & {(NumRoutes){operand_ready_i[i]}}) << selected_input_d[i]);
+                    // Schedule the operation
+                    operand_valid_o[i] = 1'b1;
+                end
+                // Select the ongoing operand
+                reduction_req_operation_o = stalling_flit[selected_input_d[0]].flit.hdr.collective_op;
+                // Forward the header of the flit
+                req_header = stalling_flit[selected_input_d[0]].flit;
+                // Forward the output selection mask
+                req_output_mask = stalling_flit[selected_input_d[0]].output_dir;
+            end
+        end
+
+        // 1.4 Stage: Release lock if operation was accepted
+        if((reduction_req_valid_i == 1'b1) && (reduction_req_ready_i == 1'b1)) begin
+            locked_d = 1'b0;
+        end
+
+        // 1.5 Stage: Release lock if bypass was accepted
+        if((bypass_valid == 1'b1) && (bypass_ready == 1'b1)) begin
+            locked_d = 1'b0;
+        end
+    end
+end else begin
+    assign req_header = '0;
+    assign req_output_mask = '0;
+end
+
+// If we want to support a bypass then use an arb-tree to include the bypass!
+if(RdEnableBypass == 1'b1) begin : gen_bypass_arb_tree
+    stream_arbiter_flushable  #(
+        .DATA_T                 (flit_mask_tag_t),
+        .N_INP                  (2)
+    ) i_output_arbiter (
+        .clk_i                  (clk_i),
+        .rst_ni                 (rst_ni),
+        .flush_i                (flush_i),
+        .inp_data_i             ({{fully_red_flit, fully_red_mask, fully_red_data_i.tag}, bypass_flit}),
+        .inp_valid_i            ({fully_red_valid_i, bypass_valid}),
+        .inp_ready_o            ({fully_red_ready_o, bypass_ready}),
+        .oup_data_o             (final_flit_o),
+        .oup_valid_o            (final_valid_o),
+        .oup_ready_i            (final_ready_i)
+    );
+end else begin
+    assign final_flit_o = {fully_red_flit, fully_red_mask, fully_red_data_i.tag};
+    assign final_valid_o = fully_red_valid_i;
+    assign fully_red_ready_o = final_ready_i;
+end
+
+// Generate the header and the output mask for the fully reduced data
+// @ Generic:            Iterate through the buffer and try to find a matching tag
+//                       then extract the header and the output mask
+// @ Stalling:           When we know that the element leave the reduction logic after the next
+//                       reduction then we fetch the header / output mask and store them inside
+//                       a designated fifo.
+// @ Simple:             Store the header / output dir directly inside a fifo when
+//                       the request is placed!
+if(GENERIC) begin
+    assign stalling_reduction_ongoing_n = 1'b0; //Sig. not used in stalling case
+    always_comb begin
+        metadata_out_flit = '0;
+        metadata_out_mask = '0;
+
+        for(int i = 0; i < RdBufferSize; i++) begin
+            if(buffer_q[i].f_valid && (buffer_q[i].tag == fully_red_data_i.tag) && fully_red_valid_i) begin
+                metadata_out_flit = buffer_q[i].header;
+                metadata_out_mask = buffer_q[i].output_dir;
+            end
+        end
+    end
+end else if(STALLING) begin
+    // Fifo to store the header of the element during the FPU reduction
+    fifo_v3 #(
+        .FALL_THROUGH     (1'b0),
+        .dtype            (flit_t),
+        .DEPTH            (RdPipelineDepth+2)
+    ) i_fifo_header (
+        .clk_i            (clk_i),
+        .rst_ni           (rst_ni),
+        .flush_i          (flush_i),
+        .testmode_i       (1'b0),
+        .full_o           (),
+        .empty_o          (stalling_reduction_ongoing_n),
+        .usage_o          (),
+        .data_i           (buffer_d[0].header),
+        // push header only if we know that this is the last iteration of the flit e.g. it
+        // leaves the reduction logic afterwards
+        .push_i           (retire_element),
+        .data_o           (metadata_out_flit),
+        // pop header on active fpu resp hs and active output hs
+        // We need to include the resp hs as otherwise a bypass flit could remove an element
+        .pop_i            (final_valid_o & final_ready_i & reduction_resp_valid_i & reduction_resp_ready_i)
+    );
+
+    // Fifo to store the output direction of the element during the FPU reduction
+    fifo_v3 #(
+        .FALL_THROUGH     (1'b0),
+        .DATA_WIDTH       (NumRoutes),
+        .DEPTH            (RdPipelineDepth+2)
+    ) i_fifo_outdir (
+        .clk_i            (clk_i),
+        .rst_ni           (rst_ni),
+        .flush_i          (flush_i),
+        .testmode_i       (1'b0),
+        .full_o           (),
+        .empty_o          (),
+        .usage_o          (),
+        .data_i           (buffer_d[0].output_dir),
+        // push header only if we know that this is the last iteration of the flit e.g. it
+        // leaves the reduction logic afterwards
+        .push_i           (retire_element),
+        .data_o           (metadata_out_mask),
+        // pop mask on active fpu resp hs and active output hs
+        // We need to include the resp hs as otherwise a bypass flit could remove an element
+        .pop_i            (final_valid_o & final_ready_i & reduction_resp_valid_i & reduction_resp_ready_i)
+    );
+end else begin
+    // Fifo to store the header of the element during the FPU reduction
+    fifo_v3 #(
+        .FALL_THROUGH     (1'b0),
+        .dtype            (flit_t),
+        .DEPTH            (RdPipelineDepth+2)
+    ) i_fifo_header (
+        .clk_i            (clk_i),
+        .rst_ni           (rst_ni),
+        .flush_i          (flush_i),
+        .testmode_i       (1'b0),
+        .full_o           (),
+        .empty_o          (simple_reduction_ongoing_n),
+        .usage_o          (),
+        .data_i           (req_header),
+        .push_i           (reduction_req_valid_i & reduction_req_ready_i),  // push mask on active fpu req hs
+        .data_o           (metadata_out_flit),
+        .pop_i            (reduction_resp_valid_i & reduction_resp_ready_i) // pop mask on active fpu resp hs
+    );
+
+    // Fifo to store the output direction of the element during the FPU reduction
+    fifo_v3 #(
+        .FALL_THROUGH     (1'b0),
+        .DATA_WIDTH       (NumRoutes),
+        .DEPTH            (RdPipelineDepth+2)
+    ) i_fifo_outdir (
+        .clk_i            (clk_i),
+        .rst_ni           (rst_ni),
+        .flush_i          (flush_i),
+        .testmode_i       (1'b0),
+        .full_o           (),
+        .empty_o          (),
+        .usage_o          (),
+        .data_i           (req_output_mask),
+        .push_i           (reduction_req_valid_i & reduction_req_ready_i),  // push mask on active fpu req hs
+        .data_o           (metadata_out_mask),
+        .pop_i            (reduction_resp_valid_i & reduction_resp_ready_i) // pop mask on active fpu resp hs
+    );
+end
+
+// Parse the mask
+//Combine the metadata flit together with the result from the reduction.
+always_comb begin
+    fully_red_flit = '0;
+    if(RdSupportAxi == 1'b1) begin
+        fully_red_flit = insertAXIWdata(metadata_out_flit, fully_red_data_i.data);
+    end
+    fully_red_mask = metadata_out_mask;
+end
+
+// Generate the signal for the demux which either forwards the reduction response
+// to the partial buffer or towards the output (if fully reduced)
+if(GENERIC) begin : gen_response_demux_generic
+    logic [RdBufferSize-1:0] temp_match;
+    for(genvar i = 0; i < RdBufferSize; i++) begin
+        // Only allow if we found a matching final mask and tag with a valid entry
+        assign temp_match[i] = (buffer_q[i].f_valid && (buffer_q[i].final_mask == reduction_resp_mask_i) && (buffer_q[i].tag == reduction_resp_tag_i)) ? 1'b1 : 1'b0;
+    end
+    assign ctrl_output_demux_o = (|temp_match) & reduction_resp_valid_i;
+end else if(STALLING) begin : gen_response_demux_stalling
+    // Fifo to store if the element should be forwarded to the output
+    fifo_v3 #(
+        .FALL_THROUGH     (1'b0),
+        .DATA_WIDTH       (1),
+        .DEPTH            (RdPipelineDepth+2)
+    ) i_fifo_outdir (
+        .clk_i            (clk_i),
+        .rst_ni           (rst_ni),
+        .flush_i          (flush_i),
+        .testmode_i       (1'b0),
+        .full_o           (),
+        .empty_o          (),
+        .usage_o          (),
+        .data_i           (retire_element),
+        .push_i           (reduction_req_valid_i & reduction_req_ready_i),  // push mask on active fpu req hs
+        .data_o           (ctrl_output_demux_o),
+        .pop_i            (reduction_resp_valid_i & reduction_resp_ready_i) // pop mask on active fpu resp hs
+    );
+end else begin
+    // No buffering - always forward it
+    assign ctrl_output_demux_o = 1'b1;
+end
+
+// AXI Specific function!
+// Insert data into AXI specific W frame!
+function automatic flit_t insertAXIWdata(flit_t metadata, RdData_t data);
+    floo_axi_w_flit_t w_flit;
+    // Parse the entire flit
+    w_flit = floo_axi_w_flit_t'(metadata);
+    // Copy the new data
+    w_flit.payload.data = data;
+    return flit_t'(w_flit);
+endfunction
+
+// Extract data from AXI specific W frame!
+function automatic RdData_t extractAXIWdata(flit_t metadata);
+    floo_axi_w_flit_t w_flit;
+    // Parse the entire flit
+    w_flit = floo_axi_w_flit_t'(metadata);
+    // Return the W data
+    return w_flit.payload.data;
+endfunction
+
+// Store the data in the buffer
+`FF(buffer_q, buffer_d, '0, clk_i, rst_ni)
+
+// Store all locked in signals
+`FF(locked_q, locked_d, '0, clk_i, rst_ni)
+`FF(selected_input_q, selected_input_d, '0, clk_i, rst_ni)
+`FF(selected_partial_result_buffer_q, selected_partial_result_buffer_d, '0, clk_i, rst_ni)  // Only generic and stalling
+`FF(selected_partial_result_mux_q, selected_partial_result_mux_d, '0, clk_i, rst_ni)        // Only generic and stalling
+`FF(selected_op_q, selected_op_d, floo_pkg::F_Add, clk_i, rst_ni)                           // Only generic and stalling
+`FF(selected_tag_q, selected_tag_d, '0, clk_i, rst_ni)                                      // Only generic and stalling
+
+/* ASSERTION Checks */
+// We can only run GENERIC or SIMPLE or STALLING
+`ASSERT_INIT(Invalid_Configuration_1, !(GENERIC & SIMPLE))
+`ASSERT_INIT(Invalid_Configuration_2, !(STALLING & SIMPLE))
+`ASSERT_INIT(Invalid_Configuration_3, !(GENERIC & STALLING))
+`ASSERT_INIT(Invalid_Configuration_4, (GENERIC | STALLING | SIMPLE))
+
+// Currently the AXI support must be enabled
+`ASSERT_INIT(Support_AXI, RdSupportAxi)
+
+endmodule
diff --git a/hw/floo_offload_reduction_stalling.sv b/hw/floo_offload_reduction_stalling.sv
new file mode 100644
index 00000000..6fce79eb
--- /dev/null
+++ b/hw/floo_offload_reduction_stalling.sv
@@ -0,0 +1,101 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Raphael Roth <raroth@student.ethz.ch>
+
+// This module allows to stall a valid / ready handshake by delaying the ready signal
+// to the source. Any valid signal acknowledged on the destination side will lead to
+// the deassertion of said valid signal. The handshake to the source can be controlled
+// by an external stalling signal.
+
+// The stalling signal is not preemtive e.g. it needs to be asserted in either the cycle where
+// the dst handshake occurs or after. A stalling signal befor will be ignored!
+
+`include "common_cells/registers.svh"
+
+module floo_offload_reduction_stalling #() (
+    /// Control Inputs
+    input  logic        clk_i,
+    input  logic        rst_ni,
+    input  logic        flush_i,
+    /// All Input Connections
+    input  logic        src_valid_i,
+    output logic        src_ready_o,
+    /// Stop stalling the valid signal
+    input logic         stalling_i,
+    /// All Output Connections
+    output logic        dst_valid_o,
+    input logic         dst_ready_i
+);
+
+/* All local parameter */
+
+/* All Typedef Vars */
+
+// Var to track the state of the handshake
+typedef enum logic [1:0] { 
+    s_idle = 2'd0,
+    s_forward = 2'd1,
+    s_stalling = 2'd2
+} state_t;
+
+/* Variable declaration */
+state_t sm_d, sm_q;
+
+/* Module Declaration */
+always_comb begin
+    // Init all Vars
+    sm_d = sm_q;
+
+    dst_valid_o = 1'b0;
+    src_ready_o = 1'b0;
+
+    // Small State Machine
+    case(sm_d)
+        s_idle: begin
+            dst_valid_o = src_valid_i;  // forward the valid signal
+
+            if((src_valid_i == 1'b1) && (dst_ready_i == 1'b0)) begin
+                sm_d = s_forward;
+            end else if((src_valid_i == 1'b1) && (dst_ready_i == 1'b1) && (stalling_i == 1'b0)) begin
+                sm_d = s_stalling;
+            end else if((src_valid_i == 1'b1) && (dst_ready_i == 1'b1) && (stalling_i == 1'b1)) begin
+                src_ready_o = 1'b1;
+                sm_d = s_idle;
+            end
+        end        
+        s_forward: begin
+            dst_valid_o = src_valid_i;  // forward the valid signal
+
+            if((src_valid_i == 1'b1) && (dst_ready_i == 1'b1) && (stalling_i == 1'b0)) begin
+                sm_d = s_stalling;
+            end else if((src_valid_i == 1'b1) && (dst_ready_i == 1'b1) && (stalling_i == 1'b1)) begin
+                src_ready_o = 1'b1;
+                sm_d = s_idle;
+            end
+        end
+        s_stalling: begin
+            dst_valid_o = 1'b0; // the valid signal was already acked
+
+            if(stalling_i == 1'b1) begin
+                src_ready_o = 1'b1;
+                sm_d = s_idle;
+            end
+        end
+    endcase
+
+    // Reset the state
+    if(flush_i == 1'b1) begin
+        sm_d = s_idle;
+        dst_valid_o = 1'b0;
+        src_ready_o = 1'b0;
+    end
+end
+
+
+// Buffer the locked in signal
+`FF(sm_q, sm_d, s_idle, clk_i, rst_ni)
+
+
+endmodule
diff --git a/hw/floo_offload_reduction_taggen.sv b/hw/floo_offload_reduction_taggen.sv
new file mode 100644
index 00000000..d30173e2
--- /dev/null
+++ b/hw/floo_offload_reduction_taggen.sv
@@ -0,0 +1,248 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Raphael Roth <raroth@student.ethz.ch>
+
+// To be able to have multiple infligth reduction at the same time we need to track each element
+// belonging to an individal reduction. Only elements with equal tag will be reduced together.
+// This separation into a seperate module allows to reduce the tracking effort inside the rest
+// of the system. Depending on the system restriction we can have a more sophiticated tag
+// generator implementation. In the most general case we would support reduction of
+// out-of-order arriving elements (NOT SUPPORTED!)
+
+// Current Implementation:
+// We want to support the most general input pattern without the overhead of out-of-order
+// tracking. The main problem is that the tag can never be out of sync in respect for all
+// inputs. If one element is excpected from a certain input direction then it is only allowed
+// to increment the Tag if the element actually arrives (and not sooner!) therefor we count
+// pending elements on each input. If no element is excpected from one direction then the Tag
+// should be incremented immidiatly.
+
+// Example:
+// We have 3 different inputs (A,B & C) for two reductions:
+// - Reduction 1 with 2 (1.1 + 1.2) Flits from dir A & B
+// - Reduction 2 with 3 (2.1 + 2.2 + 2.3) Flits from dir B & C
+
+// Cycle 0  A > Flit 1.1A arrives --> gets Tag 1 --> internal Tag set to 2
+//          B > Flit expected but not here yet --> internal Tag remains 1 (pending counter = 1)
+//          C > No Flit expected --> internal Tag set to 2
+//
+// Cycle 1  A > Flit 1.2A arrives --> gets Tag 2 --> internal Tag set to 3
+//          B > Flit 1.1B arrives --> gets Tag 1 --> internal Tag set to 2 (pending counter = 1)
+//          C > No Flit expected --> internal Tag set to 3
+//
+// Cycle 2  None
+//
+// Cycle 3  A > No Flit --> internal Tag remains 3
+//          B > Flit 1.2B arrives --> gets Tag 2 --> internal Tag set to 3 (pending counter = 0)
+//          C > No Flit --> internal Tag remains 3
+//
+// Cycle 4  None
+//
+// Cycle 5  A > No Flit expected --> internal Tag set to 4
+//          B > Flit 2.1B arrives --> gets Tag 3 --> internal Tag set to 4
+//          C > Flit 2.1C arrives --> gets Tag 3 --> internal Tag set to 4
+//
+// Cycle 6  None*
+//
+// Cycle 7  A > No Flit expected --> internal Tag set to 5
+//          B > Flit 2.2B arrives --> gets Tag 4 --> internal Tag set to 5
+//          C > Flit expected but not here yet --> internal Tag remains 4 (pending counter = 1)
+//
+// Cycle 8  A > No Flit expected --> internal Tag set to 6
+//          B > Flit 2.3B arrives --> gets Tag 5 --> internal Tag set to 6
+//          C > Flit expected but not here yet --> internal Tag remains 4 (pending counter = 2)
+//
+// Cycle 9  A > No Flit --> internal Tag remains 6
+//          B > No Flit --> internal Tag remains 6
+//          C > Flit 2.2C arrives --> gets Tag 4 --> internal Tag set to 5 (pending counter = 1)
+//
+// Cycle 10 A > No Flit --> internal Tag remains 6
+//          B > No Flit --> internal Tag remains 6
+//          C > Flit 2.3C arrives --> gets Tag 5 --> internal Tag set to 6 (pending counter = 0)
+//
+// * At this point we have finished reduction threfor all internal tag are required to be on the
+//   same internal level because we do not know from where the next flits will arrive from.
+
+// Restriction:
+// - With the current implementation it is impossible to handle two different incoming reduction
+//   (different target address) request in the same cycle. However it should work if a pending
+//   elements incomes together with an new reduction request (Not tested!).
+// - All inputs needs to be strictly in order.
+
+// Open Points:
+// - Check if the module works with backpressure or not. Maybe necessary to introduce output
+//   stage if valid is asserted but not accepted by ready yet.
+// - Evaluate the target adress to allow for more than one incoming reduction at the same time
+
+`include "common_cells/registers.svh"
+
+module floo_offload_reduction_taggen #(
+    /// Number of input routes
+    parameter int unsigned NumRoutes                    = 1,
+    /// Typedef for the Tag
+    parameter type TAG_T                                = logic,
+    /// Bit-Width of the TAG_T
+    parameter int unsigned RdTagBits                    = 1
+) (
+    /// Control Inputs
+    input  logic                                clk_i,
+    input  logic                                rst_ni,
+    input  logic                                flush_i,
+    /// All Input directions
+    input logic [NumRoutes-1:0][NumRoutes-1:0]  mask_i,
+    input logic [NumRoutes-1:0]                 valid_i,
+    input logic [NumRoutes-1:0]                 ready_i,
+    /// Generated Tag for each output
+    output TAG_T [NumRoutes-1:0]                tag_o
+);
+
+/* All local parameter */
+localparam int unsigned  MaxNumberofOutstandingRed = 1 << RdTagBits;
+
+/* All Typedef Vars */
+
+/* Variable declaration */
+logic [NumRoutes-1:0] inc_pending;
+logic [NumRoutes-1:0] dec_pending;
+logic [NumRoutes-1:0] outstanding_pending;
+
+logic [NumRoutes-1:0][NumRoutes-1:0] gen_mask_with_pending;
+
+logic [NumRoutes-1:0] inc_tag;
+logic [NumRoutes-1:0] inc_tag_pending_src;
+logic [NumRoutes-1:0] general_mask;
+
+TAG_T [NumRoutes-1:0] tag_q, tag_d;
+
+logic [NumRoutes-1:0] handshake;
+
+logic new_reduction_incoming;
+
+/* Module Declaration */
+
+// determint if we have a active valid handshake
+assign handshake = valid_i & ready_i;
+
+// Generate Credit Counter once per input
+for (genvar i = 0; i < NumRoutes; i++) begin : gen_pending_tracker
+    credit_counter #(
+        .NumCredits         (MaxNumberofOutstandingRed),
+        .InitCreditEmpty    (1'b1)
+    ) i_credit_counter (
+        .clk_i              (clk_i),
+        .rst_ni             (rst_ni),
+        .credit_o           (),
+        .credit_give_i      (inc_pending[i]),
+        .credit_take_i      (dec_pending[i]),
+        .credit_init_i      (1'b0),
+        .credit_left_o      (outstanding_pending[i]),   // == 1'b1 if credits are available
+        .credit_crit_o      (),  // Giving one more credit will fill the credits
+        .credit_full_o      ()
+    );
+end
+
+// TODO(lleone): Transpose input mask to gen_mask_with_pending. WHY? IS IT REALLY NECESSARY?
+// Generate the mask - if no pending incoming req then forward the mask, otherwise set to 0!
+for (genvar i = 0; i < NumRoutes; i++) begin
+    for (genvar j = 0; j < NumRoutes; j++) begin
+        assign gen_mask_with_pending[j][i] =
+                ((outstanding_pending[i] == 1'b0) && (handshake[i] == 1'b1)) ? mask_i[i][j] : 1'b0;
+    end
+end
+
+// The general mask indicates if the router excpect an element on this input. The or-connection
+// between all inputs is to receive the first handshake on any interface availble.
+// The mask is only taken into consideration in the general mask if no pending element exists on
+// the input as the strict in-order-requiremnt determines that the next incoming element
+// belongs to an "old" reduction.
+
+// Here is also the problematic part when two different reductions arrive at the same time:
+// the generated mask would be the combination of the two and the tag would be mixed up!
+
+// Generate the General Mask (OR-Connect all 1 bit / 2 bit etc.)
+for (genvar i = 0; i < NumRoutes; i++) begin : gen_reduce_bitwise_outer
+    assign general_mask[i] = |gen_mask_with_pending[i];
+end
+
+// Generate the Signal where we indicate if a new reduction is incoming
+assign new_reduction_incoming = (|handshake) & (|general_mask);
+
+always_comb begin
+    // Init all Vars
+    inc_pending = '0;
+    dec_pending = '0;
+    inc_tag = '0;
+    inc_tag_pending_src = '0;
+
+    // Iterate over all inputs
+    for (int i = 0; i < NumRoutes;i++) begin
+        // Increment the Tag if we have a valid handshake and the bit in the general mask is set
+        // (Element expected from this input and element is actually there)
+        if((general_mask[i] == 1'b1) && (handshake[i] == 1'b1) && (new_reduction_incoming == 1'b1)) begin
+            // Edge case: On another input we have new incoming request but we have also a pending one
+            // with the same mask on this input therefore the received entry is the pending one
+            // (handled further down) and not the "new" one - so increment the pending one
+            if(outstanding_pending[i] == 1'b1) begin
+                inc_pending[i] = 1'b1;
+            end else begin
+                inc_tag[i] = 1'b1;
+            end
+        end
+
+        // Increment the Pending for this Input when the general mask bit is set but we do not have a hs
+        // (Element expected from this input but element is not there)
+        if((general_mask[i] == 1'b1) && (handshake[i] == 1'b0) && (new_reduction_incoming == 1'b1)) begin
+            inc_pending[i] = 1'b1;
+        end
+
+        // Increment the Tag if the general mask bit is clear but somewhere exists a hs
+        // (No Element expected from this input - make sure to only increment by 1)
+        // However if this entry is backpressured then add a pending
+        if((general_mask[i] == 1'b0) && (new_reduction_incoming == 1'b1)) begin
+            if(valid_i[i] == 1'b0) begin
+                inc_tag[i] = 1'b1;
+            end else begin
+                inc_pending[i] = 1'b1;
+            end
+        end
+
+        // Decrement the Pending for this Input if we have a pending incoming element and a valid hs
+        // (Element arrives from a erlier handled request but was pending)
+        if((outstanding_pending[i] == 1'b1) && (handshake[i] == 1'b1)) begin
+            dec_pending[i] = 1'b1;
+            inc_tag_pending_src[i] = 1'b1;
+        end
+    end
+end
+
+// TODO(lleone): WHY NOT USING A NORMAL COUNTER? In this code you might inceremnet twice if both signals are asseretd? If so use delta counter?
+// Generate the Tag's here!
+always_comb begin
+    // Init all Vars
+    tag_d = tag_q;
+
+    // Iterate over all inputs
+    for (int i = 0; i < NumRoutes;i++) begin
+
+        // Increment the Tag
+        if(inc_tag[i] == 1'b1) begin
+            tag_d[i] = tag_d[i] + 1;
+        end
+
+        // Increment the Tag again if we have a second HS
+        if(inc_tag_pending_src[i] == 1'b1) begin
+            tag_d[i] = tag_d[i] + 1;
+        end
+    end
+end
+
+// Assign the output tag
+assign tag_o = tag_q;
+
+// buffer the tag
+`FF(tag_q, tag_d, '0, clk_i, rst_ni)
+
+/* ASSERTION Checks */
+endmodule
diff --git a/hw/floo_output_arbiter.sv b/hw/floo_output_arbiter.sv
index 5df88ab9..977a7010 100644
--- a/hw/floo_output_arbiter.sv
+++ b/hw/floo_output_arbiter.sv
@@ -3,46 +3,69 @@
 // SPDX-License-Identifier: SHL-0.51
 //
 // Chen Wu <chenwu@student.ethz.ch>
+// Raphael Roth <raroth@student.ethz.ch>
+// Lorenzo Leone <lleone@iis.ee.ethz.ch>
+
+// The purpose of the slave ports is to merge data from port's which are not mapped in the "normal" way.
+// An example would be the output of the reduction logic!
+// These ports cannot be reduced!
 
 `include "common_cells/assertions.svh"
 
 module floo_output_arbiter import floo_pkg::*;
 #(
-  /// Number of input ports
+  /// Number of total input ports
   parameter int unsigned NumRoutes  = 1,
+  /// Number of paraellel reduction capable ports
+  parameter int unsigned NumParallelRedRoutes = 0,
+  /// Collective ops configuration
+  parameter collect_op_be_cfg_t  CollectOpCfg    = CollectiveSupportDefaultCfg,
   /// Type definitions
-  parameter type         flit_t     = logic,
-  parameter type         payload_t  = logic,
-  parameter payload_t    NarrowRspMask = '0,
-  parameter payload_t    WideRspMask = '0,
-  parameter type         id_t       = logic
+  parameter type         flit_t               = logic,
+  parameter type         hdr_t                = logic,
+  parameter type         id_t                 = logic,
+  /// Do we support local loopback e.g. should the logic expect the local flit or not
+  parameter bit          RdSupportLoopback    = 1'b0,
+  /// AXI dependent parameter
+  parameter bit          RdSupportAxi         = 1'b1,
+  parameter axi_cfg_t    AxiCfg               = '0
 ) (
-  input  logic                   clk_i,
-  input  logic                   rst_ni,
+  input  logic                      clk_i,
+  input  logic                      rst_ni,
   /// Current XY-coordinate of the router
-  input  id_t                    xy_id_i,
+  input  id_t                       xy_id_i,
   /// Input ports
-  input  logic  [NumRoutes-1:0]  valid_i,
-  output logic  [NumRoutes-1:0]  ready_o,
-  input  flit_t [NumRoutes-1:0]  data_i,
+  input  logic  [NumRoutes-1:0]   valid_i,
+  output logic  [NumRoutes-1:0]   ready_o,
+  input  flit_t [NumRoutes-1:0]   data_i,
   /// Output port
-  output logic                   valid_o,
-  input  logic                   ready_i,
-  output flit_t                  data_o
+  output logic                      valid_o,
+  input  logic                      ready_i,
+  output flit_t                     data_o
 );
 
-  flit_t                 reduce_data_out, unicast_data_out;
-  logic [NumRoutes-1:0]  reduce_valid_in, unicast_valid_in, reduce_ready_out, unicast_ready_out;
-  logic                  reduce_valid_out, unicast_valid_out, reduce_ready_in, unicast_ready_in;
+  flit_t                  reduce_data_out, unicast_data_out;
+  logic [NumRoutes-1:0]   reduce_valid_in, unicast_valid_in;
+  logic [NumRoutes-1:0]   reduce_ready_out, unicast_ready_out;
+  logic                   reduce_valid_out, unicast_valid_out;
+  logic                   reduce_ready_in, unicast_ready_in;
+
+  logic [NumRoutes-1:0]   reduce_mask;
 
-  logic [NumRoutes-1:0]  reduce_mask;
+  localparam bit EnParallelReduction = (NumParallelRedRoutes > 1) ? 1'b1 : 1'b0;
 
-  // Determine which input ports are to be reduced
-  for (genvar i = 0; i < NumRoutes; i++) begin : gen_reduce_mask
-    assign reduce_mask[i] = (data_i[i].hdr.commtype == ParallelReduction);
+  // Determine which input ports are to be reduced in parallel
+  // ignore the local ports
+  always_comb begin: gen_reduce_mask
+    reduce_mask = '0;
+    if (EnParallelReduction) begin
+      for (int i = 0; i < NumParallelRedRoutes; i++) begin
+        reduce_mask[i] = (is_parallel_reduction_op(data_i[i].hdr.collective_op));
+      end
+    end
   end
 
-  // Arbitrate unicasts
+  // Arbitrate unicasts and sequential reductions already computed by the offload unit
   assign unicast_valid_in = valid_i & ~reduce_mask;
 
   floo_wormhole_arbiter #(
@@ -60,45 +83,76 @@ module floo_output_arbiter import floo_pkg::*;
   );
 
   // Arbitrate reductions
-  assign reduce_valid_in = valid_i & reduce_mask;
-
-  floo_reduction_arbiter #(
-    .NumRoutes      ( NumRoutes     ),
-    .flit_t         ( flit_t        ),
-    .payload_t      ( payload_t     ),
-    .id_t           ( id_t          ),
-    .NarrowRspMask  ( NarrowRspMask ),
-    .WideRspMask    ( WideRspMask   )
-  ) i_reduction_arbiter (
-    .xy_id_i,
-    .data_i,
-    .valid_i   ( reduce_valid_in  ),
-    .ready_o   ( reduce_ready_out ),
-    .valid_o   ( reduce_valid_out ),
-    .ready_i   ( reduce_ready_in  ),
-    .data_o    ( reduce_data_out  )
-  );
+  if (EnParallelReduction) begin: gen_parallel_reduction
+    // Var to sparate Non-Slave ports if they have to go to the reduction arbiter!
+    flit_t [NumParallelRedRoutes-1:0]    parallel_red_data;
+    logic [NumParallelRedRoutes-1:0]     parallel_red_valid;
+    logic [NumParallelRedRoutes-1:0]     parallel_red_ready;
 
-  // Arbitrate between wormhole and reduction arbiter
-  // Reductions have higher priority than unicasts (index 0)
-  stream_arbiter #(
-    .N_INP  (2),
-    .ARBITER("prio"),
-    .DATA_T (flit_t)
-  ) i_stream_arbiter (
-    .clk_i,
-    .rst_ni,
-    .inp_data_i ({unicast_data_out, reduce_data_out}),
-    .inp_valid_i({unicast_valid_out, reduce_valid_out}),
-    .inp_ready_o({unicast_ready_in, reduce_ready_in}),
-    .oup_data_o (data_o),
-    .oup_valid_o(valid_o),
-    .oup_ready_i(ready_i)
-  );
+    // Arbiter to be instantiated for reduction operations.
+    // Responses from a multicast request are also treated as reductions.
+    // TODO: fix these flags here - RdCfg... is used (mostly) in the offload
+    //       reduction rather the parallel reduction - maybe we could make
+    //       another configuration?
+    assign reduce_valid_in = valid_i & reduce_mask;
+
+    // The reduction support only the "original" configuration of NumRoutes!
+    // Therefore NumRoutes port are connected into the reduction arbiter
+    assign parallel_red_data = data_i[NumParallelRedRoutes-1:0];
+    assign parallel_red_valid = reduce_valid_in[NumParallelRedRoutes-1:0];
+    assign reduce_ready_out[NumParallelRedRoutes-1:0] = parallel_red_ready;
+    if(NumRoutes > NumParallelRedRoutes) begin
+      assign reduce_ready_out[NumRoutes-1:NumParallelRedRoutes] = '0;
+    end
+
+    floo_reduction_arbiter #(
+      .NumRoutes            ( NumParallelRedRoutes ),
+      .CollectOpCfg         ( CollectOpCfg  ),
+      .flit_t               ( flit_t               ),
+      .hdr_t                ( hdr_t                ),
+      .id_t                 ( id_t                 ),
+      .RdSupportLoopback    ( RdSupportLoopback    ),
+      .RdSupportAxi         ( RdSupportAxi         ),
+      .AxiCfg               ( AxiCfg               )
+    ) i_reduction_arbiter (
+      .xy_id_i,
+      .data_i    ( parallel_red_data   ),
+      .valid_i   ( parallel_red_valid  ),
+      .ready_o   ( parallel_red_ready  ),
+      .valid_o   ( reduce_valid_out ),
+      .ready_i   ( reduce_ready_in  ),
+      .data_o    ( reduce_data_out  )
+    );
 
-  assign ready_o = (reduce_valid_out)? reduce_ready_out : unicast_ready_out;
+    // Arbitrate between wormhole and reduction arbiter
+    // Reductions have higher priority than unicasts (index 0)
+    stream_arbiter #(
+      .N_INP  (2),
+      .ARBITER("prio"),
+      .DATA_T (flit_t)
+    ) i_stream_arbiter (
+      .clk_i,
+      .rst_ni,
+      .inp_data_i ({unicast_data_out, reduce_data_out}),
+      .inp_valid_i({unicast_valid_out, reduce_valid_out}),
+      .inp_ready_o({unicast_ready_in, reduce_ready_in}),
+      .oup_data_o (data_o),
+      .oup_valid_o(valid_o),
+      .oup_ready_i(ready_i)
+    );
+
+    assign ready_o = (reduce_valid_out)? reduce_ready_out : unicast_ready_out;
+
+  end else begin : gen_no_parallel_reduction
+    assign data_o  = unicast_data_out;
+    assign valid_o = unicast_valid_out;
+    assign unicast_ready_in = ready_i;
+    assign ready_o = unicast_ready_out;
+  end
 
   // Cannot have an output valid without at least one input valid
   `ASSERT(ValidOutInvalidIn, valid_o |-> |valid_i)
 
+  `ASSERT_INIT(InvalidNumParallelRedRoutes, !(NumParallelRedRoutes == 1),
+               "Number of parallel reduction routes cannot be 1")
 endmodule
diff --git a/hw/floo_pkg.sv b/hw/floo_pkg.sv
index cbb048a2..af5922fa 100644
--- a/hw/floo_pkg.sv
+++ b/hw/floo_pkg.sv
@@ -129,35 +129,76 @@ package floo_pkg;
     OffloadReduction = 2'd3
   } collect_comm_e;
 
-  /// Different offloadable reduction
-  typedef enum logic [3:0] {
-    R_Select  = 4'b0000, // Select the first incoming flit
-    F_Add     = 4'b0100, // FP Addition
-    F_Mul     = 4'b0101, // FP Multiplication
-    F_Min     = 4'b0110, // FP Min
-    F_Max     = 4'b0111, // FP Max
-    A_Add     = 4'b1000, // Atomic Add (signed)
-    A_Mul     = 4'b1001, // (Non-) Atomic (signed)
-    A_Min_S   = 4'b1010, // Atomic Min (signed)
-    A_Min_U   = 4'b1110, // Atomic Min (unsigned)
-    A_Max_S   = 4'b1011, // Atomic Max (signed)
-    A_Max_U   = 4'b1111  // Atomic Max (unsigned)
-  } reduction_offload_op_e;
-
-  /// Different instantanous reduction
-  typedef enum logic [3:0] {
-    SelectAW  = 4'b0000,  // Select the first incoming flit
-    CollectB  = 4'b0001,  // Collect the B responses from an AXI transmission
-    LSBAnd    = 4'b0010   // AND Connect the LSB of the payload (useful for barrier ops)
-  } reduction_parallel_op_e;
+  /// TODO(lleone): delet this portion of code
+  // /// The types of collective communication
+  // typedef enum logic [1:0] {
+  //   /// Normal communication
+  //   Unicast = 2'd0,
+  //   /// Multicast communication
+  //   Multicast = 2'd1,
+  //   /// Parallel reduction operations
+  //   ParallelReduction = 2'd2,
+  //   /// Offload Reduction
+  //   OffloadReduction = 2'd3
+  // } collect_comm_e;
+
+  // /// Different offloadable reduction
+  // typedef enum logic [3:0] {
+  //   R_Select  = 4'b0000, // Select the first incoming flit
+  //   F_Add     = 4'b0100, // FP Addition
+  //   F_Mul     = 4'b0101, // FP Multiplication
+  //   F_Min     = 4'b0110, // FP Min
+  //   F_Max     = 4'b0111, // FP Max
+  //   A_Add     = 4'b1000, // Atomic Add (signed)
+  //   A_Mul     = 4'b1001, // (Non-) Atomic (signed)
+  //   A_Min_S   = 4'b1010, // Atomic Min (signed)
+  //   A_Min_U   = 4'b1110, // Atomic Min (unsigned)
+  //   A_Max_S   = 4'b1011, // Atomic Max (signed)
+  //   A_Max_U   = 4'b1111  // Atomic Max (unsigned)
+  // } reduction_offload_op_e;
+
+  // /// Different instantanous reduction
+  // typedef enum logic [3:0] {
+  //   SelectAW  = 4'b0000,  // Select the first incoming flit
+  //   CollectB  = 4'b0001,  // Collect the B responses from an AXI transmission
+  //   LSBAnd    = 4'b0010   // AND Connect the LSB of the payload (useful for barrier ops)
+  // } reduction_parallel_op_e;
+
 
   /// Union for both Datatype(s) - because they need to have the same size for the chimney
   /// The chimney needs this information as it does not know if we support an offload reduction
   /// or an parallel reduction.
-  typedef union packed {
-    reduction_offload_op_e op_offload;
-    reduction_parallel_op_e op_parallel;
-  } reduction_op_t;
+  // typedef union packed {
+  //   reduction_offload_op_e op_offload;
+  //   reduction_parallel_op_e op_parallel;
+  // } reduction_op_t;
+
+  /// List of supported collective operations in the NoC
+  /// These are "micro" collective operations. For example an AXI
+  /// multicast is split into a generic multicast + reduction
+  /// of teh B responses (CollectB).
+  /// The internal micro operations must be in teh MSB to make sure
+  /// the user will never issue those
+  typedef enum logic [3:0] {
+    Unicast   = 4'b0000,  // Unicast operation
+    Multicast = 4'b0001,  // Multicast communication
+    LSBAnd    = 4'b0010,  // AND Connect the LSB of the payload
+    F_Add     = 4'b0011,  // FP Addition
+    F_Mul     = 4'b0100,  // FP Multiplication
+    F_Min     = 4'b0101,  // FP Min
+    F_Max     = 4'b0110,  // FP Max
+    A_Add     = 4'b0111,  // Atomic Add (signed)
+    A_Mul     = 4'b1000,  // (Non-) Atomic (signed)
+    A_Min_S   = 4'b1001,  // Atomic Min (signed)
+    A_Min_U   = 4'b1010,  // Atomic Min (unsigned)
+    A_Max_S   = 4'b1011,  // Atomic Max (signed)
+    A_Max_U   = 4'b1100,  // Atomic Max (unsigned)
+    SelectAW  = 4'b1101,  // Select first incoming AW flit
+    CollectB  = 4'b1110,  // Collect B responses for AXI transmisison
+    //  TODO(lleone): Remove this operation and chenag the offload logic to make
+    // handle the selectAW from the parallel reduction harware
+    SeqAW     = 4'b1111   // Select the first incoming flit from a sequential reduction
+  } collect_op_e;
 
   /// The types of AXI channels in narrow-wide AXI network interfaces
   typedef enum logic [3:0] {
@@ -198,6 +239,108 @@ package floo_pkg;
     int unsigned OutIdWidth;
   } axi_cfg_t;
 
+  /// Collective macro operations to support in the NoC
+  /// In this context collective operations are macro
+  /// operations, i.e. multicast, reduction etc...
+  /// The user does not have to care about the hidden
+  /// transfers required to implement these macro collective.
+  /// This is the type the user can set [Frontend]
+  typedef struct packed {
+    /// Enable multicast transcation support on the narrow router
+    bit EnNarrowMulticast;
+    /// Enable multicast transcation support on the wide router
+    bit EnWideMulticast;
+    /// Enable LSB and operation support
+    bit EnLSBAnd;
+    /// Enable FP addition support
+    bit EnF_Add;
+    /// Enable FP multiplier support
+    bit EnF_Mul;
+    /// Enable FP minimum calculation support
+    bit EnF_Min;
+    /// Enable FP maximum calculationn support
+    bit EnF_Max;
+    /// Enable INT addition support
+    bit EnA_Add;
+    /// Enable INT multiplier support
+    bit EnA_Mul;
+    /// Enable INT signed minimum calculation support
+    bit EnA_Min_S;
+    /// Enable INT unsigned minimum calculation support
+    bit EnA_Min_U;
+    /// Enable INT signed maximum calculation support
+    bit EnA_Max_S;
+    /// Enable INT unsigned maximum calculation support
+    bit EnA_Max_U;
+  } collect_op_fe_cfg_t;
+
+  /// Collective micro operations to support in the NoC
+  /// This flags can be used at the FlooNoC level to enable/disable
+  /// features like the support for collectB, or selectAW etc... to
+  /// maximize the granularity of the hardware configuration.
+  /// For instance a system featuring FlooNoC with multicast support
+  /// needs partial support for parallel reduction, EnCollectB = true
+  /// but EnLSBAnd = false. This level of granularity is hidden to the user,
+  /// and it's used internally by the NoC [Backend]
+  typedef struct packed {
+    bit EnMulticast;// Multicast communication
+    bit EnLSBAnd;   // AND Connect the LSB of the payload
+    bit EnF_Add;    // FP Addition
+    bit EnF_Mul;    // FP Multiplication
+    bit EnF_Min;    // FP Min
+    bit EnF_Max;    // FP Max
+    bit EnA_Add;    // Atomic Add (signed)
+    bit EnA_Mul;    // (Non-) Atomic (signed)
+    bit EnA_Min_S;  // Atomic Min (signed)
+    bit EnA_Min_U;  // Atomic Min (unsigned)
+    bit EnA_Max_S;  // Atomic Max (signed)
+    bit EnA_Max_U;  // Atomic Max (unsigned)
+    bit EnSelectAW; // Select first incoming AW flit
+    bit EnCollectB; // Collect B responses for AXI transmisison
+  } collect_op_be_cfg_t;
+
+  typedef logic [3:0] collect_op_t;
+
+  /// Controller configuration
+  typedef enum logic [1:0] {
+    /// Simple configuration
+    ControllerSimple = 2'd0,
+    /// Stalling configuration
+    ControllerStalling = 2'd1,
+    /// Generic configuration
+    ControllerGeneric = 2'd2
+  } floo_red_controller_e;
+
+  /// Configuration for the offload reduction logic
+  typedef struct packed {
+    /// configuration for the controller
+    floo_red_controller_e RdControllConf;
+    /// input fifo configuration
+    bit RdFifoFallThrough;
+    int unsigned RdFifoDepth;
+    /// pipeline depth of the offload unit
+    int unsigned RdPipelineDepth;
+    /// partial buffer size
+    int unsigned RdPartialBufferSize;
+    /// required tag bit if generic controller is used
+    int unsigned RdTagBits;
+    /// is the underlying protocl AXI
+    bit RdSupportAxi;
+    /// enable the bypass (required for AXI-AW)
+    bit RdEnableBypass;
+    /// support loopback for the local link - collective will
+    /// be forwarded to the local port too.
+    bit RdSupportLoopback;
+    /// Cut offload interface
+    bit CutOffloadIntf;
+  } reduction_cfg_t;
+
+  /// Configuration to specify how extensive collective support is enabled
+  typedef struct packed {
+    collect_op_fe_cfg_t OpCfg;
+    reduction_cfg_t  RedCfg;
+  } collective_cfg_t;
+
   /// Configuration to pass routing information to the routers
   /// as well as network interfaces
   typedef struct packed {
@@ -221,14 +364,8 @@ package floo_pkg;
     /// The number of routes for every routing table,
     /// Only used if `RouteAlgo == SourceRouting`
     int unsigned NumRoutes;
-    /// Whether to enable the multicast feature in the NoC
-    bit EnMultiCast;
-    /// Whether to use the parallel reduction on the narrow req link
-    bit EnParallelReduction;
-    /// Whether to use the offload reduction on the narrow req link
-    bit EnNarrowOffloadReduction;
-    /// Whether to use the offload reduction on the wide link
-    bit EnWideOffloadReduction;
+    /// Configuration to support collective operations
+    collective_cfg_t CollectiveCfg;
   } route_cfg_t;
 
   /// Configuration for the network interface (chimney)
@@ -268,6 +405,61 @@ package floo_pkg;
     bit CutRsp;
   } chimney_cfg_t;
 
+  /// Default macro collective operations supported in the NoC - all disabled
+  localparam collect_op_fe_cfg_t CollectiveOpDefaultCfg = '{
+    EnNarrowMulticast : 1'b0,
+    EnWideMulticast   : 1'b0,
+    EnLSBAnd          : 1'b0,
+    EnF_Add           : 1'b0,
+    EnF_Mul           : 1'b0,
+    EnF_Min           : 1'b0,
+    EnF_Max           : 1'b0,
+    EnA_Add           : 1'b0,
+    EnA_Mul           : 1'b0,
+    EnA_Min_S         : 1'b0,
+    EnA_Min_U         : 1'b0,
+    EnA_Max_S         : 1'b0,
+    EnA_Max_U         : 1'b0
+  };
+
+  /// Default micro collective operations supported in the NoC - all disabled
+  localparam collect_op_be_cfg_t CollectiveSupportDefaultCfg = '{
+    EnMulticast : 1'b0,
+    EnLSBAnd    : 1'b0,
+    EnF_Add     : 1'b0,
+    EnF_Mul     : 1'b0,
+    EnF_Min     : 1'b0,
+    EnF_Max     : 1'b0,
+    EnA_Add     : 1'b0,
+    EnA_Mul     : 1'b0,
+    EnA_Min_S   : 1'b0,
+    EnA_Min_U   : 1'b0,
+    EnA_Max_S   : 1'b0,
+    EnA_Max_U   : 1'b0,
+    EnSelectAW  : 1'b0,
+    EnCollectB  : 1'b0
+  };
+
+  /// The default configuration for the offload reduction unit
+  localparam reduction_cfg_t ReductionDefaultCfg = '{
+    RdControllConf: ControllerGeneric,
+    RdFifoFallThrough: 1'b1,
+    RdFifoDepth: 2,
+    RdPipelineDepth: 5,
+    RdPartialBufferSize: 3,
+    RdTagBits: 5,
+    RdSupportAxi: 1'b1,
+    RdEnableBypass: 1'b1,
+    RdSupportLoopback: 1'b1,
+    CutOffloadIntf: 1'b1
+  };
+
+  /// The default configuration for collective operations
+  localparam collective_cfg_t CollectiveDefaultCfg = '{
+    OpCfg:  CollectiveOpDefaultCfg,
+    RedCfg: ReductionDefaultCfg
+  };
+
   /// The default configuration for the network interface
   localparam chimney_cfg_t ChimneyDefaultCfg = '{
     EnSbrPort: 1'b1,
@@ -292,10 +484,7 @@ package floo_pkg;
     IdAddrOffset: 0,
     NumSamRules: 0,
     NumRoutes: 0,
-    EnMultiCast: 1'b0,
-    EnParallelReduction: 1'b0,
-    EnNarrowOffloadReduction: 1'b0,
-    EnWideOffloadReduction: 1'b0
+    CollectiveCfg: CollectiveDefaultCfg
   };
 
   /// The AXI channel to link mapping in a single-AXI network interface
@@ -431,4 +620,112 @@ package floo_pkg;
                                    get_nw_chan_width(cfg_n, cfg_w, ch);
   endfunction
 
+
+  /**********************************************************
+   *         Collective Communication Support               *
+   **********************************************************/
+  /* These functions help to abstract the complexity of the NoC and the
+  /  implementation schemes for collective communication.
+  /  The user is responsible to declare only which macro level collective
+  /  operations are supported in the NoC (collect_op_fe_cfg_t).
+  /  The NoC implementation will then derive which type of hardware support
+  /  is required (e.g. multicast, collectB, selectAW, etc...). This info
+  /  is implementation specific and must be transparent to the user.
+  */
+  ///---------------------------------------------------------
+  /// Helper functions to calculate which macro operations are supported
+  /// and which type of hardware support is required
+
+  /// Calculates if the NoC needs support for Narrow parallel reduction
+  function automatic bit is_en_parallel_reduction(collect_op_fe_cfg_t cfg);
+    return (cfg.EnLSBAnd);
+  endfunction
+
+  /// Calculates if the NoC needs support for Narrow sequential reduction
+  function automatic bit is_en_narrow_seq_reduction(collect_op_fe_cfg_t cfg);
+    return (cfg.EnA_Add | cfg.EnA_Mul | cfg.EnA_Min_S |
+            cfg.EnA_Min_U | cfg.EnA_Max_S | cfg.EnA_Max_U
+            );
+  endfunction
+
+  /// Calculates if the NoC needs support for Narrow Sequential reduction
+  function automatic bit is_en_narrow_reduction(collect_op_fe_cfg_t cfg);
+    return (is_en_narrow_seq_reduction(cfg) | is_en_parallel_reduction(cfg)
+            );
+  endfunction
+
+  /// Calculates if the NoC needs support for Wide Sequential reduction
+  /// there is no need to separate between parallel and sequential for the
+  /// wide because only wide sequential is supported
+  function automatic bit is_en_wide_reduction(collect_op_fe_cfg_t cfg);
+    return (cfg.EnF_Add | cfg.EnF_Mul |
+            cfg.EnF_Min | cfg.EnF_Max
+            );
+  endfunction
+
+  /// Calculate if narrow collective support is enabled
+  function automatic bit is_en_narrow_collective(collect_op_fe_cfg_t cfg);
+    return (cfg.EnNarrowMulticast | is_en_narrow_reduction(cfg));
+  endfunction
+
+  /// Calculate if wide collective support is enabled
+  function automatic bit is_en_wide_collective(collect_op_fe_cfg_t cfg);
+    return (cfg.EnWideMulticast | is_en_wide_reduction(cfg));
+  endfunction
+
+  /// Calculate if there is need for collective support
+  function automatic bit is_en_collective(collect_op_fe_cfg_t cfg);
+    return (is_en_wide_collective(cfg) | is_en_narrow_collective(cfg));
+  endfunction
+
+  ///---------------------------------------------------------
+  /// Helper functions to calculate which micro transaction are supported
+  /// and which type of hardware support is required
+  function automatic bit en_sequential_support(collect_op_be_cfg_t cfg);
+    return (cfg.EnF_Add | cfg.EnF_Mul | cfg.EnF_Min | cfg.EnF_Max |
+            cfg.EnA_Add | cfg.EnA_Mul | cfg.EnA_Min_S | cfg.EnA_Min_U |
+            cfg.EnA_Max_S | cfg.EnA_Max_U
+            );
+  endfunction
+
+  function automatic bit en_parallel_support(collect_op_be_cfg_t cfg);
+    return (cfg.EnLSBAnd | cfg.EnCollectB | cfg.EnSelectAW);
+  endfunction
+
+  function automatic bit en_multicast_support(collect_op_be_cfg_t cfg);
+    return (cfg.EnMulticast);
+  endfunction
+
+  ///---------------------------------------------------------
+  /// Helper functions to translate internal opcodes in macro transactions
+  /// Evaluate if the incoming operation is a multicast operation
+  function automatic bit is_multicast_op(collect_op_e op);
+    return (op == Multicast);
+  endfunction
+
+  /// Evaluate if the incoming operation is a reduction operation
+  function automatic bit is_reduction_op(collect_op_e op);
+    case (op)
+      F_Add, F_Mul, F_Min, F_Max, LSBAnd, SelectAW, SeqAW,
+      A_Add, A_Mul, A_Min_S, A_Min_U, A_Max_S,
+      A_Max_U: return 1'b1;
+      default: return 1'b0;
+    endcase
+  endfunction
+
+  /// Evaluate if the incoming operation is a parallel reduction
+  function automatic bit is_parallel_reduction_op(collect_op_e op);
+    return (op == LSBAnd | op == CollectB | op == SelectAW);
+  endfunction
+
+  /// Evaluate if the incoming operation is a sequential reduction
+  function automatic bit is_sequential_reduction_op(collect_op_e op);
+    case (op)
+      F_Add, F_Mul, F_Min, F_Max, SeqAW,
+      A_Add, A_Mul, A_Min_S, A_Min_U, A_Max_S,
+      A_Max_U: return 1'b1;
+      default: return 1'b0;
+    endcase
+  endfunction
+
 endpackage
diff --git a/hw/floo_reduction_arbiter.sv b/hw/floo_reduction_arbiter.sv
index 3bd11dfe..683d9077 100644
--- a/hw/floo_reduction_arbiter.sv
+++ b/hw/floo_reduction_arbiter.sv
@@ -3,19 +3,27 @@
 // SPDX-License-Identifier: SHL-0.51
 //
 // Author: Chen Wu <chenwu@student.ethz.ch>
+//         Raphael Roth <raroth@student.ethz.ch>
+
+`include "axi/typedef.svh"
+`include "floo_noc/typedef.svh"
 
 module floo_reduction_arbiter import floo_pkg::*;
 #(
   /// Number of input ports
-  parameter int unsigned NumRoutes  = 1,
+  parameter int unsigned NumRoutes            = 1,
+  /// Collective ops configuration
+  parameter collect_op_be_cfg_t  CollectOpCfg    = CollectiveSupportDefaultCfg,
   /// Type definitions
-  parameter type         flit_t     = logic,
-  parameter type         payload_t  = logic,
-  // Masks used to select which bits of the payload are part of the response,
-  // allowing extraction of relevant bits and detection of any participant errors.
-  parameter payload_t    NarrowRspMask = '0,
-  parameter payload_t    WideRspMask = '0,
-  parameter type         id_t       = logic
+  parameter type         flit_t               = logic,
+  parameter type         hdr_t                = logic,
+  parameter type         id_t                 = logic,
+  /// Do we support local loopback e.g. should the logic expect the local flit or not
+  parameter bit          RdSupportLoopback    = 1'b0,
+  /// AXI dependent parameter for collective support
+  /// When performing collective, data bits need to be extracted from the payoload
+  parameter bit          RdSupportAxi         = 1'b1,
+  parameter axi_cfg_t    AxiCfg               = '0
 ) (
   /// Current XY-coordinate of the router
   input  id_t                    xy_id_i,
@@ -29,66 +37,180 @@ module floo_reduction_arbiter import floo_pkg::*;
   output flit_t                  data_o
 );
 
+  `FLOO_TYPEDEF_AXI_FROM_CFG(axi, AxiCfg)
+  `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_in, AxiCfg, hdr_t)
+
+  // We calculte the different reduction in parallel and select the result at the output
+  flit_t data_forward_flit;
+  flit_t data_collectB;
+  flit_t data_LSBAnd;
+
+  collect_op_e incoming_red_op;
+
+  // Logic bit to connect all LSB together
+  logic lsb;
+  logic [1:0] resp;
+
   // calculated expected input source lists for each input flit
-  logic [NumRoutes-1:0]  in_route_mask;
+  logic [NumRoutes-1:0][NumRoutes-1:0]  in_route_mask;
+  logic [NumRoutes-1:0]                 red_valid_in;
+  logic [NumRoutes-1:0][NumRoutes-1:0]  ready_out;
 
   typedef logic [cf_math_pkg::idx_width(NumRoutes)-1:0] arb_idx_t;
   arb_idx_t input_sel;
 
-  // Use a leading zero counter to find the first valid input to reduce
+  // TODO (lleone): The handshake betwene the input and output shoudl be implemented with
+  // a stream fork. The one in common cell is not suitable for this condition because
+  // it connectes the input stream to ALL of output ports. We would need a stream fork
+  // that connects the input stream to ANY of the output ports.
+  assign ready_o = ready_out[input_sel];
+  for (genvar i = 0; i < NumRoutes; i++) begin : gen_invalid_data
+    // Compute list of possible input sources for each input port
+    // This module determines which inputs are expected to participate in the reduction
+    floo_route_xymask #(
+      .NumRoutes ( NumRoutes ),
+      .flit_t    ( flit_t    ),
+      .id_t      ( id_t      ),
+      .FwdMode   ( 0         ) // We enable the backward mode for reduction
+    ) i_route_xymask (
+      .channel_i    ( data_i[i]   ),
+      .xy_id_i      ( xy_id_i         ),
+      .route_sel_o  ( in_route_mask[i] )
+    );
+
+    floo_reduction_sync #(
+      .NumRoutes          ( NumRoutes ),
+      .RdSupportLoopback  ( RdSupportLoopback ),
+      .arb_idx_t          ( arb_idx_t ),
+      .flit_t             ( flit_t    ),
+      .id_t               ( id_t      )
+    ) i_reduction_sync (
+      .sel_i            ( arb_idx_t'(i)    ),
+      .data_i           ( data_i           ),
+      .valid_i          ( valid_i          ),
+      .ready_o          ( ready_out[i]     ),
+      .xy_id_i          ( xy_id_i          ),
+      .valid_o          ( red_valid_in[i]  ),
+      .ready_i          ( ready_i          ),
+      .in_route_mask_i  ( in_route_mask[i] )
+    );
+  end
+
+  // Use a leading zero counter to find the first valid reduction input
   lzc #(
     .WIDTH(NumRoutes)
   ) i_lzc (
-    .in_i  ( valid_i   ),
-    .cnt_o ( input_sel ),
+    .in_i  ( red_valid_in ),
+    .cnt_o ( input_sel    ),
     .empty_o ()
   );
 
-  floo_reduction_sync #(
-    .NumRoutes ( NumRoutes ),
-    .arb_idx_t ( arb_idx_t ),
-    .flit_t    ( flit_t    ),
-    .id_t      ( id_t      )
-  ) i_reduction_sync (
-    .sel_i            ( input_sel     ),
-    .data_i           ( data_i        ),
-    .valid_i          ( valid_i       ),
-    .xy_id_i          ( xy_id_i       ),
-    .valid_o          ( valid_o       ),
-    .in_route_mask_o  ( in_route_mask )
-  );
-
-  payload_t ReduceMask;
-  assign ReduceMask = data_i[input_sel].hdr.axi_ch==NarrowB? NarrowRspMask : WideRspMask;
+  // Select the incoming reduction operation
+  assign incoming_red_op = data_i[input_sel].hdr.collective_op;
 
-  logic [1:0] resp;
+  // ----------------------------
+  // Reduction op implementations
+  // ----------------------------
 
-  // Reduction operation
+  // TODO(lleone): Guard with a Cfg parameter that tells you which are the supported operations
+  // Collect B response operation
   always_comb begin : gen_reduced_B
-    data_o = data_i[input_sel];
+    data_collectB = data_i[input_sel];
     resp = '0;
     // We check every input port from which we expect a response
     for (int i = 0; i < NumRoutes; i++) begin
-      if(in_route_mask[i]) begin
+      if(in_route_mask[input_sel][i]) begin
         // Select only the bits of the payload that are part of the response
         // and check if at least one of the participants sent an error.
-        automatic int j = 0;
-        for (int k = 0; k < $bits(ReduceMask); k++) begin
-          if (ReduceMask[k]) begin
-            resp[j] = data_i[i].payload[k];
-            j++;
-          end
-        end
-        // If one of the responses is an error, we return an error
-        // otherwise we return the first response
+        resp = extractAxiBResp(data_i[i]);
         if(resp == axi_pkg::RESP_SLVERR) begin
-          data_o = data_i[i];
+          data_collectB = data_i[i];
           break;
         end
       end
     end
   end
 
-  assign ready_o = (ready_i & valid_o)? valid_i & in_route_mask : '0;
+  // Forward flits directly - Just choose to forward the selected one
+  always_comb begin : gen_forward
+    data_forward_flit = '0;
+    if (CollectOpCfg.EnLSBAnd) data_forward_flit = data_i[input_sel];
+  end
+
+  // And all the LSB
+  always_comb begin : gen_and_lsb
+    data_LSBAnd = '0;
+    if (CollectOpCfg.EnLSBAnd) begin
+      data_LSBAnd = data_i[input_sel];
+      lsb = 1'b1;
+
+      // We check every input port from which we expect a response
+      for (int i = 0; i < NumRoutes; i++) begin
+        if(in_route_mask[input_sel][i]) begin
+          // Extract the last bit from the data
+          if(RdSupportAxi) begin
+            axi_data_t axi_w_data;
+            axi_w_data = extractAxiWData(data_i[i]);
+            lsb &= axi_w_data[0];
+          end
+        end
+      end
+
+      // Assign the bit again
+      if(RdSupportAxi) begin
+        data_LSBAnd = insertAxiWlsb(data_LSBAnd, lsb);
+      end
+    end
+  end
+
+  // Select which parallel operation to output
+  always_comb begin
+    // Assign inital value
+    data_o = '0;
+    case ({incoming_red_op, 1'b1})
+      {SelectAW, CollectOpCfg.EnLSBAnd}:  data_o = data_forward_flit;
+      {LSBAnd,   CollectOpCfg.EnLSBAnd}:  data_o = data_LSBAnd;
+      {CollectB, 1'b1}:                   data_o = data_collectB;
+      default:;
+    endcase
+  end
+
+  // Connect the valid and ready signals
+  assign valid_o = red_valid_in[input_sel];
+  // TODO (lleone): Delete this line
+  // assign ready_o = (ready_i & valid_o) ? valid_i & in_route_mask[input_sel] : '0;
+
+  // -----------------------------
+  // AXI Specific Helper functions
+  // -----------------------------
+
+  //TODO(lleone): Move those functions into floo_pkg
+  // Insert data into AXI specific W frame!
+  function automatic flit_t insertAxiWlsb(flit_t metadata, logic data);
+      floo_axi_w_flit_t w_flit;
+      // Parse the entire flit
+      w_flit = floo_axi_w_flit_t'(metadata);
+      // Copy the new data
+      w_flit.payload.data[0] = data;
+      return flit_t'(w_flit);
+  endfunction
+
+  // Extract data from AXI specific W frame!
+  function automatic axi_data_t extractAxiWData(flit_t metadata);
+      floo_axi_w_flit_t w_flit;
+      // Parse the entire flit
+      w_flit = floo_axi_w_flit_t'(metadata);
+      // Return the W data
+      return w_flit.payload.data;
+  endfunction
+
+  // Extract B response from AXI specific B frame!
+  function automatic axi_pkg::resp_t extractAxiBResp(flit_t metadata);
+      floo_axi_b_flit_t b_flit;
+      // Parse the entire flit
+      b_flit = floo_axi_b_flit_t'(metadata);
+      // Return the B response
+      return b_flit.payload.resp;
+  endfunction
 
 endmodule
diff --git a/hw/floo_reduction_sync.sv b/hw/floo_reduction_sync.sv
index 0d7e2e5e..836803c8 100644
--- a/hw/floo_reduction_sync.sv
+++ b/hw/floo_reduction_sync.sv
@@ -3,11 +3,14 @@
 // SPDX-License-Identifier: SHL-0.51
 //
 // Author: Chen Wu <chenwu@student.ethz.ch>
+//         Raphael Roth <raroth@student.ethz.ch>
 
 module floo_reduction_sync import floo_pkg::*;
 #(
   /// Number of input ports
   parameter int unsigned NumRoutes  = 1,
+  /// Do we support local loopback e.g. should the logic expect the local flit or not
+  parameter bit          RdSupportLoopback    = 1'b0,
   /// Type definitions
   parameter type         arb_idx_t  = logic,
   parameter type         flit_t     = logic,
@@ -16,43 +19,47 @@ module floo_reduction_sync import floo_pkg::*;
   input  arb_idx_t               sel_i,
   input  flit_t [NumRoutes-1:0]  data_i,
   input  logic  [NumRoutes-1:0]  valid_i,
+  output logic  [NumRoutes-1:0]  ready_o,
   input  id_t                    xy_id_i,
   output logic                   valid_o,
-  output logic  [NumRoutes-1:0]  in_route_mask_o
+  input  logic                   ready_i,
+  input logic  [NumRoutes-1:0]    in_route_mask_i
 );
 
-  logic [NumRoutes-1:0]  compare_same, same_and_valid;
-  logic all_reduction_srcs_valid;
-
-  // Compute the input mask based on the selected input port's destination and mask fields.
-  // This determines which input ports are expected to participate in the reduction.
-  floo_route_xymask #(
-    .NumRoutes ( NumRoutes ),
-    .flit_t    ( flit_t    ),
-    .id_t      ( id_t      ),
-    .FwdMode   ( 0         ) // We enable the backward mode for reduction
-  ) i_route_xymask (
-    .channel_i    ( data_i[sel_i]   ),
-    .xy_id_i      ( xy_id_i         ),
-    .route_sel_o  ( in_route_mask_o )
-  );
+  logic [NumRoutes-1:0]  filtered_valid_in, filtered_local;
+
+
+  logic [NumRoutes-1:0] filtered_route_mask;
+  // The incoming mask is combinatorial. The valid is used to make sure the mask used in the following logic
+  // is actually from a valid flit.
+  assign filtered_route_mask = in_route_mask_i & {NumRoutes{valid_i[sel_i]}};
 
-  for (genvar in = 0; in < NumRoutes; in++) begin : gen_routes
-    // Compare whether the `mask` and `dst_id` are equal to the selected input port
-    assign compare_same[in] = ((data_i[in].hdr.mask == data_i[sel_i].hdr.mask) &&
-                               (data_i[in].hdr.dst_id == data_i[sel_i].hdr.dst_id));
 
-    // Determine if this input should be considered valid for the reduction:
-    // If we are at the dst node and the port is the local one, we don’t wait for a
-    // response/reduction since it will stay locally [NoLoopBack].
-    assign same_and_valid[in] = (data_i[sel_i].hdr.dst_id == xy_id_i && in == Eject) ||
-                                (compare_same[in] & valid_i[in]);
+  // Filter valids from the expected input sources. If the collective targets
+  // the local node and loopback is unsupported, also mark the local port as valid
+  // so the flit can reach the endpoint and avoid deadlock.
+  for (genvar in = 0; in < NumRoutes; in++) begin : gen_valid
+    // Only valid form same reduction streams are propagated
+    assign filtered_valid_in[in] =  valid_i[in] && (data_i[in].hdr.dst_id == data_i[sel_i].hdr.dst_id) &&
+                                    (data_i[in].hdr.collective_mask == data_i[sel_i].hdr.collective_mask);
+
+    // Mask local port if loopback is not supported
+    if (!RdSupportLoopback) begin
+      assign filtered_local[in] = filtered_valid_in[in] ||
+                                  (data_i[sel_i].hdr.dst_id == xy_id_i && in == Eject);
+    end else begin
+      assign filtered_local[in] = filtered_valid_in[in];
+    end
   end
 
-  // Reduction is valid only if all expected inputs [in_route_mask_o] are valid.
-  // Inputs not involved in the reduction are ignored [~(in_route_mask_o)].
-  assign all_reduction_srcs_valid = &(same_and_valid | ~in_route_mask_o);
+  stream_join_dynamic #(
+    .N_INP ( NumRoutes )
+  ) i_stream_join_dynamic (
+    .inp_valid_i   ( filtered_local      ),
+    .inp_ready_o   ( ready_o             ),
+    .sel_i         ( filtered_route_mask ),
+    .oup_valid_o   ( valid_o             ),
+    .oup_ready_i   ( ready_i             )
+  );
 
-  // To have a valid output at least one input must be valid.
-  assign valid_o = (in_route_mask_o == '0)? 1'b0 : (|valid_i & all_reduction_srcs_valid);
 endmodule
diff --git a/hw/floo_reduction_unit.sv b/hw/floo_reduction_unit.sv
new file mode 100644
index 00000000..ee65ca86
--- /dev/null
+++ b/hw/floo_reduction_unit.sv
@@ -0,0 +1,331 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Lorenzo Leone <lleone@iis.ee.ethz.ch>
+//
+// This module is used to handle arithmetic reduction streams that need to be offloaded
+// to a functional unit. It selects the first two valid inputs and issues them to the FU.
+// It then takes care of forwarding the incoming result back to the correct output.
+//
+// Limitations:
+// This module is not AXI agnostic, it works only with AXI compliant data streams.
+// The protocol information is necessary to extract the data bits from the payload.
+
+`include "common_cells/assertions.svh"
+`include "common_cells/registers.svh"
+`include "floo_noc/typedef.svh"
+`include "axi/typedef.svh"
+
+
+module floo_reduction_unit
+  import floo_pkg::*;
+  #(
+    parameter int unsigned NumInputs  = 0,
+    parameter int unsigned NumOutputs = 0,
+    parameter type flit_t             = logic,
+    parameter type hdr_t              = logic,
+    parameter type id_t               = logic,
+    parameter type reduction_data_t   = logic,
+    /// Parameters for the reduction configuration
+    parameter reduction_cfg_t RedCfg               = '0,
+    /// Axi Configuration
+    parameter floo_pkg::axi_cfg_t AxiCfg            = '0
+  )(
+    input   logic                                     clk_i,
+    input   logic                                     rst_ni,
+    input   id_t                                      xy_id_i,
+    input   logic   [NumInputs-1:0]                   valid_i,
+    output  logic   [NumInputs-1:0]                   ready_o,
+    input   flit_t  [NumInputs-1:0]                   data_i,
+    output  logic   [NumOutputs-1:0]                  valid_o,
+    input   logic   [NumOutputs-1:0]                  ready_i,
+    output  flit_t  [NumOutputs-1:0]                  data_o,
+    /// One-hot mask to route result to the output
+    input   logic   [NumInputs-1:0][NumOutputs-1:0]   routed_out_mask_i,
+    /// One-hot mask to indicate expected inputs
+    input   logic   [NumInputs-1:0][NumInputs-1:0]    in_mask_i,
+    output  logic                                     operands_valid_o,
+    input   logic                                     operands_ready_i,
+    output  reduction_data_t                          operand1_o,
+    output  reduction_data_t                          operand2_o,
+    output  collect_op_e                              operation_o,
+    input   logic                                     result_valid_i,
+    output  logic                                     result_ready_o,
+    input   reduction_data_t                          result_i
+  );
+
+  `FLOO_TYPEDEF_AXI_FROM_CFG(axi, AxiCfg)
+  `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_in, AxiCfg, hdr_t)
+
+  typedef logic [cf_math_pkg::idx_width(NumInputs)-1:0] input_sel_t;
+  typedef logic [cf_math_pkg::idx_width(NumOutputs)-1:0] out_select_t;
+
+  typedef struct packed {
+    collect_op_e     op;
+    reduction_data_t operand1;
+    reduction_data_t operand2;
+  } red_intsr_t;
+
+  // Select signals for the input data
+  input_sel_t operand1_sel;
+  input_sel_t operand2_sel;
+  collect_op_e incoming_op;
+
+  logic [NumInputs-1:0] mask_operand1;
+  logic [NumInputs-1:0] mask_operand2;
+
+  // Selected flit. Still generic floo flit type
+  flit_t operand1_flit, operand2_flit;
+
+  // Signals towards multiple functional units
+  logic       operands_valid_out;
+  logic       operands_ready_in;
+
+  // Signals towards the offload interface
+  logic       offload_operands_valid_out;
+  logic       offload_operands_ready_in;
+  red_intsr_t       instr_out, instr_out_cut;
+  floo_axi_w_flit_t w_flit_operand1, w_flit_operand2;
+  floo_axi_w_flit_t w_flit_result;
+
+  // Signals towards selectAW unit
+  logic       aw_valid_out;
+  logic       aw_ready_in;
+  flit_t      aw_out;
+
+  // Signals from the response offload interface
+  reduction_data_t      result_data_in;
+  logic                 result_valid_in;
+  logic                 result_ready_out;
+  flit_t                result_flit_in;
+
+  // Output flit after the mux
+  flit_t                result_flit_out;
+  logic                 result_flit_valid_out;
+  logic                 result_flit_ready_in;
+  logic                 result_mux_sel;
+
+  // Metadata for latency tolerant controller
+  flit_t                  metadata_flit_out;
+  logic [NumOutputs-1:0]  metadata_route_out_dir;
+  out_select_t            out_select;
+
+  ///------------------------///
+  /// Outgoing operands flow ///
+  ///------------------------///
+
+  // Leading zero counter to chose the first valid operand
+  lzc #(
+    .WIDTH(NumInputs)
+  ) i_lzc_opn1 (
+    .in_i     ( valid_i       ),
+    .cnt_o    ( operand1_sel  ),
+    .empty_o  (               )
+  );
+
+  floo_reduction_sync #(
+    .NumRoutes          ( NumInputs ),
+    .RdSupportLoopback  ( RedCfg.RdSupportLoopback ),
+    .arb_idx_t          ( input_sel_t ),
+    .flit_t             ( flit_t    ),
+    .id_t               ( id_t      )
+  ) i_reduction_sync (
+    .sel_i            ( operand1_sel            ),
+    .data_i           ( data_i                  ),
+    .valid_i          ( valid_i                 ),
+    .ready_o          ( ready_o                 ),
+    .xy_id_i          ( xy_id_i                 ),
+    .in_route_mask_i  ( in_mask_i[operand1_sel] ),
+    .valid_o          ( operands_valid_out      ),
+    .ready_i          ( operands_ready_in       )
+  );
+
+  // The first operand is always the one selected from the lzc module
+  assign operand1_flit = data_i[operand1_sel];
+  assign incoming_op = data_i[operand1_sel].hdr.collective_op;
+
+
+  assign mask_operand1 = {NumInputs'(1)} << operand1_sel;
+  assign mask_operand2 = in_mask_i[operand1_sel] & ~mask_operand1;
+  // This zero counter is used to select the second operand looking at the input mask
+  lzc #(
+    .WIDTH(NumInputs)
+  ) i_lzc_opn2 (
+    .in_i     ( mask_operand2 ),
+    .cnt_o    ( operand2_sel  ),
+    .empty_o  (               )
+  );
+
+  assign operand2_flit = data_i[operand2_sel];
+
+  // Stream demux to arbitrate between different functional units:
+  // - Output 1: Offload unit
+  // - Output 2: SelectAW unit
+  stream_demux #(
+    .N_OUP ( 2 )
+  ) i_operands_demux (
+    .inp_valid_i   ( operands_valid_out     ),
+    .inp_ready_o   ( operands_ready_in      ),
+    .oup_sel_i     ( incoming_op == SeqAW   ),
+    .oup_valid_o   ( {aw_valid_out, offload_operands_valid_out} ),
+    .oup_ready_i   ( {aw_ready_in, offload_operands_ready_in}   )
+  );
+
+
+  assign w_flit_operand1 = floo_axi_w_flit_t'(operand1_flit);
+  assign w_flit_operand2 = floo_axi_w_flit_t'(operand2_flit);
+
+  assign instr_out.operand1 = w_flit_operand1.payload.data;
+  assign instr_out.operand2 = w_flit_operand2.payload.data;
+  assign instr_out.op = operand2_flit.hdr.collective_op;
+
+  // For the select AW we don't need any operations except for assigning one of
+  assign aw_out = operand1_flit;
+
+  // Store incoming hdr + payload info for the response path
+  // The data bits are useless since the result coming from the
+  // functional unit will be the actual data. For this reason we hardcode the data to 0
+  // at the fifo input to make sure that those FFs are then optimized away.
+
+  // To avoid combinational loop, the push of the fifo must not depend on the ready
+  // because the latter depends from the output of the fifo itself that is combinatorial
+  // in case of FALL_THROUGH.
+
+  logic already_pushed_q;
+  logic valid_operand_handshake;
+
+  assign valid_operand_handshake = operands_valid_out & operands_ready_in;
+
+  `FFLARNC(already_pushed_q, 1'b1, operands_valid_out && (~already_pushed_q),
+           valid_operand_handshake, 1'b0, clk_i, rst_ni)
+  fifo_v3 #(
+      .FALL_THROUGH     (1'b1),
+      .dtype            (flit_t),
+      .DEPTH            (RedCfg.RdPipelineDepth+2)
+  ) i_fifo_flit (
+      .clk_i            (clk_i),
+      .rst_ni           (rst_ni),
+      .flush_i          (1'b0),
+      .testmode_i       (1'b0),
+      .full_o           (),
+      .empty_o          (),
+      .usage_o          (),
+      .data_i           (operand1_flit),                     // store the flit of the first operand
+      .push_i           (operands_valid_out & (~already_pushed_q)),  // push when handshake on the input operands
+      .data_o           (metadata_flit_out),
+      .pop_i            (result_flit_valid_out & result_flit_ready_in) // pop mask when handshake on the result
+  );
+  // Fifo to store the output direction of the element during the FPU reduction
+  fifo_v3 #(
+      .FALL_THROUGH     (1'b1),
+      .DATA_WIDTH       (NumInputs),
+      .DEPTH            (RedCfg.RdPipelineDepth+2)
+  ) i_fifo_route_dir (
+      .clk_i            (clk_i),
+      .rst_ni           (rst_ni),
+      .flush_i          (1'b0),
+      .testmode_i       (1'b0),
+      .full_o           (),
+      .empty_o          (), // Not needed, this fifo is always sinc with the flit one
+      .usage_o          (),
+      .data_i           (routed_out_mask_i[operand1_sel]),         // store the route out of the first operand
+      .push_i           (operands_valid_out & (~already_pushed_q)),  // push when handshake on the input operands
+      .data_o           (metadata_route_out_dir),
+      .pop_i            (result_flit_valid_out & result_flit_ready_in) // pop mask when handshake on the result
+  );
+
+  // TODO (lleone): Create a REQ/RSP struct for the following interface
+  // and replace all the spill registers with just one for REQ and one for RSP
+  spill_register #(
+        .T (red_intsr_t),
+        .Bypass (!RedCfg.CutOffloadIntf)
+  ) i_offload_cut_req (
+        .clk_i,
+        .rst_ni,
+        .data_i   (instr_out),
+        .valid_i  (offload_operands_valid_out),
+        .ready_o  (offload_operands_ready_in),
+        .data_o   (instr_out_cut),
+        .valid_o  (operands_valid_o),
+        .ready_i  (operands_ready_i)
+  );
+
+  // TODO(lleone): When uniforming the offload interface, get rid of this part, isnce the cur will be of the type of the interface
+  assign operation_o = instr_out_cut.op;
+  assign operand1_o = instr_out_cut.operand1;
+  assign operand2_o = instr_out_cut.operand2;
+
+
+  ///-------------------------///
+  /// Incoming responses flow ///
+  ///-------------------------///
+
+  spill_register #(
+        .T (reduction_data_t),
+        .Bypass (!RedCfg.CutOffloadIntf)
+  ) i_offload_cut_rsp (
+        .clk_i,
+        .rst_ni,
+        .data_i   (result_i),
+        .valid_i  (result_valid_i),
+        .ready_o  (result_ready_o),
+        .data_o   (result_data_in),
+        .valid_o  (result_valid_in),
+        .ready_i  (result_ready_out)
+  );
+
+  // TODO(lleone): Make sure this logic is actually optimized away in PnR
+  // Apply the result from the offload unit to the stored flit
+  always_comb begin: gen_result_flit
+    w_flit_result = floo_axi_w_flit_t'(metadata_flit_out);
+    w_flit_result.payload.data = result_data_in;
+  end
+
+  assign result_flit_in = flit_t'(w_flit_result);
+
+
+  ///-------------------------///
+  ///  Output responses flow  ///
+  ///-------------------------///
+
+  assign result_mux_sel = metadata_flit_out.hdr.collective_op == SeqAW;
+  stream_mux #(
+    .DATA_T   ( flit_t ),
+    .N_INP    ( 2 )
+  ) i_result_mux (
+    .inp_data_i   ( {aw_out, result_flit_in}        ),
+    .inp_valid_i  ( {aw_valid_out, result_valid_in} ),
+    .inp_ready_o  ( {aw_ready_in, result_ready_out} ),
+    .inp_sel_i    ( result_mux_sel                  ),
+    .oup_data_o   ( result_flit_out                 ),
+    .oup_valid_o  ( result_flit_valid_out           ),
+    .oup_ready_i  ( result_flit_ready_in            )
+  );
+
+  // Output destination lzc
+  lzc #(
+    .WIDTH(NumOutputs)
+  ) i_lzc_result_out (
+    .in_i     ( metadata_route_out_dir  ),
+    .cnt_o    ( out_select       ),
+    .empty_o  (                  )
+  );
+
+  stream_demux #(
+    .N_OUP ( NumOutputs )
+  ) i_result_demux (
+    .inp_valid_i   ( result_flit_valid_out  ),
+    .inp_ready_o   ( result_flit_ready_in   ),
+    .oup_sel_i     ( out_select             ),
+    .oup_valid_o   ( valid_o                ),
+    .oup_ready_i   ( ready_i                )
+  );
+  assign data_o = {NumOutputs{result_flit_out}};
+
+  `ASSERT(ReductionFrom2MoreInputs,
+          !(|valid_i) || ($countones(in_mask_i[operand1_sel]) == 0) ||
+          ($countones(in_mask_i[operand1_sel]) == 2),
+         clk_i, !rst_ni,
+         "Incoming sequential reduction from more than 2 inputs is not supported")
+endmodule
diff --git a/hw/floo_route_comp.sv b/hw/floo_route_comp.sv
index c2584b22..65d0709e 100644
--- a/hw/floo_route_comp.sv
+++ b/hw/floo_route_comp.sv
@@ -36,6 +36,8 @@ module floo_route_comp
   output id_t mask_o
 );
 
+  localparam bit EnCollective = floo_pkg::is_en_collective(RouteCfg.CollectiveCfg.OpCfg);
+
   // Use an address decoder to map the address to a destination ID.
   // The `rule_t` struct has to have the fields `idx`, `start_addr` and `end_addr`.
   // `SourceRouting` is a special case, where the the `idx` is the actual (pre-computed) route.
@@ -73,7 +75,7 @@ module floo_route_comp
       .en_default_idx_i ( 1'b0        ),
       .default_idx_i    ( '0          )
     );
-    if (RouteCfg.EnMultiCast && RouteCfg.UseIdTable &&
+    if (EnCollective && RouteCfg.UseIdTable &&
         (RouteCfg.RouteAlgo == floo_pkg::XYRouting))
     begin : gen_mcast_mask
       floo_mask_decode #(
@@ -106,7 +108,7 @@ module floo_route_comp
     assign id_o.port_id = '0;
     assign id_o.x = addr_i[RouteCfg.XYAddrOffsetX +: $bits(id_o.x)];
     assign id_o.y = addr_i[RouteCfg.XYAddrOffsetY +: $bits(id_o.y)];
-    if(RouteCfg.EnMultiCast) begin : gen_mcast_mask
+    if(EnCollective) begin : gen_mcast_mask
       assign mask_o.x = mask_i[RouteCfg.XYAddrOffsetX +: $bits(id_o.x)];
       assign mask_o.y = mask_i[RouteCfg.XYAddrOffsetY +: $bits(id_o.y)];
       assign mask_o.port_id = '0;
diff --git a/hw/floo_route_select.sv b/hw/floo_route_select.sv
index 789dca66..c03378da 100644
--- a/hw/floo_route_select.sv
+++ b/hw/floo_route_select.sv
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: SHL-0.51
 //
 // Michael Rogenmoser <michaero@iis.ee.ethz.ch>
+// Raphael Roth <raroth@student.ethz.ch>
 
 `include "common_cells/registers.svh"
 
@@ -27,7 +28,9 @@ module floo_route_select
   /// Various types used in the routing algorithm
   parameter type         flit_t           = logic,
   parameter type         addr_rule_t      = logic,
-  parameter type         id_t             = logic[IdWidth-1:0]
+  parameter type         id_t             = logic[IdWidth-1:0],
+  /// Inversed SRC / DST if we want to support Multicast on the B response
+  parameter bit          InversedSrcDst       = 1'b0
 ) (
   input  logic                          clk_i,
   input  logic                          rst_ni,
@@ -44,9 +47,15 @@ module floo_route_select
   output logic [RouteSelWidth-1:0]      route_sel_id_o
 );
 
+  // Selected route defined by th alg.
   logic [NumRoutes-1:0] route_sel;
   logic [RouteSelWidth-1:0] route_sel_id;
 
+  // We need to calc the multicast and the unicast route in parallel
+  // and mux them depending on the flit header!
+  logic [NumRoutes-1:0] route_sel_multicast;
+  logic [NumRoutes-1:0] route_sel_unicast;
+
   if (RouteAlgo == IdTable) begin : gen_id_table
     // Routing based on an ID table passed into the router (TBD parameter or signal)
     // Assumes an ID field present in the flit_t
@@ -102,43 +111,54 @@ module floo_route_select
 
     // One-hot encoding of the decoded route
 
+    // If we enable multicast then generate the output routes here seperatly
     if (EnMultiCast) begin : gen_mcast_route_sel
       floo_route_xymask #(
-        .NumRoutes     ( NumRoutes ),
-        .flit_t        ( flit_t    ),
-        .id_t          ( id_t      ),
-        .FwdMode       ( 1         )
+        .NumRoutes     ( NumRoutes        ),
+        .flit_t        ( flit_t           ),
+        .id_t          ( id_t             ),
+        .FwdMode       ( 1'b1             )
       ) i_route_xymask (
         .channel_i   ( channel_i ),
         .xy_id_i     ( xy_id_i   ),
-        .route_sel_o ( route_sel )
+        .route_sel_o ( route_sel_multicast )
       );
-      assign route_sel_id = '0; // Not defined in multicast
-    end else begin : gen_route_sel
-      id_t id_in;
-      assign id_in = id_t'(channel_i.hdr.dst_id);
-      always_comb begin
-        route_sel_id = East;
-        if (id_in.x == xy_id_i.x && id_in.y == xy_id_i.y) begin
-          route_sel_id = Eject + channel_i.hdr.dst_id.port_id;
-        end else if (id_in.x == xy_id_i.x) begin
-          if (id_in.y < xy_id_i.y) begin
-            route_sel_id = South;
-          end else begin
-            route_sel_id = North;
-          end
+    end else begin : gen_no_mcast
+      assign route_sel_multicast = '0;  // No MCast supported
+    end
+
+    // Calculate here the unicast output mask
+    id_t id_in;
+    assign id_in = id_t'(channel_i.hdr.dst_id);
+    always_comb begin
+      route_sel_id = East;
+      if (id_in.x == xy_id_i.x && id_in.y == xy_id_i.y) begin
+        route_sel_id = Eject + channel_i.hdr.dst_id.port_id;
+      end else if (id_in.x == xy_id_i.x) begin
+        if (id_in.y < xy_id_i.y) begin
+          route_sel_id = South;
         end else begin
-          if (id_in.x < xy_id_i.x) begin
-            route_sel_id = West;
-          end else begin
-            route_sel_id = East;
-          end
+          route_sel_id = North;
+        end
+      end else begin
+        if (id_in.x < xy_id_i.x) begin
+          route_sel_id = West;
+        end else begin
+          route_sel_id = East;
         end
-        route_sel = '0;
-        route_sel[route_sel_id] = 1'b1;
       end
+      route_sel_unicast = '0;
+      route_sel_unicast[route_sel_id] = 1'b1;
+    end
+
+    // Depending on the flit header choose the correct route
+    if(EnMultiCast) begin
+      assign route_sel = (channel_i.hdr.collective_op == Multicast) ? route_sel_multicast : route_sel_unicast;
+    end else begin
+      assign route_sel = route_sel_unicast;
     end
 
+    // Assign the data directly to the output
     assign channel_o = channel_i;
 
   end else begin : gen_err
diff --git a/hw/floo_route_xymask.sv b/hw/floo_route_xymask.sv
index ad0521bd..887c2000 100644
--- a/hw/floo_route_xymask.sv
+++ b/hw/floo_route_xymask.sv
@@ -2,63 +2,133 @@
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 //
-// Author: Chen Wu <chenwu@student.ethz.ch>
+// Author:
+// - Chen Wu <chenwu@student.ethz.ch>
+// - Raphael Roth <raroth@student.ethz.ch>
 
-module floo_route_xymask
-  import floo_pkg::*;
-#(
-  /// Number of output ports
-  parameter int unsigned NumRoutes     = 0,
+// This module is the heartpiece for collective operation in the FlooNoC. When running a
+// multicast / reduction it either determines the output direction of the filt
+// (e.g. in which direction a copy of the filt has to be sent) or the expected
+// input direction (e.g. which input provides a flit).
+
+// Limitations:
+// - It only supports xy routing
+// - It only supports 5 in/out routes
+
+`include "common_cells/assertions.svh"
+
+module floo_route_xymask import floo_pkg::*; #(
+  /// Number of collective routes, either output or input
+  parameter int unsigned    NumRoutes   = 0,
   /// The type of mask to be computed
   /// 1: Determine output directions of the forward path in Multicast
   /// 0: Determine input directions of the backward path in Multicast i.e the reduction
-  parameter bit       FwdMode          = 1'b1,
-  /// Various types
-  parameter type         flit_t        = logic,
-  parameter type         id_t          = logic
+  parameter bit             FwdMode     = 1'b1,
+  /// type for data flit
+  parameter type            flit_t      = logic,
+  /// type for local id (router id)
+  parameter type            id_t        = logic
 ) (
   // The input flit (only the header is used)
   input  flit_t                         channel_i,
   // The current XY-coordinate of the router
   input  id_t                           xy_id_i,
-  // The calculated mask for the multicast/reduction
+  // The calculated onehot maks for the multicast/reduction
   output logic [NumRoutes-1:0]          route_sel_o
 );
 
-  logic [NumRoutes-1:0] route_sel;
+  // General Concept: In XY-Routing all flits travel first in X - direction until they arrive at the columne of the destination
+  //                  and then travel in Y direction until they reach the destination. In the Multicast case we have to forward
+  //                  them until the "most far away" x/y position (dst_id_max) determint by the mask!
+
+  // We need to handle 4 different cases in this module:
+  // ---------------------------------------------------
+  // @ Multicast
+  //
+  // Request              - src (Single)
+  //                      - dst (Multiple)
+  //                      --> generate destination mask
+  //
+  // Collect B            - src (Multiple)
+  //                      - dst (Single)
+  //                      --> generate expected input mask
+  // ---------------------------------------------------
+  // @ Reduction
+  //
+  // Reduction            - src (Multiple)
+  //                      - dst (Single)
+  //                      --> generate expected input mask
+  //
+  // distribute B resp    - src (Single)
+  //                      - dst (Multiple)
+  //                      --> generate destination mask
+  // ---------------------------------------------------
+
+  // Two cases overlap themself e.g. when we want to have an expected input mask
+  // we go from multiple sources to one destination (reduction). With the output mask it is
+  // the opposite with single source to multiple destinations (multicast).
+
+  // To improve readability of the code we generate both mask in parallel and only
+  // mux them at the output.
+
+/* Variable declaration */
+  // generated routes
+  logic [NumRoutes-1:0] route_output;
+  logic [NumRoutes-1:0] route_expected_input;
+
+  // Var for easier signal assignments
+  id_t dst_id;
+  id_t src_id;
+  id_t mask;
 
-  id_t dst_id, mask_in, src_id;
+  // Var to hold the maxium distribution distance for both source and destination
   id_t dst_id_max, dst_id_min;
-  logic x_matched, y_matched;
+  id_t src_id_max, src_id_min;
 
-  // In the forward path, we use the normal `dst_id` to compute the mask.
-  // In the backward path, we use the `src_id` which was the original
-  // `dst_id` in from the forward path.
-  assign dst_id = (FwdMode)? channel_i.hdr.dst_id : channel_i.hdr.src_id;
-  assign src_id = (FwdMode)? channel_i.hdr.src_id : channel_i.hdr.dst_id;
-  // TODO(fischeti): Clarify with Chen why `ParallelReduction` are excluded
-  assign mask_in = (FwdMode && channel_i.hdr.commtype==ParallelReduction)?
-                    '0 : channel_i.hdr.mask;
+  // Var indicates if the current router lies in the same x/y axis as the source / destination
+  logic x_matched_output;
+  logic y_matched_output;
+  logic x_matched_expected_input;
+  logic y_matched_expected_input;
+
+  // Signal assigments
+  assign dst_id = channel_i.hdr.dst_id;
+  assign src_id = channel_i.hdr.src_id;
+  assign mask = channel_i.hdr.collective_mask;
 
   // We compute minimum and maximum destination IDs, to decide whether
   // we need to send left and/or right resp. up and/or down.
-  assign dst_id_max.x = dst_id.x | mask_in.x;
-  assign dst_id_max.y = dst_id.y | mask_in.y;
-  assign dst_id_min.x = dst_id.x & (~mask_in.x);
-  assign dst_id_min.y = dst_id.y & (~mask_in.y);
-
-  // `x/y_matched` means whether the current coordinate is a
-  // receiver of the the multicast.
-  assign x_matched = &(mask_in.x | ~(xy_id_i.x ^ dst_id.x));
-  assign y_matched = &(mask_in.y | ~(xy_id_i.y ^ dst_id.y));
-
-  always_comb begin
-    route_sel = '0;
-    if (FwdMode) begin : gen_out_mask
-      // If both x and y are matched, we eject the flit
-      if (x_matched && y_matched) begin
-        route_sel[Eject] = 1;
+  assign dst_id_max.x = dst_id.x | mask.x;
+  assign dst_id_max.y = dst_id.y | mask.y;
+  assign dst_id_min.x = dst_id.x & (~mask.x);
+  assign dst_id_min.y = dst_id.y & (~mask.y);
+
+  // We compute minimum and maximum source IDs, to decide whether
+  // we need to send left and/or right resp. up and/or down.
+  assign src_id_max.x = src_id.x | mask.x;
+  assign src_id_max.y = src_id.y | mask.y;
+  assign src_id_min.x = src_id.x & (~mask.x);
+  assign src_id_min.y = src_id.y & (~mask.y);
+
+  // `x/y_matched_output` means whether the current coordinate is a receiver of the the multicast.
+  assign x_matched_output = &(mask.x | ~(xy_id_i.x ^ dst_id.x));
+  assign y_matched_output = &(mask.y | ~(xy_id_i.y ^ dst_id.y));
+
+  // `x/y_matched_expected_input` means the current coordinate provides one element to the reduction.
+  assign x_matched_expected_input = &(mask.x | ~(xy_id_i.x ^ src_id.x));
+  assign y_matched_expected_input = &(mask.y | ~(xy_id_i.y ^ src_id.y));
+
+
+  // Generate the output mask
+  if(FwdMode) begin : gen_output_mask
+    always_comb begin
+      route_output = '0;
+
+      // If both direction match then the local port is member of the distribution
+      if(x_matched_output && y_matched_output) begin
+        route_output[Eject] = 1'b1;
       end
+
       // If the multicast was issued from an endpoint in the same row
       // i.e. the same Y-coordinate, we forward it to `East` if:
       // 1. The request is incoming from `West` or `Eject` and
@@ -66,53 +136,82 @@ module floo_route_xymask
       // The same applies to the `West` direction.
       if (xy_id_i.y == src_id.y) begin
         if (xy_id_i.x >= src_id.x && xy_id_i.x < dst_id_max.x) begin
-          route_sel[East] = 1;
+          route_output[East] = 1;
         end
         if (xy_id_i.x <= src_id.x && xy_id_i.x > dst_id_min.x) begin
-          route_sel[West] = 1;
+          route_output[West] = 1;
         end
       end
+
       // If there are multicast destinations in the current column,
       // We inject it to `North` if:
       // 1. The request is incoming from `South` or `Eject` and
       // 2. There are more multicast destinations in the `North` direction
       // The same applies to the `South` direction.
-      if (x_matched) begin
+      if (x_matched_output) begin
         if (xy_id_i.y >= src_id.y && xy_id_i.y < dst_id_max.y) begin
-          route_sel[North] = 1;
+          route_output[North] = 1;
         end
         if (xy_id_i.y <= src_id.y && xy_id_i.y > dst_id_min.y) begin
-          route_sel[South] = 1;
+          route_output[South] = 1;
         end
       end
     end
+  end
 
-    // TODO(fischeti): Clarify with Chen why `YXRouting` is used
-    // for the backward path
-    else begin : gen_in_mask
-      // If we previously ejected the flit, we expect one again
-      if (x_matched && y_matched) begin
-        route_sel[Eject] = 1;
+  // Generate the expected input mask
+  if(!FwdMode) begin : gen_expected_input_mask
+    always_comb begin
+      route_expected_input = '0;
+
+      // If both direction match then the local port is a member of the distribution
+      if(x_matched_expected_input && y_matched_expected_input) begin
+        route_expected_input[Eject] = 1'b1;
       end
-      // This is the same as the forward path, but we use the
-      // `YXRouting` algorithm to compute the mask.
-      if (xy_id_i.x == src_id.x) begin
-        if (xy_id_i.y >= src_id.y && xy_id_i.y < dst_id_max.y) begin
-          route_sel[North] = 1;
+
+      // In the case of an reduction we want to collect the source responses first in the x direction.
+      // e.g. the North / South can only be selected if we are in the correct dst columne.
+      // We expect a packet from the north if the current y id is higher/equal as the destination but still
+      // inside the expected maximum range of the source reduction. Same for the South!
+      if(xy_id_i.x == dst_id.x) begin
+        if((xy_id_i.y >= dst_id.y) && (xy_id_i.y < src_id_max.y)) begin
+          route_expected_input[North] = 1'b1;
         end
-        if (xy_id_i.y <= src_id.y && xy_id_i.y > dst_id_min.y) begin
-          route_sel[South] = 1;
+        if((xy_id_i.y <= dst_id.y) && (xy_id_i.y > src_id_min.y)) begin
+          route_expected_input[South] = 1'b1;
         end
       end
-      if (y_matched) begin
-        if (xy_id_i.x >= src_id.x && xy_id_i.x < dst_id_max.x) begin
-          route_sel[East] = 1;
+
+      // If we have multiple sources in the same row we first have to collect them in x direction
+      // therefor expecting inputs from either the east or west direction.
+      // For all members of a rows involved in the reduction the flag y_matched_expected_input is set!
+      // We expect a packet from the east if the current x id is higher/equal as the destination but still
+      // inside the expected maximum range of the source reduction. Same for the West!
+      if(y_matched_expected_input) begin
+        if((xy_id_i.x >= dst_id.x) && (xy_id_i.x < src_id_max.x)) begin
+          route_expected_input[East] = 1'b1;
         end
-        if (xy_id_i.x <= src_id.x && xy_id_i.x > dst_id_min.x) begin
-          route_sel[West] = 1;
+        if((xy_id_i.x <= dst_id.x) && (xy_id_i.x > src_id_min.x)) begin
+          route_expected_input[West] = 1'b1;
         end
       end
     end
   end
-  assign route_sel_o = route_sel;
+
+  // Eiter assign the expected input or the output depending on the Mode
+  assign route_sel_o = (FwdMode) ? route_output : route_expected_input;
+
+  // We only support five input/output routes
+  `ASSERT_INIT(NoMultiCastSupport, NumRoutes == 5)
+
+  // TODO(colluca): fix code and uncomment
+  // always_comb begin
+  //   // Check that module does nothing when unsupported
+  //   `ASSERT_I(
+  //     NoReductionInForward,
+  //     (FwdMode ? is_multicast_op(channel_i.hdr.collective_op) || (route_sel_o == '0) :
+  //     !is_multicast_op(channel_i.hdr.collective_op) || (route_sel_o == '0)),
+  //     "Mask should be 0 on unsupported operation.")
+  // end
+
 endmodule
diff --git a/hw/floo_router.sv b/hw/floo_router.sv
index e4e11dca..e393842a 100644
--- a/hw/floo_router.sv
+++ b/hw/floo_router.sv
@@ -4,6 +4,7 @@
 //
 // Michael Rogenmoser <michaero@iis.ee.ethz.ch>
 // Lorenzo Leone <lleone@iis.ee.ethz.ch>
+// Raphael Roth <raroth@student.ethz.ch>
 
 `include "common_cells/assertions.svh"
 `include "common_cells/registers.svh"
@@ -13,29 +14,29 @@ module floo_router
   import floo_pkg::*;
 #(
   /// Number of ports
-  parameter int unsigned NumRoutes        = 0,
+  parameter int unsigned NumRoutes            = 0,
   /// More fine-grained control over number of input ports
-  parameter int unsigned NumInput         = NumRoutes,
+  parameter int unsigned NumInput             = NumRoutes,
   /// More fine-grained control over number of output ports
-  parameter int unsigned NumOutput        = NumRoutes,
+  parameter int unsigned NumOutput            = NumRoutes,
   /// Number of virtual channels
-  parameter int unsigned NumVirtChannels  = 0,
+  parameter int unsigned NumVirtChannels      = 0,
   /// Number of physical channels
-  parameter int unsigned NumPhysChannels  = 1,
+  parameter int unsigned NumPhysChannels      = 1,
   /// Depth of input FIFOs
-  parameter int unsigned InFifoDepth      = 0,
+  parameter int unsigned InFifoDepth          = 0,
   /// Depth of output FIFOs
-  parameter int unsigned OutFifoDepth     = 0,
+  parameter int unsigned OutFifoDepth         = 0,
   /// Routing algorithm
-  parameter route_algo_e RouteAlgo        = IdTable,
+  parameter route_algo_e RouteAlgo            = IdTable,
   /// Parameters, only used for ID-based and XY routing
-  parameter int unsigned IdWidth          = 0,
-  parameter type         id_t             = logic[IdWidth-1:0],
+  parameter int unsigned IdWidth              = 0,
+  parameter type         id_t                 = logic[IdWidth-1:0],
   /// Used for ID-based routing
-  parameter int unsigned NumAddrRules     = 1,
+  parameter int unsigned NumAddrRules         = 1,
   /// Configuration parameters for special network topologies
   /// Disables Y->X connections in XYRouting
-  parameter bit          XYRouteOpt       = 1'b1,
+  parameter bit          XYRouteOpt           = 1'b1,
   /// Disables loopback connections
   parameter bit          NoLoopback       = 1'b1,
   /// Select VC implementation
@@ -45,11 +46,20 @@ module floo_router
   /// Enable reduction feature
   parameter bit          EnReduction      = 1'b0,
   /// Various types
-  parameter type         addr_rule_t      = logic,
-  parameter type         flit_t           = logic,
-  parameter type         payload_t        = logic,
-  parameter payload_t    NarrowRspMask    = '0,
-  parameter payload_t    WideRspMask      = '0
+  parameter type         addr_rule_t          = logic,
+  parameter type         flit_t               = logic,
+  parameter type         hdr_t                = logic,
+  /// Offload reduction parameter
+  /// Possible operation for offloading (must match type in header)
+  parameter type         RdOperation_t        = logic,
+  /// Data type of the offload reduction
+  parameter type         RdData_t             = logic,
+  /// Parameter for the reduction configuration
+  parameter collect_op_be_cfg_t CollectiveCfg    = CollectiveSupportDefaultCfg,
+  parameter reduction_cfg_t     RedCfg           = '0,
+  /// AXI configurations
+  parameter axi_cfg_t    AxiCfgOffload        = '0,
+  parameter axi_cfg_t    AxiCfgParallel       = '0
 ) (
   input  logic                                       clk_i,
   input  logic                                       rst_ni,
@@ -67,19 +77,54 @@ module floo_router
   output logic  [NumOutput-1:0][NumVirtChannels-1:0] valid_o,
   input  logic  [NumOutput-1:0][NumVirtChannels-1:0] ready_i,
   output flit_t [NumOutput-1:0][NumPhysChannels-1:0] data_o,
-  input  logic  [NumOutput-1:0][NumVirtChannels-1:0] credit_i
+  input  logic  [NumOutput-1:0][NumVirtChannels-1:0] credit_i,
+  /// IF towards the offload logic
+  output RdOperation_t                               offload_req_op_o,
+  output RdData_t                                    offload_req_operand1_o,
+  output RdData_t                                    offload_req_operand2_o,
+  output logic                                       offload_req_valid_o,
+  input logic                                        offload_req_ready_i,
+  /// IF from external FPU
+  input RdData_t                                     offload_resp_result_i,
+  input logic                                        offload_resp_valid_i,
+  output logic                                       offload_resp_ready_o
 );
 
   // TODO MICHAERO: assert NumPhysChannels <= NumVirtChannels
 
+
+  // Generate some local parameters to understand which type of collective support
+  // is required in the specific router instance
+  localparam bit EnSequentialReduction = en_sequential_support(CollectiveCfg);
+  localparam bit EnParallelReduction   = en_parallel_support(CollectiveCfg);
+  localparam bit EnMultiCast = en_multicast_support(CollectiveCfg);
+
+  // When a offloadable reduction is dedected then the data will be brunched off infront
+  // of the router crossbar. The reduction logic will reduce the incoming flits and deliver
+  // a single flit instead. When finished the result will be merged as an extra port into
+  // the output arbiter.
+  // Generate local Number of routes
+  localparam int unsigned localNumInputs = (EnSequentialReduction == 1'b1) ? (NumInput + 1) : (NumInput);
+  localparam int unsigned NumParallelRedRoutes = (EnParallelReduction) ? NumInput : 0 ;
+
+  // Generate the vars to handle the input of the router
   flit_t [NumInput-1:0][NumVirtChannels-1:0] in_data, in_routed_data;
   logic  [NumInput-1:0][NumVirtChannels-1:0] in_valid, in_ready;
-
   logic  [NumInput-1:0][NumVirtChannels-1:0][NumOutput-1:0] route_mask;
 
   // Credit generation for virtual channel support
   logic [NumInput-1:0][NumVirtChannels-1:0] credit_gnt_q, credit_gnt_d;
 
+  // Signals to connect input only virtual channel 0 to offload reduction logic
+  logic  [NumInput-1:0] red_offload_valid_in, red_offload_ready_in;
+  flit_t [NumInput-1:0] red_offload_data_in;
+  logic  [NumInput-1:0][NumOutput-1:0] red_offload_route_selected;
+  logic  [NumInput-1:0][NumInput-1:0] red_offload_expected_in_route_loopback;
+
+  // SIgnals top connect offload reduction logic to output virtual channel 0
+  logic  [NumOutput-1:0] red_offload_valid_out, red_offload_ready_out;
+  flit_t [NumOutput-1:0] red_offload_data_out;
+
   // Router input part
   for (genvar in = 0; in < NumInput; in++) begin : gen_input
     for (genvar v = 0; v < NumVirtChannels; v++) begin : gen_virt_input
@@ -124,7 +169,6 @@ module floo_router
         .clk_i,
         .rst_ni,
         .test_enable_i,
-
         .xy_id_i        ( xy_id_i               ),
         .id_route_map_i ( id_route_map_i        ),
         .channel_i      ( in_data       [in][v] ),
@@ -146,7 +190,174 @@ module floo_router
     end
   end
 
+  // Var for the "normal" dataflow without any reduction
+  logic  [NumInput-1:0][NumVirtChannels-1:0] cross_valid, cross_ready;
+
+  // Vars to branch the reduction off the main path (No virtual channel support for reduction)
+  logic  [NumInput-1:0][NumVirtChannels-1:0] red_valid_in, red_ready_in;
+  logic  [NumInput-1:0][NumVirtChannels-1:0][NumOutput-1:0] red_route_selected;
+  flit_t [NumInput-1:0][NumVirtChannels-1:0] red_data_in;
+
+  // Vars for the data comming from the reduction
+  logic  [NumOutput-1:0][NumVirtChannels-1:0] red_valid_out, red_ready_out;
+  flit_t [NumOutput-1:0][NumVirtChannels-1:0] red_data_out;
+
+  // Vars to separate reductions with only one member
+  logic [NumInput-1:0][NumVirtChannels-1:0][NumInput-1:0] red_expected_in_route, red_expected_in_route_loopback;
+  logic [NumInput-1:0][NumVirtChannels-1:0][$clog2(NumInput):0] red_how_many_participants;
+  logic [NumInput-1:0][NumVirtChannels-1:0] red_single_member, offload_reduction;
+  logic [NumInput-1:0][NumVirtChannels-1:0] red_ignore_loopback_port;
+
+  // If we support offload reduction and a reduction is dedected then we split the signal and forward it to the reduction
+  if(EnSequentialReduction == 1'b1) begin : gen_offload_reduction_demux
+    for (genvar in = 0; in < NumInput; in++) begin : gen_input
+      for (genvar v = 0; v < NumVirtChannels; v++) begin : gen_virt_input
+        // Generate the mask for all inputs to determint if we have a reduction with only one member.
+        // Any reduction with one member will be directly forwarded to its destination without reduction!
+        floo_route_xymask #(
+          .NumRoutes    (NumInput),
+          .flit_t       (flit_t),
+          .id_t         (id_t),
+          .FwdMode      (0)
+        ) i_gen_route_xymask (
+          .channel_i    (in_routed_data[in][v]),
+          .xy_id_i      (xy_id_i),
+          .route_sel_o  (red_expected_in_route[in][v])
+        );
+
+        // If the option RdSupportLoopback is disabled, then the local port
+        // must be ignored and removed from the list of participants because the last step of
+        // the reduction will be handled downsteram.
+        always_comb begin: gen_ignore_loopback
+          red_ignore_loopback_port[in][v] = ((route_mask[in][v][Eject] == 1'b1) && (!RedCfg.RdSupportLoopback));
+          red_expected_in_route_loopback[in][v] = red_expected_in_route[in][v];
+          red_expected_in_route_loopback[in][v][Eject] = red_expected_in_route[in][v][Eject] & (~red_ignore_loopback_port[in][v]);
+        end
+
+        // onehot decoding of the input direction
+        // bypass the reduction if only on  e input member is selected (if none is selected then bypass too [should never occure but to avoid deadlocks])
+        popcount #(
+          .INPUT_WIDTH (NumInput)
+        ) i_red_list_counter (
+          .data_i       (red_expected_in_route_loopback[in][v]),
+          .popcount_o   (red_how_many_participants[in][v])
+        );
+        assign red_single_member[in][v] = (red_how_many_participants[in][v] <= 1);
+
+        // Generate the handshaking
+        // Outoput 0: unicast
+        // Output 1: reduction
+        //TODO(lleone): Now in the sequential reduction list tehre is also SeqAW because in case of offload,
+        // the selectAW operation is anyway handled by the offload unit regardless it's not really an offload reduction.
+        assign offload_reduction[in][v] = (~red_single_member[in][v]) &
+                                          (is_sequential_reduction_op(in_routed_data[in][v].hdr.collective_op));
+        stream_demux #(
+          .N_OUP              (2)
+        ) i_stream_demux (
+          .inp_valid_i        (in_valid[in][v]),
+          .inp_ready_o        (in_ready[in][v]),
+          .oup_sel_i          (offload_reduction[in][v]),
+          .oup_valid_o        ({red_valid_in[in][v], cross_valid[in][v]}),
+          .oup_ready_i        ({red_ready_in[in][v], cross_ready[in][v]})
+        );
+        // Assign the data
+        assign red_data_in[in][v] = in_routed_data[in][v];
+        assign red_route_selected[in][v] = route_mask[in][v];
+      end
+    end
+  end else begin
+    assign cross_valid = in_valid;
+    assign in_ready = cross_ready;
+    assign red_valid_in = '0;
+    assign red_data_in = '0;
+    assign red_route_selected = '0;
+    assign red_expected_in_route_loopback = '0;
+  end
+
+  // TODO(lleone): For the moment we don't support reduction with only one virtual channel.
+  // This requirement could be relaxed in the fouture if the wide req router is split between
+  // AR/W and R channels.
+  // To have reduction support, VC0 must be used for the reduction traffic
+
+  // Reduction logic
+  if(EnSequentialReduction == 1'b1) begin : gen_reduction_logic
+    for (genvar in = 0; in < NumInput; in++) begin: gen_vc_reduction
+        assign red_offload_valid_in[in] = red_valid_in[in][0];
+        assign red_ready_in[in][0]      = red_offload_ready_in[in];
+        assign red_offload_data_in[in]  = red_data_in[in][0];
+        assign red_offload_route_selected[in]   = red_route_selected[in][0];
+        assign red_offload_expected_in_route_loopback[in] = red_expected_in_route_loopback[in][0];
+    end
+    if (EnCollVirtChannel) begin
+      for (genvar in = 0; in < NumInput; in++) begin: gen_vc1_tied
+        assign red_ready_in[in][1]  = '0; // Tied to zero the ready from offload unit to VC1
+      end
+    end
+
+    floo_reduction_unit #(
+      .NumInputs                  (NumInput),
+      .NumOutputs                 (NumOutput),
+      .flit_t                     (flit_t),
+      .hdr_t                      (hdr_t),
+      .id_t                       (id_t),
+      .reduction_data_t           (RdData_t),
+      .RedCfg                     (RedCfg),
+      .AxiCfg                     (AxiCfgOffload)
+    ) i_reduction_unit (
+      .clk_i                      (clk_i),
+      .rst_ni                     (rst_ni),
+      .xy_id_i                    (xy_id_i),
+      .valid_i                    (red_offload_valid_in),
+      .ready_o                    (red_offload_ready_in),
+      .data_i                     (red_offload_data_in),
+      .routed_out_mask_i          (red_offload_route_selected),
+      .in_mask_i                  (red_offload_expected_in_route_loopback),
+      .valid_o                    (red_offload_valid_out),
+      .ready_i                    (red_offload_ready_out),
+      .data_o                     (red_offload_data_out),
+      .operation_o                (offload_req_op_o),
+      .operand1_o                 (offload_req_operand1_o),
+      .operand2_o                 (offload_req_operand2_o),
+      .operands_valid_o           (offload_req_valid_o),
+      .operands_ready_i           (offload_req_ready_i),
+      .result_i                   (offload_resp_result_i),
+      .result_valid_i             (offload_resp_valid_i),
+      .result_ready_o             (offload_resp_ready_o)
+    );
+
+    for (genvar out = 0; out < NumOutput; out++) begin : gen_output_virt_sel
+      // Data path
+      assign red_data_out[out][0] = red_offload_data_out[out];
+      assign red_valid_out[out][0] = red_offload_valid_out[out];
+      assign red_offload_ready_out[out] = red_ready_out[out][0];
+    end
+
+    // Tie down all unused signals
+    if(EnCollVirtChannel) begin
+      for (genvar out = 0; out < NumOutput; out++) begin
+        assign red_data_out[out][1] = '0;
+        assign red_valid_out[out][1] = '0;
+      end
+    end
+  end else begin
+    assign red_offload_valid_in = '0;
+    assign red_offload_ready_in = '0;
+    assign red_offload_data_in = '0;
+    assign red_offload_route_selected = '0;
+    assign red_offload_expected_in_route_loopback = '0;
+    assign red_offload_valid_out = '0;
+    assign red_offload_ready_out = '0;
+    assign red_offload_data_out = '0;
+    assign red_data_out = '0;
+    assign red_valid_out = '0;
+    assign offload_req_op_o = '0;
+    assign offload_req_operand1_o = '0;
+    assign offload_req_operand2_o = '0;
+    assign offload_req_valid_o = '0;
+    assign offload_resp_ready_o = '0;
+  end
 
+  // Normal crossbar between all in / out routes
   logic [NumOutput-1:0][NumVirtChannels-1:0][NumInput-1:0] masked_valid, masked_ready;
   logic [NumInput-1:0][NumVirtChannels-1:0][NumOutput-1:0] masked_valid_transposed;
   logic [NumInput-1:0][NumVirtChannels-1:0][NumOutput-1:0] masked_ready_transposed;
@@ -170,14 +381,14 @@ module floo_router
           assign masked_data[out][v][in]      = '0;
         end else begin : gen_conn
           assign masked_ready_transposed[in][v][out] = masked_ready[out][v][in];
-          assign masked_valid[out][v][in]     = in_valid[in][v] & route_mask[in][v][out] &
+          assign masked_valid[out][v][in]     = cross_valid[in][v] & route_mask[in][v][out] &
                                                 (!EnMultiCast || ~past_handshakes_q[in][v][out]);
           assign masked_data[out][v][in]      = in_routed_data[in][v];
         end
         assign masked_valid_transposed[in][v][out] = masked_valid[out][v][in];
       end
       if (!EnMultiCast) begin : gen_unicast
-        assign in_ready[in][v] = |(masked_ready_transposed[in][v] & route_mask[in][v]);
+        assign cross_ready[in][v] = |(masked_ready_transposed[in][v] & route_mask[in][v]);
       end else begin : gen_multicast
         // In the case of multicast transactions, each destination can assert the ready signal
         // independently and potentially at different clock cycles. This logic ensures that
@@ -188,7 +399,7 @@ module floo_router
         assign current_handshakes[in][v] = masked_valid_transposed[in][v] &
                                            masked_ready_transposed[in][v];
         // Handhsake received in previous cycles
-        assign past_handshakes_d[in][v] = (in_ready[in][v] & in_valid[in][v]) ? '0 :
+        assign past_handshakes_d[in][v] = (cross_ready[in][v] & cross_valid[in][v]) ? '0 :
                                             (past_handshakes_q[in][v] | current_handshakes[in][v]);
         // History of handshake received (past + present)
         assign all_handshakes[in][v] = past_handshakes_q[in][v] | current_handshakes[in][v];
@@ -198,13 +409,34 @@ module floo_router
         assign expected_handshakes[in][v] = route_mask[in][v] & ~ignore_routes[in][v];
 
         // Send ready upstream only when all expected downstream handhsalkes have been received
-        assign in_ready[in][v] = &(all_handshakes[in][v] | ~expected_handshakes[in][v]);
+        assign cross_ready[in][v] = &(all_handshakes[in][v] | ~expected_handshakes[in][v]);
       end
     end
   end
 
+  // TODO (lleone): Move the folloiwng FF inside the multicast
   `FF(past_handshakes_q, past_handshakes_d, '0)
 
+  // We merge the data from the reduction module as an additional input of our output arbiter.
+  logic [NumOutput-1:0][NumVirtChannels-1:0][localNumInputs-1:0] merged_valid, merged_ready;
+  flit_t [NumOutput-1:0][NumVirtChannels-1:0][localNumInputs-1:0] merged_data;
+
+  if(EnSequentialReduction == 1'b1) begin : gen_assign_data_output
+    for (genvar v = 0; v < NumVirtChannels; v++) begin : gen_con_virt
+      for (genvar out = 0; out < NumOutput; out++) begin : gen_con_output
+        assign merged_data[out][v] = {red_data_out[out][v], masked_data[out][v]};
+        assign merged_valid[out][v] = {red_valid_out[out][v], masked_valid[out][v]};
+        assign masked_ready[out][v] = merged_ready[out][v][localNumInputs-2:0];
+        assign red_ready_out[out][v] = merged_ready[out][v][localNumInputs-1];
+      end
+    end
+  end else begin
+    assign merged_data = masked_data;
+    assign merged_valid = masked_valid;
+    assign masked_ready = merged_ready;
+  end
+
+  // Vars to handle the output of the arbiter and the optinal fifos
   flit_t [NumOutput-1:0][NumVirtChannels-1:0] out_data, out_buffered_data;
   logic  [NumOutput-1:0][NumVirtChannels-1:0] out_valid, out_ready;
   logic  [NumOutput-1:0][NumVirtChannels-1:0] out_buffered_valid, out_buffered_ready;
@@ -213,46 +445,30 @@ module floo_router
 
     // arbitrate input fifos per virtual channel
     for (genvar v = 0; v < NumVirtChannels; v++) begin : gen_virt_output
-      if(!EnReduction) begin : gen_wh_arb
-        floo_wormhole_arbiter #(
-          .NumRoutes  ( NumInput ),
-          .flit_t     ( flit_t   )
-        ) i_wormhole_arbiter (
-          .clk_i,
-          .rst_ni,
-
-          .valid_i ( masked_valid[out][v] ),
-          .ready_o ( masked_ready[out][v] ),
-          .data_i  ( masked_data [out][v] ),
-
-          .valid_o ( out_valid[out][v] ),
-          .ready_i ( out_ready[out][v] ),
-          .data_o  ( out_data [out][v] )
-        );
-      end else begin : gen_red_arb
-        // Arbiter to be instantiated for reduction operations.
-        // Repsonses from a multicast request are also treated as reductions.
-        floo_output_arbiter #(
-          .NumRoutes     ( NumInput      ),
-          .flit_t        ( flit_t        ),
-          .payload_t     ( payload_t     ),
-          .NarrowRspMask ( NarrowRspMask ),
-          .WideRspMask   ( WideRspMask   ),
-          .id_t          ( id_t          )
-        ) i_output_arbiter (
-          .clk_i,
-          .rst_ni,
-
-          .valid_i  ( masked_valid[out][v] ),
-          .ready_o  ( masked_ready[out][v] ),
-          .data_i   ( masked_data [out][v] ),
-          .xy_id_i  ( xy_id_i              ),
-
-          .valid_o ( out_valid[out][v] ),
-          .ready_i ( out_ready[out][v] ),
-          .data_o  ( out_data [out][v] )
-        );
-      end
+      // Output arbiter
+      floo_output_arbiter #(
+        .NumRoutes            ( localNumInputs            ),
+        .NumParallelRedRoutes ( NumParallelRedRoutes      ),
+        .CollectOpCfg         ( CollectiveCfg             ),
+        .flit_t               ( flit_t                    ),
+        .hdr_t                ( hdr_t                     ),
+        .id_t                 ( id_t                      ),
+        .RdSupportLoopback    ( RedCfg.RdSupportLoopback  ),
+        .RdSupportAxi         ( RedCfg.RdSupportAxi       ),
+        .AxiCfg               ( AxiCfgParallel            )
+      ) i_output_arbiter (
+        .clk_i,
+        .rst_ni,
+
+        .valid_i  ( merged_valid[out][v] ),
+        .ready_o  ( merged_ready[out][v] ),
+        .data_i   ( merged_data [out][v] ),
+        .xy_id_i  ( xy_id_i              ),
+
+        .valid_o ( out_valid[out][v] ),
+        .ready_i ( out_ready[out][v] ),
+        .data_o  ( out_data [out][v] )
+      );
 
       if (OutFifoDepth > 0) begin : gen_out_fifo
         (* ungroup *)
@@ -334,12 +550,28 @@ module floo_router
     for (genvar in = 0; in < NumInput; in++) begin : gen_input
       for (genvar v = 0; v < NumVirtChannels; v++) begin : gen_virt
         `ASSERT(NoLoopback, !(in_valid[in][v] && route_mask[in][v][in] &&
-                            (in_data[in][v].hdr.commtype == Unicast)))
+                            (in_data[in][v].hdr.collective_op == Unicast)))
       end
     end
   end
 
+  // If you have offload reduction and more than one virtual channel,
+  // the reduction traffic must arrive from Virtual Channel 0
+  if (EnSequentialReduction && (NumVirtChannels > 1) && EnCollVirtChannel) begin: gen_vc_red
+    for (genvar in = 0; in < NumInput; in++) begin
+        `ASSERT(CollOpReceivedOnWrongVirtChannel, !red_valid_in[in][1])
+    end
+  end
+
   // Multicast is currently only supported for `XYRouting`
   `ASSERT_INIT(NoMultiCastSupport, !(EnMultiCast && RouteAlgo != XYRouting))
+  // We only support symmetrical configuration for the FP reduction
+  `ASSERT_INIT(NoSymConfig, !(EnSequentialReduction && (NumInput != NumOutput)))
+  // Currently the AXI support must be enabled
+  `ASSERT_INIT(SupportAXI, !EnSequentialReduction || RedCfg.RdSupportAxi)
+  // We can not support Loopback when you have reduction and the NoLoopback option is disabled
+  `ASSERT_INIT(SupportLoopback, !(EnSequentialReduction && RedCfg.RdSupportLoopback && NoLoopback))
+  // We cannot support sequential reduction with multiple VC if EnCollVirtChannel is not set
+  `ASSERT_INIT(NoRedVcSupport, !(EnSequentialReduction && (NumVirtChannels > 1) && !EnCollVirtChannel))
 
 endmodule
diff --git a/hw/include/floo_noc/typedef.svh b/hw/include/floo_noc/typedef.svh
index 2cea3a31..34bccbcd 100644
--- a/hw/include/floo_noc/typedef.svh
+++ b/hw/include/floo_noc/typedef.svh
@@ -47,18 +47,17 @@
 //
 // For `SourceRouting`:
 // `FLOO_TYPEDEF_HDR_T(hdr_t, route_t, id_t, floo_pkg::axi_ch_e, logic)
-`define FLOO_TYPEDEF_HDR_T(hdr_t, dst_t, src_t, ch_t, rob_idx_t, mask_t = logic, collect_comm_t = logic, reduction_t = logic)  \
+`define FLOO_TYPEDEF_HDR_T(hdr_t, dst_t, src_t, ch_t, rob_idx_t, mask_t = logic, collect_op_t = logic)  \
   typedef struct packed {                                         \
     logic rob_req;                                                \
     rob_idx_t rob_idx;                                            \
     dst_t dst_id;                                                 \
-    mask_t mask;                                                  \
+    mask_t collective_mask;                                       \
     src_t src_id;                                                 \
     logic last;                                                   \
     logic atop;                                                   \
     ch_t axi_ch;                                                  \
-    collect_comm_t commtype;                                      \
-    reduction_t reduction_op;                                     \
+    collect_op_t collective_op;                                   \
   } hdr_t;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -288,7 +287,7 @@
     floo_``chan_name``_chan_t ``chan_name``;  \
   } floo_``name``_t;
 
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
 // Defines the all the link types with a ready-valid handshaking interface
 // It support virtual channeling by extending the handshakes
 //
@@ -360,7 +359,7 @@
   `FLOO_TYPEDEF_LINK_T(rsp, rsp_chan)                                           \
   `FLOO_TYPEDEF_LINK_T(wide, wide_chan)
 
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Defines the all the link types with ready-valid handshaking interface
 // for a narrow-wide AXI interface configuration which implements a simple
 // virtual channeling.
diff --git a/hw/synth/floo_synth_nw_chimney.sv b/hw/synth/floo_synth_nw_chimney.sv
index da3385c0..c450ce17 100644
--- a/hw/synth/floo_synth_nw_chimney.sv
+++ b/hw/synth/floo_synth_nw_chimney.sv
@@ -8,18 +8,22 @@ module floo_synth_nw_chimney
   import floo_pkg::*;
   import floo_synth_params_pkg::*;
   import floo_synth_nw_pkg::*;
-(
+  import floo_synth_collective_pkg::*;
+  import endpoint_axi_pkg::*;
+  #(
+    parameter bit EnCollective = 1'b0
+  ) (
   input  logic clk_i,
   input  logic rst_ni,
   input  logic test_enable_i,
-  input  axi_narrow_in_req_t  axi_narrow_in_req_i,
-  output axi_narrow_in_rsp_t  axi_narrow_in_rsp_o,
-  output axi_narrow_out_req_t axi_narrow_out_req_o,
-  input  axi_narrow_out_rsp_t axi_narrow_out_rsp_i,
-  input  axi_wide_in_req_t    axi_wide_in_req_i,
-  output axi_wide_in_rsp_t    axi_wide_in_rsp_o,
-  output axi_wide_out_req_t   axi_wide_out_req_o,
-  input  axi_wide_out_rsp_t   axi_wide_out_rsp_i,
+  input  endpoint_axi_pkg::narrow_out_req_t  axi_narrow_in_req_i,
+  output endpoint_axi_pkg::narrow_out_resp_t  axi_narrow_in_rsp_o,
+  output endpoint_axi_pkg::narrow_in_req_t axi_narrow_out_req_o,
+  input  endpoint_axi_pkg::narrow_in_resp_t axi_narrow_out_rsp_i,
+  input  endpoint_axi_pkg::wide_out_req_t    axi_wide_in_req_i,
+  output endpoint_axi_pkg::wide_out_resp_t    axi_wide_in_rsp_o,
+  output endpoint_axi_pkg::wide_in_req_t   axi_wide_out_req_o,
+  input  endpoint_axi_pkg::wide_in_resp_t   axi_wide_out_rsp_i,
   input  id_t id_i,
   input  route_t [RouteCfg.NumRoutes-1:0] route_table_i,
   output floo_req_t  floo_req_o,
@@ -27,32 +31,43 @@ module floo_synth_nw_chimney
   input  floo_req_t  floo_req_i,
   input  floo_rsp_t  floo_rsp_i,
   output floo_wide_t floo_wide_o,
-  input  floo_wide_t floo_wide_i
+  input  floo_wide_double_t floo_wide_i
 );
 
+localparam floo_pkg::route_cfg_t RouteCfgColl = (EnCollective) ? CollectRouteCfg : RouteCfg;
+
   floo_nw_chimney #(
     .AxiCfgN              ( AxiCfgN               ),
     .AxiCfgW              ( AxiCfgW               ),
     .ChimneyCfgN          ( ChimneyCfg            ),
     .ChimneyCfgW          ( ChimneyCfg            ),
-    .RouteCfg             ( RouteCfg              ),
+    .RouteCfg             ( RouteCfgColl          ), //TODO (lleone): change to enable multicast/collective
     .AtopSupport          ( AtopSupport           ),
+    .EnDecoupledRW        ( 1'b1                  ),
     .MaxAtomicTxns        ( MaxAtomicTxns         ),
+    // SAM?
     .id_t                 ( id_t                  ),
     .route_t              ( route_t               ),
     .dst_t                ( route_t               ),
     .hdr_t                ( hdr_t                 ),
-    .axi_narrow_in_req_t  ( axi_narrow_in_req_t   ),
-    .axi_narrow_in_rsp_t  ( axi_narrow_in_rsp_t   ),
-    .axi_narrow_out_req_t ( axi_narrow_out_req_t  ),
-    .axi_narrow_out_rsp_t ( axi_narrow_out_rsp_t  ),
-    .axi_wide_in_req_t    ( axi_wide_in_req_t     ),
-    .axi_wide_in_rsp_t    ( axi_wide_in_rsp_t     ),
-    .axi_wide_out_req_t   ( axi_wide_out_req_t    ),
-    .axi_wide_out_rsp_t   ( axi_wide_out_rsp_t    ),
+    .sam_rule_t           ( sam_multicast_rule_t  ), //TODO (lleone) handled unicast case
+    .sam_idx_t            ( sam_idx_t             ), //TODO (lleone) handled unicast case
+    .mask_sel_t           ( mask_sel_t            ), //TODO (lleone) handled unicast case
+    .axi_narrow_in_req_t  ( endpoint_axi_pkg::narrow_out_req_t),
+    .axi_narrow_in_rsp_t  ( endpoint_axi_pkg::narrow_out_resp_t),
+    .axi_narrow_out_req_t ( endpoint_axi_pkg::narrow_in_req_t),
+    .axi_narrow_out_rsp_t ( endpoint_axi_pkg::narrow_in_resp_t),
+    .axi_wide_in_req_t    ( endpoint_axi_pkg::wide_out_req_t),
+    .axi_wide_in_rsp_t    ( endpoint_axi_pkg::wide_out_resp_t),
+    .axi_wide_out_req_t   ( endpoint_axi_pkg::wide_in_req_t),
+    .axi_wide_out_rsp_t   ( endpoint_axi_pkg::wide_in_resp_t),
+
     .floo_req_t           ( floo_req_t            ),
     .floo_rsp_t           ( floo_rsp_t            ),
-    .floo_wide_t          ( floo_wide_t           )
+    .floo_wide_t          ( floo_wide_t           ),
+    .floo_wide_in_t       ( floo_wide_double_t    ),
+    .user_narrow_struct_t ( collective_narrow_user_t),
+    .user_wide_struct_t   ( collective_wide_user_t)
   ) i_floo_nw_chimney (
     .clk_i                ( clk_i                 ),
     .rst_ni               ( rst_ni                ),
diff --git a/hw/synth/floo_synth_nw_router.sv b/hw/synth/floo_synth_nw_router.sv
index 0d11e144..4d0b661e 100644
--- a/hw/synth/floo_synth_nw_router.sv
+++ b/hw/synth/floo_synth_nw_router.sv
@@ -8,8 +8,12 @@ module floo_synth_nw_router
   import floo_pkg::*;
   import floo_synth_params_pkg::*;
   import floo_synth_nw_pkg::*;
+  import floo_synth_collective_pkg::*;
 #(
-  parameter int unsigned NumPorts = int'(floo_pkg::NumDirections)
+  parameter int unsigned NumPorts = int'(floo_pkg::NumDirections),
+  parameter int unsigned  EnCollective  = 0,
+  parameter int unsigned  EnNarrOffload    = 0,
+  parameter int unsigned  EnWideOffload    = 0
 ) (
   input  logic clk_i,
   input  logic rst_ni,
@@ -23,9 +27,82 @@ module floo_synth_nw_router
   output floo_req_t [NumPorts-1:0] floo_req_o,
   output floo_rsp_t [NumPorts-1:0] floo_rsp_o,
   input  floo_wide_t [NumPorts-1:0] floo_wide_i,
-  output floo_wide_t [NumPorts-1:0] floo_wide_o
+  output floo_wide_double_t [NumPorts-1:0] floo_wide_o,
+  /// Wide IF towards the offload logic
+  output floo_pkg::collect_op_e              offload_wide_req_op_o,
+  output RdDataWide_t                   offload_wide_req_operand1_o,
+  output RdDataWide_t                   offload_wide_req_operand2_o,
+  output logic                          offload_wide_req_valid_o,
+  input logic                           offload_wide_req_ready_i,
+  /// Wide IF from external FPU
+  input RdDataWide_t                    offload_wide_resp_result_i,
+  input logic                           offload_wide_resp_valid_i,
+  output logic                          offload_wide_resp_ready_o,
+  /// Narrow IF towards the offload logic
+  output floo_pkg::collect_op_e            offload_narrow_req_op_o,
+  output RdDataNarrow_t                 offload_narrow_req_operand1_o,
+  output RdDataNarrow_t                 offload_narrow_req_operand2_o,
+  output logic                          offload_narrow_req_valid_o,
+  input logic                           offload_narrow_req_ready_i,
+  /// Narrow IF from external FPU
+  input RdDataNarrow_t                  offload_narrow_resp_result_i,
+  input logic                           offload_narrow_resp_valid_i,
+  output logic                          offload_narrow_resp_ready_o
 );
 
+
+localparam floo_pkg::collect_op_fe_cfg_t OpCfg = CollectOpCfgList[EnCollective];
+localparam reduction_cfg_t NarrRedCfg = NarrRedCfgList[EnNarrOffload];
+localparam reduction_cfg_t WideRedCfg = WideRedCfgList[EnWideOffload];
+
+if (!EnCollective) begin
+  floo_nw_router #(
+    .AxiCfgN       ( AxiCfgN             ),
+    .AxiCfgW       ( AxiCfgW             ),
+    .RouteAlgo     ( RouteCfg.RouteAlgo  ),
+    .NumRoutes     ( NumPorts            ),
+    .NumAddrRules  ( 1                   ),
+    .InFifoDepth   ( InFifoDepth         ),
+    .OutFifoDepth  ( OutFifoDepth        ),
+    .XYRouteOpt    ( 1'b0                ),
+    .EnDecoupledRW ( 1'b1                ),
+    .id_t          ( id_t                ),
+    .hdr_t         ( hdr_t               ),
+    .floo_req_t    ( floo_req_t          ),
+    .floo_rsp_t    ( floo_rsp_t          ),
+    .floo_wide_t   ( floo_wide_t         ),
+    .floo_wide_out_t (floo_wide_double_t)
+  ) i_floo_nw_router (
+    .clk_i          ( clk_i           ),
+    .rst_ni         ( rst_ni          ),
+    .test_enable_i  ( test_enable_i   ),
+    .id_i           ( id_i            ),
+    .id_route_map_i ( id_route_map_i  ),
+    .floo_req_i     ( floo_req_i      ),
+    .floo_rsp_i     ( floo_rsp_i      ),
+    .floo_req_o     ( floo_req_o      ),
+    .floo_rsp_o     ( floo_rsp_o      ),
+    .floo_wide_i    ( floo_wide_i     ),
+    .floo_wide_o    ( floo_wide_o     ),
+    .offload_wide_req_op_o          (),
+    .offload_wide_req_operand1_o    (),
+    .offload_wide_req_operand2_o    (),
+    .offload_wide_req_valid_o       (),
+    .offload_wide_req_ready_i       ('0),
+    .offload_wide_resp_result_i     ('0),
+    .offload_wide_resp_valid_i      ('0),
+    .offload_wide_resp_ready_o      (),
+    // Narrow Reduction offload port
+    .offload_narrow_req_op_o        (),
+    .offload_narrow_req_operand1_o  (),
+    .offload_narrow_req_operand2_o  (),
+    .offload_narrow_req_valid_o     (),
+    .offload_narrow_req_ready_i     ('0),
+    .offload_narrow_resp_result_i   ('0),
+    .offload_narrow_resp_valid_i    ('0),
+    .offload_narrow_resp_ready_o    ()
+  );
+end else begin
   floo_nw_router #(
     .AxiCfgN      ( AxiCfgN             ),
     .AxiCfgW      ( AxiCfgW             ),
@@ -35,11 +112,22 @@ module floo_synth_nw_router
     .InFifoDepth  ( InFifoDepth         ),
     .OutFifoDepth ( OutFifoDepth        ),
     .XYRouteOpt   ( 1'b0                ),
+    .NoLoopback   (1'b0),
+    .EnDecoupledRW (1'b1),
     .id_t         ( id_t                ),
-    .hdr_t        ( hdr_t               ),
+    .hdr_t        ( hdr_coll_t          ),
     .floo_req_t   ( floo_req_t          ),
     .floo_rsp_t   ( floo_rsp_t          ),
-    .floo_wide_t  ( floo_wide_t         )
+    .floo_wide_t  ( floo_wide_t         ),
+    .floo_wide_out_t (floo_wide_double_t),
+    .RdWideOperation_t        (floo_pkg::collect_op_e),
+    .RdNarrowOperation_t      (floo_pkg::collect_op_e),
+    .RdWideData_t             (RdDataWide_t),
+    .RdNarrowData_t           (RdDataNarrow_t),
+    .CollectiveOpCfg          (OpCfg),          // To be modified for different synth cfg results
+    .RdWideCfg                (WideRedCfg),     // To be modified for different synth cfg results
+    .RdNarrowCfg              (NarrRedCfg),   // To be modified for different synth cfg results
+    .RdRespCfg                (ResponseReductionCfg)
   ) i_floo_nw_router (
     .clk_i          ( clk_i           ),
     .rst_ni         ( rst_ni          ),
@@ -51,7 +139,26 @@ module floo_synth_nw_router
     .floo_req_o     ( floo_req_o      ),
     .floo_rsp_o     ( floo_rsp_o      ),
     .floo_wide_i    ( floo_wide_i     ),
-    .floo_wide_o    ( floo_wide_o     )
+    .floo_wide_o    ( floo_wide_o     ),
+    .offload_wide_req_op_o          (offload_wide_req_op_o),
+    .offload_wide_req_operand1_o    (offload_wide_req_operand1_o),
+    .offload_wide_req_operand2_o    (offload_wide_req_operand2_o),
+    .offload_wide_req_valid_o       (offload_wide_req_valid_o),
+    .offload_wide_req_ready_i       (offload_wide_req_ready_i),
+    .offload_wide_resp_result_i     (offload_wide_resp_result_i),
+    .offload_wide_resp_valid_i      (offload_wide_resp_valid_i),
+    .offload_wide_resp_ready_o      (offload_wide_resp_ready_o),
+    // Narrow Reduction offload port
+    .offload_narrow_req_op_o        (offload_narrow_req_op_o),
+    .offload_narrow_req_operand1_o  (offload_narrow_req_operand1_o),
+    .offload_narrow_req_operand2_o  (offload_narrow_req_operand2_o),
+    .offload_narrow_req_valid_o     (offload_narrow_req_valid_o),
+    .offload_narrow_req_ready_i     (offload_narrow_req_ready_i),
+    .offload_narrow_resp_result_i   (offload_narrow_resp_result_i),
+    .offload_narrow_resp_valid_i    (offload_narrow_resp_valid_i),
+    .offload_narrow_resp_ready_o    (offload_narrow_resp_ready_o)
   );
+end
+
 
 endmodule
diff --git a/hw/synth/floo_synth_params_pkg.sv b/hw/synth/floo_synth_params_pkg.sv
index 59bb6c82..66ad8789 100644
--- a/hw/synth/floo_synth_params_pkg.sv
+++ b/hw/synth/floo_synth_params_pkg.sv
@@ -136,3 +136,308 @@ package floo_synth_nw_vc_pkg;
   `FLOO_TYPEDEF_NW_LINK_ALL(vc_req, vc_rsp, vc_wide, vc_req, vc_rsp, vc_wide)
 
 endpackage
+
+// Package to define the AXI interface at the NI for chimney synthesis
+// It's taken for the standard snitch pkg interface used in Picobello
+// This package is necessary to correctly configure the chimney in a
+// realistic manner for a possible endpoint.
+package endpoint_axi_pkg;
+  localparam int unsigned AtomicIdWidth = 5;
+  localparam int unsigned AddrWidth = 48;
+  localparam int unsigned NarrowDataWidth = 64;
+  localparam int unsigned WideDataWidth = 512;
+  localparam int unsigned NarrowIdWidthIn = 2;
+  localparam int unsigned NrNarrowMasters = 3;
+  localparam int unsigned NarrowIdWidthOut = $clog2(NrNarrowMasters) + NarrowIdWidthIn;
+  localparam int unsigned NrWideMasters = 1 + 1 + 1;
+  localparam int unsigned WideIdWidthIn = 1;
+  localparam int unsigned WideIdWidthOut = $clog2(NrWideMasters) + WideIdWidthIn;
+
+  typedef logic [AddrWidth-1:0]         addr_t;
+  typedef logic [NarrowDataWidth-1:0]   data_t;
+  typedef logic [NarrowDataWidth/8-1:0] strb_t;
+  typedef logic [WideDataWidth-1:0]     data_dma_t;
+  typedef logic [WideDataWidth/8-1:0]   strb_dma_t;
+  typedef logic [NarrowIdWidthIn-1:0]   narrow_in_id_t;
+  typedef logic [NarrowIdWidthOut-1:0]  narrow_out_id_t;
+  typedef logic [WideIdWidthIn-1:0]     wide_in_id_t;
+  typedef logic [WideIdWidthOut-1:0]    wide_out_id_t;
+
+  localparam int unsigned CollectiveWidth = 4;
+
+  typedef struct packed {
+    addr_t                          collective_mask;
+    logic [CollectiveWidth-1:0]     collective_op;
+    logic [AtomicIdWidth-1:0]       atomic_id;
+  } user_narrow_t;
+
+// Will be extended when implementing collective operation on the wide dma link
+  typedef struct packed {
+    addr_t                          collective_mask;
+    logic [CollectiveWidth-1:0]     collective_op;
+  } user_dma_t;
+
+  localparam int unsigned NarrowUserWidth = $bits(user_narrow_t);
+  localparam int unsigned WideUserWidth = $bits(user_dma_t);
+
+  // AXI interface
+  `AXI_TYPEDEF_ALL(narrow_in, addr_t, narrow_in_id_t, data_t, strb_t, user_narrow_t)
+  `AXI_TYPEDEF_ALL(narrow_out, addr_t, narrow_out_id_t, data_t, strb_t, user_narrow_t)
+  `AXI_TYPEDEF_ALL(wide_in, addr_t, wide_in_id_t, data_dma_t, strb_dma_t, user_dma_t)
+  `AXI_TYPEDEF_ALL(wide_out, addr_t, wide_out_id_t, data_dma_t, strb_dma_t, user_dma_t)
+endpackage
+
+
+// Package to define all the types and information to analyyze collective support
+package floo_synth_collective_pkg;
+  import floo_pkg::*;
+  import floo_synth_params_pkg::*;
+  import floo_synth_nw_pkg::*;
+  import endpoint_axi_pkg::*;
+
+  typedef logic [0:0] rob_idx_t;
+
+  // TODO (lleone): Script the following configurations with Python
+  // Offload unit configuration
+  localparam reduction_cfg_t WideGenReductionCfg = '{
+    RdControllConf: ControllerGeneric,
+    RdFifoFallThrough: 1'b1,
+    RdFifoDepth: 0,
+    RdPipelineDepth: 5,
+    RdPartialBufferSize: 6,
+    RdTagBits: 5,
+    RdSupportAxi: 1'b1,
+    RdEnableBypass: 1'b1,
+    RdSupportLoopback: 1'b1,
+    CutOffloadIntf: 1'b1
+  };
+
+  localparam reduction_cfg_t WideStallingReductionCfg = '{
+    RdControllConf: ControllerStalling,
+    RdFifoFallThrough: 1'b1,
+    RdFifoDepth: 0,
+    RdPipelineDepth: 5,
+    RdPartialBufferSize: 3,
+    RdTagBits: 5,
+    RdSupportAxi: 1'b1,
+    RdEnableBypass: 1'b1,
+    RdSupportLoopback: 1'b1,
+    CutOffloadIntf: 1'b1
+  };
+
+  localparam reduction_cfg_t WideSimpleReductionCfg = '{
+    RdControllConf: ControllerSimple,
+    RdFifoFallThrough: 1'b1,
+    RdFifoDepth: 0,
+    RdPipelineDepth: 5,
+    RdPartialBufferSize: 1,
+    RdTagBits: 5,
+    RdSupportAxi: 1'b1,
+    RdEnableBypass: 1'b1,
+    RdSupportLoopback: 1'b1,
+    CutOffloadIntf: 1'b1
+  };
+
+localparam reduction_cfg_t NarrowGenReductionCfg = '{
+    RdControllConf: ControllerGeneric,
+    RdFifoFallThrough: 1'b1,
+    RdFifoDepth: 0,
+    RdPipelineDepth: 1,
+    RdPartialBufferSize: 3,
+    RdTagBits: 5,
+    RdSupportAxi: 1'b1,
+    RdEnableBypass: 1'b1,
+    RdSupportLoopback: 1'b1,
+    CutOffloadIntf: 1'b1
+  };
+
+  localparam reduction_cfg_t NarrowStallingReductionCfg = '{
+    RdControllConf: ControllerStalling,
+    RdFifoFallThrough: 1'b1,
+    RdFifoDepth: 0,
+    RdPipelineDepth: 1,
+    RdPartialBufferSize: 3,
+    RdTagBits: 5,
+    RdSupportAxi: 1'b1,
+    RdEnableBypass: 1'b1,
+    RdSupportLoopback: 1'b1,
+    CutOffloadIntf: 1'b1
+  };
+
+  localparam reduction_cfg_t NarrowSimpleReductionCfg = '{
+    RdControllConf: ControllerSimple,
+    RdFifoFallThrough: 1'b1,
+    RdFifoDepth: 0,
+    RdPipelineDepth: 1,
+    RdPartialBufferSize: 3,
+    RdTagBits: 5,
+    RdSupportAxi: 1'b1,
+    RdEnableBypass: 1'b1,
+    RdSupportLoopback: 1'b1,
+    CutOffloadIntf: 1'b1
+  };
+
+  localparam reduction_cfg_t ResponseReductionCfg = '{
+    RdEnableBypass: 1'b1,
+    RdSupportLoopback: 1'b1,
+    default: '0
+  };
+
+  // Route config with collective support enabled
+  // This configuration is the one to be changed in order to enable or disable
+  // different collective operation support
+  // TODO (lleone): SCript this with Python
+
+  localparam floo_pkg::collect_op_fe_cfg_t CollectiveOpCfg = '{
+    EnNarrowMulticast:  1'b1,
+    EnWideMulticast:    1'b1,
+    EnLSBAnd:           1'b1,
+    EnF_Add:            1'b1,
+    EnF_Mul:            1'b1,
+    EnF_Min:            1'b1,
+    EnF_Max:            1'b1,
+    EnA_Add:            1'b1,
+    EnA_Mul:            1'b1,
+    EnA_Min_S:          1'b1,
+    EnA_Min_U:          1'b1,
+    EnA_Max_S:          1'b1,
+    EnA_Max_U:          1'b1
+  };
+
+  localparam floo_pkg::collect_op_fe_cfg_t MulticastOpCfg = '{
+    EnNarrowMulticast:  1'b1,
+    EnWideMulticast:    1'b1,
+    EnLSBAnd:           1'b0,
+    EnF_Add:            1'b0,
+    EnF_Mul:            1'b0,
+    EnF_Min:            1'b0,
+    EnF_Max:            1'b0,
+    default:            '0
+  };
+
+  localparam floo_pkg::collect_op_fe_cfg_t ParallelOpCfg = '{
+    EnNarrowMulticast:  1'b1,
+    EnWideMulticast:    1'b1,
+    EnLSBAnd:           1'b1,
+    EnF_Add:            1'b0,
+    EnF_Mul:            1'b0,
+    EnF_Min:            1'b0,
+    EnF_Max:            1'b0,
+    default:            '0
+  };
+
+  localparam floo_pkg::collect_op_fe_cfg_t NarrSequentialOpCfg = '{
+    EnNarrowMulticast:  1'b1,
+    EnWideMulticast:    1'b1,
+    EnLSBAnd:           1'b1,
+    EnF_Add:            1'b0,
+    EnF_Mul:            1'b0,
+    EnF_Min:            1'b0,
+    EnF_Max:            1'b0,
+    EnA_Add:            1'b1,
+    EnA_Mul:            1'b1,
+    EnA_Min_S:          1'b1,
+    EnA_Min_U:          1'b1,
+    EnA_Max_S:          1'b1,
+    EnA_Max_U:          1'b1
+  };
+
+  localparam floo_pkg::collect_op_fe_cfg_t WideSequentialOpCfg = '{
+    EnNarrowMulticast:  1'b1,
+    EnWideMulticast:    1'b1,
+    EnLSBAnd:           1'b1,
+    EnF_Add:            1'b1,
+    EnF_Mul:            1'b1,
+    EnF_Min:            1'b1,
+    EnF_Max:            1'b1,
+    EnA_Add:            1'b1,
+    EnA_Mul:            1'b1,
+    EnA_Min_S:          1'b1,
+    EnA_Min_U:          1'b1,
+    EnA_Max_S:          1'b1,
+    EnA_Max_U:          1'b1
+  };
+
+  localparam floo_pkg::collect_op_fe_cfg_t CollectOpCfgList [0:5] = '{
+    0: '0,
+    1: CollectiveOpCfg,
+    2: MulticastOpCfg,
+    3: ParallelOpCfg,
+    4: NarrSequentialOpCfg,
+    5: WideSequentialOpCfg
+  };
+
+  localparam reduction_cfg_t NarrRedCfgList [0:3] = '{
+    0: '0,
+    1: NarrowSimpleReductionCfg,
+    2: NarrowStallingReductionCfg,
+    3: NarrowGenReductionCfg
+  };
+
+  localparam reduction_cfg_t WideRedCfgList [0:3] = '{
+    0: '0,
+    1: WideSimpleReductionCfg,
+    2: WideStallingReductionCfg,
+    3: WideGenReductionCfg
+  };
+
+  typedef logic[AxiCfgW.DataWidth-1:0] RdDataWide_t;
+  typedef logic[AxiCfgN.DataWidth-1:0] RdDataNarrow_t;
+
+  // TODO(lleone): Each field must become [1:0] when testing VC
+  typedef struct packed {
+    logic [1:0] valid;
+    logic [1:0] ready;
+    floo_wide_chan_t [1:0] wide;
+  } floo_wide_double_t;
+
+  `FLOO_TYPEDEF_HDR_T(hdr_coll_t, id_t, id_t, nw_ch_e, rob_idx_t, id_t, collect_op_e)
+  // `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2)
+
+  // Typedef for the chimney
+  typedef bit [ 5:0] aw_bt;
+
+  typedef struct packed {
+    logic [5:0] offset;
+    logic [2:0] len;
+    logic [2:0] grp_base_id;
+  } mask_sel_t;
+
+  typedef struct packed {
+    id_t       id;
+    mask_sel_t mask_x;
+    mask_sel_t mask_y;
+  } sam_idx_t;
+
+  typedef struct packed {
+    sam_idx_t                             idx;
+    logic [aw_bt'(AxiCfgN.AddrWidth)-1:0] start_addr;
+    logic [aw_bt'(AxiCfgN.AddrWidth)-1:0] end_addr;
+  } sam_multicast_rule_t;
+
+  typedef logic [aw_bt'(AxiCfgN.AddrWidth)-1:0] user_mask_t;
+
+  typedef struct packed {
+    user_mask_t                 collective_mask;
+    floo_pkg::collect_op_e      collective_op;
+    logic [AtomicIdWidth-1:0]   atomic;
+  } collective_narrow_user_t;
+
+  typedef struct packed {
+    user_mask_t             collective_mask;
+    floo_pkg::collect_op_e  collective_op;
+  } collective_wide_user_t;
+
+  localparam floo_pkg::route_cfg_t CollectRouteCfg = '{
+    RouteAlgo: floo_pkg::XYRouting,
+    UseIdTable: 1,
+    XYAddrOffsetX: 16,
+    XYAddrOffsetY: 20,
+    CollectiveCfg: '{
+      OpCfg: CollectiveOpCfg,
+      RedCfg: WideSimpleReductionCfg
+    },
+    default: '0 // Potentially enable Multicast features
+  };
+
+endpackage
diff --git a/hw/synth/snitch_cluster_pkg.sv b/hw/synth/snitch_cluster_pkg.sv
new file mode 100644
index 00000000..e6557b29
--- /dev/null
+++ b/hw/synth/snitch_cluster_pkg.sv
@@ -0,0 +1,734 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// AUTOMATICALLY GENERATED by clustergen.py; edit the script or configuration
+// instead.
+
+
+
+
+
+
+
+`include "axi/typedef.svh"
+`include "tcdm_interface/typedef.svh"
+`include "dca_interface/typedef.svh"
+
+// verilog_lint: waive-start package-filename
+package snitch_cluster_pkg;
+
+  localparam int unsigned NrCores = 9;
+  localparam int unsigned NrHives = 1;
+
+  localparam int unsigned TcdmSize = 128;
+  localparam int unsigned TcdmSizeNapotRounded = 1 << $clog2(TcdmSize);
+  localparam int unsigned BootromSize = 4; // Fixed size of 4kB
+  localparam int unsigned ClusterPeriphSize = 60;
+  localparam int unsigned ZeroMemorySize = 60;
+  localparam int unsigned ExtMemorySize = 4;
+
+  localparam int unsigned AddrWidth = 48;
+  localparam int unsigned NarrowDataWidth = 64;
+  localparam int unsigned WideDataWidth = 512;
+
+  localparam int unsigned NarrowIdWidthIn = 2;
+  localparam int unsigned NrNarrowMasters = 3;
+  localparam int unsigned NarrowIdWidthOut = $clog2(NrNarrowMasters) + NarrowIdWidthIn;
+
+  localparam int unsigned NrWideMasters = 1 + 1 + 1;
+  localparam int unsigned WideIdWidthIn = 1;
+  localparam int unsigned WideIdWidthOut = $clog2(NrWideMasters) + WideIdWidthIn;
+
+  localparam int unsigned EnableWideCollectives = 1;
+  localparam int unsigned EnableNarrowCollectives = 1;
+
+  localparam int unsigned AtomicIdWidth = 5;
+  localparam int unsigned CollectiveWidth = 4;
+
+  localparam int unsigned ICacheLineWidth [NrHives] = '{
+    256
+};
+  localparam int unsigned ICacheLineCount [NrHives] = '{
+    128
+};
+  localparam int unsigned ICacheWays [NrHives] = '{
+    2
+};
+
+  localparam int unsigned Hive [NrCores] = '{0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  localparam int unsigned TcdmAddrWidth = $clog2(TcdmSize*1024);
+
+  typedef struct packed {
+    logic [0:0] reserved;
+  } sram_cfg_t;
+
+  typedef struct packed {
+    sram_cfg_t icache_tag;
+    sram_cfg_t icache_data;
+    sram_cfg_t tcdm;
+  } sram_cfgs_t;
+
+  // Define dca_req_t and dca_rsp_t
+  `DCA_TYPEDEF_ALL(dca, WideDataWidth)
+
+  typedef logic [AddrWidth-1:0]         addr_t;
+  typedef logic [NarrowDataWidth-1:0]   data_t;
+  typedef logic [NarrowDataWidth/8-1:0] strb_t;
+  typedef logic [WideDataWidth-1:0]     data_dma_t;
+  typedef logic [WideDataWidth/8-1:0]   strb_dma_t;
+  typedef logic [NarrowIdWidthIn-1:0]   narrow_in_id_t;
+  typedef logic [NarrowIdWidthOut-1:0]  narrow_out_id_t;
+  typedef logic [WideIdWidthIn-1:0]     wide_in_id_t;
+  typedef logic [WideIdWidthOut-1:0]    wide_out_id_t;
+
+// Generate the typedef's for the userfield's with the required subfields depending
+// on the configuration
+  typedef struct packed {
+    addr_t                          collective_mask;
+    logic [CollectiveWidth-1:0]     collective_op;
+    logic [AtomicIdWidth-1:0]       atomic_id;
+  } user_narrow_t;
+
+// Will be extended when implementing collective operation on the wide dma link
+  typedef struct packed {
+    addr_t                          collective_mask;
+    logic [CollectiveWidth-1:0]     collective_op;
+  } user_dma_t;
+
+  localparam int unsigned NarrowUserWidth = $bits(user_narrow_t);
+  localparam int unsigned WideUserWidth = $bits(user_dma_t);
+
+  `AXI_TYPEDEF_ALL(narrow_in, addr_t, narrow_in_id_t, data_t, strb_t, user_narrow_t)
+  `AXI_TYPEDEF_ALL(narrow_out, addr_t, narrow_out_id_t, data_t, strb_t, user_narrow_t)
+  `AXI_TYPEDEF_ALL(wide_in, addr_t, wide_in_id_t, data_dma_t, strb_dma_t, user_dma_t)
+  `AXI_TYPEDEF_ALL(wide_out, addr_t, wide_out_id_t, data_dma_t, strb_dma_t, user_dma_t)
+
+  typedef logic [TcdmAddrWidth-1:0]     tcdm_addr_t;
+
+  `TCDM_TYPEDEF_ALL(tcdm_dma, tcdm_addr_t, data_dma_t, strb_dma_t, logic)
+
+  function automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] get_cached_regions();
+    automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] cached_regions;
+    cached_regions = '{default: '0};
+    cached_regions[0] = '{base: 48'h70000000, mask: 48'hfffff0000000};
+    return cached_regions;
+  endfunction
+
+  localparam snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{
+      NrCachedRegionRules: 1,
+      CachedRegion: get_cached_regions(),
+      default: 0
+  };
+
+  localparam fpnew_pkg::fpu_implementation_t FPUImplementation [9] = '{
+    '{
+        PipeRegs: // FMA Block
+                  '{
+                    '{  2, // FP32
+                        3, // FP64
+                        1, // FP16
+                        1, // FP8
+                        1, // FP16alt
+                        1  // FP8alt
+                      },
+                    '{1, 1, 1, 1, 1, 1},   // DIVSQRT
+                    '{1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1},   // NONCOMP
+                    '{2,
+                      2,
+                      2,
+                      2,
+                      2,
+                      2},   // CONV
+                    '{3,
+                      3,
+                      3,
+                      3,
+                      3,
+                      3}    // DOTP
+                    },
+        UnitTypes: '{'{fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED},  // FMA
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}, // DIVSQRT
+                    '{fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL}, // NONCOMP
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED},   // CONV
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED}},  // DOTP
+        PipeConfig: fpnew_pkg::BEFORE
+    },
+    '{
+        PipeRegs: // FMA Block
+                  '{
+                    '{  2, // FP32
+                        3, // FP64
+                        1, // FP16
+                        1, // FP8
+                        1, // FP16alt
+                        1  // FP8alt
+                      },
+                    '{1, 1, 1, 1, 1, 1},   // DIVSQRT
+                    '{1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1},   // NONCOMP
+                    '{2,
+                      2,
+                      2,
+                      2,
+                      2,
+                      2},   // CONV
+                    '{3,
+                      3,
+                      3,
+                      3,
+                      3,
+                      3}    // DOTP
+                    },
+        UnitTypes: '{'{fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED},  // FMA
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}, // DIVSQRT
+                    '{fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL}, // NONCOMP
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED},   // CONV
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED}},  // DOTP
+        PipeConfig: fpnew_pkg::BEFORE
+    },
+    '{
+        PipeRegs: // FMA Block
+                  '{
+                    '{  2, // FP32
+                        3, // FP64
+                        1, // FP16
+                        1, // FP8
+                        1, // FP16alt
+                        1  // FP8alt
+                      },
+                    '{1, 1, 1, 1, 1, 1},   // DIVSQRT
+                    '{1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1},   // NONCOMP
+                    '{2,
+                      2,
+                      2,
+                      2,
+                      2,
+                      2},   // CONV
+                    '{3,
+                      3,
+                      3,
+                      3,
+                      3,
+                      3}    // DOTP
+                    },
+        UnitTypes: '{'{fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED},  // FMA
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}, // DIVSQRT
+                    '{fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL}, // NONCOMP
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED},   // CONV
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED}},  // DOTP
+        PipeConfig: fpnew_pkg::BEFORE
+    },
+    '{
+        PipeRegs: // FMA Block
+                  '{
+                    '{  2, // FP32
+                        3, // FP64
+                        1, // FP16
+                        1, // FP8
+                        1, // FP16alt
+                        1  // FP8alt
+                      },
+                    '{1, 1, 1, 1, 1, 1},   // DIVSQRT
+                    '{1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1},   // NONCOMP
+                    '{2,
+                      2,
+                      2,
+                      2,
+                      2,
+                      2},   // CONV
+                    '{3,
+                      3,
+                      3,
+                      3,
+                      3,
+                      3}    // DOTP
+                    },
+        UnitTypes: '{'{fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED},  // FMA
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}, // DIVSQRT
+                    '{fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL}, // NONCOMP
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED},   // CONV
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED}},  // DOTP
+        PipeConfig: fpnew_pkg::BEFORE
+    },
+    '{
+        PipeRegs: // FMA Block
+                  '{
+                    '{  2, // FP32
+                        3, // FP64
+                        1, // FP16
+                        1, // FP8
+                        1, // FP16alt
+                        1  // FP8alt
+                      },
+                    '{1, 1, 1, 1, 1, 1},   // DIVSQRT
+                    '{1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1},   // NONCOMP
+                    '{2,
+                      2,
+                      2,
+                      2,
+                      2,
+                      2},   // CONV
+                    '{3,
+                      3,
+                      3,
+                      3,
+                      3,
+                      3}    // DOTP
+                    },
+        UnitTypes: '{'{fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED},  // FMA
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}, // DIVSQRT
+                    '{fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL}, // NONCOMP
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED},   // CONV
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED}},  // DOTP
+        PipeConfig: fpnew_pkg::BEFORE
+    },
+    '{
+        PipeRegs: // FMA Block
+                  '{
+                    '{  2, // FP32
+                        3, // FP64
+                        1, // FP16
+                        1, // FP8
+                        1, // FP16alt
+                        1  // FP8alt
+                      },
+                    '{1, 1, 1, 1, 1, 1},   // DIVSQRT
+                    '{1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1},   // NONCOMP
+                    '{2,
+                      2,
+                      2,
+                      2,
+                      2,
+                      2},   // CONV
+                    '{3,
+                      3,
+                      3,
+                      3,
+                      3,
+                      3}    // DOTP
+                    },
+        UnitTypes: '{'{fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED},  // FMA
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}, // DIVSQRT
+                    '{fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL}, // NONCOMP
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED},   // CONV
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED}},  // DOTP
+        PipeConfig: fpnew_pkg::BEFORE
+    },
+    '{
+        PipeRegs: // FMA Block
+                  '{
+                    '{  2, // FP32
+                        3, // FP64
+                        1, // FP16
+                        1, // FP8
+                        1, // FP16alt
+                        1  // FP8alt
+                      },
+                    '{1, 1, 1, 1, 1, 1},   // DIVSQRT
+                    '{1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1},   // NONCOMP
+                    '{2,
+                      2,
+                      2,
+                      2,
+                      2,
+                      2},   // CONV
+                    '{3,
+                      3,
+                      3,
+                      3,
+                      3,
+                      3}    // DOTP
+                    },
+        UnitTypes: '{'{fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED},  // FMA
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}, // DIVSQRT
+                    '{fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL}, // NONCOMP
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED},   // CONV
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED}},  // DOTP
+        PipeConfig: fpnew_pkg::BEFORE
+    },
+    '{
+        PipeRegs: // FMA Block
+                  '{
+                    '{  2, // FP32
+                        3, // FP64
+                        1, // FP16
+                        1, // FP8
+                        1, // FP16alt
+                        1  // FP8alt
+                      },
+                    '{1, 1, 1, 1, 1, 1},   // DIVSQRT
+                    '{1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1},   // NONCOMP
+                    '{2,
+                      2,
+                      2,
+                      2,
+                      2,
+                      2},   // CONV
+                    '{3,
+                      3,
+                      3,
+                      3,
+                      3,
+                      3}    // DOTP
+                    },
+        UnitTypes: '{'{fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED},  // FMA
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}, // DIVSQRT
+                    '{fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL}, // NONCOMP
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED},   // CONV
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED}},  // DOTP
+        PipeConfig: fpnew_pkg::BEFORE
+    },
+    '{
+        PipeRegs: // FMA Block
+                  '{
+                    '{  2, // FP32
+                        3, // FP64
+                        1, // FP16
+                        1, // FP8
+                        1, // FP16alt
+                        1  // FP8alt
+                      },
+                    '{1, 1, 1, 1, 1, 1},   // DIVSQRT
+                    '{1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1},   // NONCOMP
+                    '{2,
+                      2,
+                      2,
+                      2,
+                      2,
+                      2},   // CONV
+                    '{3,
+                      3,
+                      3,
+                      3,
+                      3,
+                      3}    // DOTP
+                    },
+        UnitTypes: '{'{fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED},  // FMA
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}, // DIVSQRT
+                    '{fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL}, // NONCOMP
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED},   // CONV
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}}, // DOTP
+        PipeConfig: fpnew_pkg::BEFORE
+    }
+  };
+
+  localparam snitch_ssr_pkg::ssr_cfg_t [3-1:0] SsrCfgs [9] = '{
+    '{'{1, 0, 0, 1, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 1, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 0, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3}},
+    '{'{1, 0, 0, 1, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 1, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 0, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3}},
+    '{'{1, 0, 0, 1, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 1, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 0, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3}},
+    '{'{1, 0, 0, 1, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 1, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 0, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3}},
+    '{'{1, 0, 0, 1, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 1, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 0, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3}},
+    '{'{1, 0, 0, 1, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 1, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 0, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3}},
+    '{'{1, 0, 0, 1, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 1, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 0, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3}},
+    '{'{1, 0, 0, 1, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 1, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3},
+      '{1, 1, 0, 0, 1, 1, 4, 17, 17, 3, 4, 3, 8, 4, 3}},
+    '{/*None*/ '0,
+      /*None*/ '0,
+      /*None*/ '0}
+  };
+
+  localparam logic [3-1:0][4:0] SsrRegs [9] = '{
+    '{2, 1, 0},
+    '{2, 1, 0},
+    '{2, 1, 0},
+    '{2, 1, 0},
+    '{2, 1, 0},
+    '{2, 1, 0},
+    '{2, 1, 0},
+    '{2, 1, 0},
+    '{/*None*/ 0, /*None*/ 0, /*None*/ 0}
+  };
+
+  // Forward potentially optional configuration parameters
+  localparam logic [9:0] CfgBaseHartId        = (10'h1);
+  localparam addr_t    	 CfgClusterBaseAddr   = (48'h20000000);
+  localparam addr_t    	 CfgClusterBaseOffset = (48'h40000);
+
+endpackage
+// verilog_lint: waive-stop package-filename
diff --git a/hw/test/floo_reduction_offloads.sv b/hw/test/floo_reduction_offloads.sv
new file mode 100644
index 00000000..8f4410bc
--- /dev/null
+++ b/hw/test/floo_reduction_offloads.sv
@@ -0,0 +1,338 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Raphael Roth <raroth@student.ethz.ch>
+
+// This module allows to implement a reduction HW to simulate a reduction operation.
+// Simple Testbench implementation!
+
+// Open Points:
+
+`include "common_cells/assertions.svh"
+
+// This Wrapper allows to wrap 8x 64 Bit (512 Bits) in parallel
+module floo_reduction_wrapper import floo_pkg::*; #(
+  parameter type         RdData_t               = logic,
+  parameter int unsigned RdElements             = 8,
+  parameter bit          FPU_ACTIVE             = 1'b0,
+  parameter bit          ALU_ACTIVE             = 1'b0,
+  parameter bit          DEBUG_PRINT_TRACE      = 1'b0
+) (
+  input  logic                          clk_i,
+  input  logic                          rst_ni,
+  input  logic                          flush_i,
+  /// IF towards external FPU
+  input  RdData_t                       reduction_req_op1_i,
+  input  RdData_t                       reduction_req_op2_i,
+  input  collect_op_t                   reduction_req_type_i,
+  input  logic                          reduction_req_valid_i,
+  output logic                          reduction_req_ready_o,
+  /// IF from external FPU
+  output RdData_t                       reduction_resp_data_o,
+  output logic                          reduction_resp_valid_o,
+  input  logic                          reduction_resp_ready_i
+);
+
+  // Parameter
+  localparam int unsigned FLEN = 64;
+
+  // Variable
+  logic [RdElements] comp_req_valid;
+  logic [RdElements] comp_req_ready;
+  logic [RdElements] comp_resp_valid;
+  logic [RdElements] comp_resp_ready;
+
+  // Fork the hadshaking
+  stream_fork #(
+    .N_OUP         (RdElements)
+  ) i_dca_fork_fpu (
+    .clk_i         (clk_i),
+    .rst_ni        (rst_ni),
+    .valid_i       (reduction_req_valid_i),
+    .ready_o       (reduction_req_ready_o),
+    .valid_o       (comp_req_valid),
+    .ready_i       (comp_req_ready)
+  );
+
+  // Implement FPU(s)
+  for (genvar i = 0; i < RdElements; i++) begin : gen_fpu_metadata
+
+    // Generate the FPU
+    if(FPU_ACTIVE == 1'b1) begin
+      floo_reduction_fpu #(
+        .ID                   (i),
+        .DEBUG_PRINT_TRACE    (DEBUG_PRINT_TRACE)
+      ) i_fpu (
+        .clk_i                (clk_i),
+        .rst_ni               (rst_ni),
+        .flush_i              (flush_i),
+        .fpu_req_op1_i        (reduction_req_op1_i[(FLEN*(i+1))-1:FLEN*i]),
+        .fpu_req_op2_i        (reduction_req_op2_i[(FLEN*(i+1))-1:FLEN*i]),
+        .fpu_req_type_i       (reduction_req_type_i),
+        .fpu_req_valid_i      (comp_req_valid[i]),
+        .fpu_req_ready_o      (comp_req_ready[i]),
+        .fpu_resp_data_o      (reduction_resp_data_o[(FLEN*(i+1)-1):FLEN*i]),
+        .fpu_resp_valid_o     (comp_resp_valid[i]),
+        .fpu_resp_ready_i     (comp_resp_ready[i])
+      );
+    end
+
+    // Generate the ALU
+    if(ALU_ACTIVE == 1'b1) begin
+      floo_reduction_alu #(
+        .ID                   (i),
+        .DEBUG_PRINT_TRACE    (DEBUG_PRINT_TRACE)
+      ) i_alu (
+        .clk_i                (clk_i),
+        .rst_ni               (rst_ni),
+        .flush_i              (flush_i),
+        .alu_req_op1_i        (reduction_req_op1_i[(FLEN*(i+1))-1:FLEN*i]),
+        .alu_req_op2_i        (reduction_req_op2_i[(FLEN*(i+1))-1:FLEN*i]),
+        .alu_req_type_i       (reduction_req_type_i),
+        .alu_req_valid_i      (comp_req_valid[i]),
+        .alu_req_ready_o      (comp_req_ready[i]),
+        .alu_resp_data_o      (reduction_resp_data_o[(FLEN*(i+1)-1):FLEN*i]),
+        .alu_resp_valid_o     (comp_resp_valid[i]),
+        .alu_resp_ready_i     (comp_resp_ready[i])
+      );
+    end
+
+  end
+
+  // Join all the signal together
+  stream_join #(
+    .N_INP           (RdElements)
+  ) i_dca_join_fpu (
+    .inp_valid_i     (comp_resp_valid),
+    .inp_ready_o     (comp_resp_ready),
+    .oup_valid_o     (reduction_resp_valid_o),
+    .oup_ready_i     (reduction_resp_ready_i)
+  );
+
+  // Sanity Check
+  `ASSERT_INIT(Invalid_ALU_or_FPU, !((FPU_ACTIVE ^ ALU_ACTIVE) == 1'b0))
+  `ASSERT_INIT(Invalid_Config, !($bits(RdData_t) != (RdElements*FLEN)))
+
+endmodule
+
+// Floating Point Reduction
+module floo_reduction_fpu import floo_pkg::*; #(
+  parameter int unsigned ID = 0,
+  parameter bit          DEBUG_PRINT_TRACE      = 1'b0
+) (
+  input  logic              clk_i,
+  input  logic              rst_ni,
+  input  logic              flush_i,
+  /// IF towards external FPU
+  input  logic[63:0]        fpu_req_op1_i,
+  input  logic[63:0]        fpu_req_op2_i,
+  input  collect_op_t       fpu_req_type_i,
+  input  logic              fpu_req_valid_i,
+  output logic              fpu_req_ready_o,
+  /// IF from external FPU
+  output logic[63:0]        fpu_resp_data_o,
+  output logic              fpu_resp_valid_o,
+  input  logic              fpu_resp_ready_i
+);
+
+  /* All local parameter */
+
+  // FPU Configuration
+  localparam fpnew_pkg::fpu_features_t FPUFeatures = '{
+    Width:             64,
+    EnableVectors:     1'b1,
+    EnableNanBox:      1'b1,
+    FpFmtMask:         {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1}, //{RVF, RVD, XF16, XF8, XF16ALT, XF8ALT},
+    IntFmtMask:        {1'b1, 1'b1, 1'b1, 1'b1} //{XFVEC && (XF8 || XF8ALT), XFVEC && (XF16 || XF16ALT), 1'b1, 1'b0}
+  };
+
+  // FPU Implementation copied from the generated code (messy as fuck)
+  localparam fpnew_pkg::fpu_implementation_t FPUImplementation [1] = '{
+      '{
+          PipeRegs:
+                    '{'{2, 3, 1, 1, 1, 1},   // FMA Block
+                      '{1, 1, 1, 1, 1, 1},   // DIVSQRT
+                      '{1, 1, 1, 1, 1, 1},   // NONCOMP
+                      '{2, 2, 2, 2, 2, 2},   // CONV
+                      '{3, 3, 3, 3, 3, 3}    // DOTP
+                      },
+          UnitTypes: '{'{fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED},  // FMA
+                      '{fpnew_pkg::DISABLED, fpnew_pkg::DISABLED, fpnew_pkg::DISABLED, fpnew_pkg::DISABLED, fpnew_pkg::DISABLED, fpnew_pkg::DISABLED}, // DIVSQRT
+                      '{fpnew_pkg::PARALLEL, fpnew_pkg::PARALLEL, fpnew_pkg::PARALLEL, fpnew_pkg::PARALLEL, fpnew_pkg::PARALLEL, fpnew_pkg::PARALLEL}, // NONCOMP
+                      '{fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED},   // CONV
+                      '{fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED, fpnew_pkg::MERGED}},  // DOTP
+          PipeConfig: fpnew_pkg::BEFORE
+      }
+    };
+
+  /* All Typedef Vars */
+  typedef struct packed {
+    logic [2:0][63:0]        operands;
+    fpnew_pkg::roundmode_e   rnd_mode;
+    fpnew_pkg::operation_e   op;
+    logic                    op_mod;
+    fpnew_pkg::fp_format_e   src_fmt;
+    fpnew_pkg::fp_format_e   dst_fmt;
+    fpnew_pkg::int_format_e  int_fmt;
+    logic                    vectorial_op;
+  } fpu_in_t;
+
+  typedef struct packed {
+    logic [63:0] result;
+    logic [4:0]      status;
+  } fpu_out_t;
+
+  /* Variable declaration */
+  fpu_in_t fpu_in;
+  fpu_out_t fpu_out;
+
+  /* Module Declaration */
+
+  // Parse the FPU Request
+  always_comb begin
+    // Init default values
+    fpu_in = '0;
+
+    // Set default Values
+    fpu_in.src_fmt = fpnew_pkg::FP64;
+    fpu_in.dst_fmt = fpnew_pkg::FP64;
+    fpu_in.int_fmt = fpnew_pkg::INT64;
+    fpu_in.vectorial_op = 1'b0;
+    fpu_in.op_mod = 1'b0;
+    fpu_in.rnd_mode = fpnew_pkg::RNE;
+    fpu_in.op = fpnew_pkg::ADD;
+
+    // Define the operation we want to execute on the FPU
+    unique casez (fpu_req_type_i)
+      (floo_pkg::F_Add) : begin
+        fpu_in.op = fpnew_pkg::ADD;
+        fpu_in.operands[0] = '0;
+        fpu_in.operands[1] = fpu_req_op1_i;
+        fpu_in.operands[2] = fpu_req_op2_i;
+      end
+      (floo_pkg::F_Mul) : begin
+        fpu_in.op = fpnew_pkg::MUL;
+        fpu_in.operands[0] = fpu_req_op1_i;
+        fpu_in.operands[1] = fpu_req_op2_i;
+        fpu_in.operands[2] = '0;
+      end
+      (floo_pkg::F_Max) : begin
+        fpu_in.op = fpnew_pkg::MINMAX;
+        fpu_in.rnd_mode = fpnew_pkg::RNE;
+        fpu_in.operands[0] = fpu_req_op1_i;
+        fpu_in.operands[1] = fpu_req_op2_i;
+        fpu_in.operands[2] = '0;
+      end
+      (floo_pkg::F_Min) : begin
+        fpu_in.op = fpnew_pkg::MINMAX;
+        fpu_in.rnd_mode = fpnew_pkg::RTZ;
+        fpu_in.operands[0] = fpu_req_op1_i;
+        fpu_in.operands[1] = fpu_req_op2_i;
+        fpu_in.operands[2] = '0;
+      end
+      default : begin
+        fpu_in.op = fpnew_pkg::ADD;
+        fpu_in.operands[0] = '0;
+        fpu_in.operands[1] = '0;
+        fpu_in.operands[2] = '0;
+      end
+    endcase
+  end
+
+  // Instanciate the FPU as single element
+  fpnew_top #(
+    // FPU configuration
+    .Features                    (FPUFeatures),
+    .Implementation              (FPUImplementation[0]),
+    .TagType                     (logic),
+    .CompressedVecCmpResult      (1),
+    .StochasticRndImplementation (fpnew_pkg::DEFAULT_RSR)
+  ) i_fpu (
+    .clk_i            (clk_i),
+    .rst_ni           (rst_ni),
+    .hart_id_i        ('0),
+    .operands_i       (fpu_in.operands),
+    .rnd_mode_i       (fpu_in.rnd_mode),
+    .op_i             (fpu_in.op),
+    .op_mod_i         (fpu_in.op_mod),
+    .src_fmt_i        (fpu_in.src_fmt),
+    .dst_fmt_i        (fpu_in.dst_fmt),
+    .int_fmt_i        (fpu_in.int_fmt),
+    .vectorial_op_i   (fpu_in.vectorial_op),
+    .tag_i            ('0),
+    .simd_mask_i      ('1),
+    .in_valid_i       (fpu_req_valid_i),
+    .in_ready_o       (fpu_req_ready_o),
+    .flush_i          (flush_i),
+    .result_o         (fpu_out.result),
+    .status_o         (fpu_out.status),
+    .tag_o            (),
+    .out_valid_o      (fpu_resp_valid_o),
+    .out_ready_i      (fpu_resp_ready_i),
+    .busy_o           ()
+  );
+
+  // Provide the data to the output
+  assign fpu_resp_data_o = fpu_out.result;
+
+  // Print the Status info
+  if(DEBUG_PRINT_TRACE) begin
+    int cnt_in;
+    int cnt_out;
+    initial begin
+      cnt_in = 0;
+      cnt_out = 0;
+      while(1) begin
+        @(posedge clk_i);
+        // Print the incoming operation
+        if((fpu_req_valid_i == 1'b1) && (fpu_req_ready_o == 1'b1)) begin
+          $display($time, " [FPU %1d - Itr %1d] > FPU Ops: [%f, %f] FPU Op: %s", ID, cnt_in, fpu_req_op1_i, fpu_req_op2_i, genOp(fpu_req_type_i));
+          cnt_in = cnt_in + 1;
+        end
+
+        // Print Result / Status of FPU
+        if((fpu_resp_valid_o == 1'b1) && (fpu_resp_ready_i == 1'b1)) begin
+          $display($time, " [FPU %1d - Itr %1d] > FPU Result: %f FPU Status: %s", ID, cnt_out, fpu_out.result, genBitRep(fpu_out.status));
+          cnt_out = cnt_out + 1;
+        end
+      end
+    end
+
+    // Helper Function to generate Bitstring
+    function string genBitRep (logic [4:0] in);
+      string retVal;
+      retVal = "B";
+      for(int i = 0; i < 5; i++) begin
+          if(in[4-i] == 1'b1) begin
+              retVal = {retVal, "1"};
+          end else begin
+              retVal = {retVal, "0"};
+          end
+      end
+      return retVal;
+    endfunction
+
+    function string genOp (reduction_op_t type_reduction);
+      string retVal;
+      retVal = "";
+      unique casez (type_reduction)
+        (floo_pkg::F_Add) : begin
+          retVal = "FAdd";
+        end
+        (floo_pkg::F_Mul) : begin
+          retVal = "FMul";
+        end
+        (floo_pkg::F_Max) : begin
+          retVal = "FMax";
+        end
+        (floo_pkg::F_Min) : begin
+          retVal = "FMin";
+        end
+      endcase
+      return retVal;
+    endfunction
+  end
+
+endmodule
+
diff --git a/hw/test/floo_test_pkg.sv b/hw/test/floo_test_pkg.sv
index 376e16eb..92a31f05 100644
--- a/hw/test/floo_test_pkg.sv
+++ b/hw/test/floo_test_pkg.sv
@@ -33,10 +33,7 @@ package floo_test_pkg;
     IdAddrOffset: 0,
     NumSamRules: 1,
     NumRoutes: 1,
-    EnMultiCast: 1'b0,
-    EnParallelReduction: 1'b0,
-    EnNarrowOffloadReduction: 1'b0,
-    EnWideOffloadReduction: 1'b0
+    CollectiveCfg: floo_pkg::CollectiveDefaultCfg
   };
 
   // Common chimney parameters

From 305f6aec0785c33830ee117eed4bc02a2eeb97d2 Mon Sep 17 00:00:00 2001
From: Lorenzo Leone <lleone@iis.ee.ethz.ch>
Date: Fri, 21 Nov 2025 12:12:00 +0100
Subject: [PATCH 16/17] hw: Align reduction to VC support

---
 hw/floo_nw_chimney.sv             |   3 +-
 hw/floo_nw_router.sv              |   9 +-
 hw/floo_pkg.sv                    |  12 --
 hw/floo_router.sv                 |  25 ++--
 hw/synth/floo_synth_nw_2tiles.sv  | 213 ++++++++++++++++++++++++++++++
 hw/synth/floo_synth_nw_chimney.sv |   4 +-
 hw/synth/floo_synth_nw_router.sv  |  56 ++++----
 hw/synth/floo_synth_params_pkg.sv |   7 -
 8 files changed, 258 insertions(+), 71 deletions(-)
 create mode 100644 hw/synth/floo_synth_nw_2tiles.sv

diff --git a/hw/floo_nw_chimney.sv b/hw/floo_nw_chimney.sv
index 70e03a05..e6af59a6 100644
--- a/hw/floo_nw_chimney.sv
+++ b/hw/floo_nw_chimney.sv
@@ -84,7 +84,6 @@ module floo_nw_chimney #(
   parameter type floo_rsp_t                             = logic,
   /// Floo `wide` link type
   parameter type floo_wide_t                            = logic,
-  parameter type floo_wide_in_t                         = logic,
   /// SRAM configuration type `tc_sram_impl` in RoB
   /// Only used if technology-dependent SRAM is used
   parameter type sram_cfg_t                             = logic,
@@ -118,7 +117,7 @@ module floo_nw_chimney #(
   /// Input links from NoC
   input  floo_req_t   floo_req_i,
   input  floo_rsp_t   floo_rsp_i,
-  input  floo_wide_in_t  floo_wide_i
+  input  floo_wide_t  floo_wide_i
 );
 
   import floo_pkg::*;
diff --git a/hw/floo_nw_router.sv b/hw/floo_nw_router.sv
index b7dfe620..024711cd 100644
--- a/hw/floo_nw_router.sv
+++ b/hw/floo_nw_router.sv
@@ -2,7 +2,7 @@
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 //
-// Author: Tim Fischer <fischeti@iis.ee.ethz.ch>
+// Author: Lorenzo Leone <lleone@iis.ee.ethz.ch>
 
 `include "axi/typedef.svh"
 `include "floo_noc/typedef.svh"
@@ -51,7 +51,6 @@ module floo_nw_router #(
   parameter type floo_rsp_t                         = logic,
   /// Floo `wide` link type
   parameter type floo_wide_t                        = logic,
-  parameter type floo_wide_out_t                    = logic,
   /// Possible operation for offloading (must match type in header)
   parameter type RdWideOperation_t                  = logic,
   parameter type RdNarrowOperation_t                = logic,
@@ -82,7 +81,7 @@ module floo_nw_router #(
   output  floo_req_t [NumOutputs-1:0]   floo_req_o,
   output  floo_rsp_t [NumInputs-1:0]    floo_rsp_o,
   input   floo_wide_t [NumRoutes-1:0]   floo_wide_i,
-  output  floo_wide_out_t [NumRoutes-1:0]   floo_wide_o,
+  output  floo_wide_t [NumRoutes-1:0]   floo_wide_o,
   /// Wide IF towards the offload logic
   output RdWideOperation_t              offload_wide_req_op_o,
   output RdWideData_t                   offload_wide_req_operand1_o,
@@ -125,10 +124,6 @@ module floo_nw_router #(
       axi_wide_in_id_t, axi_wide_data_t, axi_wide_strb_t, axi_wide_user_t)
   `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow, axi_wide, AxiCfgN, AxiCfgW, hdr_t)
 
-  // Local parameters to properly configure collective operation support
-  // hiding the complexity to the user
-  localparam int unsigned WideVirtChannel = (EnDecoupledRW) ? 2 : 1;
-
   localparam floo_pkg::collect_op_be_cfg_t CollectiveReqCfg = '{
     EnMulticast : CollectiveOpCfg.EnNarrowMulticast,
     EnLSBAnd    : CollectiveOpCfg.EnLSBAnd,
diff --git a/hw/floo_pkg.sv b/hw/floo_pkg.sv
index af5922fa..be1b2622 100644
--- a/hw/floo_pkg.sv
+++ b/hw/floo_pkg.sv
@@ -117,18 +117,6 @@ package floo_pkg;
     Phys = 2'd2
   } wide_rw_decouple_e;
 
-  /// The types of collective communication
-  typedef enum logic [1:0] {
-    /// Normal communication
-    Unicast = 2'd0,
-    /// Multicast communication
-    Multicast = 2'd1,
-    /// Parallel reduction operations
-    ParallelReduction = 2'd2,
-    /// Offload Reduction
-    OffloadReduction = 2'd3
-  } collect_comm_e;
-
   /// TODO(lleone): delet this portion of code
   // /// The types of collective communication
   // typedef enum logic [1:0] {
diff --git a/hw/floo_router.sv b/hw/floo_router.sv
index e393842a..d7375ad6 100644
--- a/hw/floo_router.sv
+++ b/hw/floo_router.sv
@@ -38,13 +38,9 @@ module floo_router
   /// Disables Y->X connections in XYRouting
   parameter bit          XYRouteOpt           = 1'b1,
   /// Disables loopback connections
-  parameter bit          NoLoopback       = 1'b1,
+  parameter bit          NoLoopback           = 1'b1,
   /// Select VC implementation
-  parameter floo_pkg::vc_impl_e VcImpl    = floo_pkg::VcNaive,
-  /// Enable Multicast feature
-  parameter bit          EnMultiCast      = 1'b0,
-  /// Enable reduction feature
-  parameter bit          EnReduction      = 1'b0,
+  parameter floo_pkg::vc_impl_e VcImpl        = floo_pkg::VcNaive,
   /// Various types
   parameter type         addr_rule_t          = logic,
   parameter type         flit_t               = logic,
@@ -190,6 +186,7 @@ module floo_router
     end
   end
 
+
   // Var for the "normal" dataflow without any reduction
   logic  [NumInput-1:0][NumVirtChannels-1:0] cross_valid, cross_ready;
 
@@ -288,7 +285,7 @@ module floo_router
         assign red_offload_route_selected[in]   = red_route_selected[in][0];
         assign red_offload_expected_in_route_loopback[in] = red_expected_in_route_loopback[in][0];
     end
-    if (EnCollVirtChannel) begin
+    if (NumVirtChannels > 1) begin
       for (genvar in = 0; in < NumInput; in++) begin: gen_vc1_tied
         assign red_ready_in[in][1]  = '0; // Tied to zero the ready from offload unit to VC1
       end
@@ -333,7 +330,7 @@ module floo_router
     end
 
     // Tie down all unused signals
-    if(EnCollVirtChannel) begin
+    if(NumVirtChannels > 1) begin
       for (genvar out = 0; out < NumOutput; out++) begin
         assign red_data_out[out][1] = '0;
         assign red_valid_out[out][1] = '0;
@@ -495,7 +492,9 @@ module floo_router
       end
     end
 
-    // Arbitrate virtual channels onto the physical channel
+    // At the end point, we cannot make valid dependent on ready.
+    // However, this is the case in the `floo_vc_arbiter`.
+    // For this reason, there must be cuts at the input of the endpoint.
     floo_vc_arbiter #(
       .NumVirtChannels  ( NumVirtChannels  ),
       .flit_t           ( flit_t           ),
@@ -504,11 +503,9 @@ module floo_router
     ) i_vc_arbiter (
       .clk_i,
       .rst_ni,
-
       .valid_i  ( out_buffered_valid[out] ),
       .ready_o  ( out_buffered_ready[out] ),
       .data_i   ( out_buffered_data [out] ),
-
       .ready_i  ( ready_i  [out] ),
       .valid_o  ( valid_o  [out] ),
       .data_o   ( data_o   [out] ),
@@ -557,7 +554,7 @@ module floo_router
 
   // If you have offload reduction and more than one virtual channel,
   // the reduction traffic must arrive from Virtual Channel 0
-  if (EnSequentialReduction && (NumVirtChannels > 1) && EnCollVirtChannel) begin: gen_vc_red
+  if (EnSequentialReduction && (NumVirtChannels > 1)) begin: gen_vc_red
     for (genvar in = 0; in < NumInput; in++) begin
         `ASSERT(CollOpReceivedOnWrongVirtChannel, !red_valid_in[in][1])
     end
@@ -571,7 +568,7 @@ module floo_router
   `ASSERT_INIT(SupportAXI, !EnSequentialReduction || RedCfg.RdSupportAxi)
   // We can not support Loopback when you have reduction and the NoLoopback option is disabled
   `ASSERT_INIT(SupportLoopback, !(EnSequentialReduction && RedCfg.RdSupportLoopback && NoLoopback))
-  // We cannot support sequential reduction with multiple VC if EnCollVirtChannel is not set
-  `ASSERT_INIT(NoRedVcSupport, !(EnSequentialReduction && (NumVirtChannels > 1) && !EnCollVirtChannel))
+  // We cannot support sequential reduction with multiple VC
+  // `ASSERT_INIT(NoRedVcSupport, !(EnSequentialReduction && (NumVirtChannels > 1)))
 
 endmodule
diff --git a/hw/synth/floo_synth_nw_2tiles.sv b/hw/synth/floo_synth_nw_2tiles.sv
new file mode 100644
index 00000000..2413325f
--- /dev/null
+++ b/hw/synth/floo_synth_nw_2tiles.sv
@@ -0,0 +1,213 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Tim Fischer <fischeti@iis.ee.ethz.ch>
+//
+// This wrapper wnats to simulate a floorplan with 2 tiles. It basically conenct the two routers
+// west <-> east ports together. For this reason, at the interface there will be twice the number of
+// ports compared to a single tile.
+//
+module floo_synth_nw_2tiles
+  import floo_pkg::*;
+  import floo_synth_params_pkg::*;
+  import floo_synth_nw_pkg::*;
+  import floo_synth_collective_pkg::*;
+#(
+  parameter int unsigned NumPorts = int'(floo_pkg::NumDirections),
+  parameter int unsigned  NumWideVirtChannel = 1,
+  parameter int unsigned  NumWidePhysChannel = 1,
+  parameter int unsigned  VcImpl = 32'd0
+) (
+  input  logic clk_i,
+  input  logic rst_ni,
+  input  logic test_enable_i,
+
+  input  id_t id_1_i,
+  input logic id_route_map_1_i,
+
+  input  id_t id_0_i,
+  input logic id_route_map_0_i,
+
+  input  floo_req_t  [NumPorts-2:0] floo_req_1_i,
+  input  floo_rsp_t  [NumPorts-2:0] floo_rsp_1_i,
+  output floo_req_t  [NumPorts-2:0] floo_req_1_o,
+  output floo_rsp_t  [NumPorts-2:0] floo_rsp_1_o,
+  input  floo_wide_t [NumPorts-2:0] floo_wide_1_i,
+  output floo_wide_t [NumPorts-2:0] floo_wide_1_o,
+  input  floo_req_t  [NumPorts-2:0] floo_req_0_i,
+  input  floo_rsp_t  [NumPorts-2:0] floo_rsp_0_i,
+  output floo_req_t  [NumPorts-2:0] floo_req_0_o,
+  output floo_rsp_t  [NumPorts-2:0] floo_rsp_0_o,
+  input  floo_wide_t [NumPorts-2:0] floo_wide_0_i,
+  output floo_wide_t [NumPorts-2:0] floo_wide_0_o
+);
+
+// Intermediate signals to connect the two virtual tiles
+floo_req_t  [NumPorts-1:0] floo_req_1_in;
+floo_rsp_t  [NumPorts-1:0] floo_rsp_1_in;
+floo_req_t  [NumPorts-1:0] floo_req_1_out;
+floo_rsp_t  [NumPorts-1:0] floo_rsp_1_out;
+floo_wide_t [NumPorts-1:0] floo_wide_1_in;
+floo_wide_t [NumPorts-1:0] floo_wide_1_out;
+
+floo_req_t  [NumPorts-1:0] floo_req_0_in;
+floo_rsp_t  [NumPorts-1:0] floo_rsp_0_in;
+floo_req_t  [NumPorts-1:0] floo_req_0_out;
+floo_rsp_t  [NumPorts-1:0] floo_rsp_0_out;
+floo_wide_t [NumPorts-1:0] floo_wide_0_in;
+floo_wide_t [NumPorts-1:0] floo_wide_0_out;
+
+localparam floo_pkg::vc_impl_e  VcImplementation = floo_pkg::vc_impl_e'(VcImpl);
+
+// Tile 1
+for (genvar p = 0; p < NumPorts; p++) begin
+  if (p != West) begin
+    assign floo_req_1_in[p] = floo_req_1_i[tile1_idx_map(p)];
+    assign floo_rsp_1_in[p] = floo_rsp_1_i[tile1_idx_map(p)];
+    assign floo_rsp_1_o[tile1_idx_map(p)] = floo_rsp_1_out[p];
+    assign floo_req_1_o[tile1_idx_map(p)] = floo_req_1_out[p];
+
+    assign floo_wide_1_in[p] = floo_wide_1_i[tile1_idx_map(p)];
+    assign floo_wide_1_o[tile1_idx_map(p)] = floo_wide_1_out[p];
+  end
+end
+
+// Tile 0
+for (genvar p = 0; p < NumPorts; p++) begin
+  if (p != East) begin
+    assign floo_req_0_in[p] = floo_req_0_i[tile0_idx_map(p)];
+    assign floo_rsp_0_in[p] = floo_rsp_0_i[tile0_idx_map(p)];
+    assign floo_rsp_0_o[tile0_idx_map(p)] = floo_rsp_0_out[p];
+    assign floo_req_0_o[tile0_idx_map(p)] = floo_req_0_out[p];
+
+    assign floo_wide_0_in[p] = floo_wide_0_i[tile0_idx_map(p)];
+    assign floo_wide_0_o[tile0_idx_map(p)] = floo_wide_0_out[p];
+  end
+end
+
+assign floo_req_0_in[East] = floo_req_1_out[West];
+assign floo_rsp_1_in[West] = floo_rsp_0_out[East];
+assign floo_req_1_in[West] = floo_req_0_out[East];
+assign floo_rsp_0_in[East] = floo_rsp_1_out[West];
+
+assign floo_wide_0_in[East] = floo_wide_1_out[West];
+assign floo_wide_1_in[West] = floo_wide_0_out[East];
+
+floo_nw_router #(
+  .AxiCfgN       ( AxiCfgN             ),
+  .AxiCfgW       ( AxiCfgW             ),
+  .RouteAlgo     ( RouteCfg.RouteAlgo  ),
+  .NumRoutes     ( NumPorts            ),
+  .NumAddrRules  ( 1                   ),
+  .InFifoDepth   ( InFifoDepth         ),
+  .OutFifoDepth  ( OutFifoDepth        ),
+  .XYRouteOpt    ( 1'b0                ),
+  .NumWideVirtChannels (NumWideVirtChannel),
+  .NumWidePhysChannels (NumWidePhysChannel),
+  .VcImplementation (VcImplementation),
+  .id_t          ( id_t                ),
+  .hdr_t         ( hdr_t               ),
+  .floo_req_t    ( floo_req_t          ),
+  .floo_rsp_t    ( floo_rsp_t          ),
+  .floo_wide_t   ( floo_wide_t         )
+) i_floo_nw_router1 (
+  .clk_i          ( clk_i           ),
+  .rst_ni         ( rst_ni          ),
+  .test_enable_i  ( test_enable_i   ),
+  .id_i           ( id_1_i            ),
+  .id_route_map_i ( id_route_map_1_i  ),
+  .floo_req_i     ( floo_req_1_in      ),
+  .floo_rsp_i     ( floo_rsp_1_in      ),
+  .floo_req_o     ( floo_req_1_out      ),
+  .floo_rsp_o     ( floo_rsp_1_out      ),
+  .floo_wide_i    ( floo_wide_1_in     ),
+  .floo_wide_o    ( floo_wide_1_out     ),
+  .offload_wide_req_op_o          (),
+  .offload_wide_req_operand1_o    (),
+  .offload_wide_req_operand2_o    (),
+  .offload_wide_req_valid_o       (),
+  .offload_wide_req_ready_i       ('0),
+  .offload_wide_resp_result_i     ('0),
+  .offload_wide_resp_valid_i      ('0),
+  .offload_wide_resp_ready_o      (),
+  // Narrow Reduction offload port
+  .offload_narrow_req_op_o        (),
+  .offload_narrow_req_operand1_o  (),
+  .offload_narrow_req_operand2_o  (),
+  .offload_narrow_req_valid_o     (),
+  .offload_narrow_req_ready_i     ('0),
+  .offload_narrow_resp_result_i   ('0),
+  .offload_narrow_resp_valid_i    ('0),
+  .offload_narrow_resp_ready_o    ()
+);
+
+floo_nw_router #(
+  .AxiCfgN       ( AxiCfgN             ),
+  .AxiCfgW       ( AxiCfgW             ),
+  .RouteAlgo     ( RouteCfg.RouteAlgo  ),
+  .NumRoutes     ( NumPorts            ),
+  .NumAddrRules  ( 1                   ),
+  .InFifoDepth   ( InFifoDepth         ),
+  .OutFifoDepth  ( OutFifoDepth        ),
+  .XYRouteOpt    ( 1'b0                ),
+  .NumWideVirtChannels (NumWideVirtChannel),
+  .NumWidePhysChannels (NumWidePhysChannel),
+  .VcImplementation (VcImplementation),
+  .id_t          ( id_t                ),
+  .hdr_t         ( hdr_t               ),
+  .floo_req_t    ( floo_req_t          ),
+  .floo_rsp_t    ( floo_rsp_t          ),
+  .floo_wide_t   ( floo_wide_t         )
+) i_floo_nw_router0 (
+  .clk_i          ( clk_i           ),
+  .rst_ni         ( rst_ni          ),
+  .test_enable_i  ( test_enable_i   ),
+  .id_i           ( id_0_i            ),
+  .id_route_map_i ( id_route_map_0_i  ),
+  .floo_req_i     ( floo_req_0_in      ),
+  .floo_rsp_i     ( floo_rsp_0_in       ),
+  .floo_req_o     ( floo_req_0_out      ),
+  .floo_rsp_o     ( floo_rsp_0_out      ),
+  .floo_wide_i    ( floo_wide_0_in      ),
+  .floo_wide_o    ( floo_wide_0_out     ),
+  .offload_wide_req_op_o          (),
+  .offload_wide_req_operand1_o    (),
+  .offload_wide_req_operand2_o    (),
+  .offload_wide_req_valid_o       (),
+  .offload_wide_req_ready_i       ('0),
+  .offload_wide_resp_result_i     ('0),
+  .offload_wide_resp_valid_i      ('0),
+  .offload_wide_resp_ready_o      (),
+  // Narrow Reduction offload port
+  .offload_narrow_req_op_o        (),
+  .offload_narrow_req_operand1_o  (),
+  .offload_narrow_req_operand2_o  (),
+  .offload_narrow_req_valid_o     (),
+  .offload_narrow_req_ready_i     ('0),
+  .offload_narrow_resp_result_i   ('0),
+  .offload_narrow_resp_valid_i    ('0),
+  .offload_narrow_resp_ready_o    ()
+);
+
+function automatic int tile0_idx_map(route_direction_e dir);
+  case (dir)
+    North: return 0;
+    // East:  return 1;
+    South: return 1;
+    West:  return 2;
+    Eject: return 3;
+  endcase
+endfunction
+
+function automatic int tile1_idx_map(route_direction_e dir);
+  case (dir)
+    North: return 0;
+    East:  return 1;
+    South: return 2;
+    // West:  return 2;
+    Eject: return 3;
+  endcase
+endfunction
+
+endmodule
diff --git a/hw/synth/floo_synth_nw_chimney.sv b/hw/synth/floo_synth_nw_chimney.sv
index c450ce17..8c24be4c 100644
--- a/hw/synth/floo_synth_nw_chimney.sv
+++ b/hw/synth/floo_synth_nw_chimney.sv
@@ -31,7 +31,7 @@ module floo_synth_nw_chimney
   input  floo_req_t  floo_req_i,
   input  floo_rsp_t  floo_rsp_i,
   output floo_wide_t floo_wide_o,
-  input  floo_wide_double_t floo_wide_i
+  input  floo_wide_t floo_wide_i
 );
 
 localparam floo_pkg::route_cfg_t RouteCfgColl = (EnCollective) ? CollectRouteCfg : RouteCfg;
@@ -44,6 +44,7 @@ localparam floo_pkg::route_cfg_t RouteCfgColl = (EnCollective) ? CollectRouteCfg
     .RouteCfg             ( RouteCfgColl          ), //TODO (lleone): change to enable multicast/collective
     .AtopSupport          ( AtopSupport           ),
     .EnDecoupledRW        ( 1'b1                  ),
+    .NumWidePhysChannels  (1),
     .MaxAtomicTxns        ( MaxAtomicTxns         ),
     // SAM?
     .id_t                 ( id_t                  ),
@@ -65,7 +66,6 @@ localparam floo_pkg::route_cfg_t RouteCfgColl = (EnCollective) ? CollectRouteCfg
     .floo_req_t           ( floo_req_t            ),
     .floo_rsp_t           ( floo_rsp_t            ),
     .floo_wide_t          ( floo_wide_t           ),
-    .floo_wide_in_t       ( floo_wide_double_t    ),
     .user_narrow_struct_t ( collective_narrow_user_t),
     .user_wide_struct_t   ( collective_wide_user_t)
   ) i_floo_nw_chimney (
diff --git a/hw/synth/floo_synth_nw_router.sv b/hw/synth/floo_synth_nw_router.sv
index 4d0b661e..ec858353 100644
--- a/hw/synth/floo_synth_nw_router.sv
+++ b/hw/synth/floo_synth_nw_router.sv
@@ -13,7 +13,9 @@ module floo_synth_nw_router
   parameter int unsigned NumPorts = int'(floo_pkg::NumDirections),
   parameter int unsigned  EnCollective  = 0,
   parameter int unsigned  EnNarrOffload    = 0,
-  parameter int unsigned  EnWideOffload    = 0
+  parameter int unsigned  EnWideOffload    = 0,
+  parameter int unsigned  NumWideVirtChannel = 1,
+  parameter int unsigned  NumWidePhysChannel = 1
 ) (
   input  logic clk_i,
   input  logic rst_ni,
@@ -27,27 +29,27 @@ module floo_synth_nw_router
   output floo_req_t [NumPorts-1:0] floo_req_o,
   output floo_rsp_t [NumPorts-1:0] floo_rsp_o,
   input  floo_wide_t [NumPorts-1:0] floo_wide_i,
-  output floo_wide_double_t [NumPorts-1:0] floo_wide_o,
-  /// Wide IF towards the offload logic
-  output floo_pkg::collect_op_e              offload_wide_req_op_o,
-  output RdDataWide_t                   offload_wide_req_operand1_o,
-  output RdDataWide_t                   offload_wide_req_operand2_o,
-  output logic                          offload_wide_req_valid_o,
-  input logic                           offload_wide_req_ready_i,
-  /// Wide IF from external FPU
-  input RdDataWide_t                    offload_wide_resp_result_i,
-  input logic                           offload_wide_resp_valid_i,
-  output logic                          offload_wide_resp_ready_o,
-  /// Narrow IF towards the offload logic
-  output floo_pkg::collect_op_e            offload_narrow_req_op_o,
-  output RdDataNarrow_t                 offload_narrow_req_operand1_o,
-  output RdDataNarrow_t                 offload_narrow_req_operand2_o,
-  output logic                          offload_narrow_req_valid_o,
-  input logic                           offload_narrow_req_ready_i,
-  /// Narrow IF from external FPU
-  input RdDataNarrow_t                  offload_narrow_resp_result_i,
-  input logic                           offload_narrow_resp_valid_i,
-  output logic                          offload_narrow_resp_ready_o
+  output floo_wide_t [NumPorts-1:0] floo_wide_o
+  // /// Wide IF towards the offload logic
+  // output floo_pkg::collect_op_e              offload_wide_req_op_o,
+  // output RdDataWide_t                   offload_wide_req_operand1_o,
+  // output RdDataWide_t                   offload_wide_req_operand2_o,
+  // output logic                          offload_wide_req_valid_o,
+  // input logic                           offload_wide_req_ready_i,
+  // /// Wide IF from external FPU
+  // input RdDataWide_t                    offload_wide_resp_result_i,
+  // input logic                           offload_wide_resp_valid_i,
+  // output logic                          offload_wide_resp_ready_o,
+  // /// Narrow IF towards the offload logic
+  // output floo_pkg::collect_op_e            offload_narrow_req_op_o,
+  // output RdDataNarrow_t                 offload_narrow_req_operand1_o,
+  // output RdDataNarrow_t                 offload_narrow_req_operand2_o,
+  // output logic                          offload_narrow_req_valid_o,
+  // input logic                           offload_narrow_req_ready_i,
+  // /// Narrow IF from external FPU
+  // input RdDataNarrow_t                  offload_narrow_resp_result_i,
+  // input logic                           offload_narrow_resp_valid_i,
+  // output logic                          offload_narrow_resp_ready_o
 );
 
 
@@ -65,13 +67,13 @@ if (!EnCollective) begin
     .InFifoDepth   ( InFifoDepth         ),
     .OutFifoDepth  ( OutFifoDepth        ),
     .XYRouteOpt    ( 1'b0                ),
-    .EnDecoupledRW ( 1'b1                ),
+    .NumWideVirtChannels (NumWideVirtChannel),
+    .NumWidePhysChannels (NumWidePhysChannel),
     .id_t          ( id_t                ),
     .hdr_t         ( hdr_t               ),
     .floo_req_t    ( floo_req_t          ),
     .floo_rsp_t    ( floo_rsp_t          ),
-    .floo_wide_t   ( floo_wide_t         ),
-    .floo_wide_out_t (floo_wide_double_t)
+    .floo_wide_t   ( floo_wide_t         )
   ) i_floo_nw_router (
     .clk_i          ( clk_i           ),
     .rst_ni         ( rst_ni          ),
@@ -113,13 +115,13 @@ end else begin
     .OutFifoDepth ( OutFifoDepth        ),
     .XYRouteOpt   ( 1'b0                ),
     .NoLoopback   (1'b0),
-    .EnDecoupledRW (1'b1),
+    .NumWideVirtChannels (NumWideVirtChannel),
+    .NumWidePhysChannels (NumWidePhysChannel),
     .id_t         ( id_t                ),
     .hdr_t        ( hdr_coll_t          ),
     .floo_req_t   ( floo_req_t          ),
     .floo_rsp_t   ( floo_rsp_t          ),
     .floo_wide_t  ( floo_wide_t         ),
-    .floo_wide_out_t (floo_wide_double_t),
     .RdWideOperation_t        (floo_pkg::collect_op_e),
     .RdNarrowOperation_t      (floo_pkg::collect_op_e),
     .RdWideData_t             (RdDataWide_t),
diff --git a/hw/synth/floo_synth_params_pkg.sv b/hw/synth/floo_synth_params_pkg.sv
index 66ad8789..b77f7afc 100644
--- a/hw/synth/floo_synth_params_pkg.sv
+++ b/hw/synth/floo_synth_params_pkg.sv
@@ -384,13 +384,6 @@ localparam reduction_cfg_t NarrowGenReductionCfg = '{
   typedef logic[AxiCfgW.DataWidth-1:0] RdDataWide_t;
   typedef logic[AxiCfgN.DataWidth-1:0] RdDataNarrow_t;
 
-  // TODO(lleone): Each field must become [1:0] when testing VC
-  typedef struct packed {
-    logic [1:0] valid;
-    logic [1:0] ready;
-    floo_wide_chan_t [1:0] wide;
-  } floo_wide_double_t;
-
   `FLOO_TYPEDEF_HDR_T(hdr_coll_t, id_t, id_t, nw_ch_e, rob_idx_t, id_t, collect_op_e)
   // `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2)
 

From c75af692cec6b15ded4ac48e5b217e7e5b42a5a1 Mon Sep 17 00:00:00 2001
From: Kevin Greig <kgreig@tenstorrent.com>
Date: Sun, 1 Mar 2026 04:11:51 -0500
Subject: [PATCH 17/17] hw: Align chimney and cut modules to updated floo_pkg

The floo_pkg was restructured with new types and nested configs, but
the chimney, cut, and generator modules were not updated to match.

Type/enum renames (floo_pkg restructuring):
- ParallelReduction -> CollectB in both chimneys
- RouteCfg.EnMultiCast -> derived localparam from
  CollectiveCfg.OpCfg.EnNarrowMulticast | EnWideMulticast
- .hdr.mask -> .hdr.collective_mask, .hdr.commtype -> .hdr.collective_op

Generator fix (floogen):
- routing.py: emit CollectiveCfg: CollectiveDefaultCfg instead of the
  old flat fields (EnMultiCast, EnParallelReduction, etc.)

Port/parameter fixes:
- Guard route_table_i against NumRoutes=0 unsigned underflow
- floo_id_translation: add missing NoIndices parameter to addr_decode

Pipeline/protocol fixes:
- Add spill registers on chimney eject path (both chimneys) to prevent
  valid glitches between back-to-back AXI bursts violating StableValidIn
- floo_cut: fix ready wiring for pass-through case
  (NumVirtChannels == NumPhysChannels, NumCuts > 1) where inverted ready
  causes data loss under backpressure
---
 Bender.yml                |   2 +-
 floogen/model/routing.py  |   5 +-
 hw/floo_axi_chimney.sv    | 164 ++++++++++++++---------
 hw/floo_cut.sv            |  39 ++++++
 hw/floo_id_translation.sv |   1 +
 hw/floo_nw_chimney.sv     | 265 +++++++++++++++++++++-----------------
 6 files changed, 293 insertions(+), 183 deletions(-)

diff --git a/Bender.yml b/Bender.yml
index e8cabc75..49b796cf 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -48,7 +48,7 @@ sources:
   - hw/floo_output_arbiter.sv
   # Level 4
   - hw/floo_nw_join.sv
-  #- hw/floo_axi_chimney.sv
+  - hw/floo_axi_chimney.sv
   - hw/floo_nw_chimney.sv
   - hw/floo_router.sv
   # Level 5 (Wrappers)
diff --git a/floogen/model/routing.py b/floogen/model/routing.py
index 304964e1..efc2bf98 100644
--- a/floogen/model/routing.py
+++ b/floogen/model/routing.py
@@ -732,9 +732,6 @@ def render_route_cfg(self, name) -> str:
                                 self.route_algo == RouteAlgo.ID and not self.use_id_table else 0,
             "NumSamRules": len(self.sam),
             "NumRoutes": self.num_endpoints if self.route_algo == RouteAlgo.SRC else 0,
-            "EnMultiCast": bool_to_sv(self.en_multicast),
-            "EnParallelReduction": bool_to_sv(self.en_parallel_reduction),
-            "EnNarrowOffloadReduction": bool_to_sv(self.en_narrow_offload_reduction),
-            "EnWideOffloadReduction": bool_to_sv(self.en_wide_offload_reduction)
+            "CollectiveCfg": "CollectiveDefaultCfg"
         }
         return sv_param_decl(name, sv_struct_render(fields), dtype="route_cfg_t")
diff --git a/hw/floo_axi_chimney.sv b/hw/floo_axi_chimney.sv
index 2a372a09..c1173ab1 100644
--- a/hw/floo_axi_chimney.sv
+++ b/hw/floo_axi_chimney.sv
@@ -88,7 +88,7 @@ module floo_axi_chimney #(
   /// Coordinates/ID of the current tile
   input  id_t id_i,
   /// Routing table for the current tile
-  input  route_t [RouteCfg.NumRoutes-1:0] route_table_i,
+  input  route_t [(RouteCfg.NumRoutes > 0 ? RouteCfg.NumRoutes-1 : 0):0] route_table_i,
   /// Output links to NoC
   output floo_req_t floo_req_o,
   output floo_rsp_t floo_rsp_o,
@@ -99,6 +99,10 @@ module floo_axi_chimney #(
 
   import floo_pkg::*;
 
+  // Collective communication configuration
+  localparam floo_pkg::collect_op_fe_cfg_t CollectOpCfg = RouteCfg.CollectiveCfg.OpCfg;
+  localparam bit EnMultiCast = CollectOpCfg.EnNarrowMulticast | CollectOpCfg.EnWideMulticast;
+
   typedef logic [AxiCfg.AddrWidth-1:0] axi_addr_t;
   typedef logic [AxiCfg.InIdWidth-1:0] axi_in_id_t;
   typedef logic [AxiCfg.OutIdWidth-1:0] axi_out_id_t;
@@ -197,7 +201,7 @@ module floo_axi_chimney #(
     `AXI_ASSIGN_RESP_STRUCT(axi_in_rsp_o, axi_rsp_out)
 
     // Extract the multicast mask bits from the AXI user bits
-    if (RouteCfg.EnMultiCast) begin : gen_mask
+    if (EnMultiCast) begin : gen_mask
       user_struct_t user;
       assign user = axi_in_req_i.aw.user;
       assign axi_req_in_mask = user.mcast_mask;
@@ -231,7 +235,7 @@ module floo_axi_chimney #(
         .valid_o    ( axi_ar_queue_valid_out  ),
         .ready_i    ( axi_ar_queue_ready_in   )
       );
-      if (RouteCfg.EnMultiCast) begin : gen_mask_cuts
+      if (EnMultiCast) begin : gen_mask_cuts
         spill_register #(
           .T (logic [AxiCfg.UserWidth-1:0])
         ) i_usermask_queue (
@@ -517,7 +521,7 @@ module floo_axi_chimney #(
   `FFL(axi_aw_id_q, id_out[AxiAw], axi_aw_queue_valid_out &&
                                    axi_aw_queue_ready_in, '0)
 
-  if (RouteCfg.EnMultiCast) begin : gen_mcast
+  if (EnMultiCast) begin : gen_mcast
     localparam int unsigned AddrWidth = $bits(axi_addr_t);
     axi_addr_t [NumAxiChannels-1:0] x_addr_mask;
     axi_addr_t [NumAxiChannels-1:0] y_addr_mask;
@@ -548,8 +552,8 @@ module floo_axi_chimney #(
     assign mcast_mask[AxiAw] = mask_id[AxiAw];
     assign mcast_mask[AxiAr] = '0;
     assign mcast_mask[AxiW]  = axi_aw_mask_q;
-    assign mcast_mask[AxiR]  = ar_out_hdr_out.hdr.mask;
-    assign mcast_mask[AxiB]  = aw_out_hdr_out.hdr.mask;
+    assign mcast_mask[AxiR]  = ar_out_hdr_out.hdr.collective_mask;
+    assign mcast_mask[AxiB]  = aw_out_hdr_out.hdr.collective_mask;
 
     `FFL(axi_aw_mask_q, mcast_mask[AxiAw], axi_aw_queue_valid_out &&
                                      axi_aw_queue_ready_in, '0)
@@ -563,73 +567,73 @@ module floo_axi_chimney #(
 
   always_comb begin
     floo_axi_aw             = '0;
-    floo_axi_aw.hdr.rob_req = aw_rob_req_out;
-    floo_axi_aw.hdr.rob_idx = aw_rob_idx_out;
-    floo_axi_aw.hdr.dst_id  = dst_id[AxiAw];
-    floo_axi_aw.hdr.mask    = mcast_mask[AxiAw];
-    floo_axi_aw.hdr.src_id  = id_i;
-    floo_axi_aw.hdr.last    = 1'b0;
-    floo_axi_aw.hdr.axi_ch  = AxiAw;
-    floo_axi_aw.hdr.atop    = axi_aw_queue.atop != axi_pkg::ATOP_NONE;
-    floo_axi_aw.payload     = axi_aw_queue;
-    floo_axi_aw.hdr.commtype = (mcast_mask[AxiAw] != '0)? Multicast : Unicast;
+    floo_axi_aw.hdr.rob_req         = aw_rob_req_out;
+    floo_axi_aw.hdr.rob_idx         = aw_rob_idx_out;
+    floo_axi_aw.hdr.dst_id          = dst_id[AxiAw];
+    floo_axi_aw.hdr.collective_mask = mcast_mask[AxiAw];
+    floo_axi_aw.hdr.src_id          = id_i;
+    floo_axi_aw.hdr.last            = 1'b0;
+    floo_axi_aw.hdr.axi_ch         = AxiAw;
+    floo_axi_aw.hdr.atop            = axi_aw_queue.atop != axi_pkg::ATOP_NONE;
+    floo_axi_aw.payload             = axi_aw_queue;
+    floo_axi_aw.hdr.collective_op   = (mcast_mask[AxiAw] != '0)? Multicast : Unicast;
   end
 
   always_comb begin
     floo_axi_w              = '0;
-    floo_axi_w.hdr.rob_req  = aw_rob_req_out;
-    floo_axi_w.hdr.rob_idx  = aw_rob_idx_out;
-    floo_axi_w.hdr.dst_id   = dst_id[AxiW];
-    floo_axi_w.hdr.mask     = mcast_mask[AxiW];
-    floo_axi_w.hdr.src_id   = id_i;
-    floo_axi_w.hdr.last     = axi_req_in.w.last;
-    floo_axi_w.hdr.axi_ch   = AxiW;
-    floo_axi_w.payload      = axi_req_in.w;
-    floo_axi_w.hdr.commtype = (mcast_mask[AxiW] != '0)? Multicast : Unicast;
+    floo_axi_w.hdr.rob_req         = aw_rob_req_out;
+    floo_axi_w.hdr.rob_idx         = aw_rob_idx_out;
+    floo_axi_w.hdr.dst_id          = dst_id[AxiW];
+    floo_axi_w.hdr.collective_mask = mcast_mask[AxiW];
+    floo_axi_w.hdr.src_id          = id_i;
+    floo_axi_w.hdr.last            = axi_req_in.w.last;
+    floo_axi_w.hdr.axi_ch         = AxiW;
+    floo_axi_w.payload             = axi_req_in.w;
+    floo_axi_w.hdr.collective_op   = (mcast_mask[AxiW] != '0)? Multicast : Unicast;
   end
 
   always_comb begin
     floo_axi_ar             = '0;
-    floo_axi_ar.hdr.rob_req = ar_rob_req_out;
-    floo_axi_ar.hdr.rob_idx = ar_rob_idx_out;
-    floo_axi_ar.hdr.dst_id  = dst_id[AxiAr];
-    floo_axi_ar.hdr.mask = mcast_mask[AxiAr];
-    floo_axi_ar.hdr.src_id  = id_i;
-    floo_axi_ar.hdr.last    = 1'b1;
-    floo_axi_ar.hdr.axi_ch  = AxiAr;
-    floo_axi_ar.payload     = axi_ar_queue;
-    floo_axi_ar.hdr.commtype = '0;
+    floo_axi_ar.hdr.rob_req         = ar_rob_req_out;
+    floo_axi_ar.hdr.rob_idx         = ar_rob_idx_out;
+    floo_axi_ar.hdr.dst_id          = dst_id[AxiAr];
+    floo_axi_ar.hdr.collective_mask = mcast_mask[AxiAr];
+    floo_axi_ar.hdr.src_id          = id_i;
+    floo_axi_ar.hdr.last            = 1'b1;
+    floo_axi_ar.hdr.axi_ch         = AxiAr;
+    floo_axi_ar.payload             = axi_ar_queue;
+    floo_axi_ar.hdr.collective_op   = '0;
   end
 
   always_comb begin
     floo_axi_b              = '0;
-    floo_axi_b.hdr.rob_req  = aw_out_hdr_out.hdr.rob_req;
-    floo_axi_b.hdr.rob_idx  = aw_out_hdr_out.hdr.rob_idx;
-    floo_axi_b.hdr.dst_id   = dst_id[AxiB];
-    floo_axi_b.hdr.mask     = mcast_mask[AxiB];
-    floo_axi_b.hdr.src_id   = id_i;
-    floo_axi_b.hdr.last     = 1'b1;
-    floo_axi_b.hdr.axi_ch   = AxiB;
-    floo_axi_b.hdr.atop     = aw_out_hdr_out.hdr.atop;
-    floo_axi_b.payload      = meta_buf_rsp_out.b;
-    floo_axi_b.payload.id   = aw_out_hdr_out.id;
-    floo_axi_b.hdr.commtype = (aw_out_hdr_out.hdr.commtype == Multicast)?
-                              ParallelReduction : Unicast;
+    floo_axi_b.hdr.rob_req         = aw_out_hdr_out.hdr.rob_req;
+    floo_axi_b.hdr.rob_idx         = aw_out_hdr_out.hdr.rob_idx;
+    floo_axi_b.hdr.dst_id          = dst_id[AxiB];
+    floo_axi_b.hdr.collective_mask = mcast_mask[AxiB];
+    floo_axi_b.hdr.src_id          = id_i;
+    floo_axi_b.hdr.last            = 1'b1;
+    floo_axi_b.hdr.axi_ch         = AxiB;
+    floo_axi_b.hdr.atop            = aw_out_hdr_out.hdr.atop;
+    floo_axi_b.payload             = meta_buf_rsp_out.b;
+    floo_axi_b.payload.id          = aw_out_hdr_out.id;
+    floo_axi_b.hdr.collective_op   = (aw_out_hdr_out.hdr.collective_op == Multicast)?
+                                      CollectB : Unicast;
   end
 
   always_comb begin
     floo_axi_r              = '0;
-    floo_axi_r.hdr.rob_req  = ar_out_hdr_out.hdr.rob_req;
-    floo_axi_r.hdr.rob_idx  = ar_out_hdr_out.hdr.rob_idx;
-    floo_axi_r.hdr.dst_id   = dst_id[AxiR];
-    floo_axi_r.hdr.mask     = mcast_mask[AxiR];
-    floo_axi_r.hdr.src_id   = id_i;
-    floo_axi_r.hdr.last     = 1'b1; // There is no reason to do wormhole routing for R bursts
-    floo_axi_r.hdr.axi_ch   = AxiR;
-    floo_axi_r.hdr.atop     = ar_out_hdr_out.hdr.atop;
-    floo_axi_r.payload      = meta_buf_rsp_out.r;
-    floo_axi_r.payload.id   = ar_out_hdr_out.id;
-    floo_axi_r.hdr.commtype = '0;
+    floo_axi_r.hdr.rob_req         = ar_out_hdr_out.hdr.rob_req;
+    floo_axi_r.hdr.rob_idx         = ar_out_hdr_out.hdr.rob_idx;
+    floo_axi_r.hdr.dst_id          = dst_id[AxiR];
+    floo_axi_r.hdr.collective_mask = mcast_mask[AxiR];
+    floo_axi_r.hdr.src_id          = id_i;
+    floo_axi_r.hdr.last            = 1'b1; // No reason to do wormhole routing for R bursts
+    floo_axi_r.hdr.axi_ch         = AxiR;
+    floo_axi_r.hdr.atop            = ar_out_hdr_out.hdr.atop;
+    floo_axi_r.payload             = meta_buf_rsp_out.r;
+    floo_axi_r.payload.id          = ar_out_hdr_out.id;
+    floo_axi_r.hdr.collective_op   = '0;
   end
 
   always_comb begin
@@ -661,6 +665,9 @@ module floo_axi_chimney #(
   // FLIT ARBITRATION  //
   ///////////////////////
 
+  floo_req_generic_flit_t floo_req_arb_data;
+  logic floo_req_arb_valid, floo_req_arb_ready;
+
   floo_wormhole_arbiter #(
     .NumRoutes  ( 2                       ),
     .flit_t     ( floo_req_generic_flit_t )
@@ -670,11 +677,28 @@ module floo_axi_chimney #(
     .valid_i  ( floo_req_arb_req_in   ),
     .data_i   ( floo_req_arb_in       ),
     .ready_o  ( floo_req_arb_gnt_out  ),
-    .data_o   ( floo_req_o.req        ),
-    .ready_i  ( floo_req_i.ready      ),
-    .valid_o  ( floo_req_o.valid      )
+    .data_o   ( floo_req_arb_data     ),
+    .ready_i  ( floo_req_arb_ready    ),
+    .valid_o  ( floo_req_arb_valid    )
+  );
+
+  spill_register #(
+    .T     ( floo_req_generic_flit_t ),
+    .Bypass( 1'b0                    )
+  ) i_req_out_cut (
+    .clk_i,
+    .rst_ni,
+    .valid_i ( floo_req_arb_valid ),
+    .ready_o ( floo_req_arb_ready ),
+    .data_i  ( floo_req_arb_data  ),
+    .valid_o ( floo_req_o.valid   ),
+    .ready_i ( floo_req_i.ready   ),
+    .data_o  ( floo_req_o.req     )
   );
 
+  floo_rsp_generic_flit_t floo_rsp_arb_data;
+  logic floo_rsp_arb_valid, floo_rsp_arb_ready;
+
   floo_wormhole_arbiter #(
     .NumRoutes  ( 2                       ),
     .flit_t     ( floo_rsp_generic_flit_t )
@@ -684,9 +708,23 @@ module floo_axi_chimney #(
     .valid_i  ( floo_rsp_arb_req_in   ),
     .data_i   ( floo_rsp_arb_in       ),
     .ready_o  ( floo_rsp_arb_gnt_out  ),
-    .data_o   ( floo_rsp_o.rsp        ),
-    .ready_i  ( floo_rsp_i.ready      ),
-    .valid_o  ( floo_rsp_o.valid      )
+    .data_o   ( floo_rsp_arb_data     ),
+    .ready_i  ( floo_rsp_arb_ready    ),
+    .valid_o  ( floo_rsp_arb_valid    )
+  );
+
+  spill_register #(
+    .T     ( floo_rsp_generic_flit_t ),
+    .Bypass( 1'b0                    )
+  ) i_rsp_out_cut (
+    .clk_i,
+    .rst_ni,
+    .valid_i ( floo_rsp_arb_valid ),
+    .ready_o ( floo_rsp_arb_ready ),
+    .data_i  ( floo_rsp_arb_data  ),
+    .valid_o ( floo_rsp_o.valid   ),
+    .ready_i ( floo_rsp_i.ready   ),
+    .data_o  ( floo_rsp_o.rsp     )
   );
 
   ////////////////////
diff --git a/hw/floo_cut.sv b/hw/floo_cut.sv
index 4b0f3022..c2a95921 100644
--- a/hw/floo_cut.sv
+++ b/hw/floo_cut.sv
@@ -29,6 +29,45 @@ module floo_cut #(
     assign valid_o = valid_i;
     assign ready_o = ready_i;
     assign data_o  = data_i;
+  end else if (NumVirtChannels == NumPhysChannels) begin : gen_passthrough_cuts
+    // Standard forward pipeline for pass-through case (no vc arbitration needed).
+    // The original gen_floo_cuts has inverted ready wiring when NumCuts > 1 and
+    // the vc_arbiter is a pass-through: stage c=0 gets downstream ready directly
+    // while stage c=1 gets an intermediate ready, causing data loss and valid
+    // instability on cardinal direction inputs.
+
+    flit_t [NumChannels-1:0][NumCuts:0] data;
+    logic  [NumChannels-1:0][NumCuts:0][NumVirtChannels-1:0] valid, ready;
+
+    for (genvar n = 0; n < NumChannels; n++) begin : gen_channel
+      // Input at index 0
+      assign data[n][0]  = data_i[n];
+      assign valid[n][0] = valid_i[n];
+      // Output at index NumCuts
+      assign data_o[n]  = data[n][NumCuts];
+      assign valid_o[n] = valid[n][NumCuts];
+      // Ready flows backward: downstream at NumCuts, upstream at 0
+      assign ready[n][NumCuts] = ready_i[n];
+      assign ready_o[n]        = ready[n][0];
+
+      for (genvar c = 0; c < NumCuts; c++) begin : gen_cut
+        for (genvar v = 0; v < NumVirtChannels; v++) begin : gen_virt
+          spill_register #(
+            .T      ( flit_t ),
+            .Bypass ( 1'b0   )
+          ) i_floo_spill_reg (
+            .clk_i,
+            .rst_ni,
+            .valid_i ( valid[n][c][v]   ),
+            .ready_o ( ready[n][c][v]   ),
+            .data_i  ( data[n][c]       ),
+            .valid_o ( valid[n][c+1][v] ),
+            .ready_i ( ready[n][c+1][v] ),
+            .data_o  ( data[n][c+1]     )
+          );
+        end
+      end
+    end
   end else begin : gen_floo_cuts
 
     flit_t  [NumChannels-1:0][NumCuts:0] data;
diff --git a/hw/floo_id_translation.sv b/hw/floo_id_translation.sv
index 629144ef..0933a69d 100644
--- a/hw/floo_id_translation.sv
+++ b/hw/floo_id_translation.sv
@@ -44,6 +44,7 @@ module floo_id_translation #(
     addr_t x_addr_mask, y_addr_mask;
 
     addr_decode #(
+      .NoIndices  ( 2**$bits(sam_idx_t)  ),
       .NoRules    ( RouteCfg.NumSamRules ),
       .addr_t     ( addr_t               ),
       .rule_t     ( addr_rule_t          ),
diff --git a/hw/floo_nw_chimney.sv b/hw/floo_nw_chimney.sv
index e6af59a6..c4c8890c 100644
--- a/hw/floo_nw_chimney.sv
+++ b/hw/floo_nw_chimney.sv
@@ -109,7 +109,7 @@ module floo_nw_chimney #(
   /// Coordinates/ID of the current tile
   input  id_t id_i,
   /// Routing table for the current tile
-  input  route_t [RouteCfg.NumRoutes-1:0] route_table_i,
+  input  route_t [(RouteCfg.NumRoutes > 0 ? RouteCfg.NumRoutes-1 : 0):0] route_table_i,
   /// Output links to NoC
   output floo_req_t   floo_req_o,
   output floo_rsp_t   floo_rsp_o,
@@ -155,6 +155,7 @@ module floo_nw_chimney #(
   localparam int unsigned NumWidePhysChannels = (WideRwDecouple == floo_pkg::Phys) ? 2 : 1;
   // Collective communication configuration
   localparam floo_pkg::collect_op_fe_cfg_t CollectOpCfg = RouteCfg.CollectiveCfg.OpCfg;
+  localparam bit EnMultiCast = CollectOpCfg.EnNarrowMulticast | CollectOpCfg.EnWideMulticast;
 
   // Duplicate AXI port signals to degenerate ports
   // in case they are not used
@@ -319,7 +320,7 @@ module floo_nw_chimney #(
     `AXI_ASSIGN_RESP_STRUCT(axi_narrow_in_rsp_o, axi_narrow_rsp_out)
 
     // Extract the multicast mask bits from the AXI user bits
-    if (RouteCfg.EnMultiCast) begin : gen_mask
+    if (EnMultiCast) begin : gen_mask
       user_struct_t user;
       assign user = axi_narrow_in_req_i.aw.user;
       // TODO(lleone): Check subfield name is `mcast_mask`
@@ -355,7 +356,7 @@ module floo_nw_chimney #(
         .ready_i  ( axi_narrow_ar_queue_ready_in  )
       );
 
-      if (RouteCfg.EnMultiCast) begin : gen_mask_cuts
+      if (EnMultiCast) begin : gen_mask_cuts
         spill_register #(
           .T (user_mask_t)
         ) i_narrow_usermask_queue (
@@ -410,7 +411,7 @@ module floo_nw_chimney #(
     `AXI_ASSIGN_REQ_STRUCT(axi_wide_req_in, axi_wide_in_req_i)
     `AXI_ASSIGN_RESP_STRUCT(axi_wide_in_rsp_o, axi_wide_rsp_out)
 
-    if (RouteCfg.EnMultiCast) begin : gen_mask
+    if (EnMultiCast) begin : gen_mask
       assign axi_wide_req_in_mask = axi_wide_in_req_i.aw.user;
     end else begin : gen_no_mask
       assign axi_wide_req_in_mask = '0;
@@ -443,7 +444,7 @@ module floo_nw_chimney #(
         .ready_i  ( axi_wide_ar_queue_ready_in  )
       );
 
-      if (RouteCfg.EnMultiCast) begin : gen_mask_cuts
+      if (EnMultiCast) begin : gen_mask_cuts
         spill_register #(
           .T (user_mask_t)
         ) i_wide_usermask_queue (
@@ -926,7 +927,7 @@ module floo_nw_chimney #(
   `FFL(wide_aw_id_q, id_out[WideAw], axi_wide_aw_queue_valid_out &&
                                      axi_wide_aw_queue_ready_in, '0)
 
-  if (RouteCfg.EnMultiCast) begin : gen_mcast
+  if (EnMultiCast) begin : gen_mcast
     localparam int unsigned AddrWidth = $bits(axi_addr_t);
     axi_addr_t [NumNWAxiChannels-1:0] x_addr_mask;
     axi_addr_t [NumNWAxiChannels-1:0] y_addr_mask;
@@ -961,10 +962,10 @@ module floo_nw_chimney #(
     assign mcast_mask[NarrowW]  = narrow_aw_mask_q;
     assign mcast_mask[WideW]    = wide_aw_mask_q;
 
-    assign mcast_mask[NarrowR] = narrow_ar_buf_hdr_out.hdr.mask;
-    assign mcast_mask[NarrowB] = narrow_aw_buf_hdr_out.hdr.mask;
-    assign mcast_mask[WideR]   = wide_ar_buf_hdr_out.hdr.mask;
-    assign mcast_mask[WideB]   = wide_aw_buf_hdr_out.hdr.mask;
+    assign mcast_mask[NarrowR] = narrow_ar_buf_hdr_out.hdr.collective_mask;
+    assign mcast_mask[NarrowB] = narrow_aw_buf_hdr_out.hdr.collective_mask;
+    assign mcast_mask[WideR]   = wide_ar_buf_hdr_out.hdr.collective_mask;
+    assign mcast_mask[WideB]   = wide_aw_buf_hdr_out.hdr.collective_mask;
 
     `FFL(narrow_aw_mask_q, mcast_mask[NarrowAw], axi_narrow_aw_queue_valid_out &&
                                            axi_narrow_aw_queue_ready_in, '0)
@@ -981,141 +982,141 @@ module floo_nw_chimney #(
 
   always_comb begin
     floo_narrow_aw              = '0;
-    floo_narrow_aw.hdr.rob_req  = narrow_aw_rob_req_out;
-    floo_narrow_aw.hdr.rob_idx  = rob_idx_t'(narrow_aw_rob_idx_out);
-    floo_narrow_aw.hdr.dst_id   = dst_id[NarrowAw];
-    floo_narrow_aw.hdr.mask     = mcast_mask[NarrowAw];
-    floo_narrow_aw.hdr.src_id   = id_i;
-    floo_narrow_aw.hdr.last     = 1'b0;  // AW and W need to be sent together
-    floo_narrow_aw.hdr.axi_ch   = NarrowAw;
-    floo_narrow_aw.hdr.atop     = axi_narrow_aw_queue.atop != axi_pkg::ATOP_NONE;
-    floo_narrow_aw.payload      = axi_narrow_aw_queue;
-    floo_narrow_aw.hdr.commtype = (mcast_mask[NarrowAw] != '0)? Multicast : Unicast;
+    floo_narrow_aw.hdr.rob_req         = narrow_aw_rob_req_out;
+    floo_narrow_aw.hdr.rob_idx         = rob_idx_t'(narrow_aw_rob_idx_out);
+    floo_narrow_aw.hdr.dst_id          = dst_id[NarrowAw];
+    floo_narrow_aw.hdr.collective_mask = mcast_mask[NarrowAw];
+    floo_narrow_aw.hdr.src_id          = id_i;
+    floo_narrow_aw.hdr.last            = 1'b0;  // AW and W need to be sent together
+    floo_narrow_aw.hdr.axi_ch         = NarrowAw;
+    floo_narrow_aw.hdr.atop            = axi_narrow_aw_queue.atop != axi_pkg::ATOP_NONE;
+    floo_narrow_aw.payload             = axi_narrow_aw_queue;
+    floo_narrow_aw.hdr.collective_op   = (mcast_mask[NarrowAw] != '0)? Multicast : Unicast;
   end
 
   always_comb begin
     floo_narrow_w               = '0;
-    floo_narrow_w.hdr.rob_req   = narrow_aw_rob_req_out;
-    floo_narrow_w.hdr.rob_idx   = rob_idx_t'(narrow_aw_rob_idx_out);
-    floo_narrow_w.hdr.dst_id    = dst_id[NarrowW];
-    floo_narrow_w.hdr.mask      = mcast_mask[NarrowW];
-    floo_narrow_w.hdr.src_id    = id_i;
-    floo_narrow_w.hdr.last      = axi_narrow_req_in.w.last;
-    floo_narrow_w.hdr.axi_ch    = NarrowW;
-    floo_narrow_w.payload       = axi_narrow_req_in.w;
-    floo_narrow_w.hdr.commtype  = (mcast_mask[NarrowW] != '0)? Multicast : Unicast;
+    floo_narrow_w.hdr.rob_req         = narrow_aw_rob_req_out;
+    floo_narrow_w.hdr.rob_idx         = rob_idx_t'(narrow_aw_rob_idx_out);
+    floo_narrow_w.hdr.dst_id          = dst_id[NarrowW];
+    floo_narrow_w.hdr.collective_mask = mcast_mask[NarrowW];
+    floo_narrow_w.hdr.src_id          = id_i;
+    floo_narrow_w.hdr.last            = axi_narrow_req_in.w.last;
+    floo_narrow_w.hdr.axi_ch         = NarrowW;
+    floo_narrow_w.payload             = axi_narrow_req_in.w;
+    floo_narrow_w.hdr.collective_op   = (mcast_mask[NarrowW] != '0)? Multicast : Unicast;
   end
 
   always_comb begin
     floo_narrow_ar              = '0;
-    floo_narrow_ar.hdr.rob_req  = narrow_ar_rob_req_out;
-    floo_narrow_ar.hdr.rob_idx  = rob_idx_t'(narrow_ar_rob_idx_out);
-    floo_narrow_ar.hdr.dst_id   = dst_id[NarrowAr];
-    floo_narrow_ar.hdr.mask     = mcast_mask[NarrowAr];
-    floo_narrow_ar.hdr.src_id   = id_i;
-    floo_narrow_ar.hdr.last     = 1'b1;
-    floo_narrow_ar.hdr.axi_ch   = NarrowAr;
-    floo_narrow_ar.payload      = axi_narrow_ar_queue;
-    floo_narrow_ar.hdr.commtype = '0;
+    floo_narrow_ar.hdr.rob_req         = narrow_ar_rob_req_out;
+    floo_narrow_ar.hdr.rob_idx         = rob_idx_t'(narrow_ar_rob_idx_out);
+    floo_narrow_ar.hdr.dst_id          = dst_id[NarrowAr];
+    floo_narrow_ar.hdr.collective_mask = mcast_mask[NarrowAr];
+    floo_narrow_ar.hdr.src_id          = id_i;
+    floo_narrow_ar.hdr.last            = 1'b1;
+    floo_narrow_ar.hdr.axi_ch         = NarrowAr;
+    floo_narrow_ar.payload             = axi_narrow_ar_queue;
+    floo_narrow_ar.hdr.collective_op   = '0;
   end
 
   always_comb begin
     floo_narrow_b              = '0;
-    floo_narrow_b.hdr.rob_req  = narrow_aw_buf_hdr_out.hdr.rob_req;
-    floo_narrow_b.hdr.rob_idx  = rob_idx_t'(narrow_aw_buf_hdr_out.hdr.rob_idx);
-    floo_narrow_b.hdr.dst_id   = dst_id[NarrowB];
-    floo_narrow_b.hdr.mask     = mcast_mask[NarrowB];
-    floo_narrow_b.hdr.src_id   = id_i;
-    floo_narrow_b.hdr.last     = 1'b1;
-    floo_narrow_b.hdr.axi_ch   = NarrowB;
-    floo_narrow_b.hdr.atop     = narrow_aw_buf_hdr_out.hdr.atop;
-    floo_narrow_b.payload      = axi_narrow_meta_buf_rsp_out.b;
-    floo_narrow_b.payload.id   = narrow_aw_buf_hdr_out.id;
-    floo_narrow_b.hdr.commtype = (narrow_aw_buf_hdr_out.hdr.commtype == Multicast)?
-                                 ParallelReduction : Unicast;
+    floo_narrow_b.hdr.rob_req         = narrow_aw_buf_hdr_out.hdr.rob_req;
+    floo_narrow_b.hdr.rob_idx         = rob_idx_t'(narrow_aw_buf_hdr_out.hdr.rob_idx);
+    floo_narrow_b.hdr.dst_id          = dst_id[NarrowB];
+    floo_narrow_b.hdr.collective_mask = mcast_mask[NarrowB];
+    floo_narrow_b.hdr.src_id          = id_i;
+    floo_narrow_b.hdr.last            = 1'b1;
+    floo_narrow_b.hdr.axi_ch         = NarrowB;
+    floo_narrow_b.hdr.atop            = narrow_aw_buf_hdr_out.hdr.atop;
+    floo_narrow_b.payload             = axi_narrow_meta_buf_rsp_out.b;
+    floo_narrow_b.payload.id          = narrow_aw_buf_hdr_out.id;
+    floo_narrow_b.hdr.collective_op   = (narrow_aw_buf_hdr_out.hdr.collective_op == Multicast)?
+                                         CollectB : Unicast;
   end
 
   always_comb begin
     floo_narrow_r             = '0;
-    floo_narrow_r.hdr.rob_req = narrow_ar_buf_hdr_out.hdr.rob_req;
-    floo_narrow_r.hdr.rob_idx = rob_idx_t'(narrow_ar_buf_hdr_out.hdr.rob_idx);
-    floo_narrow_r.hdr.dst_id  = dst_id[NarrowR];
-    floo_narrow_r.hdr.mask    = mcast_mask[NarrowR];
-    floo_narrow_r.hdr.src_id  = id_i;
-    floo_narrow_r.hdr.axi_ch  = NarrowR;
-    floo_narrow_r.hdr.last    = 1'b1; // There is no reason to do wormhole routing for R bursts
-    floo_narrow_r.hdr.atop    = narrow_ar_buf_hdr_out.hdr.atop;
-    floo_narrow_r.payload     = axi_narrow_meta_buf_rsp_out.r;
-    floo_narrow_r.payload.id  = narrow_ar_buf_hdr_out.id;
-    floo_narrow_r.hdr.commtype = '0;
+    floo_narrow_r.hdr.rob_req         = narrow_ar_buf_hdr_out.hdr.rob_req;
+    floo_narrow_r.hdr.rob_idx         = rob_idx_t'(narrow_ar_buf_hdr_out.hdr.rob_idx);
+    floo_narrow_r.hdr.dst_id          = dst_id[NarrowR];
+    floo_narrow_r.hdr.collective_mask = mcast_mask[NarrowR];
+    floo_narrow_r.hdr.src_id          = id_i;
+    floo_narrow_r.hdr.axi_ch         = NarrowR;
+    floo_narrow_r.hdr.last            = 1'b1; // No reason to do wormhole routing for R bursts
+    floo_narrow_r.hdr.atop            = narrow_ar_buf_hdr_out.hdr.atop;
+    floo_narrow_r.payload             = axi_narrow_meta_buf_rsp_out.r;
+    floo_narrow_r.payload.id          = narrow_ar_buf_hdr_out.id;
+    floo_narrow_r.hdr.collective_op   = '0;
   end
 
   always_comb begin
     floo_wide_aw              = '0;
-    floo_wide_aw.hdr.rob_req  = wide_aw_rob_req_out;
-    floo_wide_aw.hdr.rob_idx  = rob_idx_t'(wide_aw_rob_idx_out);
-    floo_wide_aw.hdr.dst_id   = dst_id[WideAw];
-    floo_wide_aw.hdr.mask     = mcast_mask[WideAw];
-    floo_wide_aw.hdr.src_id   = id_i;
-    floo_wide_aw.hdr.last     = 1'b0;  // AW and W need to be sent together
-    floo_wide_aw.hdr.axi_ch   = WideAw;
-    floo_wide_aw.payload      = axi_wide_aw_queue;
-    floo_wide_aw.hdr.commtype = (mcast_mask[WideAw] != '0)? Multicast : Unicast;
+    floo_wide_aw.hdr.rob_req         = wide_aw_rob_req_out;
+    floo_wide_aw.hdr.rob_idx         = rob_idx_t'(wide_aw_rob_idx_out);
+    floo_wide_aw.hdr.dst_id          = dst_id[WideAw];
+    floo_wide_aw.hdr.collective_mask = mcast_mask[WideAw];
+    floo_wide_aw.hdr.src_id          = id_i;
+    floo_wide_aw.hdr.last            = 1'b0;  // AW and W need to be sent together
+    floo_wide_aw.hdr.axi_ch         = WideAw;
+    floo_wide_aw.payload             = axi_wide_aw_queue;
+    floo_wide_aw.hdr.collective_op   = (mcast_mask[WideAw] != '0)? Multicast : Unicast;
   end
 
   always_comb begin
     floo_wide_w             = '0;
-    floo_wide_w.hdr.rob_req = wide_aw_rob_req_out;
-    floo_wide_w.hdr.rob_idx = rob_idx_t'(wide_aw_rob_idx_out);
-    floo_wide_w.hdr.dst_id  = dst_id[WideW];
-    floo_wide_w.hdr.mask    = mcast_mask[WideW];
-    floo_wide_w.hdr.src_id  = id_i;
-    floo_wide_w.hdr.last    = axi_wide_req_in.w.last;
-    floo_wide_w.hdr.axi_ch  = WideW;
-    floo_wide_w.payload     = axi_wide_req_in.w;
-    floo_wide_w.hdr.commtype = (mcast_mask[WideW] != '0)? Multicast : Unicast;
+    floo_wide_w.hdr.rob_req         = wide_aw_rob_req_out;
+    floo_wide_w.hdr.rob_idx         = rob_idx_t'(wide_aw_rob_idx_out);
+    floo_wide_w.hdr.dst_id          = dst_id[WideW];
+    floo_wide_w.hdr.collective_mask = mcast_mask[WideW];
+    floo_wide_w.hdr.src_id          = id_i;
+    floo_wide_w.hdr.last            = axi_wide_req_in.w.last;
+    floo_wide_w.hdr.axi_ch         = WideW;
+    floo_wide_w.payload             = axi_wide_req_in.w;
+    floo_wide_w.hdr.collective_op   = (mcast_mask[WideW] != '0)? Multicast : Unicast;
   end
 
   always_comb begin
     floo_wide_ar              = '0;
-    floo_wide_ar.hdr.rob_req  = wide_ar_rob_req_out;
-    floo_wide_ar.hdr.rob_idx  = rob_idx_t'(wide_ar_rob_idx_out);
-    floo_wide_ar.hdr.dst_id   = dst_id[WideAr];
-    floo_wide_ar.hdr.mask     = mcast_mask[WideAr];
-    floo_wide_ar.hdr.src_id   = id_i;
-    floo_wide_ar.hdr.last     = 1'b1;
-    floo_wide_ar.hdr.axi_ch   = WideAr;
-    floo_wide_ar.payload      = axi_wide_ar_queue;
-    floo_wide_ar.hdr.commtype = '0;
+    floo_wide_ar.hdr.rob_req         = wide_ar_rob_req_out;
+    floo_wide_ar.hdr.rob_idx         = rob_idx_t'(wide_ar_rob_idx_out);
+    floo_wide_ar.hdr.dst_id          = dst_id[WideAr];
+    floo_wide_ar.hdr.collective_mask = mcast_mask[WideAr];
+    floo_wide_ar.hdr.src_id          = id_i;
+    floo_wide_ar.hdr.last            = 1'b1;
+    floo_wide_ar.hdr.axi_ch         = WideAr;
+    floo_wide_ar.payload             = axi_wide_ar_queue;
+    floo_wide_ar.hdr.collective_op   = '0;
   end
 
   always_comb begin
     floo_wide_b             = '0;
-    floo_wide_b.hdr.rob_req = wide_aw_buf_hdr_out.hdr.rob_req;
-    floo_wide_b.hdr.rob_idx = rob_idx_t'(wide_aw_buf_hdr_out.hdr.rob_idx);
-    floo_wide_b.hdr.dst_id  = dst_id[WideB];
-    floo_wide_b.hdr.mask    = mcast_mask[WideB];
-    floo_wide_b.hdr.src_id  = id_i;
-    floo_wide_b.hdr.last    = 1'b1;
-    floo_wide_b.hdr.axi_ch  = WideB;
-    floo_wide_b.payload     = axi_wide_meta_buf_rsp_out.b;
-    floo_wide_b.payload.id  = wide_aw_buf_hdr_out.id;
-    floo_wide_b.hdr.commtype = (wide_aw_buf_hdr_out.hdr.commtype == Multicast)?
-                               ParallelReduction : Unicast;
+    floo_wide_b.hdr.rob_req         = wide_aw_buf_hdr_out.hdr.rob_req;
+    floo_wide_b.hdr.rob_idx         = rob_idx_t'(wide_aw_buf_hdr_out.hdr.rob_idx);
+    floo_wide_b.hdr.dst_id          = dst_id[WideB];
+    floo_wide_b.hdr.collective_mask = mcast_mask[WideB];
+    floo_wide_b.hdr.src_id          = id_i;
+    floo_wide_b.hdr.last            = 1'b1;
+    floo_wide_b.hdr.axi_ch         = WideB;
+    floo_wide_b.payload             = axi_wide_meta_buf_rsp_out.b;
+    floo_wide_b.payload.id          = wide_aw_buf_hdr_out.id;
+    floo_wide_b.hdr.collective_op   = (wide_aw_buf_hdr_out.hdr.collective_op == Multicast)?
+                                       CollectB : Unicast;
   end
 
   always_comb begin
     floo_wide_r             = '0;
-    floo_wide_r.hdr.rob_req = wide_ar_buf_hdr_out.hdr.rob_req;
-    floo_wide_r.hdr.rob_idx = rob_idx_t'(wide_ar_buf_hdr_out.hdr.rob_idx);
-    floo_wide_r.hdr.dst_id  = dst_id[WideR];
-    floo_wide_r.hdr.mask    = mcast_mask[WideR];
-    floo_wide_r.hdr.src_id  = id_i;
-    floo_wide_r.hdr.axi_ch  = WideR;
-    floo_wide_r.hdr.last    = 1'b1; // There is no reason to do wormhole routing for R bursts
-    floo_wide_r.payload     = axi_wide_meta_buf_rsp_out.r;
-    floo_wide_r.payload.id  = wide_ar_buf_hdr_out.id;
-    floo_wide_r.hdr.commtype = '0;
+    floo_wide_r.hdr.rob_req         = wide_ar_buf_hdr_out.hdr.rob_req;
+    floo_wide_r.hdr.rob_idx         = rob_idx_t'(wide_ar_buf_hdr_out.hdr.rob_idx);
+    floo_wide_r.hdr.dst_id          = dst_id[WideR];
+    floo_wide_r.hdr.collective_mask = mcast_mask[WideR];
+    floo_wide_r.hdr.src_id          = id_i;
+    floo_wide_r.hdr.axi_ch         = WideR;
+    floo_wide_r.hdr.last            = 1'b1; // No reason to do wormhole routing for R bursts
+    floo_wide_r.payload             = axi_wide_meta_buf_rsp_out.r;
+    floo_wide_r.payload.id          = wide_ar_buf_hdr_out.id;
+    floo_wide_r.hdr.collective_op   = '0;
   end
 
   always_comb begin
@@ -1186,6 +1187,9 @@ module floo_nw_chimney #(
   // FLIT ARBITRATION  //
   ///////////////////////
 
+  floo_req_generic_flit_t floo_req_arb_data;
+  logic floo_req_arb_valid, floo_req_arb_ready;
+
   floo_wormhole_arbiter #(
     .NumRoutes  ( 4                       ),
     .flit_t     ( floo_req_generic_flit_t )
@@ -1195,11 +1199,28 @@ module floo_nw_chimney #(
     .valid_i  ( floo_req_arb_req_in   ),
     .data_i   ( floo_req_arb_in       ),
     .ready_o  ( floo_req_arb_gnt_out  ),
-    .data_o   ( floo_req_o.req        ),
-    .ready_i  ( floo_req_i.ready      ),
-    .valid_o  ( floo_req_o.valid      )
+    .data_o   ( floo_req_arb_data     ),
+    .ready_i  ( floo_req_arb_ready    ),
+    .valid_o  ( floo_req_arb_valid    )
+  );
+
+  spill_register #(
+    .T     ( floo_req_generic_flit_t ),
+    .Bypass( 1'b0                    )
+  ) i_req_out_cut (
+    .clk_i,
+    .rst_ni,
+    .valid_i ( floo_req_arb_valid ),
+    .ready_o ( floo_req_arb_ready ),
+    .data_i  ( floo_req_arb_data  ),
+    .valid_o ( floo_req_o.valid   ),
+    .ready_i ( floo_req_i.ready   ),
+    .data_o  ( floo_req_o.req     )
   );
 
+  floo_rsp_generic_flit_t floo_rsp_arb_data;
+  logic floo_rsp_arb_valid, floo_rsp_arb_ready;
+
   floo_wormhole_arbiter #(
     .NumRoutes  ( 3                       ),
     .flit_t     ( floo_rsp_generic_flit_t )
@@ -1209,9 +1230,23 @@ module floo_nw_chimney #(
     .valid_i  ( floo_rsp_arb_req_in   ),
     .data_i   ( floo_rsp_arb_in       ),
     .ready_o  ( floo_rsp_arb_gnt_out  ),
-    .data_o   ( floo_rsp_o.rsp        ),
-    .ready_i  ( floo_rsp_i.ready      ),
-    .valid_o  ( floo_rsp_o.valid      )
+    .data_o   ( floo_rsp_arb_data     ),
+    .ready_i  ( floo_rsp_arb_ready    ),
+    .valid_o  ( floo_rsp_arb_valid    )
+  );
+
+  spill_register #(
+    .T     ( floo_rsp_generic_flit_t ),
+    .Bypass( 1'b0                    )
+  ) i_rsp_out_cut (
+    .clk_i,
+    .rst_ni,
+    .valid_i ( floo_rsp_arb_valid ),
+    .ready_o ( floo_rsp_arb_ready ),
+    .data_i  ( floo_rsp_arb_data  ),
+    .valid_o ( floo_rsp_o.valid   ),
+    .ready_i ( floo_rsp_i.ready   ),
+    .data_o  ( floo_rsp_o.rsp     )
   );
   // Credit is never used for narrow req/rsp
   if (VcImpl == floo_pkg::VcCredit) begin : gen_credit_tie