From 7fc7c5dfdffee7fa14c7822d53a119b5bc2fd1ba Mon Sep 17 00:00:00 2001
From: Shulin Feng <fengshulin@hisilicon.com>
Date: Sat, 14 Feb 2026 18:24:15 +0800
Subject: [PATCH 1/2] Fix cube v2 decoder for dynamic ARRAY_SIZE and multiple
 assignment bugs

- Fix decoder tile calculation to use dynamic shift_amount based on ARRAY_SIZE
  instead of hardcoded >> 4 (divide by 16)
- Fix multiple continuous assignment bugs in decoder, issue_queue, and mmio
  using explicit priority mux pattern
- Add 64x64x64 MATMUL testbench for cycle count measurement
- Change ARRAY_SIZE to 8 for testing different PE configurations

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 janus/pyc/janus/cube/cube_v2_consts.py      |   2 +-
 janus/pyc/janus/cube/cube_v2_decoder.py     | 102 ++++++----
 janus/pyc/janus/cube/cube_v2_issue_queue.py | 110 +++++++----
 janus/pyc/janus/cube/cube_v2_l0_reuse.py    |  17 +-
 janus/pyc/janus/cube/cube_v2_mmio.py        |  18 +-
 janus/pyc/janus/cube/cube_v2_reuse.py       |  31 +++-
 janus/tb/tb_cube_64x64x64.sv                | 196 ++++++++++++++++++++
 janus/tb/tb_cube_64x64x64_main.cpp          |  24 +++
 8 files changed, 412 insertions(+), 88 deletions(-)
 create mode 100644 janus/tb/tb_cube_64x64x64.sv
 create mode 100644 janus/tb/tb_cube_64x64x64_main.cpp

diff --git a/janus/pyc/janus/cube/cube_v2_consts.py b/janus/pyc/janus/cube/cube_v2_consts.py
index a1e9b84..204b96f 100644
--- a/janus/pyc/janus/cube/cube_v2_consts.py
+++ b/janus/pyc/janus/cube/cube_v2_consts.py
@@ -5,7 +5,7 @@
 # =============================================================================
 # Array Dimensions
 # =============================================================================
-ARRAY_SIZE = 16  # 16×16 systolic array
+ARRAY_SIZE = 8  # 8×8 systolic array
 
 # =============================================================================
 # Buffer Sizes
diff --git a/janus/pyc/janus/cube/cube_v2_decoder.py b/janus/pyc/janus/cube/cube_v2_decoder.py
index 3b87636..d22c6b8 100644
--- a/janus/pyc/janus/cube/cube_v2_decoder.py
+++ b/janus/pyc/janus/cube/cube_v2_decoder.py
@@ -1,7 +1,7 @@
 """Cube v2 MATMUL Decoder and Uop Generator.
 
 Decomposes MATMUL(M, K, N) instructions into micro-operations (uops) for the systolic array.
-Each uop represents a 16×16 tile multiplication.
+Each uop represents an ARRAY_SIZE×ARRAY_SIZE tile multiplication.
 """
 
 from __future__ import annotations
@@ -83,22 +83,26 @@ def build_matmul_decoder(
         gen_state = _make_uop_gen_state(m, clk, rst, consts)
 
         # Calculate tile counts on start
-        # tiles = ceil(dim / 16) = (dim + 15) / 16
+        # tiles = ceil(dim / ARRAY_SIZE) = (dim + ARRAY_SIZE - 1) / ARRAY_SIZE
+        # Use bit shift for power-of-2 ARRAY_SIZE
+        import math
+        shift_amount = int(math.log2(ARRAY_SIZE))
+
         with m.scope("TILE_CALC"):
             tile_size = c(ARRAY_SIZE, width=16)
             tile_mask = c(ARRAY_SIZE - 1, width=16)
 
             # M tiles
             m_plus = inst_m + tile_mask
-            m_tiles_calc = m_plus >> 4  # Divide by 16
+            m_tiles_calc = m_plus >> shift_amount
 
             # K tiles
             k_plus = inst_k + tile_mask
-            k_tiles_calc = k_plus >> 4
+            k_tiles_calc = k_plus >> shift_amount
 
             # N tiles
             n_plus = inst_n + tile_mask
-            n_tiles_calc = n_plus >> 4
+            n_tiles_calc = n_plus >> shift_amount
 
         # Latch instruction on start
         with m.scope("LATCH"):
@@ -110,14 +114,8 @@ def build_matmul_decoder(
             gen_state.k_tiles.set(k_tiles_calc.trunc(width=TILE_IDX_WIDTH), when=start)
             gen_state.n_tiles.set(n_tiles_calc.trunc(width=TILE_IDX_WIDTH), when=start)
 
-            # Reset tile indices
-            gen_state.m_tile.set(c(0, width=TILE_IDX_WIDTH), when=start)
-            gen_state.k_tile.set(c(0, width=TILE_IDX_WIDTH), when=start)
-            gen_state.n_tile.set(c(0, width=TILE_IDX_WIDTH), when=start)
-
-            # Start generating
-            gen_state.generating.set(consts.one1, when=start)
-            gen_state.gen_done.set(consts.zero1, when=start)
+            # Note: tile indices are set below with explicit priority mux
+            # Note: generating and gen_done are set below with explicit priority
 
         # Uop generation logic
         with m.scope("UOP_GEN"):
@@ -157,7 +155,7 @@ def build_matmul_decoder(
             # Output valid uop
             uop_valid = can_generate
 
-            # Advance tile indices (iterate: k, n, m order for better locality)
+            # Compute tile index advancement (iterate: k, n, m order for better locality)
             with m.scope("ADVANCE"):
                 # Next k_tile
                 k_tile_next = k_tile + c(1, width=TILE_IDX_WIDTH)
@@ -174,29 +172,63 @@ def build_matmul_decoder(
                 # All done when m wraps
                 all_done = k_wrap & n_wrap & m_wrap
 
-                # Update indices when generating
-                # K advances every cycle
+                # Compute new values for tile indices
                 new_k = k_wrap.select(c(0, width=TILE_IDX_WIDTH), k_tile_next)
-                gen_state.k_tile.set(new_k, when=can_generate)
-
-                # N advances when K wraps
                 new_n = (k_wrap & n_wrap).select(c(0, width=TILE_IDX_WIDTH), n_tile_next)
-                gen_state.n_tile.set(new_n, when=can_generate & k_wrap)
-
-                # M advances when N wraps
-                gen_state.m_tile.set(m_tile_next, when=can_generate & k_wrap & n_wrap)
-
-                # Done when all tiles generated
-                gen_state.generating.set(consts.zero1, when=can_generate & all_done)
-                gen_state.gen_done.set(consts.one1, when=can_generate & all_done)
-
-        # Reset logic
-        with m.scope("RESET"):
-            gen_state.generating.set(consts.zero1, when=reset_decoder)
-            gen_state.gen_done.set(consts.zero1, when=reset_decoder)
-            gen_state.m_tile.set(c(0, width=TILE_IDX_WIDTH), when=reset_decoder)
-            gen_state.k_tile.set(c(0, width=TILE_IDX_WIDTH), when=reset_decoder)
-            gen_state.n_tile.set(c(0, width=TILE_IDX_WIDTH), when=reset_decoder)
+
+        # Explicit priority mux for generating and gen_done
+        # Priority: reset_decoder > (can_generate & all_done) > start > hold
+        with m.scope("STATE_UPDATE"):
+            current_generating = gen_state.generating.out()
+            current_gen_done = gen_state.gen_done.out()
+
+            # Default: hold current value
+            next_generating = current_generating
+            next_gen_done = current_gen_done
+
+            # start sets generating=1, gen_done=0
+            next_generating = start.select(consts.one1, next_generating)
+            next_gen_done = start.select(consts.zero1, next_gen_done)
+
+            # can_generate & all_done sets generating=0, gen_done=1
+            finish_cond = can_generate & all_done
+            next_generating = finish_cond.select(consts.zero1, next_generating)
+            next_gen_done = finish_cond.select(consts.one1, next_gen_done)
+
+            # reset_decoder sets generating=0, gen_done=0 (highest priority)
+            next_generating = reset_decoder.select(consts.zero1, next_generating)
+            next_gen_done = reset_decoder.select(consts.zero1, next_gen_done)
+
+            # Single set call with explicit next value
+            gen_state.generating.set(next_generating)
+            gen_state.gen_done.set(next_gen_done)
+
+        # Explicit priority mux for tile indices
+        # Priority: reset_decoder > start > advance > hold
+        with m.scope("TILE_UPDATE"):
+            # K tile
+            current_k = gen_state.k_tile.out()
+            next_k = current_k
+            next_k = can_generate.select(new_k, next_k)
+            next_k = start.select(c(0, width=TILE_IDX_WIDTH), next_k)
+            next_k = reset_decoder.select(c(0, width=TILE_IDX_WIDTH), next_k)
+            gen_state.k_tile.set(next_k)
+
+            # N tile
+            current_n = gen_state.n_tile.out()
+            next_n_val = current_n
+            next_n_val = (can_generate & k_wrap).select(new_n, next_n_val)
+            next_n_val = start.select(c(0, width=TILE_IDX_WIDTH), next_n_val)
+            next_n_val = reset_decoder.select(c(0, width=TILE_IDX_WIDTH), next_n_val)
+            gen_state.n_tile.set(next_n_val)
+
+            # M tile
+            current_m = gen_state.m_tile.out()
+            next_m = current_m
+            next_m = (can_generate & k_wrap & n_wrap).select(m_tile_next, next_m)
+            next_m = start.select(c(0, width=TILE_IDX_WIDTH), next_m)
+            next_m = reset_decoder.select(c(0, width=TILE_IDX_WIDTH), next_m)
+            gen_state.m_tile.set(next_m)
 
         gen_done = gen_state.gen_done.out()
 
diff --git a/janus/pyc/janus/cube/cube_v2_issue_queue.py b/janus/pyc/janus/cube/cube_v2_issue_queue.py
index a87f775..8049933 100644
--- a/janus/pyc/janus/cube/cube_v2_issue_queue.py
+++ b/janus/pyc/janus/cube/cube_v2_issue_queue.py
@@ -90,30 +90,26 @@ def build_issue_queue(
         queue_full = count.out().eq(c(ISSUE_QUEUE_SIZE, width=QUEUE_IDX_WIDTH + 1))
         queue_empty = count.out().eq(c(0, width=QUEUE_IDX_WIDTH + 1))
 
-        # Enqueue logic
+        # Enqueue logic - compute enqueue conditions
         with m.scope("ENQUEUE"):
             can_enqueue = enqueue_valid & ~queue_full & ~flush
 
+            # Compute per-entry enqueue conditions
+            enqueue_this_list = []
             for i in range(ISSUE_QUEUE_SIZE):
                 tail_match = tail.out().eq(c(i, width=QUEUE_IDX_WIDTH))
                 enqueue_this = can_enqueue & tail_match
+                enqueue_this_list.append(enqueue_this)
 
-                # Write uop data
+                # Write uop data (these don't have conflicts)
                 entries[i].uop.l0a_idx.set(enqueue_l0a_idx, when=enqueue_this)
                 entries[i].uop.l0b_idx.set(enqueue_l0b_idx, when=enqueue_this)
                 entries[i].uop.acc_idx.set(enqueue_acc_idx, when=enqueue_this)
                 entries[i].uop.is_first.set(enqueue_is_first, when=enqueue_this)
                 entries[i].uop.is_last.set(enqueue_is_last, when=enqueue_this)
 
-                # Set valid, clear issued
-                entries[i].valid.set(consts.one1, when=enqueue_this)
-                entries[i].issued.set(consts.zero1, when=enqueue_this)
-
-            # Update tail pointer
-            next_tail = (tail.out() + consts.one8.trunc(width=QUEUE_IDX_WIDTH)) & c(
-                ISSUE_QUEUE_SIZE - 1, width=QUEUE_IDX_WIDTH
-            )
-            tail.set(next_tail, when=can_enqueue)
+            # Note: valid and issued updates moved to ENTRY_STATE section
+            # Note: tail pointer update moved to FLUSH section with explicit priority mux
 
         # Update ready bits based on buffer status
         with m.scope("READY_UPDATE"):
@@ -182,12 +178,13 @@ def build_issue_queue(
 
                 found = found | is_ready
 
-            # Mark as issued when acknowledged
+            # Compute mark_issued conditions (moved to ENTRY_STATE section)
             issue_and_ack = issue_valid & issue_ack
+            mark_issued_list = []
             for i in range(ISSUE_QUEUE_SIZE):
                 idx_match = issue_idx.eq(c(i, width=QUEUE_IDX_WIDTH))
                 mark_issued = issue_and_ack & idx_match
-                entries[i].issued.set(consts.one1, when=mark_issued)
+                mark_issued_list.append(mark_issued)
 
             # Create issue result
             issued_uop = Uop(
@@ -199,15 +196,14 @@ def build_issue_queue(
             )
             issue_result = IssueResult(issue_valid=issue_valid, uop=issued_uop)
 
-        # Retire logic (remove completed entries)
+        # Retire logic (compute retire conditions)
         with m.scope("RETIRE"):
-            # Retire from head when issued
+            # Compute can_retire conditions
+            can_retire_list = []
             for i in range(ISSUE_QUEUE_SIZE):
                 head_match = head.out().eq(c(i, width=QUEUE_IDX_WIDTH))
                 can_retire = head_match & entries[i].valid.out() & entries[i].issued.out()
-
-                # Clear entry
-                entries[i].valid.set(consts.zero1, when=can_retire)
+                can_retire_list.append(can_retire)
 
             # Update head pointer when retiring
             head_entry_issued = consts.zero1
@@ -218,32 +214,72 @@ def build_issue_queue(
                     head_entry_issued,
                 )
 
-            next_head = (head.out() + consts.one8.trunc(width=QUEUE_IDX_WIDTH)) & c(
-                ISSUE_QUEUE_SIZE - 1, width=QUEUE_IDX_WIDTH
-            )
-            head.set(next_head, when=head_entry_issued)
+            # Note: head pointer update moved to FLUSH section with explicit priority mux
+
+        # Entry state updates with explicit priority mux
+        # This consolidates all valid and issued updates to avoid multiple continuous assignments
+        with m.scope("ENTRY_STATE"):
+            for i in range(ISSUE_QUEUE_SIZE):
+                # Valid: Priority: flush > retire > enqueue > hold
+                current_valid = entries[i].valid.out()
+                next_valid = current_valid
+                next_valid = enqueue_this_list[i].select(consts.one1, next_valid)
+                next_valid = can_retire_list[i].select(consts.zero1, next_valid)
+                next_valid = flush.select(consts.zero1, next_valid)
+                entries[i].valid.set(next_valid)
+
+                # Issued: Priority: enqueue (clear) > mark_issued (set) > hold
+                current_issued = entries[i].issued.out()
+                next_issued = current_issued
+                next_issued = mark_issued_list[i].select(consts.one1, next_issued)
+                next_issued = enqueue_this_list[i].select(consts.zero1, next_issued)
+                entries[i].issued.set(next_issued)
 
         # Update count
         with m.scope("COUNT"):
             enqueued = can_enqueue
             retired = head_entry_issued
 
-            next_count = count.out()
-            # Increment on enqueue
-            next_count = enqueued.select(next_count + c(1, width=QUEUE_IDX_WIDTH + 1), next_count)
-            # Decrement on retire
-            next_count = retired.select(next_count - c(1, width=QUEUE_IDX_WIDTH + 1), next_count)
+            # Explicit priority mux for count
+            # Priority: flush > (enqueue/retire) > hold
+            current_count = count.out()
+            next_count = current_count
 
-            count.set(next_count, when=enqueued | retired)
-
-        # Flush logic
-        with m.scope("FLUSH"):
-            for i in range(ISSUE_QUEUE_SIZE):
-                entries[i].valid.set(consts.zero1, when=flush)
-
-            head.set(c(0, width=QUEUE_IDX_WIDTH), when=flush)
-            tail.set(c(0, width=QUEUE_IDX_WIDTH), when=flush)
-            count.set(c(0, width=QUEUE_IDX_WIDTH + 1), when=flush)
+            # Increment on enqueue (lower priority)
+            next_count = enqueued.select(current_count + c(1, width=QUEUE_IDX_WIDTH + 1), next_count)
+            # Decrement on retire (same priority level, can happen simultaneously)
+            next_count = retired.select(next_count - c(1, width=QUEUE_IDX_WIDTH + 1), next_count)
+            # Flush resets to 0 (highest priority)
+            next_count = flush.select(c(0, width=QUEUE_IDX_WIDTH + 1), next_count)
+
+            # Single set call
+            count.set(next_count)
+
+        # Pointer updates with explicit priority mux
+        with m.scope("PTRS_UPDATE"):
+            # Explicit priority mux for head and tail
+            # Priority: flush > normal update > hold
+            current_head = head.out()
+            next_head_val = current_head
+            next_head_val = head_entry_issued.select(
+                (current_head + consts.one8.trunc(width=QUEUE_IDX_WIDTH)) & c(
+                    ISSUE_QUEUE_SIZE - 1, width=QUEUE_IDX_WIDTH
+                ),
+                next_head_val,
+            )
+            next_head_val = flush.select(c(0, width=QUEUE_IDX_WIDTH), next_head_val)
+            head.set(next_head_val)
+
+            current_tail = tail.out()
+            next_tail_val = current_tail
+            next_tail_val = can_enqueue.select(
+                (current_tail + consts.one8.trunc(width=QUEUE_IDX_WIDTH)) & c(
+                    ISSUE_QUEUE_SIZE - 1, width=QUEUE_IDX_WIDTH
+                ),
+                next_tail_val,
+            )
+            next_tail_val = flush.select(c(0, width=QUEUE_IDX_WIDTH), next_tail_val)
+            tail.set(next_tail_val)
 
         entries_used = count.out()
 
diff --git a/janus/pyc/janus/cube/cube_v2_l0_reuse.py b/janus/pyc/janus/cube/cube_v2_l0_reuse.py
index 4b235db..0161319 100644
--- a/janus/pyc/janus/cube/cube_v2_l0_reuse.py
+++ b/janus/pyc/janus/cube/cube_v2_l0_reuse.py
@@ -86,12 +86,21 @@ def build_l0_buffer_reuse(
                 loading_reg = m.out("loading", clk=clk, rst=rst, width=1, init=0, en=consts.one1)
                 ref_count_reg = m.out("ref_count", clk=clk, rst=rst, width=8, init=0, en=consts.one1)
 
-                # Create a valid register that mirrors the instance output
-                valid_reg = m.out("valid", clk=clk, rst=rst, width=1, init=0, en=consts.one1)
-                valid_reg.set(entry["valid"], when=consts.one1)
+                # Use the instance's valid output directly (it's already registered)
+                # Create a dummy register that just holds the value for the status interface
+                valid_wire = entry["valid"]
+
+                # Create a simple wrapper that exposes the valid signal
+                # We use a register but set it unconditionally to the instance output
+                # This avoids the extra cycle of latency
+                class ValidWrapper:
+                    def __init__(self, wire):
+                        self._wire = wire
+                    def out(self):
+                        return self._wire
 
                 status = L0EntryStatus(
-                    valid=valid_reg,
+                    valid=ValidWrapper(valid_wire),
                     loading=loading_reg,
                     ref_count=ref_count_reg,
                 )
diff --git a/janus/pyc/janus/cube/cube_v2_mmio.py b/janus/pyc/janus/cube/cube_v2_mmio.py
index 532abca..fe5987c 100644
--- a/janus/pyc/janus/cube/cube_v2_mmio.py
+++ b/janus/pyc/janus/cube/cube_v2_mmio.py
@@ -168,6 +168,8 @@ def build_mmio_read(
 def build_mmio_inst_write(
     m: Circuit,
     *,
+    clk: Wire,
+    rst: Wire,
     consts: Consts,
     base_addr: int,
     mem_wvalid: Wire,
@@ -185,9 +187,19 @@ def build_mmio_inst_write(
         # MATMUL instruction register (M, K, N packed)
         # Format: [15:0] = M, [31:16] = K, [47:32] = N
         inst_match = mem_waddr.eq(c(base_addr + ADDR_MATMUL_INST, width=64)) & mem_wvalid
-        inst_m = mem_wdata[0:16]
-        inst_k = mem_wdata[16:32]
-        inst_n = mem_wdata[32:48]
+
+        # Latch instruction values into registers
+        inst_m_reg = m.out("inst_m", clk=clk, rst=rst, width=16, init=0, en=consts.one1)
+        inst_k_reg = m.out("inst_k", clk=clk, rst=rst, width=16, init=0, en=consts.one1)
+        inst_n_reg = m.out("inst_n", clk=clk, rst=rst, width=16, init=0, en=consts.one1)
+
+        inst_m_reg.set(mem_wdata[0:16], when=inst_match)
+        inst_k_reg.set(mem_wdata[16:32], when=inst_match)
+        inst_n_reg.set(mem_wdata[32:48], when=inst_match)
+
+        inst_m = inst_m_reg.out()
+        inst_k = inst_k_reg.out()
+        inst_n = inst_n_reg.out()
 
         # Address registers
         addr_a_match = mem_waddr.eq(c(base_addr + ADDR_ADDR_A, width=64)) & mem_wvalid
diff --git a/janus/pyc/janus/cube/cube_v2_reuse.py b/janus/pyc/janus/cube/cube_v2_reuse.py
index 52f0683..3429db2 100644
--- a/janus/pyc/janus/cube/cube_v2_reuse.py
+++ b/janus/pyc/janus/cube/cube_v2_reuse.py
@@ -137,6 +137,8 @@ def build(m: Circuit, *, base_addr: int = 0x80000000) -> None:
         addr_c,
     ) = build_mmio_inst_write(
         m,
+        clk=clk,
+        rst=rst,
         consts=consts,
         base_addr=base_addr,
         mem_wvalid=mem_wvalid,
@@ -145,24 +147,37 @@ def build(m: Circuit, *, base_addr: int = 0x80000000) -> None:
     )
 
     # --- L0A/L0B Load Decode ---
-    # L0A load: address 0x0100-0x01FF
-    # L0B load: address 0x0200-0x02FF
+    # L0A load: address 0x1000-0x4FFF (64 entries × 256 bytes each)
+    # L0B load: address 0x5000-0x8FFF (64 entries × 256 bytes each)
     # Format: entry_idx in bits 13:8, row in bits 7:4, col in bits 3:0
+    # Entry address = base + 0x1000 + (entry_idx << 8) + (row << 4) + col
     with m.scope("L0_LOAD_DECODE"):
         # Extract address offset from base
         addr_offset = (mem_waddr - c(base_addr, width=64)).trunc(width=16)
 
-        # Check if address is in L0A range (0x0100-0x01FF)
-        l0a_range = addr_offset[8:16].eq(c(0x01, width=8))
+        # Check if address is in L0A range (0x1000-0x4FFF)
+        # bits 15:12 in [1,2,3,4]
+        l0a_high = addr_offset[12:16]
+        l0a_range = (l0a_high.eq(c(0x1, width=4)) |
+                     l0a_high.eq(c(0x2, width=4)) |
+                     l0a_high.eq(c(0x3, width=4)) |
+                     l0a_high.eq(c(0x4, width=4)))
         l0a_load_valid = mem_wvalid & l0a_range
-        l0a_entry_idx = addr_offset[8:14].trunc(width=L0_IDX_WIDTH)
+        # entry_idx = (offset - 0x1000) >> 8
+        l0a_entry_idx = ((addr_offset - c(0x1000, width=16)) >> 8).trunc(width=L0_IDX_WIDTH)
         l0a_row = addr_offset[4:8]
         l0a_col = addr_offset[0:4]
 
-        # Check if address is in L0B range (0x0200-0x02FF)
-        l0b_range = addr_offset[8:16].eq(c(0x02, width=8))
+        # Check if address is in L0B range (0x5000-0x8FFF)
+        # bits 15:12 in [5,6,7,8]
+        l0b_high = addr_offset[12:16]
+        l0b_range = (l0b_high.eq(c(0x5, width=4)) |
+                     l0b_high.eq(c(0x6, width=4)) |
+                     l0b_high.eq(c(0x7, width=4)) |
+                     l0b_high.eq(c(0x8, width=4)))
         l0b_load_valid = mem_wvalid & l0b_range
-        l0b_entry_idx = addr_offset[8:14].trunc(width=L0_IDX_WIDTH)
+        # entry_idx = (offset - 0x5000) >> 8
+        l0b_entry_idx = ((addr_offset - c(0x5000, width=16)) >> 8).trunc(width=L0_IDX_WIDTH)
         l0b_row = addr_offset[4:8]
         l0b_col = addr_offset[0:4]
 
diff --git a/janus/tb/tb_cube_64x64x64.sv b/janus/tb/tb_cube_64x64x64.sv
new file mode 100644
index 0000000..6478b37
--- /dev/null
+++ b/janus/tb/tb_cube_64x64x64.sv
@@ -0,0 +1,196 @@
+// Testbench for 64x64x64 MATMUL cycle count measurement
+// Tests compute cycles with different PE array configurations
+
+module tb_cube_64x64x64;
+  logic clk;
+  logic rst;
+
+  // Memory interface
+  logic mem_wvalid;
+  logic [63:0] mem_waddr;
+  logic [63:0] mem_wdata;
+  logic [63:0] mem_raddr;
+  logic [63:0] mem_rdata;
+
+  // Status outputs
+  logic done;
+  logic busy;
+  logic queue_full;
+  logic queue_empty;
+
+  // Memory-mapped addresses
+  localparam logic [63:0] BASE_ADDR = 64'h80000000;
+  localparam logic [63:0] ADDR_CONTROL = BASE_ADDR + 64'h0000;
+  localparam logic [63:0] ADDR_STATUS = BASE_ADDR + 64'h0008;
+  localparam logic [63:0] ADDR_MATMUL_INST = BASE_ADDR + 64'h0010;
+
+  // L0 buffer base addresses
+  localparam logic [63:0] L0A_BASE = BASE_ADDR + 64'h1000;
+  localparam logic [63:0] L0B_BASE = BASE_ADDR + 64'h5000;
+
+  // Control bits
+  localparam logic [63:0] CTRL_START = 64'h01;
+  localparam logic [63:0] CTRL_RESET = 64'h02;
+
+  // Array size (will be overridden by parameter)
+  parameter int ARRAY_SIZE = 16;
+
+  // DUT instantiation
+  janus_cube_pyc dut (
+    .clk(clk),
+    .rst(rst),
+    .mem_wvalid(mem_wvalid),
+    .mem_waddr(mem_waddr),
+    .mem_wdata(mem_wdata),
+    .mem_raddr(mem_raddr),
+    .mem_rdata(mem_rdata),
+    .done(done),
+    .busy(busy),
+    .queue_full(queue_full),
+    .queue_empty(queue_empty)
+  );
+
+  // Clock generation: 10ns period
+  always #5 clk = ~clk;
+
+  // Cycle counter
+  int cycle_count;
+
+  // MMIO write task
+  task automatic mmio_write(input logic [63:0] addr, input logic [63:0] data);
+    @(posedge clk);
+    mem_wvalid <= 1'b1;
+    mem_waddr <= addr;
+    mem_wdata <= data;
+    @(posedge clk);
+    mem_wvalid <= 1'b0;
+    mem_waddr <= 64'h0;
+    mem_wdata <= 64'h0;
+  endtask
+
+  // Quick load L0A entry - just mark as valid by writing last element
+  task automatic quick_load_l0a_entry(input int entry_idx);
+    logic [63:0] addr;
+    addr = L0A_BASE + (entry_idx << 8) + ((ARRAY_SIZE-1) << 4) + (ARRAY_SIZE-1);
+    mmio_write(addr, 64'h0001);
+  endtask
+
+  // Quick load L0B entry - just mark as valid by writing last element
+  task automatic quick_load_l0b_entry(input int entry_idx);
+    logic [63:0] addr;
+    addr = L0B_BASE + (entry_idx << 8) + ((ARRAY_SIZE-1) << 4) + (ARRAY_SIZE-1);
+    mmio_write(addr, 64'h0001);
+  endtask
+
+  // Test 64x64x64 MATMUL compute cycles
+  task automatic test_64x64x64_matmul();
+    logic [63:0] inst;
+    int start_cycle;
+    int end_cycle;
+    int compute_cycles;
+    int tile_size;
+    int m_tiles, k_tiles, n_tiles;
+    int total_uops;
+    int theoretical;
+    int i;
+    int M, K, N;
+
+    M = 64;
+    K = 64;
+    N = 64;
+
+    tile_size = ARRAY_SIZE;
+    m_tiles = (M + tile_size - 1) / tile_size;
+    k_tiles = (K + tile_size - 1) / tile_size;
+    n_tiles = (N + tile_size - 1) / tile_size;
+    total_uops = m_tiles * k_tiles * n_tiles;
+    theoretical = total_uops + 3;  // pipeline latency
+
+    $display("\n========================================");
+    $display("64x64x64 MATMUL Test");
+    $display("PE Array Size: %0dx%0d", ARRAY_SIZE, ARRAY_SIZE);
+    $display("========================================");
+    $display("Tiles: %0d x %0d x %0d = %0d uops", m_tiles, k_tiles, n_tiles, total_uops);
+    $display("Theoretical compute cycles: %0d", theoretical);
+
+    // Reset
+    mmio_write(ADDR_CONTROL, CTRL_RESET);
+    repeat(10) @(posedge clk);
+
+    // Quick load L0A entries
+    $display("Loading L0A entries (%0d entries)...", m_tiles * k_tiles);
+    for (i = 0; i < m_tiles * k_tiles && i < 64; i++) begin
+      quick_load_l0a_entry(i);
+    end
+
+    // Quick load L0B entries
+    $display("Loading L0B entries (%0d entries)...", k_tiles * n_tiles);
+    for (i = 0; i < k_tiles * n_tiles && i < 64; i++) begin
+      quick_load_l0b_entry(i);
+    end
+
+    // Write MATMUL instruction: [15:0]=M, [31:16]=K, [47:32]=N
+    inst = {16'h0, N[15:0], K[15:0], M[15:0]};
+    $display("Writing MATMUL instruction: M=%0d, K=%0d, N=%0d", M, K, N);
+    mmio_write(ADDR_MATMUL_INST, inst);
+
+    // Wait a few cycles for instruction to be latched
+    repeat(5) @(posedge clk);
+
+    // Record start cycle
+    start_cycle = cycle_count;
+
+    // Start computation
+    $display("Starting computation at cycle %0d...", cycle_count);
+    mmio_write(ADDR_CONTROL, CTRL_START);
+
+    // Wait for done
+    while (!done && (cycle_count - start_cycle) < 100000) begin
+      @(posedge clk);
+    end
+
+    end_cycle = cycle_count;
+    compute_cycles = end_cycle - start_cycle;
+
+    $display("\n========================================");
+    $display("RESULTS (PE Array: %0dx%0d)", ARRAY_SIZE, ARRAY_SIZE);
+    $display("========================================");
+    if (done) begin
+      $display("Actual compute cycles:     %0d", compute_cycles);
+      $display("Theoretical compute cycles: %0d", theoretical);
+      $display("Overhead cycles:           %0d", compute_cycles - theoretical);
+      $display("Efficiency ratio:          %.2f%%", 100.0 * real'(theoretical) / real'(compute_cycles));
+    end else begin
+      $display("TIMEOUT after %0d cycles!", compute_cycles);
+    end
+    $display("========================================\n");
+  endtask
+
+  // Main test
+  initial begin
+    // Initialize
+    clk = 0;
+    rst = 1;
+    mem_wvalid = 0;
+    mem_waddr = 0;
+    mem_wdata = 0;
+    mem_raddr = 0;
+    cycle_count = 0;
+
+    // Reset sequence
+    repeat(10) @(posedge clk);
+    rst = 0;
+    repeat(5) @(posedge clk);
+
+    // Run test
+    test_64x64x64_matmul();
+
+    $finish;
+  end
+
+  // Cycle counter
+  always @(posedge clk) begin
+    cycle_count <= cycle_count + 1;
+  end
+
+endmodule
diff --git a/janus/tb/tb_cube_64x64x64_main.cpp b/janus/tb/tb_cube_64x64x64_main.cpp
new file mode 100644
index 0000000..1b07518
--- /dev/null
+++ b/janus/tb/tb_cube_64x64x64_main.cpp
@@ -0,0 +1,24 @@
+#include "Vtb_cube_64x64x64.h"
+#include "verilated.h"
+
+vluint64_t main_time = 0;
+
+double sc_time_stamp() {
+    return main_time;
+}
+
+int main(int argc, char** argv) {
+    Verilated::commandArgs(argc, argv);
+
+    Vtb_cube_64x64x64* top = new Vtb_cube_64x64x64;
+
+    // Run for limited cycles
+    while (!Verilated::gotFinish() && main_time < 500000) {
+        top->eval();
+        main_time++;
+    }
+
+    top->final();
+    delete top;
+    return 0;
+}

From 429eb4467baf5ee126ccac9f84fc060a2a194271 Mon Sep 17 00:00:00 2001
From: Shulin Feng <fengshulin@hisilicon.com>
Date: Sat, 14 Feb 2026 21:10:31 +0800
Subject: [PATCH 2/2] Add PE configuration benchmark results to documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Test results for 64×64×64 MATMUL with different PE array sizes:
- 16×16 PE: 74 cycles (90.54% efficiency)
- 8×8 PE: 579 cycles (88.95% efficiency)
- 4×4 PE: 4163 cycles (98.46% efficiency)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 janus/pyc/janus/cube/CUBE_V2_SPEC.md |  22 +++
 janus/pyc/janus/cube/README.md       |  10 ++
 janus/tb/tb_cube_cycle_count.cpp     | 133 ++++++++++++++
 janus/tb/tb_cube_cycles.sv           | 249 +++++++++++++++++++++++++++
 janus/tb/tb_cube_cycles_main.cpp     |  24 +++
 janus/tools/run_cube_cycle_count.sh  |  29 ++++
 janus/tools/run_cube_cycles.sh       |  40 +++++
 janus/tools/test_pe_configs.sh       |  60 +++++++
 8 files changed, 567 insertions(+)
 create mode 100644 janus/tb/tb_cube_cycle_count.cpp
 create mode 100644 janus/tb/tb_cube_cycles.sv
 create mode 100644 janus/tb/tb_cube_cycles_main.cpp
 create mode 100755 janus/tools/run_cube_cycle_count.sh
 create mode 100755 janus/tools/run_cube_cycles.sh
 create mode 100755 janus/tools/test_pe_configs.sh

diff --git a/janus/pyc/janus/cube/CUBE_V2_SPEC.md b/janus/pyc/janus/cube/CUBE_V2_SPEC.md
index 72af4aa..7281cb0 100644
--- a/janus/pyc/janus/cube/CUBE_V2_SPEC.md
+++ b/janus/pyc/janus/cube/CUBE_V2_SPEC.md
@@ -544,6 +544,28 @@ Performance metrics:
     - At 1 GHz: 4.096 TMAC/s (INT16)
 ```
 
+### 7.6 Benchmark Results (64×64×64 MATMUL)
+
+Actual cycle counts measured via Verilator simulation:
+
+| PE Array | Tile Size | Tiles (M×K×N) | Uops | Theoretical | Actual | Overhead | Efficiency |
+|----------|-----------|---------------|------|-------------|--------|----------|------------|
+| 16×16 | 16×16 | 4×4×4 | 64 | 67 | 74 | 7 | 90.54% |
+| 8×8 | 8×8 | 8×8×8 | 512 | 515 | 579 | 64 | 88.95% |
+| 4×4 | 4×4 | 16×16×16 | 4096 | 4099 | 4163 | 64 | 98.46% |
+
+```
+Theoretical cycles = uops + pipeline_depth - 1 + startup_overhead
+  - 16×16: 64 + 4 - 1 = 67 (actual: 74, +7 overhead)
+  - 8×8:   512 + 4 - 1 = 515 (actual: 579, +64 overhead)
+  - 4×4:   4096 + 4 - 1 = 4099 (actual: 4163, +64 overhead)
+
+Efficiency = theoretical / actual
+  - Larger PE arrays have higher per-uop throughput but more startup overhead
+  - Smaller PE arrays have lower overhead percentage due to more uops
+  - Fixed overhead (~64 cycles) from pipeline startup/drain and FSM transitions
+```
+
 ---
 
 ## 8. MMIO Interface
diff --git a/janus/pyc/janus/cube/README.md b/janus/pyc/janus/cube/README.md
index 5d0128a..32e4de3 100644
--- a/janus/pyc/janus/cube/README.md
+++ b/janus/pyc/janus/cube/README.md
@@ -83,6 +83,16 @@ uop4:                           [C0]──[C1]──[C2]──[C3]──►ACC
 Pipeline: 4-cycle latency, 1 uop/cycle throughput
 ```
 
+### Benchmark Results (64×64×64 MATMUL)
+
+| PE Array | Uops | Actual Cycles | Efficiency |
+|----------|------|---------------|------------|
+| 16×16 | 64 | 74 | 90.54% |
+| 8×8 | 512 | 579 | 88.95% |
+| 4×4 | 4096 | 4163 | 98.46% |
+
+See [CUBE_V2_SPEC.md](CUBE_V2_SPEC.md#76-benchmark-results-64×64×64-matmul) for detailed analysis.
+
 ### Cube v2 File Structure
 
 ```
diff --git a/janus/tb/tb_cube_cycle_count.cpp b/janus/tb/tb_cube_cycle_count.cpp
new file mode 100644
index 0000000..a6473a8
--- /dev/null
+++ b/janus/tb/tb_cube_cycle_count.cpp
@@ -0,0 +1,133 @@
+#include <cstdint>
+#include <cstdlib>
+#include <filesystem>
+#include <iostream>
+
+#include <pyc/cpp/pyc_tb.hpp>
+
+// Generated by `pyc-compile --emit=cpp`.
+#include "janus_cube_pyc_gen.hpp"
+
+using pyc::cpp::Testbench;
+using pyc::cpp::Wire;
+
+namespace {
+
+// Memory-mapped addresses (must match cube_v2_consts.py)
+constexpr std::uint64_t kBaseAddr = 0x80000000ull;
+constexpr std::uint64_t kAddrControl = kBaseAddr + 0x0000;
+constexpr std::uint64_t kAddrStatus = kBaseAddr + 0x0008;
+constexpr std::uint64_t kAddrMatmulInst = kBaseAddr + 0x0010;
+
+// Control bits
+constexpr std::uint64_t kCtrlStart = 1 << 0;
+constexpr std::uint64_t kCtrlReset = 1 << 1;
+
+// Status bits
+constexpr std::uint64_t kStatDone = 1 << 0;
+constexpr std::uint64_t kStatBusy = 1 << 1;
+
+// Helper to write to memory-mapped register
+static void mmioWrite(pyc::gen::janus_cube_pyc &dut, std::uint64_t addr, std::uint64_t data) {
+  dut.mem_wvalid = Wire<1>(1);
+  dut.mem_waddr = Wire<64>(addr);
+  dut.mem_wdata = Wire<64>(data);
+}
+
+// Helper to clear write signals
+static void mmioWriteClear(pyc::gen::janus_cube_pyc &dut) {
+  dut.mem_wvalid = Wire<1>(0);
+  dut.mem_waddr = Wire<64>(0);
+  dut.mem_wdata = Wire<64>(0);
+}
+
+// Test: Measure compute cycles for MATMUL instruction
+static bool testComputeCycles(int M, int K, int N) {
+  std::cout << "\n=== Testing " << M << "x" << K << "x" << N << " MATMUL ===\n";
+
+  pyc::gen::janus_cube_pyc dut{};
+  Testbench<pyc::gen::janus_cube_pyc> tb(dut);
+
+  tb.addClock(dut.clk, /*halfPeriodSteps=*/1);
+  tb.reset(dut.rst, /*cyclesAsserted=*/2, /*cyclesDeasserted=*/1);
+
+  // Clear any previous state
+  mmioWrite(dut, kAddrControl, kCtrlReset);
+  tb.runCycles(1);
+  mmioWriteClear(dut);
+  tb.runCycles(5);
+
+  // Write MATMUL instruction (M, K, N packed into 64 bits)
+  // Format: [15:0]=M, [31:16]=K, [47:32]=N
+  std::uint64_t inst = (static_cast<std::uint64_t>(M) & 0xFFFF) |
+                       ((static_cast<std::uint64_t>(K) & 0xFFFF) << 16) |
+                       ((static_cast<std::uint64_t>(N) & 0xFFFF) << 32);
+  mmioWrite(dut, kAddrMatmulInst, inst);
+  tb.runCycles(1);
+  mmioWriteClear(dut);
+  tb.runCycles(1);
+
+  // Start computation
+  mmioWrite(dut, kAddrControl, kCtrlStart);
+  tb.runCycles(1);
+  mmioWriteClear(dut);
+
+  // Wait for done signal
+  int cycles = 0;
+  int timeout = 100000;
+  while (!dut.done.toBool() && timeout > 0) {
+    tb.runCycles(1);
+    cycles++;
+    timeout--;
+  }
+
+  if (timeout == 0) {
+    std::cerr << "TIMEOUT after " << cycles << " cycles!\n";
+    return false;
+  }
+
+  std::cout << "Computation completed in " << cycles << " cycles\n";
+
+  // Calculate theoretical cycles
+  // Assuming 16x16 PE array (default config)
+  int tile_size = 16;
+  int m_tiles = (M + tile_size - 1) / tile_size;
+  int k_tiles = (K + tile_size - 1) / tile_size;
+  int n_tiles = (N + tile_size - 1) / tile_size;
+  int total_uops = m_tiles * k_tiles * n_tiles;
+  int pipeline_latency = 4;
+  int theoretical = total_uops + pipeline_latency - 1;
+
+  std::cout << "Theoretical: " << theoretical << " cycles "
+            << "(tiles: " << m_tiles << "x" << k_tiles << "x" << n_tiles
+            << " = " << total_uops << " uops)\n";
+
+  return true;
+}
+
+} // namespace
+
+int main(int argc, char **argv) {
+  std::cout << "Cube Accelerator Cycle Count Test\n";
+  std::cout << "==================================\n";
+
+  // Test various matrix sizes
+  bool ok = true;
+
+  // Small test first
+  ok = ok && testComputeCycles(16, 16, 16);
+
+  // 32x32x32
+  ok = ok && testComputeCycles(32, 32, 32);
+
+  // 64x64x64
+  ok = ok && testComputeCycles(64, 64, 64);
+
+  if (ok) {
+    std::cout << "\nAll tests passed!\n";
+    return 0;
+  } else {
+    std::cerr << "\nSome tests failed!\n";
+    return 1;
+  }
+}
diff --git a/janus/tb/tb_cube_cycles.sv b/janus/tb/tb_cube_cycles.sv
new file mode 100644
index 0000000..bcf7147
--- /dev/null
+++ b/janus/tb/tb_cube_cycles.sv
@@ -0,0 +1,249 @@
+// Simplified cycle count testbench for janus_cube_pyc
+// Tests compute cycles for MATMUL instruction
+
+module tb_cube_cycles;
+  logic clk;
+  logic rst;
+
+  // Memory interface
+  logic mem_wvalid;
+  logic [63:0] mem_waddr;
+  logic [63:0] mem_wdata;
+  logic [63:0] mem_raddr;
+  logic [63:0] mem_rdata;
+
+  // Status outputs
+  logic done;
+  logic busy;
+  logic queue_full;
+  logic queue_empty;
+
+  // Memory-mapped addresses
+  localparam logic [63:0] BASE_ADDR = 64'h80000000;
+  localparam logic [63:0] ADDR_CONTROL = BASE_ADDR + 64'h0000;
+  localparam logic [63:0] ADDR_STATUS = BASE_ADDR + 64'h0008;
+  localparam logic [63:0] ADDR_MATMUL_INST = BASE_ADDR + 64'h0010;
+
+  // L0 buffer base addresses (new scheme)
+  localparam logic [63:0] L0A_BASE = BASE_ADDR + 64'h1000;  // 0x1000-0x4FFF
+  localparam logic [63:0] L0B_BASE = BASE_ADDR + 64'h5000;  // 0x5000-0x8FFF
+
+  // Control bits
+  localparam logic [63:0] CTRL_START = 64'h01;
+  localparam logic [63:0] CTRL_RESET = 64'h02;
+
+  // Array size
+  localparam int ARRAY_SIZE = 16;
+
+  // DUT instantiation
+  janus_cube_pyc dut (
+    .clk(clk),
+    .rst(rst),
+    .mem_wvalid(mem_wvalid),
+    .mem_waddr(mem_waddr),
+    .mem_wdata(mem_wdata),
+    .mem_raddr(mem_raddr),
+    .mem_rdata(mem_rdata),
+    .done(done),
+    .busy(busy),
+    .queue_full(queue_full),
+    .queue_empty(queue_empty)
+  );
+
+  // Clock generation: 10ns period
+  always #5 clk = ~clk;
+
+  // Cycle counter
+  int cycle_count;
+
+  // MMIO write task
+  task automatic mmio_write(input logic [63:0] addr, input logic [63:0] data);
+    @(posedge clk);
+    mem_wvalid <= 1'b1;
+    mem_waddr <= addr;
+    mem_wdata <= data;
+    @(posedge clk);
+    mem_wvalid <= 1'b0;
+    mem_waddr <= 64'h0;
+    mem_wdata <= 64'h0;
+  endtask
+
+  // Load L0A entry with dummy data
+  // New address scheme: L0A at 0x1000-0x4FFF
+  // Entry address = base + 0x1000 + (entry_idx << 8) + (row << 4) + col
+  task automatic load_l0a_entry(input int entry_idx);
+    logic [63:0] addr;
+    int row, col;
+    for (row = 0; row < ARRAY_SIZE; row++) begin
+      for (col = 0; col < ARRAY_SIZE; col++) begin
+        addr = L0A_BASE + (entry_idx << 8) + (row << 4) + col;
+        mmio_write(addr, 64'h0001);  // dummy data
+      end
+    end
+    $display("  Loaded L0A entry %0d (addr_base=0x%08x, last_addr=0x%08x)",
+             entry_idx, L0A_BASE + (entry_idx << 8),
+             L0A_BASE + (entry_idx << 8) + ((ARRAY_SIZE-1) << 4) + (ARRAY_SIZE-1));
+  endtask
+
+  // Load L0B entry with dummy data
+  // New address scheme: L0B at 0x5000-0x8FFF
+  // Entry address = base + 0x5000 + (entry_idx << 8) + (row << 4) + col
+  task automatic load_l0b_entry(input int entry_idx);
+    logic [63:0] addr;
+    int row, col;
+    for (row = 0; row < ARRAY_SIZE; row++) begin
+      for (col = 0; col < ARRAY_SIZE; col++) begin
+        addr = L0B_BASE + (entry_idx << 8) + (row << 4) + col;
+        mmio_write(addr, 64'h0001);  // dummy data
+      end
+    end
+    $display("  Loaded L0B entry %0d (addr_base=0x%08x, last_addr=0x%08x)",
+             entry_idx, L0B_BASE + (entry_idx << 8),
+             L0B_BASE + (entry_idx << 8) + ((ARRAY_SIZE-1) << 4) + (ARRAY_SIZE-1));
+  endtask
+
+  // Quick load L0A entry - just mark as valid by writing last element
+  task automatic quick_load_l0a_entry(input int entry_idx);
+    logic [63:0] addr;
+    // Only write the last element (row=15, col=15) to mark entry as valid
+    addr = L0A_BASE + (entry_idx << 8) + ((ARRAY_SIZE-1) << 4) + (ARRAY_SIZE-1);
+    mmio_write(addr, 64'h0001);
+    $display("  Quick loaded L0A entry %0d (last_addr=0x%08x)", entry_idx, addr);
+  endtask
+
+  // Quick load L0B entry - just mark as valid by writing last element
+  task automatic quick_load_l0b_entry(input int entry_idx);
+    logic [63:0] addr;
+    // Only write the last element (row=15, col=15) to mark entry as valid
+    addr = L0B_BASE + (entry_idx << 8) + ((ARRAY_SIZE-1) << 4) + (ARRAY_SIZE-1);
+    mmio_write(addr, 64'h0001);
+    $display("  Quick loaded L0B entry %0d (last_addr=0x%08x)", entry_idx, addr);
+  endtask
+
+  // Test MATMUL compute cycles
+  task automatic test_matmul_cycles(input int M, input int K, input int N);
+    logic [63:0] inst;
+    int start_cycle;
+    int end_cycle;
+    int compute_cycles;
+    int tile_size;
+    int m_tiles, k_tiles, n_tiles;
+    int total_uops;
+    int theoretical;
+    int i;
+
+    tile_size = ARRAY_SIZE;
+    m_tiles = (M + tile_size - 1) / tile_size;
+    k_tiles = (K + tile_size - 1) / tile_size;
+    n_tiles = (N + tile_size - 1) / tile_size;
+    total_uops = m_tiles * k_tiles * n_tiles;
+    theoretical = total_uops + 3;  // pipeline latency
+
+    $display("\n=== Testing %0dx%0dx%0d MATMUL ===", M, K, N);
+    $display("Tiles: %0d x %0d x %0d = %0d uops", m_tiles, k_tiles, n_tiles, total_uops);
+    $display("Theoretical compute cycles: %0d", theoretical);
+
+    // Reset
+    mmio_write(ADDR_CONTROL, CTRL_RESET);
+    repeat(10) @(posedge clk);
+
+    // Quick load L0A entries (just mark as valid)
+    $display("Quick loading L0A entries...");
+    for (i = 0; i < m_tiles * k_tiles && i < 64; i++) begin
+      quick_load_l0a_entry(i);
+    end
+
+    // Quick load L0B entries (just mark as valid)
+    $display("Quick loading L0B entries...");
+    for (i = 0; i < k_tiles * n_tiles && i < 64; i++) begin
+      quick_load_l0b_entry(i);
+    end
+
+    // Write MATMUL instruction: [15:0]=M, [31:16]=K, [47:32]=N
+    inst = {16'h0, N[15:0], K[15:0], M[15:0]};
+    $display("Writing MATMUL instruction: M=%0d, K=%0d, N=%0d (inst=0x%016x)", M, K, N, inst);
+    mmio_write(ADDR_MATMUL_INST, inst);
+
+    // Wait a few cycles for instruction to be latched
+    repeat(5) @(posedge clk);
+
+    // Record start cycle
+    start_cycle = cycle_count;
+
+    // Start computation
+    $display("Sending START command at cycle %0d", cycle_count);
+    mmio_write(ADDR_CONTROL, CTRL_START);
+
+    // Debug: monitor state for first 100 cycles
+    $display("Starting computation, monitoring state...");
+    $display("  Note: queue_empty=1 means no uops in queue");
+
+    // Monitor internal decoder signals if accessible
+    // Check if decoder is generating uops
+    for (int dbg = 0; dbg < 200 && !done; dbg++) begin
+      @(posedge clk);
+      // Print every cycle for first 50 cycles to see detailed timing
+      if (dbg < 50 || dbg % 20 == 0) begin
+        $display("  cycle %0d: done=%b busy=%b queue_empty=%b queue_full=%b",
+                 cycle_count, done, busy, queue_empty, queue_full);
+      end
+    end
+
+    // Wait for done
+    while (!done && (cycle_count - start_cycle) < 100000) begin
+      @(posedge clk);
+    end
+
+    end_cycle = cycle_count;
+    compute_cycles = end_cycle - start_cycle;
+
+    if (done) begin
+      $display("Actual compute cycles: %0d", compute_cycles);
+      $display("Ratio (actual/theoretical): %.2f", real'(compute_cycles) / real'(theoretical));
+    end else begin
+      $display("TIMEOUT after %0d cycles!", compute_cycles);
+    end
+  endtask
+
+  // Main test
+  initial begin
+    // Initialize
+    clk = 0;
+    rst = 1;
+    mem_wvalid = 0;
+    mem_waddr = 0;
+    mem_wdata = 0;
+    mem_raddr = 0;
+    cycle_count = 0;
+
+    // Reset sequence
+    repeat(10) @(posedge clk);
+    rst = 0;
+    repeat(5) @(posedge clk);
+
+    $display("\n========================================");
+    $display("Cube Accelerator Cycle Count Test");
+    $display("========================================");
+
+    // Test 16x16x16 (1 tile)
+    test_matmul_cycles(16, 16, 16);
+
+    // Test 32x32x32 (8 tiles)
+    test_matmul_cycles(32, 32, 32);
+
+    // Test 64x64x64 (64 tiles)
+    test_matmul_cycles(64, 64, 64);
+
+    $display("\n========================================");
+    $display("Test Complete");
+    $display("========================================");
+
+    $finish;
+  end
+
+  // Cycle counter
+  always @(posedge clk) begin
+    cycle_count <= cycle_count + 1;
+  end
+
+endmodule
diff --git a/janus/tb/tb_cube_cycles_main.cpp b/janus/tb/tb_cube_cycles_main.cpp
new file mode 100644
index 0000000..70cca46
--- /dev/null
+++ b/janus/tb/tb_cube_cycles_main.cpp
@@ -0,0 +1,24 @@
+#include "Vtb_cube_cycles.h"
+#include "verilated.h"
+
+vluint64_t main_time = 0;
+
+double sc_time_stamp() {
+    return main_time;
+}
+
+int main(int argc, char** argv) {
+    Verilated::commandArgs(argc, argv);
+
+    Vtb_cube_cycles* top = new Vtb_cube_cycles;
+
+    // Run for limited cycles
+    while (!Verilated::gotFinish() && main_time < 200000) {
+        top->eval();
+        main_time++;
+    }
+
+    top->final();
+    delete top;
+    return 0;
+}
diff --git a/janus/tools/run_cube_cycle_count.sh b/janus/tools/run_cube_cycle_count.sh
new file mode 100755
index 0000000..7d60ae4
--- /dev/null
+++ b/janus/tools/run_cube_cycle_count.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd -- "${SCRIPT_DIR}/../.." && pwd)"
+# shellcheck source=../../scripts/lib.sh
+source "${ROOT_DIR}/scripts/lib.sh"
+pyc_find_pyc_compile
+
+GEN_DIR="${ROOT_DIR}/janus/generated/janus_cube_pyc"
+HDR="${GEN_DIR}/janus_cube_pyc_gen.hpp"
+if [[ ! -f "${HDR}" ]]; then
+  bash "${ROOT_DIR}/janus/update_generated.sh"
+fi
+
+WORK_DIR="$(mktemp -d -t cube_cycle_count.XXXXXX)"
+trap 'rm -rf "${WORK_DIR}"' EXIT
+
+"${CXX:-clang++}" -std=c++17 -O2 \
+  -I "${ROOT_DIR}/include" \
+  -I "${GEN_DIR}" \
+  -o "${WORK_DIR}/tb_cube_cycle_count" \
+  "${ROOT_DIR}/janus/tb/tb_cube_cycle_count.cpp"
+
+if [[ $# -gt 0 ]]; then
+  "${WORK_DIR}/tb_cube_cycle_count" "$@"
+else
+  "${WORK_DIR}/tb_cube_cycle_count"
+fi
diff --git a/janus/tools/run_cube_cycles.sh b/janus/tools/run_cube_cycles.sh
new file mode 100755
index 0000000..17f0a69
--- /dev/null
+++ b/janus/tools/run_cube_cycles.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Run Verilog simulation for cube cycle count test using Verilator
+set -euo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd -- "${SCRIPT_DIR}/../.." && pwd)"
+
+# Check for Verilator
+VERILATOR="${VERILATOR:-$(command -v verilator || true)}"
+if [[ -z "${VERILATOR}" ]]; then
+  echo "error: missing verilator (install with: brew install verilator)" >&2
+  exit 1
+fi
+
+# Paths
+GEN_DIR="${ROOT_DIR}/janus/generated/janus_cube_pyc"
+VLOG="${GEN_DIR}/janus_cube_pyc.v"
+TB_SV="${ROOT_DIR}/janus/tb/tb_cube_cycles.sv"
+
+# Regenerate if needed
+if [[ ! -f "${VLOG}" ]]; then
+  echo "[cube-cycles] Generating Verilog..."
+  bash "${ROOT_DIR}/janus/update_generated.sh"
+fi
+
+# Create work directory
+WORK_DIR="$(mktemp -d -t cube_cycles_verilator.XXXXXX)"
+trap 'rm -rf "${WORK_DIR}"' EXIT
+
+echo "[cube-cycles] Compiling with Verilator..."
+"${VERILATOR}" --binary --timing \
+  -I"${ROOT_DIR}/include/pyc/verilog" \
+  -Wno-fatal \
+  --top-module tb_cube_cycles \
+  -o "${WORK_DIR}/Vtb_cube_cycles" \
+  "${TB_SV}" \
+  "${VLOG}"
+
+echo "[cube-cycles] Running simulation..."
+"${WORK_DIR}/Vtb_cube_cycles" "$@"
diff --git a/janus/tools/test_pe_configs.sh b/janus/tools/test_pe_configs.sh
new file mode 100755
index 0000000..54113a2
--- /dev/null
+++ b/janus/tools/test_pe_configs.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Test 64x64x64 MATMUL with different PE array configurations
+# Usage: ./test_pe_configs.sh
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+cd "$PROJECT_ROOT"
+
+# PE configurations to test
+PE_SIZES=(16 8 4)
+
+echo "========================================"
+echo "64x64x64 MATMUL PE Configuration Test"
+echo "========================================"
+echo ""
+
+for PE_SIZE in "${PE_SIZES[@]}"; do
+    echo "----------------------------------------"
+    echo "Testing PE Array: ${PE_SIZE}x${PE_SIZE}"
+    echo "----------------------------------------"
+
+    # Update ARRAY_SIZE in consts
+    sed -i.bak "s/^ARRAY_SIZE = .*/ARRAY_SIZE = ${PE_SIZE}  # ${PE_SIZE}×${PE_SIZE} systolic array/" \
+        janus/pyc/janus/cube/cube_v2_consts.py
+
+    # Regenerate Verilog
+    echo "Regenerating Verilog..."
+    PYC_COMPILE=build/bin/pyc-compile bash janus/update_generated.sh 2>&1 | tail -5
+
+    # Build and run test
+    WORK_DIR="build/verilator_tb_64x64x64_pe${PE_SIZE}"
+    rm -rf "$WORK_DIR"
+    mkdir -p "$WORK_DIR"
+
+    echo "Building Verilator simulation..."
+    verilator --cc --exe --timing --build \
+        -Wno-fatal \
+        -I janus/generated/janus_cube_pyc \
+        janus/generated/janus_cube_pyc/janus_cube_pyc.v \
+        janus/tb/tb_cube_64x64x64.sv \
+        janus/tb/tb_cube_64x64x64_main.cpp \
+        --Mdir "$WORK_DIR" \
+        -o tb_cube_64x64x64 2>&1 | tail -3
+
+    echo "Running simulation..."
+    "$WORK_DIR/tb_cube_64x64x64"
+
+    echo ""
+done
+
+# Restore original ARRAY_SIZE
+sed -i.bak "s/^ARRAY_SIZE = .*/ARRAY_SIZE = 16  # 16×16 systolic array/" \
+    janus/pyc/janus/cube/cube_v2_consts.py
+rm -f janus/pyc/janus/cube/cube_v2_consts.py.bak
+
+echo "========================================"
+echo "Test Complete - Restored ARRAY_SIZE=16"
+echo "========================================"