From a1b2367df2e648302bd5aa5e134d17f46f1b527b Mon Sep 17 00:00:00 2001 From: Mikemy666 <2535634982@qq.com> Date: Mon, 16 Mar 2026 17:53:20 +0800 Subject: [PATCH 1/3] add dynamic in scalesim --- .../memory/double_buffered_scratchpad_mem.py | 179 +++++++++++++++++- scalesim/memory/read_buffer.py | 36 +++- scalesim/scale_config.py | 16 ++ scalesim/simulator.py | 2 +- scalesim/single_layer_sim.py | 7 +- 5 files changed, 232 insertions(+), 8 deletions(-) diff --git a/scalesim/memory/double_buffered_scratchpad_mem.py b/scalesim/memory/double_buffered_scratchpad_mem.py index aad8f2915..79d19b69d 100644 --- a/scalesim/memory/double_buffered_scratchpad_mem.py +++ b/scalesim/memory/double_buffered_scratchpad_mem.py @@ -5,6 +5,7 @@ import time import os +import math import numpy as np from tqdm import tqdm @@ -78,6 +79,16 @@ def __init__(self): self.using_ifmap_custom_layout = False self.using_filter_custom_layout = False + # Dynamic IFMAP/FILTER bank allocation state + self.enable_dynamic_bank_allocation = False + self.static_ifmap_sram_bank_num = 1 + self.static_filter_sram_bank_num = 1 + self.ifmap_sram_bank_port = 2 + self.filter_sram_bank_port = 2 + self.dynamic_ifmap_banks = set() + self.dynamic_filter_banks = set() + self.dynamic_unassigned_banks = [] + # def set_params(self, layer_id=0, @@ -89,6 +100,7 @@ def set_params(self, ifmap_backing_buf_bw=1, filter_backing_buf_bw=1, ofmap_backing_buf_bw=1, ifmap_sram_bank_num=1, ifmap_sram_bank_port=2, filter_sram_bank_num=1, filter_sram_bank_port=2, using_ifmap_custom_layout=False, using_filter_custom_layout=False, + enable_dynamic_bank_allocation=False, config=cfg(), topo=topo() ): @@ -99,6 +111,10 @@ def set_params(self, self.topo = topo self.config = config self.use_ramulator_trace = config.get_ramulator_trace() + self.static_ifmap_sram_bank_num = max(1, int(ifmap_sram_bank_num)) + self.static_filter_sram_bank_num = max(1, int(filter_sram_bank_num)) + self.ifmap_sram_bank_port = max(1, int(ifmap_sram_bank_port)) + self.filter_sram_bank_port = max(1, int(filter_sram_bank_port)) self.estimate_bandwidth_mode = estimate_bandwidth_mode @@ -167,7 +183,17 @@ def set_params(self, self.verbose = verbose self.using_ifmap_custom_layout = using_ifmap_custom_layout - self.using_filter_custom_layout = using_filter_custom_layout + self.using_filter_custom_layout = using_filter_custom_layout + self.enable_dynamic_bank_allocation = bool(enable_dynamic_bank_allocation) + if self.estimate_bandwidth_mode: + self.enable_dynamic_bank_allocation = False + if not (self.using_ifmap_custom_layout and self.using_filter_custom_layout): + self.enable_dynamic_bank_allocation = False + + self.dynamic_ifmap_banks = set() + self.dynamic_filter_banks = set() + self.dynamic_unassigned_banks = [] + self.params_valid_flag = True @@ -230,6 +256,132 @@ def service_ofmap_writes(self, return out_cycles_arr_np + def _apply_dynamic_bank_topology(self): + """ + Apply current dynamic bank assignment to IFMAP/FILTER read buffers. + """ + ifmap_banks = max(1, len(self.dynamic_ifmap_banks)) + filter_banks = max(1, len(self.dynamic_filter_banks)) + self.ifmap_buf.update_bank_topology(num_bank=ifmap_banks, + num_port=self.ifmap_sram_bank_port) + self.filter_buf.update_bank_topology(num_bank=filter_banks, + num_port=self.filter_sram_bank_port) + + def _assign_one_dynamic_bank(self, assign_to_ifmap): + """ + Permanently assign one unassigned bank to IFMAP or FILTER. + """ + if len(self.dynamic_unassigned_banks) == 0: + return False + + bank_id = self.dynamic_unassigned_banks.pop(0) + if assign_to_ifmap: + self.dynamic_ifmap_banks.add(bank_id) + else: + self.dynamic_filter_banks.add(bank_id) + + self._apply_dynamic_bank_topology() + return True + + def _estimate_required_banks(self, demand_line, num_port, total_banks): + """ + Estimate required banks from instantaneous request pressure. + """ + valid_reqs = int(np.count_nonzero(demand_line != -1)) + if valid_reqs == 0: + return 1 + est_banks = int(math.ceil(valid_reqs / max(1, num_port))) + est_banks = min(total_banks - 1, max(1, est_banks)) + return est_banks + + def _initialize_dynamic_bank_allocator(self, ifmap_demand_mat, filter_demand_mat): + """ + Initialize bank pools and do a demand-proportional warm-start assignment. + """ + total_banks = self.static_ifmap_sram_bank_num + self.static_filter_sram_bank_num + if total_banks < 2: + self.enable_dynamic_bank_allocation = False + return + + # Start with one dedicated bank each, and keep the rest in a free pool. + self.dynamic_ifmap_banks = {0} + self.dynamic_filter_banks = {1} + self.dynamic_unassigned_banks = list(range(2, total_banks)) + self._apply_dynamic_bank_topology() + + # Warm-start: estimate pressure from total valid accesses and allocate half of pool. + ifmap_total_reqs = int(np.count_nonzero(ifmap_demand_mat != -1)) + filter_total_reqs = int(np.count_nonzero(filter_demand_mat != -1)) + total_reqs = ifmap_total_reqs + filter_total_reqs + if total_reqs == 0 or len(self.dynamic_unassigned_banks) == 0: + return + + target_ifmap = int(round(total_banks * (ifmap_total_reqs / total_reqs))) + target_ifmap = min(total_banks - 1, max(1, target_ifmap)) + target_filter = total_banks - target_ifmap + + warm_assign_budget = len(self.dynamic_unassigned_banks) // 2 + while warm_assign_budget > 0 and len(self.dynamic_unassigned_banks) > 0: + deficit_ifmap = max(0, target_ifmap - len(self.dynamic_ifmap_banks)) + deficit_filter = max(0, target_filter - len(self.dynamic_filter_banks)) + + if deficit_ifmap == 0 and deficit_filter == 0: + break + + if deficit_ifmap > deficit_filter: + self._assign_one_dynamic_bank(assign_to_ifmap=True) + elif deficit_filter > deficit_ifmap: + self._assign_one_dynamic_bank(assign_to_ifmap=False) + else: + self._assign_one_dynamic_bank(assign_to_ifmap=(ifmap_total_reqs >= filter_total_reqs)) + + warm_assign_budget -= 1 + + def _dynamic_allocate_from_demand(self, ifmap_demand_line, filter_demand_line): + """ + Allocate banks based on current-line demand while preserving exclusivity. + """ + if len(self.dynamic_unassigned_banks) == 0: + return + + total_banks = self.static_ifmap_sram_bank_num + self.static_filter_sram_bank_num + req_ifmap = self._estimate_required_banks(ifmap_demand_line, + self.ifmap_sram_bank_port, + total_banks) + req_filter = self._estimate_required_banks(filter_demand_line, + self.filter_sram_bank_port, + total_banks) + + while len(self.dynamic_unassigned_banks) > 0: + deficit_ifmap = max(0, req_ifmap - len(self.dynamic_ifmap_banks)) + deficit_filter = max(0, req_filter - len(self.dynamic_filter_banks)) + if deficit_ifmap == 0 and deficit_filter == 0: + break + + if deficit_ifmap > deficit_filter: + self._assign_one_dynamic_bank(assign_to_ifmap=True) + elif deficit_filter > deficit_ifmap: + self._assign_one_dynamic_bank(assign_to_ifmap=False) + else: + ifmap_reqs = int(np.count_nonzero(ifmap_demand_line != -1)) + filter_reqs = int(np.count_nonzero(filter_demand_line != -1)) + self._assign_one_dynamic_bank(assign_to_ifmap=(ifmap_reqs >= filter_reqs)) + + def _dynamic_allocate_from_stall_feedback(self, ifmap_stall, filter_stall): + """ + Allocate one extra bank to the side with larger stall pressure. + """ + if len(self.dynamic_unassigned_banks) == 0: + return + + if ifmap_stall <= 0 and filter_stall <= 0: + return + + if ifmap_stall > filter_stall: + self._assign_one_dynamic_bank(assign_to_ifmap=True) + elif filter_stall > ifmap_stall: + self._assign_one_dynamic_bank(assign_to_ifmap=False) + # def service_memory_requests(self, ifmap_demand_mat, filter_demand_mat, ofmap_demand_mat): """ @@ -250,25 +402,37 @@ def service_memory_requests(self, ifmap_demand_mat, filter_demand_mat, ofmap_dem filter_serviced_cycles = [] ofmap_serviced_cycles = [] + if self.enable_dynamic_bank_allocation: + self._initialize_dynamic_bank_allocator(ifmap_demand_mat, filter_demand_mat) + pbar_disable = not self.verbose for i in tqdm(range(ofmap_lines), disable=pbar_disable): cycle_arr = np.zeros((1,1)) + i + self.stall_cycles ifmap_demand_line = ifmap_demand_mat[i, :].reshape((1,ifmap_demand_mat.shape[1])) + filter_demand_line = filter_demand_mat[i, :].reshape((1, filter_demand_mat.shape[1])) + + if self.enable_dynamic_bank_allocation: + # Permanent one-way assignment from free pool to IFMAP/FILTER. + self._dynamic_allocate_from_demand(ifmap_demand_line, filter_demand_line) + ifmap_cycle_out = \ self.ifmap_buf.service_reads(incoming_requests_arr_np=ifmap_demand_line, incoming_cycles_arr=cycle_arr) ifmap_serviced_cycles += [ifmap_cycle_out[0]] ifmap_stalls = ifmap_cycle_out[0] - cycle_arr[0] - ifmap_hit_latency - filter_demand_line = filter_demand_mat[i, :].reshape((1, filter_demand_mat.shape[1])) filter_cycle_out = \ self.filter_buf.service_reads(incoming_requests_arr_np=filter_demand_line, incoming_cycles_arr=cycle_arr) filter_serviced_cycles += [filter_cycle_out[0]] filter_stalls = filter_cycle_out[0] - cycle_arr[0] - filter_hit_latency + if self.enable_dynamic_bank_allocation: + self._dynamic_allocate_from_stall_feedback(ifmap_stall=float(ifmap_stalls[0]), + filter_stall=float(filter_stalls[0])) + ofmap_demand_line = ofmap_demand_mat[i, :].reshape((1, ofmap_demand_mat.shape[1])) ofmap_cycle_out = \ self.ofmap_buf.service_writes(incoming_requests_arr_np=ofmap_demand_line, @@ -483,6 +647,17 @@ def get_stall_cycles(self): assert self.traces_valid, 'Traces not generated yet' return int(self.stall_cycles) + def get_final_ifmap_filter_bank_allocation(self): + """ + Method to get final IFMAP/FILTER bank ownership after simulation. + """ + assert self.params_valid_flag, 'Memories not initialized yet' + + if self.enable_dynamic_bank_allocation and len(self.dynamic_ifmap_banks) > 0 and len(self.dynamic_filter_banks) > 0: + return len(self.dynamic_ifmap_banks), len(self.dynamic_filter_banks) + + return int(self.static_ifmap_sram_bank_num), int(self.static_filter_sram_bank_num) + # def get_ifmap_sram_start_stop_cycles(self): """ diff --git a/scalesim/memory/read_buffer.py b/scalesim/memory/read_buffer.py index 0d54bc599..57e95af05 100644 --- a/scalesim/memory/read_buffer.py +++ b/scalesim/memory/read_buffer.py @@ -61,6 +61,9 @@ def __init__(self): self.trace_valid = False self.use_ramulator_trace = False self.enable_layout_evaluation = False + self.num_bank = 1 + self.num_port = 2 + self.bw_per_bank = 1 # def set_params(self, backing_buf_obj, @@ -89,11 +92,8 @@ def set_params(self, backing_buf_obj, self.req_gen_bandwidth = backing_buf_bw # Layout modeling - self.num_bank = num_bank - self.num_port = num_port # number of ports per bank - self.bw_per_bank = self.req_gen_bandwidth // self.num_bank # bandwidth per bank self.enable_layout_evaluation = enable_layout_evaluation - assert self.bw_per_bank * self.num_bank == self.req_gen_bandwidth, f"overall bandwidth must be divisible by total number of banks, number of banks = {self.num_bank}, bandwidth of each as {self.bw_per_bank}, total bandwidth = {self.req_gen_bandwidth}" + self._set_bank_topology(num_bank=num_bank, num_port=num_port) # Ramulator trace self.use_ramulator_trace = use_ramulator_trace @@ -140,6 +140,33 @@ def reset(self): # TODO: check if all resets are working propoerly self.hashed_buffer_valid = False self.trace_valid = False self.use_ramulator_trace = False + self.num_bank = 1 + self.num_port = 2 + self.bw_per_bank = 1 + + def _set_bank_topology(self, num_bank, num_port=None): + """ + Internal helper to update bank/port topology used by layout conflict modeling. + """ + if num_bank is None: + num_bank = self.num_bank + if num_port is None: + num_port = self.num_port + + assert num_bank >= 1, f"number of banks must be >= 1, got {num_bank}" + assert num_port >= 1, f"number of ports must be >= 1, got {num_port}" + + self.num_bank = int(num_bank) + self.num_port = int(num_port) + + # Use ceil so runtime bank repartition can work even when bandwidth is not divisible. + self.bw_per_bank = max(1, int(math.ceil(self.req_gen_bandwidth / self.num_bank))) + + def update_bank_topology(self, num_bank, num_port=None): + """ + Public API to update SRAM bank topology at runtime. + """ + self._set_bank_topology(num_bank=num_bank, num_port=num_port) # def set_fetch_matrix(self, fetch_matrix_np): @@ -325,6 +352,7 @@ def service_reads(self, # because data access in compiled layout turns to be contiguious, which accesses the continuous addresses. # In (contiguous mapping), such multi-bank mapping would result in more bank conflict slowdown. bank_id = column_addr // self.bw_per_bank + bank_id = min(bank_id, self.num_bank - 1) assert bank_id < self.num_bank, f"bank id = {bank_id} for column_addr = {column_addr} needs to be smaller than total number of bank = {self.num_bank}" if line_addr not in concurrent_line_addr[bank_id]: concurrent_line_addr[bank_id].append(line_addr) diff --git a/scalesim/scale_config.py b/scalesim/scale_config.py index 109390fb6..7df8e3209 100644 --- a/scalesim/scale_config.py +++ b/scalesim/scale_config.py @@ -62,6 +62,9 @@ def __init__(self): # Time linear model parameter self.time_linear_model = 'None' + + # Dynamic SRAM bank allocation (IFMAP/FILTER shared pool) + self.enable_dynamic_bank_allocation = False # def read_conf_file(self, conf_file_in): """ @@ -101,6 +104,10 @@ def read_conf_file(self, conf_file_in): self.time_linear_model = config.get(section, 'TimeLinearModel') assert self.time_linear_model in ['None', 'TPUv4', 'TPUv5e', 'TPUv6e'], f"ERROR: Invalid time linear model '{self.time_linear_model}'. Must be one of: None, TPUv4, TPUv5e, TPUv6e" + # Parse EnableDynamic if present + if config.has_option(section, 'EnableDynamic'): + self.enable_dynamic_bank_allocation = config.getboolean(section, 'EnableDynamic') + # TODO Sarbartha: Should be bw div_factor = 1 @@ -244,6 +251,7 @@ def write_conf_file(self, conf_file_out): config.set(section, 'InterfaceBandwidth', str(bw_mode)) config.set(section, 'UseRamulatorTrace', str(self.use_ramulator_trace)) config.set(section, 'TimeLinearModel', str(self.time_linear_model)) + config.set(section, 'EnableDynamic', str(self.enable_dynamic_bank_allocation)) with open(conf_file_out, 'w') as configfile: config.write(configfile) @@ -515,6 +523,14 @@ def get_time_linear_model(self): if self.valid_conf_flag: return self.time_linear_model return "Default" + + def get_enable_dynamic_bank_allocation(self): + """ + Method to check if IFMAP/FILTER SRAM bank allocation should be dynamic. + """ + if self.valid_conf_flag: + return self.enable_dynamic_bank_allocation + return False # FIX ISSUE #14 @staticmethod diff --git a/scalesim/simulator.py b/scalesim/simulator.py index fce0a609d..a95c774fa 100644 --- a/scalesim/simulator.py +++ b/scalesim/simulator.py @@ -173,7 +173,7 @@ def generate_reports(self): compute_report_name = self.top_path + '/COMPUTE_REPORT.csv' compute_report = open(compute_report_name, 'w') header = ('LayerID, Total Cycles (incl. prefetch), Total Cycles, Stall Cycles, Overall Util %, Mapping Efficiency %,' - ' Compute Util %,\n') + ' Compute Util %, Final IFMAP/FILTER Bank Allocation,\n') compute_report.write(header) # Create TIME_REPORT.csv for linear model time conversion diff --git a/scalesim/single_layer_sim.py b/scalesim/single_layer_sim.py index 7f1f86c9e..284e1957a 100644 --- a/scalesim/single_layer_sim.py +++ b/scalesim/single_layer_sim.py @@ -55,6 +55,7 @@ def __init__(self): self.overall_util = 0 self.mapping_eff = 0 self.compute_util = 0 + self.final_bank_allocation = '0/0' # Report items : BW report self.avg_ifmap_sram_bw = 0 @@ -279,6 +280,7 @@ def run(self): filter_sram_bank_port=self.config.filter_sram_bank_port, using_ifmap_custom_layout=self.using_ifmap_custom_layout, using_filter_custom_layout=self.using_filter_custom_layout, + enable_dynamic_bank_allocation=self.config.get_enable_dynamic_bank_allocation(), estimate_bandwidth_mode=estimate_bandwidth_mode, config=self.config, topo=self.topo @@ -337,6 +339,8 @@ def calc_report_data(self): self.overall_util = (self.num_compute * 100) / (self.total_cycles * self.num_mac_unit) self.mapping_eff = self.compute_system.get_avg_mapping_efficiency() * 100 self.compute_util = self.compute_system.get_avg_compute_utilization() * 100 + final_ifmap_banks, final_filter_banks = self.memory_system.get_final_ifmap_filter_bank_allocation() + self.final_bank_allocation = f'{int(final_ifmap_banks)}/{int(final_filter_banks)}' # BW report self.ifmap_sram_reads = self.compute_system.get_ifmap_requests() @@ -401,7 +405,8 @@ def get_compute_report_items(self): self.stall_cycles, self.overall_util, self.mapping_eff, - self.compute_util] + self.compute_util, + self.final_bank_allocation] return items # From e528896353acc1734149d7c4de2ef9fc0b6867ef Mon Sep 17 00:00:00 2001 From: Mikemy666 <2535634982@qq.com> Date: Tue, 17 Mar 2026 13:16:10 +0800 Subject: [PATCH 2/3] add bank capacity util report --- .../memory/double_buffered_scratchpad_mem.py | 54 +++++++++++++++++++ scalesim/simulator.py | 2 +- scalesim/single_layer_sim.py | 8 ++- topologies/MoE/Switchtrans.csv | 20 +++++++ 4 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 topologies/MoE/Switchtrans.csv diff --git a/scalesim/memory/double_buffered_scratchpad_mem.py b/scalesim/memory/double_buffered_scratchpad_mem.py index 79d19b69d..801eb145b 100644 --- a/scalesim/memory/double_buffered_scratchpad_mem.py +++ b/scalesim/memory/double_buffered_scratchpad_mem.py @@ -658,6 +658,60 @@ def get_final_ifmap_filter_bank_allocation(self): return int(self.static_ifmap_sram_bank_num), int(self.static_filter_sram_bank_num) + def _get_unique_payload_words(self, trace_matrix): + """ + Count unique valid addresses from a trace matrix payload region. + """ + if trace_matrix is None or trace_matrix.size == 0: + return 0 + + payload = trace_matrix[:, 1:] + if payload.size == 0: + return 0 + + flat_payload = payload.reshape(-1) + valid = flat_payload[flat_payload != -1] + if valid.size == 0: + return 0 + + return int(np.unique(valid).size) + + def get_ifmap_filter_bank_capacity_utilization(self): + """ + Return IFMAP/FILTER bank capacity utilization. + + Utilization definition: + used_capacity / (bank_count * per_bank_capacity) + """ + assert self.traces_valid, 'Traces not generated yet' + + final_ifmap_banks, final_filter_banks = self.get_final_ifmap_filter_bank_allocation() + + # Per-bank capacity is derived from configured static banks. + ifmap_per_bank_capacity = max(1.0, self.ifmap_buf.total_size_bytes / max(1, self.static_ifmap_sram_bank_num)) + filter_per_bank_capacity = max(1.0, self.filter_buf.total_size_bytes / max(1, self.static_filter_sram_bank_num)) + + ifmap_total_capacity = max(1.0, final_ifmap_banks * ifmap_per_bank_capacity) + filter_total_capacity = max(1.0, final_filter_banks * filter_per_bank_capacity) + + ifmap_used_words = self._get_unique_payload_words(self.ifmap_trace_matrix) + filter_used_words = self._get_unique_payload_words(self.filter_trace_matrix) + + ifmap_word_size = max(1, int(getattr(self.ifmap_buf, 'word_size', 1))) + filter_word_size = max(1, int(getattr(self.filter_buf, 'word_size', 1))) + + ifmap_used_capacity = ifmap_used_words * ifmap_word_size + filter_used_capacity = filter_used_words * filter_word_size + + # Cap to 100% as a capacity-utilization metric. + ifmap_used_capacity = min(ifmap_used_capacity, ifmap_total_capacity) + filter_used_capacity = min(filter_used_capacity, filter_total_capacity) + + ifmap_util = ifmap_used_capacity / ifmap_total_capacity + filter_util = filter_used_capacity / filter_total_capacity + + return float(ifmap_util), float(filter_util) + # def get_ifmap_sram_start_stop_cycles(self): """ diff --git a/scalesim/simulator.py b/scalesim/simulator.py index a95c774fa..9eed2030d 100644 --- a/scalesim/simulator.py +++ b/scalesim/simulator.py @@ -173,7 +173,7 @@ def generate_reports(self): compute_report_name = self.top_path + '/COMPUTE_REPORT.csv' compute_report = open(compute_report_name, 'w') header = ('LayerID, Total Cycles (incl. prefetch), Total Cycles, Stall Cycles, Overall Util %, Mapping Efficiency %,' - ' Compute Util %, Final IFMAP/FILTER Bank Allocation,\n') + ' Compute Util %, Final IFMAP/FILTER Bank Allocation, IFMAP Bank Capacity Util %, FILTER Bank Capacity Util %,\n') compute_report.write(header) # Create TIME_REPORT.csv for linear model time conversion diff --git a/scalesim/single_layer_sim.py b/scalesim/single_layer_sim.py index 284e1957a..e83a88408 100644 --- a/scalesim/single_layer_sim.py +++ b/scalesim/single_layer_sim.py @@ -56,6 +56,8 @@ def __init__(self): self.mapping_eff = 0 self.compute_util = 0 self.final_bank_allocation = '0/0' + self.ifmap_bank_capacity_util = 0 + self.filter_bank_capacity_util = 0 # Report items : BW report self.avg_ifmap_sram_bw = 0 @@ -341,6 +343,8 @@ def calc_report_data(self): self.compute_util = self.compute_system.get_avg_compute_utilization() * 100 final_ifmap_banks, final_filter_banks = self.memory_system.get_final_ifmap_filter_bank_allocation() self.final_bank_allocation = f'{int(final_ifmap_banks)}/{int(final_filter_banks)}' + self.ifmap_bank_capacity_util, self.filter_bank_capacity_util = \ + self.memory_system.get_ifmap_filter_bank_capacity_utilization() # BW report self.ifmap_sram_reads = self.compute_system.get_ifmap_requests() @@ -406,7 +410,9 @@ def get_compute_report_items(self): self.overall_util, self.mapping_eff, self.compute_util, - self.final_bank_allocation] + self.final_bank_allocation, + self.ifmap_bank_capacity_util * 100, + self.filter_bank_capacity_util * 100] return items # diff --git a/topologies/MoE/Switchtrans.csv b/topologies/MoE/Switchtrans.csv new file mode 100644 index 000000000..4a1e4e810 --- /dev/null +++ b/topologies/MoE/Switchtrans.csv @@ -0,0 +1,20 @@ +Layer,M,N,K, +Attn_Q_proj,512,384,384, +Attn_K_proj,512,384,384, +Attn_V_proj,512,384,384, + +QKT_head,512,1024,64, +QKTV_head,512,64,1024, + +Attn_O_proj,512,384,384, + +Router_logits,512,4,384, + +MoE-E0-FF1,156,1536,384, +MoE-E0-FF2,156,384,1536, +MoE-E1-FF1,135,1536,384, +MoE-E1-FF2,135,384,1536, +MoE-E2-FF1,116,1536,384, +MoE-E2-FF2,116,384,1536, +MoE-E3-FF1,105,1536,384, +MoE-E3-FF2,105,384,1536, \ No newline at end of file From d7b3b984e000caf19c8acf76657cc784e7c81b64 Mon Sep 17 00:00:00 2001 From: Mikemy666 <2535634982@qq.com> Date: Tue, 17 Mar 2026 14:16:53 +0800 Subject: [PATCH 3/3] fix bugs of memory util --- .../memory/double_buffered_scratchpad_mem.py | 109 ++++++++------ scalesim/memory/read_buffer.py | 133 ++++++++++++------ 2 files changed, 157 insertions(+), 85 deletions(-) diff --git a/scalesim/memory/double_buffered_scratchpad_mem.py b/scalesim/memory/double_buffered_scratchpad_mem.py index 801eb145b..06c6e481b 100644 --- a/scalesim/memory/double_buffered_scratchpad_mem.py +++ b/scalesim/memory/double_buffered_scratchpad_mem.py @@ -88,6 +88,8 @@ def __init__(self): self.dynamic_ifmap_banks = set() self.dynamic_filter_banks = set() self.dynamic_unassigned_banks = [] + self.dynamic_target_ifmap_banks = 1 + self.dynamic_target_filter_banks = 1 # def set_params(self, @@ -193,6 +195,8 @@ def set_params(self, self.dynamic_ifmap_banks = set() self.dynamic_filter_banks = set() self.dynamic_unassigned_banks = [] + self.dynamic_target_ifmap_banks = 1 + self.dynamic_target_filter_banks = 1 self.params_valid_flag = True @@ -294,9 +298,38 @@ def _estimate_required_banks(self, demand_line, num_port, total_banks): est_banks = min(total_banks - 1, max(1, est_banks)) return est_banks + def _estimate_unique_demand_bytes(self, demand_mat, word_size=1): + """ + Estimate unique demanded payload size in bytes for one operand. + """ + flat_payload = demand_mat.reshape(-1) + valid_payload = flat_payload[flat_payload != -1] + if valid_payload.size == 0: + return 0.0 + unique_words = np.unique(valid_payload).size + return float(unique_words * max(1, int(word_size))) + + def _allocate_towards_target_distribution(self): + """ + Allocate all free banks towards target IFMAP/FILTER distribution. + """ + while len(self.dynamic_unassigned_banks) > 0: + deficit_ifmap = max(0, self.dynamic_target_ifmap_banks - len(self.dynamic_ifmap_banks)) + deficit_filter = max(0, self.dynamic_target_filter_banks - len(self.dynamic_filter_banks)) + + if deficit_ifmap == 0 and deficit_filter == 0: + break + + if deficit_ifmap > deficit_filter: + self._assign_one_dynamic_bank(assign_to_ifmap=True) + elif deficit_filter > deficit_ifmap: + self._assign_one_dynamic_bank(assign_to_ifmap=False) + else: + self._assign_one_dynamic_bank(assign_to_ifmap=(len(self.dynamic_ifmap_banks) <= len(self.dynamic_filter_banks))) + def _initialize_dynamic_bank_allocator(self, ifmap_demand_mat, filter_demand_mat): """ - Initialize bank pools and do a demand-proportional warm-start assignment. + Initialize bank pools and assign banks to balance capacity utilization. """ total_banks = self.static_ifmap_sram_bank_num + self.static_filter_sram_bank_num if total_banks < 2: @@ -309,67 +342,50 @@ def _initialize_dynamic_bank_allocator(self, ifmap_demand_mat, filter_demand_mat self.dynamic_unassigned_banks = list(range(2, total_banks)) self._apply_dynamic_bank_topology() - # Warm-start: estimate pressure from total valid accesses and allocate half of pool. - ifmap_total_reqs = int(np.count_nonzero(ifmap_demand_mat != -1)) - filter_total_reqs = int(np.count_nonzero(filter_demand_mat != -1)) - total_reqs = ifmap_total_reqs + filter_total_reqs - if total_reqs == 0 or len(self.dynamic_unassigned_banks) == 0: + if len(self.dynamic_unassigned_banks) == 0: return - target_ifmap = int(round(total_banks * (ifmap_total_reqs / total_reqs))) + ifmap_need_bytes = self._estimate_unique_demand_bytes(ifmap_demand_mat, + word_size=getattr(self.ifmap_buf, 'word_size', 1)) + filter_need_bytes = self._estimate_unique_demand_bytes(filter_demand_mat, + word_size=getattr(self.filter_buf, 'word_size', 1)) + + ifmap_per_bank_capacity = max(1.0, self.ifmap_buf.total_size_bytes / max(1, self.static_ifmap_sram_bank_num)) + filter_per_bank_capacity = max(1.0, self.filter_buf.total_size_bytes / max(1, self.static_filter_sram_bank_num)) + + ifmap_weight = ifmap_need_bytes / ifmap_per_bank_capacity + filter_weight = filter_need_bytes / filter_per_bank_capacity + + if ifmap_weight <= 0 and filter_weight <= 0: + ifmap_weight = 1.0 + filter_weight = 1.0 + + target_ifmap = int(round(total_banks * (ifmap_weight / (ifmap_weight + filter_weight)))) target_ifmap = min(total_banks - 1, max(1, target_ifmap)) target_filter = total_banks - target_ifmap - warm_assign_budget = len(self.dynamic_unassigned_banks) // 2 - while warm_assign_budget > 0 and len(self.dynamic_unassigned_banks) > 0: - deficit_ifmap = max(0, target_ifmap - len(self.dynamic_ifmap_banks)) - deficit_filter = max(0, target_filter - len(self.dynamic_filter_banks)) + self.dynamic_target_ifmap_banks = target_ifmap + self.dynamic_target_filter_banks = target_filter - if deficit_ifmap == 0 and deficit_filter == 0: - break + self._allocate_towards_target_distribution() - if deficit_ifmap > deficit_filter: + while len(self.dynamic_unassigned_banks) > 0: + if ifmap_weight >= filter_weight: self._assign_one_dynamic_bank(assign_to_ifmap=True) - elif deficit_filter > deficit_ifmap: - self._assign_one_dynamic_bank(assign_to_ifmap=False) else: - self._assign_one_dynamic_bank(assign_to_ifmap=(ifmap_total_reqs >= filter_total_reqs)) - - warm_assign_budget -= 1 + self._assign_one_dynamic_bank(assign_to_ifmap=False) def _dynamic_allocate_from_demand(self, ifmap_demand_line, filter_demand_line): """ - Allocate banks based on current-line demand while preserving exclusivity. + Allocate remaining banks towards precomputed target distribution. """ if len(self.dynamic_unassigned_banks) == 0: return - - total_banks = self.static_ifmap_sram_bank_num + self.static_filter_sram_bank_num - req_ifmap = self._estimate_required_banks(ifmap_demand_line, - self.ifmap_sram_bank_port, - total_banks) - req_filter = self._estimate_required_banks(filter_demand_line, - self.filter_sram_bank_port, - total_banks) - - while len(self.dynamic_unassigned_banks) > 0: - deficit_ifmap = max(0, req_ifmap - len(self.dynamic_ifmap_banks)) - deficit_filter = max(0, req_filter - len(self.dynamic_filter_banks)) - if deficit_ifmap == 0 and deficit_filter == 0: - break - - if deficit_ifmap > deficit_filter: - self._assign_one_dynamic_bank(assign_to_ifmap=True) - elif deficit_filter > deficit_ifmap: - self._assign_one_dynamic_bank(assign_to_ifmap=False) - else: - ifmap_reqs = int(np.count_nonzero(ifmap_demand_line != -1)) - filter_reqs = int(np.count_nonzero(filter_demand_line != -1)) - self._assign_one_dynamic_bank(assign_to_ifmap=(ifmap_reqs >= filter_reqs)) + self._allocate_towards_target_distribution() def _dynamic_allocate_from_stall_feedback(self, ifmap_stall, filter_stall): """ - Allocate one extra bank to the side with larger stall pressure. + Allocate one extra bank only when target distribution is not yet reached. """ if len(self.dynamic_unassigned_banks) == 0: return @@ -377,6 +393,11 @@ def _dynamic_allocate_from_stall_feedback(self, ifmap_stall, filter_stall): if ifmap_stall <= 0 and filter_stall <= 0: return + deficit_ifmap = max(0, self.dynamic_target_ifmap_banks - len(self.dynamic_ifmap_banks)) + deficit_filter = max(0, self.dynamic_target_filter_banks - len(self.dynamic_filter_banks)) + if deficit_ifmap == 0 and deficit_filter == 0: + return + if ifmap_stall > filter_stall: self._assign_one_dynamic_bank(assign_to_ifmap=True) elif filter_stall > ifmap_stall: diff --git a/scalesim/memory/read_buffer.py b/scalesim/memory/read_buffer.py index 57e95af05..0f5097326 100644 --- a/scalesim/memory/read_buffer.py +++ b/scalesim/memory/read_buffer.py @@ -37,6 +37,8 @@ def __init__(self): # Status of the buffer self.hashed_buffer = {} + self.hashed_buffer_loc_map = {} + self.active_buffer_addr_map = {} self.num_lines = 0 self.num_active_buf_lines = 1 self.num_prefetch_buf_lines = 1 @@ -120,6 +122,8 @@ def reset(self): # TODO: check if all resets are working propoerly # Status of the buffer self.hashed_buffer = {} + self.hashed_buffer_loc_map = {} + self.active_buffer_addr_map = {} self.active_buffer_set_limits = [] self.prefetch_buffer_set_limits = [] @@ -202,6 +206,10 @@ def prepare_hashed_buffer(self): """ Method to convert the fetch matrix into a hashed buffer for fast lookups. """ + self.hashed_buffer = {} + self.hashed_buffer_loc_map = {} + self.active_buffer_addr_map = {} + elems_per_set = math.ceil(self.total_size_elems / 100) if self.enable_layout_evaluation: elems_per_set = self.req_gen_bandwidth @@ -212,6 +220,7 @@ def prepare_hashed_buffer(self): line_id = 0 elem_ctr = 0 current_line = set() + current_line_loc_map = {} for r in range(prefetch_rows): for c in range(prefetch_cols): @@ -219,15 +228,19 @@ def prepare_hashed_buffer(self): if not elem == -1: current_line.add(elem) + current_line_loc_map[elem] = elem_ctr elem_ctr += 1 if not elem_ctr < elems_per_set: # ie > or = self.hashed_buffer[line_id] = current_line + self.hashed_buffer_loc_map[line_id] = current_line_loc_map line_id += 1 elem_ctr = 0 current_line = set() # new set + current_line_loc_map = {} self.hashed_buffer[line_id] = current_line + self.hashed_buffer_loc_map[line_id] = current_line_loc_map max_num_active_buf_lines = int(math.ceil(self.active_buf_size / elems_per_set)) max_num_prefetch_buf_lines = int(math.ceil(self.prefetch_buf_size / elems_per_set)) @@ -248,6 +261,58 @@ def prepare_hashed_buffer(self): self.num_lines = num_lines self.hashed_buffer_valid = True + def _iter_active_line_ids(self): + """ + Iterate through line ids currently visible in the active buffer window. + """ + if len(self.active_buffer_set_limits) != 2: + return + + start_id, end_id = self.active_buffer_set_limits + if start_id < end_id: + for line_id in range(start_id, end_id): + yield line_id + else: + for line_id in range(start_id, self.num_lines): + yield line_id + for line_id in range(end_id): + yield line_id + + def _refresh_active_buffer_addr_map(self): + """ + Refresh fast lookup map for active-buffer addresses. + """ + self.active_buffer_addr_map = {} + for line_id in self._iter_active_line_ids(): + line_map = self.hashed_buffer_loc_map.get(line_id, {}) + for addr, col_id in line_map.items(): + self.active_buffer_addr_map[addr] = (line_id, col_id) + + def _get_max_prefetch_retries(self): + """ + Upper bound for miss-driven prefetch retries per address. + """ + return max(8, int(self.num_lines * 2)) + + def _cycle_to_scalar(self, cycle): + """ + Convert numpy scalar/array cycle representation to python int-like scalar. + """ + if isinstance(cycle, np.ndarray): + return float(cycle[0]) + return float(cycle) + + def _apply_miss_retry_fallback(self, cycle, offset): + """ + Apply bounded penalty and move on when an address repeatedly misses. + """ + cycle_scalar = self._cycle_to_scalar(cycle) + penalty = max(1, self.hit_latency, self.num_prefetch_buf_lines, self.num_active_buf_lines) + potential_stall_cycles = self.last_prefetch_cycle - (cycle_scalar + offset) + if potential_stall_cycles > 0: + penalty = max(penalty, int(potential_stall_cycles)) + return offset + penalty + # def active_buffer_hit(self, addr): """ @@ -255,47 +320,16 @@ def active_buffer_hit(self, addr): """ assert self.active_buf_full_flag, 'Active buffer is not ready yet' - start_id, end_id = self.active_buffer_set_limits + if len(self.active_buffer_addr_map) == 0: + self._refresh_active_buffer_addr_map() + if self.enable_layout_evaluation: - if start_id < end_id: - for line_id in range(start_id, end_id): - this_set = self.hashed_buffer[line_id] # O(1) --> accessing hash - if addr in this_set: # Checking in a set(), O(1) lookup - return line_id, list(this_set).index(addr) - - else: - for line_id in range(start_id, self.num_lines): - this_set = self.hashed_buffer[line_id] # O(1) --> accessing hash - if addr in this_set: # Checking in a set(), O(1) lookup - return line_id, list(this_set).index(addr) - - for line_id in range(end_id): - this_set = self.hashed_buffer[line_id] # O(1) --> accessing hash - if addr in this_set: # Checking in a set(), O(1) lookup - return line_id, list(this_set).index(addr) - # Fixing for ISSUE #14 - # return True - return -1, -1 + loc = self.active_buffer_addr_map.get(addr, None) + if loc is None: + return -1, -1 + return loc else: - if start_id < end_id: - for line_id in range(start_id, end_id): - this_set = self.hashed_buffer[line_id] # O(1) --> accessing hash - if addr in this_set: # Checking in a set(), O(1) lookup - return True - - else: - for line_id in range(start_id, self.num_lines): - this_set = self.hashed_buffer[line_id] # O(1) --> accessing hash - if addr in this_set: # Checking in a set(), O(1) lookup - return True - - for line_id in range(end_id): - this_set = self.hashed_buffer[line_id] # O(1) --> accessing hash - if addr in this_set: # Checking in a set(), O(1) lookup - return True - # Fixing for ISSUE #14 - # return True - return False + return addr in self.active_buffer_addr_map # def service_reads(self, @@ -337,13 +371,21 @@ def service_reads(self, # Fixing for ISSUE #14 # if not self.active_buffer_hit(addr): # --> While loop ensures multiple prefetches if needed line_addr, column_addr = self.active_buffer_hit(addr) - while line_addr == -1: + retry_ctr = 0 + max_retry = self._get_max_prefetch_retries() + while line_addr == -1 and retry_ctr < max_retry: self.new_prefetch() + retry_ctr += 1 potential_stall_cycles = self.last_prefetch_cycle - (cycle + offset) if potential_stall_cycles > 0: offset += potential_stall_cycles line_addr, column_addr = self.active_buffer_hit(addr) + + if line_addr == -1: + # Bounded fallback avoids pathological near-infinite miss loops. + offset = self._apply_miss_retry_fallback(cycle, offset) + continue # Layout Modeling 1 -- data mapping to multiple bank # The 2D array is interleaved mapped to multiple banks. @@ -384,12 +426,19 @@ def service_reads(self, # if addr not in self.active_buffer_contents: #this is super slow!!! # Fixing for ISSUE #14 # if not self.active_buffer_hit(addr): # --> While loop ensures multiple prefetches if needed - while not self.active_buffer_hit(addr): + retry_ctr = 0 + max_retry = self._get_max_prefetch_retries() + while not self.active_buffer_hit(addr) and retry_ctr < max_retry: self.new_prefetch() + retry_ctr += 1 potential_stall_cycles = self.last_prefetch_cycle - (cycle + offset) offset += potential_stall_cycles # Offset increments if there were potential stalls if potential_stall_cycles > 0: offset += potential_stall_cycles + + if not self.active_buffer_hit(addr): + offset = self._apply_miss_retry_fallback(cycle, offset) + continue if self.use_ramulator_trace == True: out_cycles = cycle + offset + dram_stall_cycles @@ -462,6 +511,7 @@ def prefetch_active_buffer(self, start_cycle): prefetch_buf_start_line_id = active_buf_end_line_id prefetch_buf_end_line_id = prefetch_buf_start_line_id + self.num_prefetch_buf_lines self.prefetch_buffer_set_limits = [prefetch_buf_start_line_id, prefetch_buf_end_line_id] + self._refresh_active_buffer_addr_map() self.active_buf_full_flag = True @@ -498,6 +548,7 @@ def new_prefetch(self): self.active_buffer_set_limits = [active_start, active_end] self.prefetch_buffer_set_limits = [prefetch_start, prefetch_end] + self._refresh_active_buffer_addr_map() # 2. Create the request start_idx = self.next_line_prefetch_idx