Hi,
I'm experimenting a bit with very simple networks and low memory accelerators.
I'm currently using generated spatial mappings (meaning that I'm not specifying them in the mapping file) and the get_hardware_performance_zigzag function.
The problem is, the function that generates the spatial mappings can generate mappings that are incompatible with the amount of memory available in the accelerator. When this happens, an exception is raised, meaning that the whole pipeline is killed (it stalls on the exception).
What I would expect is for the pipeline to discard the mappings that are not valid instead of halting the whole process.
This is most often solved by limiting the generated spatial mappings to 2/3 or using a static spatial mapping, but this is often not ideal, as by my experiments so far I'm finding that for best optimization performances it's more convenient to try a range between 5 to 10 generated mappings.
I cannot upload the .onnx and yaml files, so I will copy them below in case you need to reproduce this behaviour.
Pytorch network
class BasicNetwork(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc3 = nn.Linear(16, 10)
def forward(self, x: torch.Tensor):
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = x.reshape((x.shape[0], x.shape[1], x.shape[2] * x.shape[3])). permute((0,2,1))
x = self.fc3(x)
return x
code I used t convert it to ONNX
network = BasicNetwork().eval()
example_input = torch.zeros((1,3,160,120))
onnx_program = torch.onnx.export(network, (example_input,), dynamo=True, optimize=True, verify=True, strict=True)
onnx_model_save_path = "./onnx_model" # <------ change this
save_name = f"{network.__class__.__name__}.onnx"
save_path = os.path.normpath(os.path.join(onnx_model_save_path, save_name))
onnx_program.save(save_path)
slight modifications for ZigZag
quantization = 8
root = "./onnx_model/" # <------ change this
models = os.listdir(root)
for i, model in enumerate(models):
models[i] = os.path.normpath(os.path.join(root, model))
zigzag_folder = "./zig_zag_tests/inputs/workload" # <------ change this
for path in models:
name = path.split(sep='/')[-1]
save_path = os.path.normpath(os.path.join(zigzag_folder, name))
onnx_model = onnx.load(path, load_external_data=False)
onnx.checker.check_model(onnx_model)
# this is necessary because Dynamo does not save the kernel shapes at the time of writing
kernels = {}
for node in onnx_model.graph.initializer:
if 'weight' in node.name and len(node.dims) == 4:
kernels[node.name] = node.dims[2:]
pass
for node in onnx_model.graph.node:
attr = onnx.helper.make_attribute("weight_size", quantization)
# ensure kernel_size is present
node.attribute.extend([attr])
for mat in node.input:
if mat in kernels:
attr = onnx.helper.make_attribute("kernel_shape", kernels[mat])
node.attribute.extend([attr])
onnx.save(onnx_model, save_path)
mapping.yaml
# Default entry in case layer name and operator type are not available
- name: default
memory_operand_links:
O: O
W: I2
I: I1
accelerator.yaml
name: accelerator1
operational_array:
unit_energy: 0.04 # pJ
unit_area: 1 # unit
dimensions: [D1, D2]
sizes: [160, 120]
memories:
core_mem:
size: 12000 # 12 KB
r_cost: 0.05 # pJ
w_cost: 0.05
area: 0
latency: 1
operands: [ I1, O, I2 ]
ports:
- name: rw_port_1
type: read_write
bandwidth_min: 8
bandwidth_max: 32
allocation:
- I2, th
- I2, tl
- I2, fh
- I2, fl
- name: rw_port_2
type: read_write
bandwidth_min: 8
bandwidth_max: 32
allocation:
- I1, th
- I1, tl
- I1, fh
- I1, fl
- name: rw_port_3
type: read_write
bandwidth_min: 8
bandwidth_max: 32
allocation:
- O, fl
- O, tl
- O, fh
- O, th
served_dimensions: []
sram:
size: 5000000 # 5 MB
r_cost: 19.6
w_cost: 22.5
area: 0
latency: 1
operands: [I1, O, I2]
ports:
- name: rw_port_1
type: read_write
bandwidth_min: 8
bandwidth_max: 128
allocation:
- I2, fl
- I2, tl
- I2, fh
- I2, th
- name: rw_port_2
type: read_write
bandwidth_min: 8
bandwidth_max: 128
allocation:
- I1, fl
- I1, tl
- I1, fh
- I1, th
- name: rw_port_3
type: read_write
bandwidth_min: 8
bandwidth_max: 128
allocation:
- O, fl
- O, tl
- O, fh
- O, th
served_dimensions: [D1, D2]
Code for launching the pipeline
import os
from datetime import datetime
from zigzag import api
from time import time
# todo: cli args
workload_path = "inputs/workload/BasicNetwork.onnx"
id_name = os.path.split(workload_path)[-1].split('.')[0]
accelerator_path = "inputs/hardware/accelerator.yaml"
mapping_path = "inputs/mapping/mapping.yaml"
optimize_for = 'latency'
spatial_tries = 10
temporal_tries = 6
data_locality = True
mixed_spatial_mapping = False
ENGINES = ('loma', 'salsa')
engine = ENGINES[1] # <---- I'm using SALSA
experiment_id = datetime.now()
dump_folder = f"outputs/{experiment_id}_{id_name}_{optimize_for}"
pickle_filename = os.path.normpath(os.path.join(dump_folder,"cmes.pickle"))
start_time = time()
energy, latency, cmes = api.get_hardware_performance_zigzag(
temporal_mapping_search_engine=engine,
workload=workload_path,
accelerator=accelerator_path,
mapping=mapping_path,
opt=optimize_for,
dump_folder=dump_folder,
pickle_filename=pickle_filename,
nb_spatial_mappings_generated=spatial_tries,
lpf_limit=temporal_tries,
exploit_data_locality=data_locality,
enable_mix_spatial_mapping=mixed_spatial_mapping,
)
end_time = time()
print(f"\n\nSimulation took {end_time - start_time} s\n\n")
print(f"Total network energy = {energy:.2e} pJ")
print(f"Total network latency = {latency:.2e} cycles")
output
2025-08-01 15:17:44,669 - __init__ +46 - WARNING - Operator Conv not defined in mapping. Using default mapping instead.
2025-08-01 15:17:44,670 - __init__ +46 - WARNING - Operator MatMul not defined in mapping. Using default mapping instead.
2025-08-01 15:17:44,670 - parse_workload_from_onnx_model_and_mapping +103 - INFO - Created ONNXWorkload graph with 10 nodes and 9 edges.
2025-08-01 15:17:44,683 - run +49 - INFO - Processing node_Conv_0...
2025-08-01 15:17:44,684 - generate_accelerator_with_removing_unused_memory +383 - INFO - Update mem architecture for layer node_Conv_0...
2025-08-01 15:17:44,705 - run +93 - INFO - node_Conv_0: Launching spatial mapping 1/10 :{D1: {OY: 156}, D2: {OX: 116}}.
2025-08-01 15:17:44,710 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:17:47,567 - run +93 - INFO - node_Conv_0: Launching spatial mapping 2/10 :{D1: {OX: 116}, D2: {OY: 120}}.
2025-08-01 15:17:47,567 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:17:50,640 - run +93 - INFO - node_Conv_0: Launching spatial mapping 3/10 :{D1: {OY: 78}, D2: {OX: 116}}.
2025-08-01 15:17:50,641 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:17:53,813 - run +93 - INFO - node_Conv_0: Launching spatial mapping 4/10 :{D1: {OY: 156}, D2: {OX: 58}}.
2025-08-01 15:17:53,814 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:17:56,936 - run +93 - INFO - node_Conv_0: Launching spatial mapping 5/10 :{D1: {OX: 116}, D2: {OY: 60}}.
2025-08-01 15:17:56,937 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:00,079 - run +93 - INFO - node_Conv_0: Launching spatial mapping 6/10 :{D1: {OX: 58}, D2: {OY: 120}}.
2025-08-01 15:18:00,080 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:03,496 - run +93 - INFO - node_Conv_0: Launching spatial mapping 7/10 :{D1: {OY: 52}, D2: {OX: 116}}.
2025-08-01 15:18:03,497 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:06,651 - run +93 - INFO - node_Conv_0: Launching spatial mapping 8/10 :{D1: {OX: 116}, D2: {OY: 40}}.
2025-08-01 15:18:06,651 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:10,104 - run +93 - INFO - node_Conv_0: Launching spatial mapping 9/10 :{D1: {OY: 78}, D2: {OX: 58}}.
2025-08-01 15:18:10,104 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:13,534 - run +93 - INFO - node_Conv_0: Launching spatial mapping 10/10 :{D1: {OY: 39}, D2: {OX: 116}}.
2025-08-01 15:18:13,535 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:16,971 - run +48 - INFO - Saved CostModelEvaluation(node_Conv_0, Core(0)) with energy 9.540e+05 and latency 1.011e+03 to outputs/2025-08-01 15:17:44.662713_BasicNetwork_latency/node_Conv_0_complete.json
2025-08-01 15:18:17,004 - run +49 - INFO - Processing node_Conv_3...
2025-08-01 15:18:17,005 - generate_accelerator_with_removing_unused_memory +383 - INFO - Update mem architecture for layer node_Conv_3...
2025-08-01 15:18:17,022 - run +93 - INFO - node_Conv_3: Launching spatial mapping 1/10 :{D1: {OX: 54}, D2: {OY: 74}}.
2025-08-01 15:18:17,023 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:20,237 - run +93 - INFO - node_Conv_3: Launching spatial mapping 2/10 :{D1: {OY: 74}, D2: {OX: 54}}.
2025-08-01 15:18:20,238 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:23,395 - run +93 - INFO - node_Conv_3: Launching spatial mapping 3/10 :{D1: {OX: 54}, D2: {OY: 37}}.
2025-08-01 15:18:23,396 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:26,991 - run +93 - INFO - node_Conv_3: Launching spatial mapping 4/10 :{D1: {OY: 37}, D2: {OX: 54}}.
2025-08-01 15:18:26,992 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:30,436 - run +93 - INFO - node_Conv_3: Launching spatial mapping 5/10 :{D1: {OX: 27}, D2: {OY: 74}}.
2025-08-01 15:18:30,436 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:33,815 - run +93 - INFO - node_Conv_3: Launching spatial mapping 6/10 :{D1: {OY: 74}, D2: {OX: 27}}.
2025-08-01 15:18:33,816 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:37,189 - run +93 - INFO - node_Conv_3: Launching spatial mapping 7/10 :{D1: {OX: 18}, D2: {OY: 74}}.
2025-08-01 15:18:37,189 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:40,453 - run +93 - INFO - node_Conv_3: Launching spatial mapping 8/10 :{D1: {OY: 74}, D2: {OX: 18}}.
2025-08-01 15:18:40,454 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
2025-08-01 15:18:43,761 - run +93 - INFO - node_Conv_3: Launching spatial mapping 9/10 :{D1: {K: 16}, D2: {OY: 74}}.
2025-08-01 15:18:43,762 - run +89 - INFO - Running SALSA Temporal Mapping Optimizer with 1 core(s).
Process Process-19:
Traceback (most recent call last):
File "-----/python3.13/site-packages/multiprocessing_on_dill/process.py", line 254, in _bootstrap
self.run()
~~~~~~~~^^
File "-----/lib/python3.13/site-packages/multiprocessing_on_dill/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "-----/lib/python3.13/site-packages/zigzag/opt/salsa/engine.py", line 98, in run
self.run_simulated_annealing_opt(self.cme_queue)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^
File "-----/lib/python3.13/site-packages/zigzag/opt/salsa/engine.py", line 109, in run_simulated_annealing_opt
best_state = SalsaState(
self.accelerator,
...<4 lines>...
self.mapping_type,
)
File "-----/lib/python3.13/site-packages/zigzag/opt/salsa/state.py", line 63, in __init__
self.temporal_mapping = allocator.run() # allocate this ordering to the memories
~~~~~~~~~~~~~^^
File "-----/lib/python3.13/site-packages/zigzag/opt/loma/memory_allocator.py", line 92, in run
self.allocate_node(node, top_levels)
~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
File "-----/lib/python3.13/site-packages/zigzag/opt/loma/memory_allocator.py", line 146, in allocate_node
raise MemoryHierarchyTooSmallException(
f"Highest MemoryLevel for {mem_op} = {node} too small to store all loops."
)
zigzag.opt.loma.memory_allocator.MemoryHierarchyTooSmallException: Highest MemoryLevel for I1 = MemoryLevel(instance=core_mem,operands=[I1, I2, O], served_dimensions=set()) too small to store all loops.
Some more info
- If I put data_locality = False, the pipeline executes successfully
- if I lower the spatial tries , the pipeline executes successfully. 8 seems to be the limit with this specific setup
- changing the temporal tries does not change the behaviour
- setting mixed_spatial_mapping = True, the pipeline fails immediately (independently from data locality)
- changing the engine from 'salsa' to 'loma' yields the exact same behaviour
Hi,
I'm experimenting a bit with very simple networks and low memory accelerators.
I'm currently using generated spatial mappings (meaning that I'm not specifying them in the mapping file) and the get_hardware_performance_zigzag function.
The problem is, the function that generates the spatial mappings can generate mappings that are incompatible with the amount of memory available in the accelerator. When this happens, an exception is raised, meaning that the whole pipeline is killed (it stalls on the exception).
What I would expect is for the pipeline to discard the mappings that are not valid instead of halting the whole process.
This is most often solved by limiting the generated spatial mappings to 2/3 or using a static spatial mapping, but this is often not ideal, as by my experiments so far I'm finding that for best optimization performances it's more convenient to try a range between 5 to 10 generated mappings.
I cannot upload the .onnx and yaml files, so I will copy them below in case you need to reproduce this behaviour.
Pytorch network
code I used t convert it to ONNX
slight modifications for ZigZag
mapping.yaml
accelerator.yaml
Code for launching the pipeline
output
Some more info