diff --git a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py
index aba6740d49..afd8b67032 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py
@@ -245,7 +245,14 @@ def _NCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_f
         if node.op in ["Conv", "RequantizedConv"]:
             # In the case of Conv: [weights, opt. bias], RequantizedConv: [weights, mul, add, opt. shift]
             for tensor in node.inputs[1:]:
-                _transformLayoutConst(tensor, spatialDims, default_channels_first)
+                # Standard case: The weight is a direct constant input.
+                if isinstance(tensor, gs.Constant):
+                    _transformLayoutConst(tensor, spatialDims, default_channels_first)
+
+                # MeZO case: The weight is produced by a Perturb node.
+                elif isinstance(tensor, gs.Variable):
+                    permute_temp = _transformLayoutPermutation(len(tensor.shape), spatialDims, default_channels_first)
+                    graph.nodes.append(_appendTranspose(tensor, node, permute_temp))
 
         node.attrs["channels_first"] = default_channels_first
 
diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index d9d768fabc..05f9ef0358 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -2884,10 +2884,11 @@ def generateIOBufferInitializationCode(self) -> str:
 
         callStack += "static const uint32_t " + self.ctxt._mangle("num_inputs") + f" = {len(inputs)};"
         callStack += "static const uint32_t " + self.ctxt._mangle("num_outputs") + f" = {len(outputs)};"
-
+        callStack += "static const uint32_t seed = 12345;"  # fixed seed for reproducibility
+        callStack += "static const uint32_t perturbation_sign = 1;"  # fixed sign for reproducibility
         callStack += "extern void* " + self.ctxt._mangle("inputs") + f"[{len(inputs)}];"
         callStack += "extern void* " + self.ctxt._mangle("outputs") + f"[{len(outputs)}];"
-
+    
         callStack += "static const uint32_t " + self.ctxt._mangle("inputs_bytes") + f"[{len(inputs)}] = " + "{"
 
         numBytes = []
@@ -3049,6 +3050,8 @@ def generateIncludeString(self) -> str:
         for engine in self.Platform.engines:
             for include in engine.includeList:
                 includeStr += ["#include \"" + include + "\""]
+            if engine.name == "GAP9Cluster":
+                includeStr += ["#include \"kernel/RandomNoise.h\""]
         return ("\n").join(includeStr)
 
     def generateEngineInitializationCode(self) -> str:
diff --git a/Deeploy/Targets/GAP9/Bindings.py b/Deeploy/Targets/GAP9/Bindings.py
index 0e7b052f46..042f3a38a1 100644
--- a/Deeploy/Targets/GAP9/Bindings.py
+++ b/Deeploy/Targets/GAP9/Bindings.py
@@ -26,7 +26,7 @@
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
     GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
     QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \
-    SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker
+    SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker, PerturbZOChecker
 from Deeploy.Targets.PULPOpen.Bindings import ForkClosure, L3MemoryAwareFunctionCallClosure, \
     MemoryAwareForkTransformer, MemoryAwareFunctionCallClosure, TilingCallClosure
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
@@ -39,7 +39,8 @@
     FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, \
     MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, \
     SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
-    iRMSNormTemplate, iSoftmaxTemplate
+    iRMSNormTemplate, iSoftmaxTemplate, FloatPerturbEggrollTemplate, FloatPerturbUniformTemplate, FloatPerturbNormalTemplate, \
+    FloatPerturbRademacherTemplate, FloatPerturbTriangleTemplate 
 from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
     PULPRequantShiftChecker
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \
@@ -328,6 +329,9 @@
 GAP9ConcatBindings = [
     NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]),
                 ConcatTemplate.referenceTemplate, GAP9ClusterTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                ConcatTemplate.referenceTemplate, GAP9ClusterTransformer)
 ]
 
 GAP9iRMSNormBindings = [
@@ -397,3 +401,33 @@
     NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate,
                 GAP9Transformer),
 ]
+
+GAP9PerturbNormalBindings = [
+    NodeBinding(
+        PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+        FloatPerturbNormalTemplate.referenceTemplate,
+        GAP9Transformer)]
+
+GAP9PerturbUniformBindings = [
+    NodeBinding(
+        PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+        FloatPerturbUniformTemplate.referenceTemplate,
+        GAP9Transformer)]
+
+GAP9PerturbEggrollBindings = [
+    NodeBinding(
+        PerturbZOChecker([PointerClass(int32_t)], [PointerClass(float32_t)]),
+        FloatPerturbEggrollTemplate.referenceTemplate,
+        GAP9Transformer)]
+
+GAP9PerturbRademacherBindings = [
+    NodeBinding(
+        PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+        FloatPerturbRademacherTemplate.referenceTemplate,
+        GAP9Transformer)]
+
+GAP9PerturbTriangleBindings = [
+    NodeBinding( 
+        PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+        FloatPerturbTriangleTemplate.referenceTemplate,
+        GAP9Transformer)]
diff --git a/Deeploy/Targets/GAP9/Platform.py b/Deeploy/Targets/GAP9/Platform.py
index d40c2c4440..28da0185db 100644
--- a/Deeploy/Targets/GAP9/Platform.py
+++ b/Deeploy/Targets/GAP9/Platform.py
@@ -22,20 +22,23 @@
     GAP9RQSTallGEMMTilingReadyBindings, GAP9RQSTilingReadyBindings, GAP9SGDTilingReadyBindings, \
     GAP9SoftmaxCrossEntropyGradTilingReadyBindings, GAP9SoftmaxCrossEntropyTilingReadyBindings, \
     GAP9SoftmaxGradTilingReadyBindings, GAP9SoftmaxTilingReadyBindings, GAP9TransposeTilingReadyBindings, \
-    GAP9UniformRQSTilingReadyBindings
+    GAP9UniformRQSTilingReadyBindings, GAP9PerturbNormalTilingReadyBindings, GAP9PerturbUniformTilingReadyBindings, \
+    GAP9PerturbEggrollTilingReadyBindings, GAP9PerturbRademacherTilingReadyBindings, GAP9PerturbTriangleTilingReadyBindings
 from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \
     BasicRQIntegerDivBinding
 from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELULayer, GEMMLayer, \
     LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, \
     ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, \
     SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, \
-    TransposeLayer, iHardswishLayer, iRMSNormLayer
+    TransposeLayer, iHardswishLayer, iRMSNormLayer, PerturbEggrollLayer, PerturbNormalLayer, PerturbRademacherLayer,\
+    PerturbTriangleLayer, PerturbUniformLayer
 from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \
     GELUParser, GEMMParser, LayerNormParser, MatMulParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, \
     QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, \
     RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \
     SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, SoftmaxParser, \
-    TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser
+    TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser, \
+    PerturbEggrollParser, PerturbNormalParser, PerturbRademacherParser, PerturbTriangleParser, PerturbUniformParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
 from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPConv1DBinding, \
     PULPDMASliceBindings, PULPDWConv1DBinding, PULPReduceMeanBindings, PULPSliceBindings
@@ -93,6 +96,11 @@
 GAP9_QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings)
 GAP9_DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings)
 GAP9_GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings)
+GAP9_PerturbNormalMapper = NodeMapper(PerturbNormalParser(), GAP9PerturbNormalTilingReadyBindings)
+GAP9_PerturbUniformMapper = NodeMapper(PerturbUniformParser(), GAP9PerturbUniformTilingReadyBindings)
+GAP9_PerturbEggrollMapper = NodeMapper(PerturbEggrollParser(), GAP9PerturbEggrollTilingReadyBindings)
+GAP9_PerturbRademacherMapper = NodeMapper(PerturbRademacherParser(), GAP9PerturbRademacherTilingReadyBindings)
+GAP9_PerturbTriangleMapper = NodeMapper(PerturbTriangleParser(), GAP9PerturbTriangleTilingReadyBindings)
 
 # GAP9-specific mapping using ClDma
 GAP9Mapping = {
@@ -171,7 +179,17 @@
     'SoftmaxCrossEntropyLossGrad':
         SoftmaxCrossEntropyLossGradLayer([GAP9_SoftmaxCrossEntropyLossGradMapper]),
     'SGD':
-        SGDLayer([GAP9_SGDMapper])
+        SGDLayer([GAP9_SGDMapper]),
+    'PerturbNormal': 
+        PerturbNormalLayer([GAP9_PerturbNormalMapper]),
+    'PerturbUniform': 
+        PerturbUniformLayer([GAP9_PerturbUniformMapper]),
+    'PerturbEggroll': 
+        PerturbEggrollLayer([GAP9_PerturbEggrollMapper]),
+    'PerturbRademacher': 
+        PerturbRademacherLayer([GAP9_PerturbRademacherMapper]),
+    'PerturbTriangle': 
+        PerturbTriangleLayer([GAP9_PerturbTriangleMapper]),
 }
 
 
diff --git a/Deeploy/Targets/GAP9/Tiler.py b/Deeploy/Targets/GAP9/Tiler.py
index fefe12b6d7..a69f0933ba 100644
--- a/Deeploy/Targets/GAP9/Tiler.py
+++ b/Deeploy/Targets/GAP9/Tiler.py
@@ -17,7 +17,8 @@
     GAP9RQAddBindings, GAP9RQSBindings, GAP9RQSConv2DBindings, GAP9RQSDWConv2DBindings, GAP9RQSGEMMBindings, \
     GAP9RQSiHardswishBindings, GAP9RQSMatrixVecBindings, GAP9RQSTallGEMMBindings, GAP9SGDBindings, \
     GAP9SoftmaxBindings, GAP9SoftmaxCrossEntropyLossBindings, GAP9SoftmaxCrossEntropyLossGradBindings, \
-    GAP9SoftmaxGradBindings, GAP9TransposeBindings, GAP9UniformRQSBindings
+    GAP9SoftmaxGradBindings, GAP9TransposeBindings, GAP9UniformRQSBindings, GAP9PerturbNormalBindings, \
+    GAP9PerturbUniformBindings, GAP9PerturbEggrollBindings, GAP9PerturbRademacherBindings, GAP9PerturbTriangleBindings
 from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint
@@ -27,6 +28,7 @@
 from Deeploy.Targets.Generic.TileConstraints.RQSiGELUTileConstraint import RQSiGELUTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.EggrollTileConstraint import EggrollTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint
@@ -142,3 +144,18 @@
 
 GAP9SGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9SGDBindings,
                                                      tileConstraint = SGDTileConstraint())
+
+GAP9PerturbNormalTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9PerturbNormalBindings,
+                                                                tileConstraint = UnaryTileConstraint())
+
+GAP9PerturbUniformTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9PerturbUniformBindings,
+                                                                 tileConstraint = UnaryTileConstraint())
+
+GAP9PerturbEggrollTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9PerturbEggrollBindings,
+                                                                tileConstraint = EggrollTileConstraint())
+
+GAP9PerturbRademacherTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9PerturbRademacherBindings,
+                                                                 tileConstraint = UnaryTileConstraint())
+
+GAP9PerturbTriangleTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9PerturbTriangleBindings,
+                                                                tileConstraint = UnaryTileConstraint())
diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py
index cc733937cc..10ba7a8358 100644
--- a/Deeploy/Targets/Generic/Layers.py
+++ b/Deeploy/Targets/Generic/Layers.py
@@ -709,3 +709,40 @@ def computeOps(self):
             numPx = opRep['dim_im_out_x']
 
         return numPx * opsPerPx
+
+
+class PerturbNormalLayer(ONNXLayer):
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
+class PerturbUniformLayer(ONNXLayer):
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
+class PerturbEggrollLayer(ONNXLayer):
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
+class PerturbRademacherLayer(ONNXLayer):
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
+class PerturbTriangleLayer(ONNXLayer):
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py
index cf1ba776bd..bcbdf1a312 100644
--- a/Deeploy/Targets/Generic/Parsers.py
+++ b/Deeploy/Targets/Generic/Parsers.py
@@ -1291,7 +1291,10 @@ def parseNode(self, node: gs.Node) -> (bool):
 
         if ret:
             if 'kernel_shape' not in node.attrs:
-                node.attrs['kernel_shape'] = node.inputs[1].shape[-2:]
+                if self.operatorRepresentation['channels_first']:
+                    node.attrs['kernel_shape'] = node.inputs[1].shape[-2:]
+                else:
+                    node.attrs['kernel_shape'] = node.inputs[1].shape[1:3]
             self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape']
             self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0])
             self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1])
@@ -2882,3 +2885,167 @@ def parseNodeCtxt(self,
         self.operatorRepresentation['size'] = int(np.prod(data_in.shape))
 
         return ctxt, True
+
+class PerturbNormalParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([len(node.inputs) == 1,
+                   len(node.outputs) == 1,
+                     'seed' in node.attrs,
+                     'eps' in node.attrs])
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        input_shape = data_in.shape
+        if isinstance(data_in.shape, int):
+            input_shape = tuple(input_shape, )
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['seed'] = node.attrs['seed']
+        self.operatorRepresentation['size'] = np.prod(input_shape)
+        self.operatorRepresentation['nodeIdx'] = node.attrs['idx']
+        self.operatorRepresentation['eps'] = node.attrs['eps']
+        return ctxt, True
+
+class PerturbUniformParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([len(node.inputs) == 1,
+                   len(node.outputs) == 1,
+                   'low' in node.attrs,
+                   'high' in node.attrs,
+                    'seed' in node.attrs,
+                    'eps' in node.attrs])
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        input_shape = data_in.shape
+        if isinstance(data_in.shape, int):
+            input_shape = tuple(input_shape, )
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['seed'] = node.attrs['seed']
+        self.operatorRepresentation['size'] = np.prod(input_shape)
+        self.operatorRepresentation['nodeIdx'] = node.attrs['idx']
+        self.operatorRepresentation['eps'] = node.attrs['eps']
+        self.operatorRepresentation['low'] = float(node.attrs['low'])
+        self.operatorRepresentation['high'] = float(node.attrs['high'])
+
+        return ctxt, True
+    
+class PerturbEggrollParser(NodeParser):
+    
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([len(node.inputs) == 1,
+                   len(node.outputs) == 1,
+                   'seed' in node.attrs,
+                   'idx' in node.attrs,
+                   'eps' in node.attrs])
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        shape_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['shape_in'] = shape_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['seed'] = node.attrs['seed']
+        self.operatorRepresentation['eps'] = node.attrs['eps']
+        self.operatorRepresentation['size'] = shape_in.values[0]
+        assert len(shape_in.values) == 2, f"Expected input to be 2D, got {len(shape_in.values)}D"
+        assert shape_in.values[1] == 1, f"Expected second dimension of input to be 1, got {shape_in.values[1]}"
+        self.operatorRepresentation['nodeIdx'] = node.attrs['idx']
+        return ctxt, True
+    
+class PerturbRademacherParser(NodeParser):
+    
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([len(node.inputs) == 1,
+                   len(node.outputs) == 1,
+                   'seed' in node.attrs,
+                   'eps' in node.attrs])
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        input_shape = data_in.shape
+        if isinstance(data_in.shape, int):
+            input_shape = tuple(input_shape, )
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['seed'] = node.attrs['seed']
+        self.operatorRepresentation['size'] = np.prod(input_shape)
+        self.operatorRepresentation['nodeIdx'] = node.attrs['idx']
+        self.operatorRepresentation['eps'] = node.attrs['eps']
+        return ctxt, True
+    
+    
+class PerturbTriangleParser(NodeParser):
+    
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([len(node.inputs) == 1,
+                   len(node.outputs) == 1,
+                   'seed' in node.attrs,
+                   'eps' in node.attrs])
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        input_shape = data_in.shape
+        if isinstance(data_in.shape, int):
+            input_shape = tuple(input_shape, )
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['seed'] = node.attrs['seed']
+        self.operatorRepresentation['size'] = np.prod(input_shape)
+        self.operatorRepresentation['nodeIdx'] = node.attrs['idx']
+        self.operatorRepresentation['eps'] = node.attrs['eps']
+        self.operatorRepresentation['low'] = float(node.attrs['low'])
+        self.operatorRepresentation['high'] = float(node.attrs['high'])
+
+        return ctxt, True
\ No newline at end of file
diff --git a/Deeploy/Targets/Generic/TileConstraints/EggrollTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/EggrollTileConstraint.py
new file mode 100644
index 0000000000..23b5af1d9b
--- /dev/null
+++ b/Deeploy/Targets/Generic/TileConstraints/EggrollTileConstraint.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+from ortools.constraint_solver.pywrapcp import IntVar
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint32_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+
+
+class EggrollTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputBufferName = parseDict['shape_in']
+        inputBuffer = ctxt.lookup(inputBufferName)
+        outputBufferName = parseDict['data_out']
+        outputBuffer = ctxt.lookup(outputBufferName)
+        inputDimVar0 = int(inputBuffer.values[0])
+        inputDimVar1 = int(inputBuffer.values[1])
+        for bufferName in [inputBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        # for dim in range(inputBuffer.values[0]):
+        #     inputDimVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = dim)
+        for dim in range(len(outputBuffer.shape)):
+            outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+            if dim == 0:
+                tilerModel.addConstraint(outputDimVar <= inputDimVar0)
+            elif dim == 1:
+                tilerModel.addConstraint(outputDimVar <= inputDimVar1)
+        return tilerModel
+    
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        inputBufferName = parseDict['data_out']
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        Dim0Idx = 0
+        Dim1Idx = 1
+        symbolicParseDict = parseDict.copy()
+        symbolicParseDict['inputDimVar0'] = tilerModel.getTensorDimVar(inputBuffer.name, Dim0Idx)
+        symbolicParseDict['inputDimVar1'] = tilerModel.getTensorDimVar(inputBuffer.name, Dim1Idx)
+        return symbolicParseDict
+    
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['shape_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        
+        replacements = {"inputDimVar0": [], "inputDimVar1": [], "size": []}
+        replacementTypes = {"inputDimVar0": PointerClass(uint32_t), "inputDimVar1": PointerClass(uint32_t), "size": PointerClass(uint32_t)}
+
+        for cube in outputCubes:
+            newSize = np.prod(cube.dims)
+            replacements["size"].append(newSize)
+            replacements['inputDimVar0'].append(cube.dims[0])
+            replacements['inputDimVar1'].append(cube.dims[1])
+
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        # for cube in outputCubes:
+        #     inputLoadSchedule.append({"shape_in": cube})
+            
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py
index 091cb55a41..525d8093bf 100644
--- a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py
@@ -35,8 +35,11 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
 
             tilerModel.addTensorDimToModel(ctxt, tensorName)
 
-            for idx, shapeDim in enumerate(_buffer.shape):
-                tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim)
+            if isinstance(_buffer.shape, int):
+                tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = 0) == _buffer.shape)
+            else:
+                for idx, shapeDim in enumerate(_buffer.shape):
+                    tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim)
 
         return tilerModel
 
diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py
index c2c8d436f8..6d14a37c2b 100644
--- a/Deeploy/Targets/Generic/TypeCheckers.py
+++ b/Deeploy/Targets/Generic/TypeCheckers.py
@@ -610,3 +610,17 @@ def _inferNumLevels(self, inputs: List[VariableBuffer],
     def _inferSignedness(self, inputs: List[VariableBuffer],
                          operatorRepresentation: OperatorRepresentation) -> List[bool]:
         return [True]
+
+class PerturbZOChecker(SignPropTypeChecker):
+    
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+    
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [inputs[0].nLevels]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [True]
+    
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index e1a9ed5932..17ecde096d 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -19,7 +19,7 @@
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
     GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
     QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \
-    SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker
+    SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker, PerturbZOChecker
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
@@ -32,7 +32,8 @@
     FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, \
     MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, \
     RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, \
-    TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate
+    TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate, FloatPerturbNormalTemplate, \
+    FloatPerturbUniformTemplate, FloatPerturbEggrollTemplate, FloatPerturbRademacherTemplate, FloatPerturbTriangleTemplate
 from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
     PULPRequantShiftChecker
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \
@@ -368,6 +369,9 @@
 PULPConcatBindings = [
     NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]),
                 ConcatTemplate.referenceTemplate, ClusterTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(ConcatChecker([PointerClass(float_type), PointerClass(float_type)], [PointerClass(float_type)]),
+                ConcatTemplate.referenceTemplate, ClusterTransformer) for float_type in FloatDataTypes
 ]
 
 PULPiRMSNormBindings = [
@@ -448,3 +452,33 @@
     NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate,
                 ForkTransformer),
 ]
+
+PULPPerturbNormalBindings = [
+    NodeBinding(
+        PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+        FloatPerturbNormalTemplate.referenceTemplate,
+        ForkTransformer)]
+
+PULPPerturbUniformBindings = [
+    NodeBinding(
+        PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+        FloatPerturbUniformTemplate.referenceTemplate,
+        ForkTransformer)]
+
+PULPPerturbEggrollBindings = [
+    NodeBinding(
+        PerturbZOChecker([PointerClass(int32_t)], [PointerClass(float32_t)]),
+        FloatPerturbEggrollTemplate.referenceTemplate,
+        ForkTransformer)]
+
+PULPPerturbRademacherBindings = [
+    NodeBinding(
+        PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+        FloatPerturbRademacherTemplate.referenceTemplate,
+        ForkTransformer)]
+
+PULPPerturbTriangleBindings = [
+    NodeBinding( 
+        PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+        FloatPerturbTriangleTemplate.referenceTemplate,
+        ForkTransformer)]
diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py
index 5c5951eaba..99e45cefb3 100644
--- a/Deeploy/Targets/PULPOpen/Parsers.py
+++ b/Deeploy/Targets/PULPOpen/Parsers.py
@@ -75,10 +75,10 @@ def parseNode(self, node: gs.Node) -> (bool):
                 # Current PULP kernel only supports grouping of 1
                 self.operatorRepresentation['group'] == 1,
 
-                # Make sure padding is square
-                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
-                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
-                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                # Make sure padding is symmetric (left==right, top==bottom)
+                # but top/bottom can differ from left/right
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],  # top == bottom
+                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],  # left == right
 
                 # Check number of inputs
                 # 2 inputs if no bias, 3 if layer has bias
@@ -133,10 +133,10 @@ def parseNode(self, node: gs.Node) -> (bool):
         if wellFormed:
             # Check if the node is a depthwise convolution
             ret = all([
-                # Make sure padding is square
-                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
-                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
-                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+                # Make sure padding is symmetric (left==right, top==bottom)
+                # but top/bottom can differ from left/right
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],  # top == bottom
+                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],  # left == right
 
                 # Check number of inputs
                 # 2 inputs if no bias, 3 if layer has bias
diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py
index d45dc00f9c..4b6a8d6a3d 100644
--- a/Deeploy/Targets/PULPOpen/Platform.py
+++ b/Deeploy/Targets/PULPOpen/Platform.py
@@ -17,13 +17,15 @@
     GEMMLayer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, \
     ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \
     RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \
-    SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer
+    SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer, PerturbNormalLayer, \
+    PerturbUniformLayer, PerturbEggrollLayer, PerturbRademacherLayer, PerturbTriangleLayer
 from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \
     GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool2DParser, \
     MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, \
     RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \
     SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, SoftmaxParser, \
-    TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser
+    TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser, \
+    PerturbNormalParser, PerturbUniformParser, PerturbEggrollParser, PerturbRademacherParser, PerturbTriangleParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, IntegerDivRequantMergePass, \
     MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \
@@ -47,7 +49,9 @@
     PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, \
     PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \
     PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \
-    PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings
+    PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings, \
+    PULPPerturbNormalTilingReadyBindings, PULPPerturbUniformTilingReadyBindings, \
+    PULPPerturbEggrollTilingReadyBindings, PULPPerturbRademacherTilingReadyBindings, PULPPerturbTriangleTilingReadyBindings
 from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \
     PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass
 
@@ -90,6 +94,13 @@
 SoftmaxMapper = NodeMapper(SoftmaxParser(), PULPSoftmaxTilingReadyBindings)
 SoftmaxGradMapper = NodeMapper(SoftmaxGradParser(), PULPSoftmaxGradTilingReadyBindings)
 Softmax_int8_Mapper = NodeMapper(iSoftmaxParser(), PULPSoftmaxTilingReadyBindings)
+PerturbNormalMapper = NodeMapper(PerturbNormalParser(), PULPPerturbNormalTilingReadyBindings)
+PerturbUniformMapper = NodeMapper(PerturbUniformParser(), PULPPerturbUniformTilingReadyBindings)
+PerturbEggrollMapper = NodeMapper(PerturbEggrollParser(), PULPPerturbEggrollTilingReadyBindings)
+PerturbRademacherMapper = NodeMapper(PerturbRademacherParser(), PULPPerturbRademacherTilingReadyBindings)
+PerturbTriangleMapper = NodeMapper(PerturbTriangleParser(), PULPPerturbTriangleTilingReadyBindings)
+
+
 
 ConcatMapper = NodeMapper(ConcatParser(), PULPConcatTilingReadyBindings)
 
@@ -148,7 +159,12 @@
     'SoftmaxGrad': SoftmaxGradLayer([SoftmaxGradMapper]),
     'SoftmaxCrossEntropyLoss': SoftmaxCrossEntropyLossLayer([SoftmaxCrossEntropyLossMapper]),
     'SoftmaxCrossEntropyLossGrad': SoftmaxCrossEntropyLossGradLayer([SoftmaxCrossEntropyLossGradMapper]),
-    'SGD': SGDLayer([SGDMapper])
+    'SGD': SGDLayer([SGDMapper]),
+    'PerturbNormal': PerturbNormalLayer([PerturbNormalMapper]),
+    'PerturbUniform': PerturbUniformLayer([PerturbUniformMapper]),
+    'PerturbEggroll': PerturbEggrollLayer([PerturbEggrollMapper]),
+    'PerturbRademacher': PerturbRademacherLayer([PerturbRademacherMapper]),
+    'PerturbTriangle': PerturbTriangleLayer([PerturbTriangleMapper]),
 }
 
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbEggrollTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbEggrollTemplate.py
new file mode 100644
index 0000000000..aea54d8e11
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbEggrollTemplate.py
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _FloatPerturbEggrollTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Add the node's unique ID to help create a unique seed_${nodeName}.
+        operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx']
+        return ctxt, operatorRepresentation, []
+
+
+# TODO: No loop unrolling optimization yet
+referenceTemplate = _FloatPerturbEggrollTemplate("""
+// Perturb Eggroll (Name: ${nodeName}, Op: ${nodeOp})
+uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id();
+uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES);
+uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size});
+uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size});
+uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start;
+
+uint32_t chunk_seed = seed +  (${nodeName}_chunk_start * ${node_id}) + (${node_id} * 104729);
+
+GenEggrollPerturbation((float32_t *) &${data_out}[${nodeName}_chunk_start],
+                        chunk_seed,
+                        ${nodeName}_local_size);
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbNormalTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbNormalTemplate.py
new file mode 100644
index 0000000000..eb89d487aa
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbNormalTemplate.py
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _FloatPerturbNormalTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Add the node's unique ID to help create a unique seed_${nodeName}.
+        operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx']
+        return ctxt, operatorRepresentation, []
+
+
+# TODO: No loop unrolling optimization yet
+referenceTemplate = _FloatPerturbNormalTemplate("""
+// PerturbNormal (Name: ${nodeName}, Op: ${nodeOp})
+uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id();
+uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES);
+uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size});
+uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size});
+uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start;
+
+uint32_t chunk_seed = seed + (${nodeName}_chunk_start * ${node_id}) + (${node_id} * 104729);
+
+// pick large enough stride to minimize correlation between nodes.
+ApplyGaussianPerturbation(
+    (const float32_t *) &${data_in}[${nodeName}_chunk_start],
+    (float32_t *) &${data_out}[${nodeName}_chunk_start],
+    chunk_seed,
+    perturbation_sign, // globally defined in DeedeployTest main
+    ${nodeName}_local_size,
+    ${eps}f,
+);
+"""
+)
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbRademacherTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbRademacherTemplate.py
new file mode 100644
index 0000000000..b550539b88
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbRademacherTemplate.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _FloatPerturbRademacherTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Add the node's unique ID to help create a unique seed.
+        operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx']
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _FloatPerturbRademacherTemplate("""
+// PerturbRademacher (Name: ${nodeName}, Op: ${nodeOp})
+uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id();
+uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES);
+uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size});
+uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size});
+uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start;
+
+uint32_t i = ${nodeName}_chunk_start;
+for (; i < ${nodeName}_chunk_stop; i++) {
+    // pick large enough stride to minimize correlation between nodes.
+    uint32_t chunk_seed = seed + i*${nodeName}_chunk_start + (${node_id} * 104729);
+    ApplyRademacherPerturbation((const float32_t *)  &${data_in}[i],
+                                (float32_t *) &${data_out}[i],
+                                chunk_seed,
+                                perturbation_sign, // globally defined in DeedeployTest main
+                                ${nodeName}_local_size,
+                                ${eps}f);
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbTriangleTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbTriangleTemplate.py
new file mode 100644
index 0000000000..416a7b0a40
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbTriangleTemplate.py
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _FloatPerturbTriangleTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Add the node's unique ID to help create a unique seed_${nodeName}.
+        operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx']
+        return ctxt, operatorRepresentation, []
+
+
+# TODO: No loop unrolling optimization yet
+referenceTemplate = _FloatPerturbTriangleTemplate("""
+// PerturbTriangle (Name: ${nodeName}, Op: ${nodeOp})
+uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id();
+uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES);
+uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size});
+uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size});
+uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start;
+
+uint32_t i = ${nodeName}_chunk_start;
+for (; i < ${nodeName}_chunk_stop; i++) {
+    // pick large enough stride to minimize correlation between nodes.
+    uint32_t chunk_seed = seed + i*${nodeName}_chunk_start + (${node_id} * 104729);
+    ApplyTrianglePerturbation((const float32_t *)  &${data_in}[i],
+                                (float32_t *) &${data_out}[i],
+                                chunk_seed,
+                                perturbation_sign, // globally defined in DeedeployTest main
+                                ${nodeName}_local_size,
+                                ${eps}f);
+}
+""")
+
+updateTemplate = _FloatPerturbTriangleTemplate("""
+// UpdateTriangle (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    UpdateWeightsTriangle((float32_t *)${data_in},
+                                loss,
+                                seed + ${node_id},
+                                ${eps}f,
+                                lr, // globally defined
+                                ${size});
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbUniformTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbUniformTemplate.py
new file mode 100644
index 0000000000..e81147751b
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbUniformTemplate.py
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _FloatPerturbUniformTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Add the node's unique ID to help create a unique seed.
+        operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx']
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _FloatPerturbUniformTemplate("""
+// PerturbUniform (Name: ${nodeName}, Op: ${nodeOp})
+uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id();
+uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES);
+uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size});
+uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size});
+uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start;
+
+// pick large enough stride to minimize correlation between nodes.
+uint32_t chunk_seed = seed + (${nodeName}_chunk_start * ${node_id}) + (${node_id} * 104729);
+ApplyUniformPerturbation((const float32_t *)  &${data_in}[${nodeName}_chunk_start],
+                            (float32_t *) &${data_out}[${nodeName}_chunk_start],
+                            chunk_seed,
+                            perturbation_sign, // globally defined in DeedeployTest main
+                            ${nodeName}_local_size,
+                            ${eps}f);
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py
index 3d7d11f343..8431b6cc89 100644
--- a/Deeploy/Targets/PULPOpen/Tiler.py
+++ b/Deeploy/Targets/PULPOpen/Tiler.py
@@ -13,6 +13,7 @@
 from Deeploy.Targets.Generic.TileConstraints.RQSiGELUTileConstraint import RQSiGELUTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.EggrollTileConstraint import EggrollTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint
 from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \
     PULPFloatDWConv2DBindings, PULPFloatGELUBinding, PULPFloatGELUGradBinding, PULPFloatGEMMBindings, \
@@ -22,7 +23,8 @@
     PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, \
     PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, PULPSoftmaxBindings, \
     PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, \
-    PULPTransposeBindings, PULPUniformRQSBindings
+    PULPTransposeBindings, PULPUniformRQSBindings, PULPPerturbNormalBindings, PULPPerturbUniformBindings, \
+    PULPPerturbEggrollBindings, PULPPerturbRademacherBindings, PULPPerturbTriangleBindings
 from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \
     RQDWConv2DTileConstraint
@@ -153,3 +155,18 @@
 
 PULPReduceMeanTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceMeanBindings,
                                                             tileConstraint = ReduceMeanTileConstraint())
+
+PULPPerturbNormalTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbNormalBindings,
+                                                                tileConstraint = UnaryTileConstraint())
+
+PULPPerturbUniformTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbUniformBindings,
+                                                                 tileConstraint = UnaryTileConstraint())
+
+PULPPerturbEggrollTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbEggrollBindings,
+                                                                tileConstraint = EggrollTileConstraint())
+
+PULPPerturbRademacherTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbRademacherBindings,
+                                                                 tileConstraint = UnaryTileConstraint())
+
+PULPPerturbTriangleTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbTriangleBindings,
+                                                                tileConstraint = UnaryTileConstraint())
diff --git a/Deeploy/TilingExtension/TileConstraint.py b/Deeploy/TilingExtension/TileConstraint.py
index 5b067b2ce9..1b92df1752 100644
--- a/Deeploy/TilingExtension/TileConstraint.py
+++ b/Deeploy/TilingExtension/TileConstraint.py
@@ -146,6 +146,8 @@ def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List
             targetIdx = 1
 
         fullShape = ctxt.lookup(outVar).shape
+        if isinstance(fullShape, int):
+            fullShape = (fullShape,)
         initialOffset = (0,) * len(fullShape)
         outputCubes = [
             AbsoluteHyperRectangle(rectangle = HyperRectangle(offset = initialOffset, dims = tuple(fullShape)),
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/inputs.npz
new file mode 100644
index 0000000000..b58ac20c7b
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/network.onnx
new file mode 100644
index 0000000000..38798357d4
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/outputs.npz
new file mode 100644
index 0000000000..5284177d8e
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/inputs.npz
new file mode 100644
index 0000000000..847536024f
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/network.onnx
new file mode 100644
index 0000000000..5dcf8bae70
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/outputs.npz
new file mode 100644
index 0000000000..a780bf64e6
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/inputs.npz
new file mode 100644
index 0000000000..4a1e9c269c
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/network.onnx
new file mode 100644
index 0000000000..52f0ccfd9c
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/outputs.npz
new file mode 100644
index 0000000000..8dc3b1d0cf
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/inputs.npz
new file mode 100644
index 0000000000..d77ca34b35
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/network.onnx
new file mode 100644
index 0000000000..23990e812d
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/outputs.npz
new file mode 100644
index 0000000000..9113de38f2
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/inputs.npz
new file mode 100644
index 0000000000..a7e5c1cfa0
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/network.onnx
new file mode 100644
index 0000000000..42c66ac0c8
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/outputs.npz
new file mode 100644
index 0000000000..56be194e61
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/Lite-CNN-ZO/inputs.npz b/DeeployTest/Tests/Models/Lite-CNN-ZO/inputs.npz
new file mode 100644
index 0000000000..8dcd54a7dd
Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN-ZO/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Lite-CNN-ZO/network.onnx b/DeeployTest/Tests/Models/Lite-CNN-ZO/network.onnx
new file mode 100644
index 0000000000..26bedabcb4
Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN-ZO/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Lite-CNN-ZO/outputs.npz b/DeeployTest/Tests/Models/Lite-CNN-ZO/outputs.npz
new file mode 100644
index 0000000000..e768b0ce4d
Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN-ZO/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/Lite-CNN/inputs.npz b/DeeployTest/Tests/Models/Lite-CNN/inputs.npz
new file mode 100644
index 0000000000..8dcd54a7dd
Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Lite-CNN/network.onnx b/DeeployTest/Tests/Models/Lite-CNN/network.onnx
new file mode 100644
index 0000000000..2a39932575
Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Lite-CNN/outputs.npz b/DeeployTest/Tests/Models/Lite-CNN/outputs.npz
new file mode 100644
index 0000000000..e768b0ce4d
Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/LiteCNN-Eggroll/inputs.npz b/DeeployTest/Tests/Models/LiteCNN-Eggroll/inputs.npz
new file mode 100644
index 0000000000..eccd659d67
Binary files /dev/null and b/DeeployTest/Tests/Models/LiteCNN-Eggroll/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/LiteCNN-Eggroll/network.onnx b/DeeployTest/Tests/Models/LiteCNN-Eggroll/network.onnx
new file mode 100644
index 0000000000..ddf5e6285b
Binary files /dev/null and b/DeeployTest/Tests/Models/LiteCNN-Eggroll/network.onnx differ
diff --git a/DeeployTest/Tests/Models/LiteCNN-Eggroll/outputs.npz b/DeeployTest/Tests/Models/LiteCNN-Eggroll/outputs.npz
new file mode 100644
index 0000000000..2bd8307dbd
Binary files /dev/null and b/DeeployTest/Tests/Models/LiteCNN-Eggroll/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/SleepConVit-ZO/inputs.npz b/DeeployTest/Tests/Models/SleepConVit-ZO/inputs.npz
new file mode 100644
index 0000000000..d55dda479f
Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit-ZO/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/SleepConVit-ZO/network.onnx b/DeeployTest/Tests/Models/SleepConVit-ZO/network.onnx
new file mode 100644
index 0000000000..c5aefc7f47
Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit-ZO/network.onnx differ
diff --git a/DeeployTest/Tests/Models/SleepConVit-ZO/outputs.npz b/DeeployTest/Tests/Models/SleepConVit-ZO/outputs.npz
new file mode 100644
index 0000000000..7b64cc07d8
Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit-ZO/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/SleepConVit/inputs.npz b/DeeployTest/Tests/Models/SleepConVit/inputs.npz
new file mode 100644
index 0000000000..ee174fcab4
Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/SleepConVit/network.onnx b/DeeployTest/Tests/Models/SleepConVit/network.onnx
new file mode 100644
index 0000000000..c51390febe
Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit/network.onnx differ
diff --git a/DeeployTest/Tests/Models/SleepConVit/outputs.npz b/DeeployTest/Tests/Models/SleepConVit/outputs.npz
new file mode 100644
index 0000000000..8babb4ed7a
Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit/outputs.npz differ
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
index 01216984af..cbaeda7cae 100644
--- a/DeeployTest/testMVP.py
+++ b/DeeployTest/testMVP.py
@@ -212,6 +212,8 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
         help =
         "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1."
     )
+    parser.add_argument('--run_mode', type = str, default = 'inference',
+                        help = 'Run mode of the network. Options are: inference, mezo_training.')
 
     parser.set_defaults(shouldFail = False)
     args = parser.parse_args()
diff --git a/DeeployTest/testRunner_tiled_siracusa_mezo.py b/DeeployTest/testRunner_tiled_siracusa_mezo.py
new file mode 100644
index 0000000000..9b85b7f491
--- /dev/null
+++ b/DeeployTest/testRunner_tiled_siracusa_mezo.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
+
+if __name__ == "__main__":
+
+    parser = TestRunnerArgumentParser(
+        tiling_arguments = True, description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling).")
+
+    parser.add_argument('--cores',
+                        metavar = '<cores>',
+                        dest = 'cores',
+                        type = int,
+                        default = 8,
+                        help = 'Set number of cluster cores')
+    args = parser.parse_args()
+
+    testRunner = TestRunner(platform = "Siracusa", simulator = "gvsoc", tiling = True, argument_parser = parser, gen_args = "--run_mode mezo_training")
+
+    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
+
+    testRunner.run()
diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
index f6e8308c97..95de0c7a5b 100644
--- a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
+++ b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
@@ -34,6 +34,7 @@
 #include "kernel/UniformRequantShift.h"
 #include "kernel/gemv.h"
 #include "kernel/iRMSnorm.h"
+#include "kernel/RandomNoise.h"
 
 #define LOG2(x) (__builtin_pulp_fl1(x))
 
diff --git a/TargetLibraries/PULPOpen/inc/kernel/RandomNoise.h b/TargetLibraries/PULPOpen/inc/kernel/RandomNoise.h
new file mode 100644
index 0000000000..f8db0d6d59
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/kernel/RandomNoise.h
@@ -0,0 +1,114 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+ 
+#ifndef __DEEPLOY_MATH_RANDOMNOISE_KERNEL_HEADER_
+#define __DEEPLOY_MATH_RANDOMNOISE_KERNEL_HEADER_
+
+#include "DeeployPULPMath.h"
+
+
+#define PI_F 3.14159265358979323846f
+
+#define ZIGGURAT_TABLE_SIZE 128
+#define ZIGGURAT_R 3.442619855899
+#define ZIGGURAT_V 9.91256303526217e-3
+
+static uint32_t kn[ZIGGURAT_TABLE_SIZE];
+static float32_t wn[ZIGGURAT_TABLE_SIZE];
+static float32_t fn[ZIGGURAT_TABLE_SIZE];
+static int32_t ziggurat_tables_initialized = 0;
+
+
+typedef struct {
+    uint32_t state;
+    uint32_t bits;
+    uint32_t bitpos;
+} RademacherRNG;
+
+// Sample from Unifom distribution U[-0.5,0.5]
+float32_t UniformSample(uint32_t *state);
+// Sample from triangular distribution Tr[-1, 1]
+float32_t TriangularSample(uint32_t *state);
+float32_t GaussianSample(uint32_t *state);
+float32_t RademacherSample(RademacherRNG *rng);
+uint32_t Xorshift32(uint32_t state);
+void build_ziggurat_tables();
+float32_t GaussianZigguratSample(uint32_t *state);
+
+void RademacherRNG_init(RademacherRNG *rng, uint32_t seed);
+
+// Applies triangular perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm.
+void ApplyTriangularPerturbation(const float32_t *__restrict__ pweights,
+                            float32_t *__restrict__ pweights_dest,
+                            uint32_t seed,
+                            uint32_t dir,
+                            uint32_t size,
+                            float32_t epsilon);
+
+// Applies uniform perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm.
+void ApplyUniformPerturbation(const float32_t *__restrict__ pweights,
+                            float32_t *__restrict__ pweights_dest,
+                            uint32_t seed,
+                            uint32_t dir,
+                            uint32_t size,
+                            float32_t epsilon);
+                        
+// Applies uniform perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm.
+void ApplyGaussianPerturbation(const float32_t *__restrict__ pweights,
+                            float32_t *__restrict__ pweights_dest,
+                            uint32_t seed,
+                            uint32_t dir,
+                            uint32_t size,
+                            float32_t epsilon);
+                     
+// Applies uniform perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm.
+void ApplyRademacherPerturbation(const float32_t *__restrict__ pweights,
+                            float32_t *__restrict__ pweights_dest,
+                            uint32_t seed,
+                            uint32_t dir,
+                            uint32_t size,
+                            float32_t epsilon);
+
+
+// Updates the weights in place according to the MeZO update rule with triangular noise.
+// Only supports qMeZO with q = 1 for now.
+// void UpdateWeightsTriangle(float32_t *__restrict__ pweights,
+//                             float32_t loss,
+//                             uint32_t seed,
+//                             float32_t epsilon,
+//                             float32_t lr,
+//                             uint32_t size);
+
+// // Updates the weights in place according to the MeZO update rule with uniform noise.
+// // Only supports qMeZO with q = 1 for now.
+// void UpdateWeightsUniform(float32_t *__restrict__ pweights,
+//                             float32_t loss,
+//                             uint32_t seed,
+//                             float32_t epsilon,
+//                             float32_t lr,
+//                             uint32_t size);
+
+// void UpdateWeightsGaussian(float32_t *__restrict__ pweights,
+//                             float32_t loss,
+//                             uint32_t seed,
+//                             float32_t epsilon,
+//                             float32_t lr,
+//                             uint32_t size);
+
+// void UpdateWeightsRademacher(float32_t *__restrict__ pweights,
+//                             float32_t loss,
+//                             uint32_t seed,
+//                             float32_t epsilon,
+//                             float32_t lr,
+//                             uint32_t size);
+
+
+void GenEggrollPerturbation(float32_t *__restrict__ p_dest,
+                            uint32_t seed,
+                            uint32_t size,
+                            float32_t epsilon);
+
+#endif //__DEEPLOY_MATH_RANDOMNOISE_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/src/Gemm.c b/TargetLibraries/PULPOpen/src/Gemm.c
index a46f8ac6ae..02fd991674 100644
--- a/TargetLibraries/PULPOpen/src/Gemm.c
+++ b/TargetLibraries/PULPOpen/src/Gemm.c
@@ -6,6 +6,7 @@
 
 #include "DeeployPULPMath.h"
 #include "pmsis.h"
+// #include "perf_utils.h"
 
 void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
                                    const float32_t *__restrict__ pSrcB,
@@ -17,6 +18,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
   int8_t core_id = pi_core_id();
   int8_t log2Core = LOG2(NUM_CORES);
 
+  //RW: Performance monitoring is currently disabled 
+  // perf_stats_t perf_start, perf_end, perf_total;
+
+  // // Initialize and start performance counters (only core 0)
+  // if (core_id == 0) {
+  //   perf_bench_init();
+  //   perf_bench_start();
+  //   perf_bench_read(&perf_start);
+  // }
+
   uint32_t M_chunk = (M >> log2Core) + ((M & (NUM_CORES - 1)) != 0);
   uint32_t M_start = MIN(core_id * M_chunk, M);
   uint32_t M_end = MIN(M_start + M_chunk, M);
@@ -351,4 +362,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
       }
     }
   }
+
+  // RW: Stop performance counters and print results (only core 0)
+  // if (core_id == 0) {
+  //   perf_bench_stop();
+  //   perf_bench_read(&perf_end);
+  //   perf_bench_diff(&perf_total, &perf_end, &perf_start);
+
+  //   char label[100];
+  //   snprintf(label, sizeof(label), "GEMM M=%u N=%u O=%u transA=%u transB=%u",
+  //            M, N, O, transA, transB);
+  //   perf_bench_print(label, &perf_total);
+  // }
 }
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/src/RandomNoise.c b/TargetLibraries/PULPOpen/src/RandomNoise.c
new file mode 100644
index 0000000000..001d205af8
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/RandomNoise.c
@@ -0,0 +1,281 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+#include <math.h>
+
+// TODO: 1) loop unrolling for ILP perf
+// TODO: 2) Perturbation directly integrated in GEMM or Conv kernels.
+/* --------------------------- RNG ---------------------------------- */
+
+uint32_t Xorshift32(uint32_t state) {
+    state ^= state << 13;
+    state ^= state >> 17;
+    state ^= state << 5;
+    return state;
+}
+
+/* --------------------------- Samplers ---------------------------------- */
+
+float32_t TriangularSample(uint32_t *state) {
+    *state = Xorshift32(*state);
+    float32_t u1 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1]
+    // mutate state to avoid same seed for u2.
+    *state = Xorshift32(*state);
+    float32_t u2 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1]
+    return u1 - u2;
+}
+
+float32_t UniformSample(uint32_t *state) {
+    *state = Xorshift32(*state);
+    float32_t u1 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1]
+    return u1-0.5f; // centered around 0
+}
+
+float32_t GaussianSample(uint32_t *state) {
+    // Box-Muller transform
+    *state = Xorshift32(*state);
+    float32_t u1 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in (0,1]
+    // mutate state to avoid same seed for u2.
+    *state = Xorshift32(*state);
+    float32_t u2 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1]
+    return sqrtf(-2.0f * logf(u1)) * cosf(2.0f * (float32_t)PI_F * u2);
+}
+
+/* ---------------- Ziggurat method for Gaussian sampling ---------------- */
+// This implementation is adapted from the public domain Ziggurat algorithm
+// by Marsaglia and Tsang.
+
+void build_ziggurat_tables() {
+    if (ziggurat_tables_initialized) return;
+
+    float32_t dn = (float32_t) ZIGGURAT_R;
+    float32_t tn = dn;
+    float32_t vn = (float32_t) ZIGGURAT_V;
+
+    // Set up the tables
+    float32_t q = vn / expf(-0.5f * dn * dn);
+    kn[0] = (uint32_t)((dn / q) * (float32_t)0xFFFFFFFF);
+    kn[1] = 0;
+
+    wn[0] = (float32_t)(q / (float32_t)0xFFFFFFFF);
+    wn[ZIGGURAT_TABLE_SIZE - 1] = (float32_t)(dn / (float32_t)0xFFFFFFFF);
+
+    fn[0] = 1.0f;
+    fn[ZIGGURAT_TABLE_SIZE - 1] = expf(-0.5f * dn * dn);
+
+    for (uint32_t i = ZIGGURAT_TABLE_SIZE - 2; i >= 1; i--) {
+        dn = sqrtf(-2.0f * logf(vn / dn + expf(-0.5f * dn * dn)));
+        kn[i + 1] = (uint32_t)((dn / tn) * (float32_t)0xFFFFFFFF);
+        tn = dn;
+        fn[i] = expf(-0.5f * dn * dn);
+        wn[i] = (float32_t)(dn / (float32_t)0xFFFFFFFF);
+    }
+    ziggurat_tables_initialized = 1;
+}
+
+
+float32_t GaussianZigguratSample(uint32_t *state) {
+    if (!ziggurat_tables_initialized) {
+        build_ziggurat_tables();
+    }
+
+    int32_t hz;
+    uint32_t iz;
+    float32_t x, y;
+
+    for (;;) {
+        *state = Xorshift32(*state);
+        hz = (int32_t)(*state);
+        iz = hz & (ZIGGURAT_TABLE_SIZE - 1);
+
+        // Quick acceptance path
+        if ((uint32_t)((hz < 0) ? -hz : hz) < kn[iz])  {
+            return (float32_t)hz * wn[iz];
+        }
+
+        // Handle the tail
+        if (iz == 0) {
+            do {
+                *state = Xorshift32(*state);
+                x = -logf((float32_t)(*state) / (float32_t)0xFFFFFFFF) / ZIGGURAT_R;
+                *state = Xorshift32(*state);
+                y = -logf((float32_t)(*state) / (float32_t)0xFFFFFFFF);
+            } while (y + y < x * x);
+            return (hz > 0) ? ZIGGURAT_R + x : -ZIGGURAT_R - x;
+        }
+
+        // Slower rejection path
+        x = (float32_t)hz * wn[iz];
+        if (fn[iz] + ((float32_t)(*state) / (float32_t)0xFFFFFFFF) * (fn[iz - 1] - fn[iz]) < expf(-0.5f * x * x)) {
+            return x;
+        }
+    }
+}
+
+void RademacherRNG_init(RademacherRNG *rng, uint32_t seed) {
+    rng->state = seed;
+    rng->bits = 0;
+    rng->bitpos = 32; // force refill on first use
+}
+
+float32_t RademacherSample(RademacherRNG *rng) {
+    if (rng->bitpos >= 32) {
+        rng->state = Xorshift32(rng->state);
+        rng->bits = rng->state;
+        rng->bitpos = 0;
+    }
+    float32_t val = (rng->bits & 1) ? 1.0f : -1.0f;
+    rng->bits >>= 1;
+    rng->bitpos++;
+    return val;
+}
+
+/* ------------------------- Perturbation Functions -------------------------------- */
+
+void ApplyTriangularPerturbation(const float32_t *__restrict__ pweights,
+                            float32_t *__restrict__ pweights_dest,
+                            uint32_t seed,
+                            uint32_t dir,
+                            uint32_t size,
+                            float32_t epsilon) 
+{
+    uint32_t rng_state = (seed * 1664525u) + 1013904223u;
+    float32_t sqrt6 = 2.44948974278f;
+    float32_t scale = epsilon * sqrt6; // sqrt(6): => variance 1
+    if (dir == 0) {scale *= -1.0f;}
+    for (uint32_t i = 0; i < size; i++) {
+        float32_t tr = TriangularSample(&rng_state);
+        pweights_dest[i] = pweights[i] + tr * scale;
+    }
+}
+
+void ApplyUniformPerturbation(const float32_t *__restrict__ pweights,
+                            float32_t *__restrict__ pweights_dest,
+                            uint32_t seed,
+                            uint32_t dir,
+                            uint32_t size,
+                            float32_t epsilon)
+{
+    printf("ApplyUniformPerturbation:epsilon=%f, size=%u \n", epsilon, size);
+    uint32_t rng_state = (seed * 1664525u) + 1013904223u;
+    float32_t sqrt3 = 1.73205080757f;
+    float32_t scale = epsilon * sqrt3 * 2.0f; // factor 2: [-0.5,0.5] => [-1,1], sqrt(3): => Gaussian(0, 1) l2 norm.
+    if (dir == 0) {scale *= -1.0f;}
+    for (uint32_t i = 0; i < size; i++) {
+        float32_t u = UniformSample(&rng_state);
+        pweights_dest[i] = pweights[i] + u * scale;
+    }
+}
+
+void ApplyGaussianPerturbation(const float32_t *__restrict__ pweights,
+                            float32_t *__restrict__ pweights_dest,
+                            uint32_t seed,
+                            uint32_t dir,
+                            uint32_t size,
+                            float32_t epsilon) {
+    uint32_t rng_state = (seed * 1664525u) + 1013904223u;
+    float32_t scale = epsilon; // gaussian naturally has variance 1
+    if (dir == 0) {scale *= -1.0f;}
+    for (uint32_t i = 0; i < size; i++) {
+        float32_t u = GaussianSample(&rng_state);
+        pweights_dest[i] = pweights[i] + u * scale;
+    }
+}
+
+void ApplyRademacherPerturbation(const float32_t *__restrict__ pweights,
+                            float32_t *__restrict__ pweights_dest,
+                            uint32_t seed,
+                            uint32_t dir,
+                            uint32_t size,
+                            float32_t epsilon) {
+    RademacherRNG rng_state = { (seed * 1664525u) + 1013904223u, 0, 32 };
+    float32_t scale = epsilon; // rademacher naturally has variance 1
+    if (dir == 0) {scale *= -1.0f;}
+    for (uint32_t i = 0; i < size; i++) {
+        float32_t u = RademacherSample(&rng_state);
+        pweights_dest[i] = pweights[i] + u * scale;
+    }
+}
+
+void GenEggrollPerturbation(float32_t *__restrict__ p_dest,
+                            uint32_t seed,
+                            uint32_t size,
+                            float32_t epsilon)
+{
+    // For compatibility with existing codegen templates. Currently maps to Rademacher noise.
+    // RademacherRNG rng_state = { (seed * 1664525u) + 1013904223u, 0, 32};
+    // float32_t scale = 0.01f; // rademacher naturally has variance 1
+    // for (uint32_t i = 0; i < size; i++) {
+    //     float32_t u = RademacherSample(&rng_state);
+    //     p_dest[i] = u;
+    // }
+    uint32_t rng_state = (seed * 1664525u) + 1013904223u;
+    float32_t u = 0.0f;
+    for (uint32_t i = 0; i < size; i++) {
+        p_dest[i] = UniformSample(&rng_state);
+    }
+
+}
+
+/* --------------------------- Update functions ---------------------------------- */
+
+// void UpdateWeightsTriangle(float32_t *__restrict__ pweights,
+//                             float32_t loss,
+//                             uint32_t seed,
+//                             float32_t epsilon,
+//                             float32_t lr,
+//                             uint32_t size) {
+//     uint32_t rng_state = (seed * 1664525u) + 1013904223u;
+//     float32_t sqrt6 = 2.44948974278f;
+//     const float32_t scale = sqrt6; // sqrt(6): => Gaussian(0, 1) l2 norm.
+//     for (uint32_t i = 0; i < size; i++) {
+//         float32_t tr = TriangularSample(&rng_state);
+//         pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * tr * scale;
+//     }
+// }
+
+// void UpdateWeightsUniform(float32_t *__restrict__ pweights,
+//                             float32_t loss,
+//                             uint32_t seed,
+//                             float32_t epsilon,
+//                             float32_t lr,
+//                             uint32_t size) {
+//     uint32_t rng_state = (seed * 1664525u) + 1013904223u;
+//     float32_t sqrt3 = 1.73205080757f;
+//     const float32_t scale = sqrt3 * 2.0f; // factor 2: [-0.5,0.5] => [-1,1], sqrt(3): => variance 1
+//     for (uint32_t i = 0; i < size; i++) {
+//         float32_t u = UniformSample(&rng_state);
+//         pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * u * scale;
+//     }
+// }
+
+// void UpdateWeightsGaussian(float32_t *__restrict__ pweights,
+//                             float32_t loss,
+//                             uint32_t seed,
+//                             float32_t epsilon,
+//                             float32_t lr,
+//                             uint32_t size) {
+//     uint32_t rng_state = (seed * 1664525u) + 1013904223u;
+//     for (uint32_t i = 0; i < size; i++) {
+//         float32_t u = GaussianSample(&rng_state);
+//         pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * u;
+//     }
+// }
+
+// void UpdateWeightsRademacher(float32_t *__restrict__ pweights,
+//                             float32_t loss,
+//                             uint32_t seed,
+//                             float32_t epsilon,
+//                             float32_t lr,
+//                             uint32_t size) {
+//     RademacherRNG rng_state = { (seed * 1664525u) + 1013904223u, 0, 32 };
+//     for (uint32_t i = 0; i < size; i++) {
+//         float32_t u = RademacherSample(&rng_state);
+//         pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * u;
+//     }
+// }
\ No newline at end of file