diff --git a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py index aba6740d49..afd8b67032 100644 --- a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py +++ b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py @@ -245,7 +245,14 @@ def _NCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_f if node.op in ["Conv", "RequantizedConv"]: # In the case of Conv: [weights, opt. bias], RequantizedConv: [weights, mul, add, opt. shift] for tensor in node.inputs[1:]: - _transformLayoutConst(tensor, spatialDims, default_channels_first) + # Standard case: The weight is a direct constant input. + if isinstance(tensor, gs.Constant): + _transformLayoutConst(tensor, spatialDims, default_channels_first) + + # MeZO case: The weight is produced by a Perturb node. + elif isinstance(tensor, gs.Variable): + permute_temp = _transformLayoutPermutation(len(tensor.shape), spatialDims, default_channels_first) + graph.nodes.append(_appendTranspose(tensor, node, permute_temp)) node.attrs["channels_first"] = default_channels_first diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index d9d768fabc..05f9ef0358 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -2884,10 +2884,11 @@ def generateIOBufferInitializationCode(self) -> str: callStack += "static const uint32_t " + self.ctxt._mangle("num_inputs") + f" = {len(inputs)};" callStack += "static const uint32_t " + self.ctxt._mangle("num_outputs") + f" = {len(outputs)};" - + callStack += "static const uint32_t seed = 12345;" # fixed seed for reproducibility + callStack += "static const uint32_t perturbation_sign = 1;" # fixed sign for reproducibility callStack += "extern void* " + self.ctxt._mangle("inputs") + f"[{len(inputs)}];" callStack += "extern void* " + self.ctxt._mangle("outputs") + f"[{len(outputs)}];" - + callStack += "static const uint32_t " + self.ctxt._mangle("inputs_bytes") + f"[{len(inputs)}] = " + "{" numBytes = [] @@ -3049,6 +3050,8 @@ def generateIncludeString(self) -> str: for engine in self.Platform.engines: for include in engine.includeList: includeStr += ["#include \"" + include + "\""] + if engine.name == "GAP9Cluster": + includeStr += ["#include \"kernel/RandomNoise.h\""] return ("\n").join(includeStr) def generateEngineInitializationCode(self) -> str: diff --git a/Deeploy/Targets/GAP9/Bindings.py b/Deeploy/Targets/GAP9/Bindings.py index 0e7b052f46..042f3a38a1 100644 --- a/Deeploy/Targets/GAP9/Bindings.py +++ b/Deeploy/Targets/GAP9/Bindings.py @@ -26,7 +26,7 @@ from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \ - SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker + SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker, PerturbZOChecker from Deeploy.Targets.PULPOpen.Bindings import ForkClosure, L3MemoryAwareFunctionCallClosure, \ MemoryAwareForkTransformer, MemoryAwareFunctionCallClosure, TilingCallClosure from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass @@ -39,7 +39,8 @@ FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, \ MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, \ SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \ - iRMSNormTemplate, iSoftmaxTemplate + iRMSNormTemplate, iSoftmaxTemplate, FloatPerturbEggrollTemplate, FloatPerturbUniformTemplate, FloatPerturbNormalTemplate, \ + FloatPerturbRademacherTemplate, FloatPerturbTriangleTemplate from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ PULPRequantShiftChecker from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ @@ -328,6 +329,9 @@ GAP9ConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, GAP9ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, GAP9ClusterTransformer) ] GAP9iRMSNormBindings = [ @@ -397,3 +401,33 @@ NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate, GAP9Transformer), ] + +GAP9PerturbNormalBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbNormalTemplate.referenceTemplate, + GAP9Transformer)] + +GAP9PerturbUniformBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbUniformTemplate.referenceTemplate, + GAP9Transformer)] + +GAP9PerturbEggrollBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), + FloatPerturbEggrollTemplate.referenceTemplate, + GAP9Transformer)] + +GAP9PerturbRademacherBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbRademacherTemplate.referenceTemplate, + GAP9Transformer)] + +GAP9PerturbTriangleBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbTriangleTemplate.referenceTemplate, + GAP9Transformer)] diff --git a/Deeploy/Targets/GAP9/Platform.py b/Deeploy/Targets/GAP9/Platform.py index d40c2c4440..28da0185db 100644 --- a/Deeploy/Targets/GAP9/Platform.py +++ b/Deeploy/Targets/GAP9/Platform.py @@ -22,20 +22,23 @@ GAP9RQSTallGEMMTilingReadyBindings, GAP9RQSTilingReadyBindings, GAP9SGDTilingReadyBindings, \ GAP9SoftmaxCrossEntropyGradTilingReadyBindings, GAP9SoftmaxCrossEntropyTilingReadyBindings, \ GAP9SoftmaxGradTilingReadyBindings, GAP9SoftmaxTilingReadyBindings, GAP9TransposeTilingReadyBindings, \ - GAP9UniformRQSTilingReadyBindings + GAP9UniformRQSTilingReadyBindings, GAP9PerturbNormalTilingReadyBindings, GAP9PerturbUniformTilingReadyBindings, \ + GAP9PerturbEggrollTilingReadyBindings, GAP9PerturbRademacherTilingReadyBindings, GAP9PerturbTriangleTilingReadyBindings from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \ BasicRQIntegerDivBinding from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELULayer, GEMMLayer, \ LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, \ ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, \ SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, \ - TransposeLayer, iHardswishLayer, iRMSNormLayer + TransposeLayer, iHardswishLayer, iRMSNormLayer, PerturbEggrollLayer, PerturbNormalLayer, PerturbRademacherLayer,\ + PerturbTriangleLayer, PerturbUniformLayer from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \ GELUParser, GEMMParser, LayerNormParser, MatMulParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, \ QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, \ RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \ SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, SoftmaxParser, \ - TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser + TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser, \ + PerturbEggrollParser, PerturbNormalParser, PerturbRademacherParser, PerturbTriangleParser, PerturbUniformParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPConv1DBinding, \ PULPDMASliceBindings, PULPDWConv1DBinding, PULPReduceMeanBindings, PULPSliceBindings @@ -93,6 +96,11 @@ GAP9_QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) GAP9_DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) GAP9_GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings) +GAP9_PerturbNormalMapper = NodeMapper(PerturbNormalParser(), GAP9PerturbNormalTilingReadyBindings) +GAP9_PerturbUniformMapper = NodeMapper(PerturbUniformParser(), GAP9PerturbUniformTilingReadyBindings) +GAP9_PerturbEggrollMapper = NodeMapper(PerturbEggrollParser(), GAP9PerturbEggrollTilingReadyBindings) +GAP9_PerturbRademacherMapper = NodeMapper(PerturbRademacherParser(), GAP9PerturbRademacherTilingReadyBindings) +GAP9_PerturbTriangleMapper = NodeMapper(PerturbTriangleParser(), GAP9PerturbTriangleTilingReadyBindings) # GAP9-specific mapping using ClDma GAP9Mapping = { @@ -171,7 +179,17 @@ 'SoftmaxCrossEntropyLossGrad': SoftmaxCrossEntropyLossGradLayer([GAP9_SoftmaxCrossEntropyLossGradMapper]), 'SGD': - SGDLayer([GAP9_SGDMapper]) + SGDLayer([GAP9_SGDMapper]), + 'PerturbNormal': + PerturbNormalLayer([GAP9_PerturbNormalMapper]), + 'PerturbUniform': + PerturbUniformLayer([GAP9_PerturbUniformMapper]), + 'PerturbEggroll': + PerturbEggrollLayer([GAP9_PerturbEggrollMapper]), + 'PerturbRademacher': + PerturbRademacherLayer([GAP9_PerturbRademacherMapper]), + 'PerturbTriangle': + PerturbTriangleLayer([GAP9_PerturbTriangleMapper]), } diff --git a/Deeploy/Targets/GAP9/Tiler.py b/Deeploy/Targets/GAP9/Tiler.py index fefe12b6d7..a69f0933ba 100644 --- a/Deeploy/Targets/GAP9/Tiler.py +++ b/Deeploy/Targets/GAP9/Tiler.py @@ -17,7 +17,8 @@ GAP9RQAddBindings, GAP9RQSBindings, GAP9RQSConv2DBindings, GAP9RQSDWConv2DBindings, GAP9RQSGEMMBindings, \ GAP9RQSiHardswishBindings, GAP9RQSMatrixVecBindings, GAP9RQSTallGEMMBindings, GAP9SGDBindings, \ GAP9SoftmaxBindings, GAP9SoftmaxCrossEntropyLossBindings, GAP9SoftmaxCrossEntropyLossGradBindings, \ - GAP9SoftmaxGradBindings, GAP9TransposeBindings, GAP9UniformRQSBindings + GAP9SoftmaxGradBindings, GAP9TransposeBindings, GAP9UniformRQSBindings, GAP9PerturbNormalBindings, \ + GAP9PerturbUniformBindings, GAP9PerturbEggrollBindings, GAP9PerturbRademacherBindings, GAP9PerturbTriangleBindings from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint @@ -27,6 +28,7 @@ from Deeploy.Targets.Generic.TileConstraints.RQSiGELUTileConstraint import RQSiGELUTileConstraint from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint +from Deeploy.Targets.Generic.TileConstraints.EggrollTileConstraint import EggrollTileConstraint from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint @@ -142,3 +144,18 @@ GAP9SGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9SGDBindings, tileConstraint = SGDTileConstraint()) + +GAP9PerturbNormalTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9PerturbNormalBindings, + tileConstraint = UnaryTileConstraint()) + +GAP9PerturbUniformTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9PerturbUniformBindings, + tileConstraint = UnaryTileConstraint()) + +GAP9PerturbEggrollTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9PerturbEggrollBindings, + tileConstraint = EggrollTileConstraint()) + +GAP9PerturbRademacherTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9PerturbRademacherBindings, + tileConstraint = UnaryTileConstraint()) + +GAP9PerturbTriangleTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9PerturbTriangleBindings, + tileConstraint = UnaryTileConstraint()) diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index cc733937cc..10ba7a8358 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -709,3 +709,40 @@ def computeOps(self): numPx = opRep['dim_im_out_x'] return numPx * opsPerPx + + +class PerturbNormalLayer(ONNXLayer): + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + +class PerturbUniformLayer(ONNXLayer): + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + +class PerturbEggrollLayer(ONNXLayer): + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + +class PerturbRademacherLayer(ONNXLayer): + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + +class PerturbTriangleLayer(ONNXLayer): + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index cf1ba776bd..bcbdf1a312 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -1291,7 +1291,10 @@ def parseNode(self, node: gs.Node) -> (bool): if ret: if 'kernel_shape' not in node.attrs: - node.attrs['kernel_shape'] = node.inputs[1].shape[-2:] + if self.operatorRepresentation['channels_first']: + node.attrs['kernel_shape'] = node.inputs[1].shape[-2:] + else: + node.attrs['kernel_shape'] = node.inputs[1].shape[1:3] self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape'] self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0]) self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1]) @@ -2882,3 +2885,167 @@ def parseNodeCtxt(self, self.operatorRepresentation['size'] = int(np.prod(data_in.shape)) return ctxt, True + +class PerturbNormalParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all([len(node.inputs) == 1, + len(node.outputs) == 1, + 'seed' in node.attrs, + 'eps' in node.attrs]) + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + input_shape = data_in.shape + if isinstance(data_in.shape, int): + input_shape = tuple(input_shape, ) + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['seed'] = node.attrs['seed'] + self.operatorRepresentation['size'] = np.prod(input_shape) + self.operatorRepresentation['nodeIdx'] = node.attrs['idx'] + self.operatorRepresentation['eps'] = node.attrs['eps'] + return ctxt, True + +class PerturbUniformParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all([len(node.inputs) == 1, + len(node.outputs) == 1, + 'low' in node.attrs, + 'high' in node.attrs, + 'seed' in node.attrs, + 'eps' in node.attrs]) + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + input_shape = data_in.shape + if isinstance(data_in.shape, int): + input_shape = tuple(input_shape, ) + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['seed'] = node.attrs['seed'] + self.operatorRepresentation['size'] = np.prod(input_shape) + self.operatorRepresentation['nodeIdx'] = node.attrs['idx'] + self.operatorRepresentation['eps'] = node.attrs['eps'] + self.operatorRepresentation['low'] = float(node.attrs['low']) + self.operatorRepresentation['high'] = float(node.attrs['high']) + + return ctxt, True + +class PerturbEggrollParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all([len(node.inputs) == 1, + len(node.outputs) == 1, + 'seed' in node.attrs, + 'idx' in node.attrs, + 'eps' in node.attrs]) + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + shape_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + self.operatorRepresentation['shape_in'] = shape_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['seed'] = node.attrs['seed'] + self.operatorRepresentation['eps'] = node.attrs['eps'] + self.operatorRepresentation['size'] = shape_in.values[0] + assert len(shape_in.values) == 2, f"Expected input to be 2D, got {len(shape_in.values)}D" + assert shape_in.values[1] == 1, f"Expected second dimension of input to be 1, got {shape_in.values[1]}" + self.operatorRepresentation['nodeIdx'] = node.attrs['idx'] + return ctxt, True + +class PerturbRademacherParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all([len(node.inputs) == 1, + len(node.outputs) == 1, + 'seed' in node.attrs, + 'eps' in node.attrs]) + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + input_shape = data_in.shape + if isinstance(data_in.shape, int): + input_shape = tuple(input_shape, ) + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['seed'] = node.attrs['seed'] + self.operatorRepresentation['size'] = np.prod(input_shape) + self.operatorRepresentation['nodeIdx'] = node.attrs['idx'] + self.operatorRepresentation['eps'] = node.attrs['eps'] + return ctxt, True + + +class PerturbTriangleParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all([len(node.inputs) == 1, + len(node.outputs) == 1, + 'seed' in node.attrs, + 'eps' in node.attrs]) + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + input_shape = data_in.shape + if isinstance(data_in.shape, int): + input_shape = tuple(input_shape, ) + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['seed'] = node.attrs['seed'] + self.operatorRepresentation['size'] = np.prod(input_shape) + self.operatorRepresentation['nodeIdx'] = node.attrs['idx'] + self.operatorRepresentation['eps'] = node.attrs['eps'] + self.operatorRepresentation['low'] = float(node.attrs['low']) + self.operatorRepresentation['high'] = float(node.attrs['high']) + + return ctxt, True \ No newline at end of file diff --git a/Deeploy/Targets/Generic/TileConstraints/EggrollTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/EggrollTileConstraint.py new file mode 100644 index 0000000000..23b5af1d9b --- /dev/null +++ b/Deeploy/Targets/Generic/TileConstraints/EggrollTileConstraint.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple, Union +from ortools.constraint_solver.pywrapcp import IntVar + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint32_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme + + +class EggrollTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + inputBufferName = parseDict['shape_in'] + inputBuffer = ctxt.lookup(inputBufferName) + outputBufferName = parseDict['data_out'] + outputBuffer = ctxt.lookup(outputBufferName) + inputDimVar0 = int(inputBuffer.values[0]) + inputDimVar1 = int(inputBuffer.values[1]) + for bufferName in [inputBufferName, outputBufferName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + # for dim in range(inputBuffer.values[0]): + # inputDimVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = dim) + for dim in range(len(outputBuffer.shape)): + outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + if dim == 0: + tilerModel.addConstraint(outputDimVar <= inputDimVar0) + elif dim == 1: + tilerModel.addConstraint(outputDimVar <= inputDimVar1) + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + + inputBufferName = parseDict['data_out'] + inputBuffer = ctxt.lookup(inputBufferName) + + Dim0Idx = 0 + Dim1Idx = 1 + symbolicParseDict = parseDict.copy() + symbolicParseDict['inputDimVar0'] = tilerModel.getTensorDimVar(inputBuffer.name, Dim0Idx) + symbolicParseDict['inputDimVar1'] = tilerModel.getTensorDimVar(inputBuffer.name, Dim1Idx) + return symbolicParseDict + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['shape_in', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + + replacements = {"inputDimVar0": [], "inputDimVar1": [], "size": []} + replacementTypes = {"inputDimVar0": PointerClass(uint32_t), "inputDimVar1": PointerClass(uint32_t), "size": PointerClass(uint32_t)} + + for cube in outputCubes: + newSize = np.prod(cube.dims) + replacements["size"].append(newSize) + replacements['inputDimVar0'].append(cube.dims[0]) + replacements['inputDimVar1'].append(cube.dims[1]) + + + inputLoadSchedule = [] + outputLoadSchedule = [] + + # for cube in outputCubes: + # inputLoadSchedule.append({"shape_in": cube}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py index 091cb55a41..525d8093bf 100644 --- a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py +++ b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py @@ -35,8 +35,11 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw tilerModel.addTensorDimToModel(ctxt, tensorName) - for idx, shapeDim in enumerate(_buffer.shape): - tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim) + if isinstance(_buffer.shape, int): + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = 0) == _buffer.shape) + else: + for idx, shapeDim in enumerate(_buffer.shape): + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim) return tilerModel diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..6d14a37c2b 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -610,3 +610,17 @@ def _inferNumLevels(self, inputs: List[VariableBuffer], def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: return [True] + +class PerturbZOChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [inputs[0].nLevels] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] + \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index e1a9ed5932..17ecde096d 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -19,7 +19,7 @@ from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \ - SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker + SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker, PerturbZOChecker from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling @@ -32,7 +32,8 @@ FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, \ MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, \ RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, \ - TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate + TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate, FloatPerturbNormalTemplate, \ + FloatPerturbUniformTemplate, FloatPerturbEggrollTemplate, FloatPerturbRademacherTemplate, FloatPerturbTriangleTemplate from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ PULPRequantShiftChecker from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ @@ -368,6 +369,9 @@ PULPConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float_type), PointerClass(float_type)], [PointerClass(float_type)]), + ConcatTemplate.referenceTemplate, ClusterTransformer) for float_type in FloatDataTypes ] PULPiRMSNormBindings = [ @@ -448,3 +452,33 @@ NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate, ForkTransformer), ] + +PULPPerturbNormalBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbNormalTemplate.referenceTemplate, + ForkTransformer)] + +PULPPerturbUniformBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbUniformTemplate.referenceTemplate, + ForkTransformer)] + +PULPPerturbEggrollBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), + FloatPerturbEggrollTemplate.referenceTemplate, + ForkTransformer)] + +PULPPerturbRademacherBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbRademacherTemplate.referenceTemplate, + ForkTransformer)] + +PULPPerturbTriangleBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbTriangleTemplate.referenceTemplate, + ForkTransformer)] diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py index 5c5951eaba..99e45cefb3 100644 --- a/Deeploy/Targets/PULPOpen/Parsers.py +++ b/Deeploy/Targets/PULPOpen/Parsers.py @@ -75,10 +75,10 @@ def parseNode(self, node: gs.Node) -> (bool): # Current PULP kernel only supports grouping of 1 self.operatorRepresentation['group'] == 1, - # Make sure padding is square - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], - self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1], + # Make sure padding is symmetric (left==right, top==bottom) + # but top/bottom can differ from left/right + self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], # top == bottom + self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], # left == right # Check number of inputs # 2 inputs if no bias, 3 if layer has bias @@ -133,10 +133,10 @@ def parseNode(self, node: gs.Node) -> (bool): if wellFormed: # Check if the node is a depthwise convolution ret = all([ - # Make sure padding is square - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], - self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1], + # Make sure padding is symmetric (left==right, top==bottom) + # but top/bottom can differ from left/right + self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], # top == bottom + self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], # left == right # Check number of inputs # 2 inputs if no bias, 3 if layer has bias diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index d45dc00f9c..4b6a8d6a3d 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -17,13 +17,15 @@ GEMMLayer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, \ ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \ RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \ - SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer + SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer, PerturbNormalLayer, \ + PerturbUniformLayer, PerturbEggrollLayer, PerturbRademacherLayer, PerturbTriangleLayer from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \ GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool2DParser, \ MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, \ RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \ SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, SoftmaxParser, \ - TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser + TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser, \ + PerturbNormalParser, PerturbUniformParser, PerturbEggrollParser, PerturbRademacherParser, PerturbTriangleParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, IntegerDivRequantMergePass, \ MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \ @@ -47,7 +49,9 @@ PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, \ PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \ PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \ - PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings + PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings, \ + PULPPerturbNormalTilingReadyBindings, PULPPerturbUniformTilingReadyBindings, \ + PULPPerturbEggrollTilingReadyBindings, PULPPerturbRademacherTilingReadyBindings, PULPPerturbTriangleTilingReadyBindings from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \ PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass @@ -90,6 +94,13 @@ SoftmaxMapper = NodeMapper(SoftmaxParser(), PULPSoftmaxTilingReadyBindings) SoftmaxGradMapper = NodeMapper(SoftmaxGradParser(), PULPSoftmaxGradTilingReadyBindings) Softmax_int8_Mapper = NodeMapper(iSoftmaxParser(), PULPSoftmaxTilingReadyBindings) +PerturbNormalMapper = NodeMapper(PerturbNormalParser(), PULPPerturbNormalTilingReadyBindings) +PerturbUniformMapper = NodeMapper(PerturbUniformParser(), PULPPerturbUniformTilingReadyBindings) +PerturbEggrollMapper = NodeMapper(PerturbEggrollParser(), PULPPerturbEggrollTilingReadyBindings) +PerturbRademacherMapper = NodeMapper(PerturbRademacherParser(), PULPPerturbRademacherTilingReadyBindings) +PerturbTriangleMapper = NodeMapper(PerturbTriangleParser(), PULPPerturbTriangleTilingReadyBindings) + + ConcatMapper = NodeMapper(ConcatParser(), PULPConcatTilingReadyBindings) @@ -148,7 +159,12 @@ 'SoftmaxGrad': SoftmaxGradLayer([SoftmaxGradMapper]), 'SoftmaxCrossEntropyLoss': SoftmaxCrossEntropyLossLayer([SoftmaxCrossEntropyLossMapper]), 'SoftmaxCrossEntropyLossGrad': SoftmaxCrossEntropyLossGradLayer([SoftmaxCrossEntropyLossGradMapper]), - 'SGD': SGDLayer([SGDMapper]) + 'SGD': SGDLayer([SGDMapper]), + 'PerturbNormal': PerturbNormalLayer([PerturbNormalMapper]), + 'PerturbUniform': PerturbUniformLayer([PerturbUniformMapper]), + 'PerturbEggroll': PerturbEggrollLayer([PerturbEggrollMapper]), + 'PerturbRademacher': PerturbRademacherLayer([PerturbRademacherMapper]), + 'PerturbTriangle': PerturbTriangleLayer([PerturbTriangleMapper]), } diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbEggrollTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbEggrollTemplate.py new file mode 100644 index 0000000000..aea54d8e11 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbEggrollTemplate.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatPerturbEggrollTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Add the node's unique ID to help create a unique seed_${nodeName}. + operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx'] + return ctxt, operatorRepresentation, [] + + +# TODO: No loop unrolling optimization yet +referenceTemplate = _FloatPerturbEggrollTemplate(""" +// Perturb Eggroll (Name: ${nodeName}, Op: ${nodeOp}) +uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id(); +uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES); +uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size}); +uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start; + +uint32_t chunk_seed = seed + (${nodeName}_chunk_start * ${node_id}) + (${node_id} * 104729); + +GenEggrollPerturbation((float32_t *) &${data_out}[${nodeName}_chunk_start], + chunk_seed, + ${nodeName}_local_size); +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbNormalTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbNormalTemplate.py new file mode 100644 index 0000000000..eb89d487aa --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbNormalTemplate.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatPerturbNormalTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Add the node's unique ID to help create a unique seed_${nodeName}. + operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx'] + return ctxt, operatorRepresentation, [] + + +# TODO: No loop unrolling optimization yet +referenceTemplate = _FloatPerturbNormalTemplate(""" +// PerturbNormal (Name: ${nodeName}, Op: ${nodeOp}) +uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id(); +uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES); +uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size}); +uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start; + +uint32_t chunk_seed = seed + (${nodeName}_chunk_start * ${node_id}) + (${node_id} * 104729); + +// pick large enough stride to minimize correlation between nodes. +ApplyGaussianPerturbation( + (const float32_t *) &${data_in}[${nodeName}_chunk_start], + (float32_t *) &${data_out}[${nodeName}_chunk_start], + chunk_seed, + perturbation_sign, // globally defined in DeedeployTest main + ${nodeName}_local_size, + ${eps}f, +); +""" +) \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbRademacherTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbRademacherTemplate.py new file mode 100644 index 0000000000..b550539b88 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbRademacherTemplate.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatPerturbRademacherTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Add the node's unique ID to help create a unique seed. + operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx'] + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _FloatPerturbRademacherTemplate(""" +// PerturbRademacher (Name: ${nodeName}, Op: ${nodeOp}) +uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id(); +uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES); +uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size}); +uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start; + +uint32_t i = ${nodeName}_chunk_start; +for (; i < ${nodeName}_chunk_stop; i++) { + // pick large enough stride to minimize correlation between nodes. + uint32_t chunk_seed = seed + i*${nodeName}_chunk_start + (${node_id} * 104729); + ApplyRademacherPerturbation((const float32_t *) &${data_in}[i], + (float32_t *) &${data_out}[i], + chunk_seed, + perturbation_sign, // globally defined in DeedeployTest main + ${nodeName}_local_size, + ${eps}f); +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbTriangleTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbTriangleTemplate.py new file mode 100644 index 0000000000..416a7b0a40 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbTriangleTemplate.py @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatPerturbTriangleTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Add the node's unique ID to help create a unique seed_${nodeName}. + operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx'] + return ctxt, operatorRepresentation, [] + + +# TODO: No loop unrolling optimization yet +referenceTemplate = _FloatPerturbTriangleTemplate(""" +// PerturbTriangle (Name: ${nodeName}, Op: ${nodeOp}) +uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id(); +uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES); +uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size}); +uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start; + +uint32_t i = ${nodeName}_chunk_start; +for (; i < ${nodeName}_chunk_stop; i++) { + // pick large enough stride to minimize correlation between nodes. + uint32_t chunk_seed = seed + i*${nodeName}_chunk_start + (${node_id} * 104729); + ApplyTrianglePerturbation((const float32_t *) &${data_in}[i], + (float32_t *) &${data_out}[i], + chunk_seed, + perturbation_sign, // globally defined in DeedeployTest main + ${nodeName}_local_size, + ${eps}f); +} +""") + +updateTemplate = _FloatPerturbTriangleTemplate(""" +// UpdateTriangle (Name: ${nodeName}, Op: ${nodeOp}) +BEGIN_SINGLE_CORE + UpdateWeightsTriangle((float32_t *)${data_in}, + loss, + seed + ${node_id}, + ${eps}f, + lr, // globally defined + ${size}); +END_SINGLE_CORE +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbUniformTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbUniformTemplate.py new file mode 100644 index 0000000000..e81147751b --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbUniformTemplate.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatPerturbUniformTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Add the node's unique ID to help create a unique seed. + operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx'] + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _FloatPerturbUniformTemplate(""" +// PerturbUniform (Name: ${nodeName}, Op: ${nodeOp}) +uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id(); +uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES); +uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size}); +uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start; + +// pick large enough stride to minimize correlation between nodes. +uint32_t chunk_seed = seed + (${nodeName}_chunk_start * ${node_id}) + (${node_id} * 104729); +ApplyUniformPerturbation((const float32_t *) &${data_in}[${nodeName}_chunk_start], + (float32_t *) &${data_out}[${nodeName}_chunk_start], + chunk_seed, + perturbation_sign, // globally defined in DeedeployTest main + ${nodeName}_local_size, + ${eps}f); +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py index 3d7d11f343..8431b6cc89 100644 --- a/Deeploy/Targets/PULPOpen/Tiler.py +++ b/Deeploy/Targets/PULPOpen/Tiler.py @@ -13,6 +13,7 @@ from Deeploy.Targets.Generic.TileConstraints.RQSiGELUTileConstraint import RQSiGELUTileConstraint from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint +from Deeploy.Targets.Generic.TileConstraints.EggrollTileConstraint import EggrollTileConstraint from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \ PULPFloatDWConv2DBindings, PULPFloatGELUBinding, PULPFloatGELUGradBinding, PULPFloatGEMMBindings, \ @@ -22,7 +23,8 @@ PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, \ PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, PULPSoftmaxBindings, \ PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, \ - PULPTransposeBindings, PULPUniformRQSBindings + PULPTransposeBindings, PULPUniformRQSBindings, PULPPerturbNormalBindings, PULPPerturbUniformBindings, \ + PULPPerturbEggrollBindings, PULPPerturbRademacherBindings, PULPPerturbTriangleBindings from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \ RQDWConv2DTileConstraint @@ -153,3 +155,18 @@ PULPReduceMeanTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceMeanBindings, tileConstraint = ReduceMeanTileConstraint()) + +PULPPerturbNormalTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbNormalBindings, + tileConstraint = UnaryTileConstraint()) + +PULPPerturbUniformTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbUniformBindings, + tileConstraint = UnaryTileConstraint()) + +PULPPerturbEggrollTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbEggrollBindings, + tileConstraint = EggrollTileConstraint()) + +PULPPerturbRademacherTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbRademacherBindings, + tileConstraint = UnaryTileConstraint()) + +PULPPerturbTriangleTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbTriangleBindings, + tileConstraint = UnaryTileConstraint()) diff --git a/Deeploy/TilingExtension/TileConstraint.py b/Deeploy/TilingExtension/TileConstraint.py index 5b067b2ce9..1b92df1752 100644 --- a/Deeploy/TilingExtension/TileConstraint.py +++ b/Deeploy/TilingExtension/TileConstraint.py @@ -146,6 +146,8 @@ def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List targetIdx = 1 fullShape = ctxt.lookup(outVar).shape + if isinstance(fullShape, int): + fullShape = (fullShape,) initialOffset = (0,) * len(fullShape) outputCubes = [ AbsoluteHyperRectangle(rectangle = HyperRectangle(offset = initialOffset, dims = tuple(fullShape)), diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/inputs.npz new file mode 100644 index 0000000000..b58ac20c7b Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/network.onnx new file mode 100644 index 0000000000..38798357d4 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/outputs.npz new file mode 100644 index 0000000000..5284177d8e Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/inputs.npz new file mode 100644 index 0000000000..847536024f Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/network.onnx new file mode 100644 index 0000000000..5dcf8bae70 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/outputs.npz new file mode 100644 index 0000000000..a780bf64e6 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/inputs.npz new file mode 100644 index 0000000000..4a1e9c269c Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/network.onnx new file mode 100644 index 0000000000..52f0ccfd9c Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/outputs.npz new file mode 100644 index 0000000000..8dc3b1d0cf Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/inputs.npz new file mode 100644 index 0000000000..d77ca34b35 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/network.onnx new file mode 100644 index 0000000000..23990e812d Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/outputs.npz new file mode 100644 index 0000000000..9113de38f2 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/inputs.npz new file mode 100644 index 0000000000..a7e5c1cfa0 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/network.onnx new file mode 100644 index 0000000000..42c66ac0c8 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/outputs.npz new file mode 100644 index 0000000000..56be194e61 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Lite-CNN-ZO/inputs.npz b/DeeployTest/Tests/Models/Lite-CNN-ZO/inputs.npz new file mode 100644 index 0000000000..8dcd54a7dd Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN-ZO/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Lite-CNN-ZO/network.onnx b/DeeployTest/Tests/Models/Lite-CNN-ZO/network.onnx new file mode 100644 index 0000000000..26bedabcb4 Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN-ZO/network.onnx differ diff --git a/DeeployTest/Tests/Models/Lite-CNN-ZO/outputs.npz b/DeeployTest/Tests/Models/Lite-CNN-ZO/outputs.npz new file mode 100644 index 0000000000..e768b0ce4d Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN-ZO/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Lite-CNN/inputs.npz b/DeeployTest/Tests/Models/Lite-CNN/inputs.npz new file mode 100644 index 0000000000..8dcd54a7dd Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Lite-CNN/network.onnx b/DeeployTest/Tests/Models/Lite-CNN/network.onnx new file mode 100644 index 0000000000..2a39932575 Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN/network.onnx differ diff --git a/DeeployTest/Tests/Models/Lite-CNN/outputs.npz b/DeeployTest/Tests/Models/Lite-CNN/outputs.npz new file mode 100644 index 0000000000..e768b0ce4d Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN/outputs.npz differ diff --git a/DeeployTest/Tests/Models/LiteCNN-Eggroll/inputs.npz b/DeeployTest/Tests/Models/LiteCNN-Eggroll/inputs.npz new file mode 100644 index 0000000000..eccd659d67 Binary files /dev/null and b/DeeployTest/Tests/Models/LiteCNN-Eggroll/inputs.npz differ diff --git a/DeeployTest/Tests/Models/LiteCNN-Eggroll/network.onnx b/DeeployTest/Tests/Models/LiteCNN-Eggroll/network.onnx new file mode 100644 index 0000000000..ddf5e6285b Binary files /dev/null and b/DeeployTest/Tests/Models/LiteCNN-Eggroll/network.onnx differ diff --git a/DeeployTest/Tests/Models/LiteCNN-Eggroll/outputs.npz b/DeeployTest/Tests/Models/LiteCNN-Eggroll/outputs.npz new file mode 100644 index 0000000000..2bd8307dbd Binary files /dev/null and b/DeeployTest/Tests/Models/LiteCNN-Eggroll/outputs.npz differ diff --git a/DeeployTest/Tests/Models/SleepConVit-ZO/inputs.npz b/DeeployTest/Tests/Models/SleepConVit-ZO/inputs.npz new file mode 100644 index 0000000000..d55dda479f Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit-ZO/inputs.npz differ diff --git a/DeeployTest/Tests/Models/SleepConVit-ZO/network.onnx b/DeeployTest/Tests/Models/SleepConVit-ZO/network.onnx new file mode 100644 index 0000000000..c5aefc7f47 Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit-ZO/network.onnx differ diff --git a/DeeployTest/Tests/Models/SleepConVit-ZO/outputs.npz b/DeeployTest/Tests/Models/SleepConVit-ZO/outputs.npz new file mode 100644 index 0000000000..7b64cc07d8 Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit-ZO/outputs.npz differ diff --git a/DeeployTest/Tests/Models/SleepConVit/inputs.npz b/DeeployTest/Tests/Models/SleepConVit/inputs.npz new file mode 100644 index 0000000000..ee174fcab4 Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit/inputs.npz differ diff --git a/DeeployTest/Tests/Models/SleepConVit/network.onnx b/DeeployTest/Tests/Models/SleepConVit/network.onnx new file mode 100644 index 0000000000..c51390febe Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit/network.onnx differ diff --git a/DeeployTest/Tests/Models/SleepConVit/outputs.npz b/DeeployTest/Tests/Models/SleepConVit/outputs.npz new file mode 100644 index 0000000000..8babb4ed7a Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit/outputs.npz differ diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py index 01216984af..cbaeda7cae 100644 --- a/DeeployTest/testMVP.py +++ b/DeeployTest/testMVP.py @@ -212,6 +212,8 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg help = "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1." ) + parser.add_argument('--run_mode', type = str, default = 'inference', + help = 'Run mode of the network. Options are: inference, mezo_training.') parser.set_defaults(shouldFail = False) args = parser.parse_args() diff --git a/DeeployTest/testRunner_tiled_siracusa_mezo.py b/DeeployTest/testRunner_tiled_siracusa_mezo.py new file mode 100644 index 0000000000..9b85b7f491 --- /dev/null +++ b/DeeployTest/testRunner_tiled_siracusa_mezo.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from testUtils.testRunner import TestRunner, TestRunnerArgumentParser + +if __name__ == "__main__": + + parser = TestRunnerArgumentParser( + tiling_arguments = True, description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling).") + + parser.add_argument('--cores', + metavar = '', + dest = 'cores', + type = int, + default = 8, + help = 'Set number of cluster cores') + args = parser.parse_args() + + testRunner = TestRunner(platform = "Siracusa", simulator = "gvsoc", tiling = True, argument_parser = parser, gen_args = "--run_mode mezo_training") + + testRunner.cmake_args += f" -D NUM_CORES={args.cores}" + + testRunner.run() diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h index f6e8308c97..95de0c7a5b 100644 --- a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h +++ b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h @@ -34,6 +34,7 @@ #include "kernel/UniformRequantShift.h" #include "kernel/gemv.h" #include "kernel/iRMSnorm.h" +#include "kernel/RandomNoise.h" #define LOG2(x) (__builtin_pulp_fl1(x)) diff --git a/TargetLibraries/PULPOpen/inc/kernel/RandomNoise.h b/TargetLibraries/PULPOpen/inc/kernel/RandomNoise.h new file mode 100644 index 0000000000..f8db0d6d59 --- /dev/null +++ b/TargetLibraries/PULPOpen/inc/kernel/RandomNoise.h @@ -0,0 +1,114 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_RANDOMNOISE_KERNEL_HEADER_ +#define __DEEPLOY_MATH_RANDOMNOISE_KERNEL_HEADER_ + +#include "DeeployPULPMath.h" + + +#define PI_F 3.14159265358979323846f + +#define ZIGGURAT_TABLE_SIZE 128 +#define ZIGGURAT_R 3.442619855899 +#define ZIGGURAT_V 9.91256303526217e-3 + +static uint32_t kn[ZIGGURAT_TABLE_SIZE]; +static float32_t wn[ZIGGURAT_TABLE_SIZE]; +static float32_t fn[ZIGGURAT_TABLE_SIZE]; +static int32_t ziggurat_tables_initialized = 0; + + +typedef struct { + uint32_t state; + uint32_t bits; + uint32_t bitpos; +} RademacherRNG; + +// Sample from Unifom distribution U[-0.5,0.5] +float32_t UniformSample(uint32_t *state); +// Sample from triangular distribution Tr[-1, 1] +float32_t TriangularSample(uint32_t *state); +float32_t GaussianSample(uint32_t *state); +float32_t RademacherSample(RademacherRNG *rng); +uint32_t Xorshift32(uint32_t state); +void build_ziggurat_tables(); +float32_t GaussianZigguratSample(uint32_t *state); + +void RademacherRNG_init(RademacherRNG *rng, uint32_t seed); + +// Applies triangular perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm. +void ApplyTriangularPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + uint32_t dir, + uint32_t size, + float32_t epsilon); + +// Applies uniform perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm. +void ApplyUniformPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + uint32_t dir, + uint32_t size, + float32_t epsilon); + +// Applies uniform perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm. +void ApplyGaussianPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + uint32_t dir, + uint32_t size, + float32_t epsilon); + +// Applies uniform perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm. +void ApplyRademacherPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + uint32_t dir, + uint32_t size, + float32_t epsilon); + + +// Updates the weights in place according to the MeZO update rule with triangular noise. +// Only supports qMeZO with q = 1 for now. +// void UpdateWeightsTriangle(float32_t *__restrict__ pweights, +// float32_t loss, +// uint32_t seed, +// float32_t epsilon, +// float32_t lr, +// uint32_t size); + +// // Updates the weights in place according to the MeZO update rule with uniform noise. +// // Only supports qMeZO with q = 1 for now. +// void UpdateWeightsUniform(float32_t *__restrict__ pweights, +// float32_t loss, +// uint32_t seed, +// float32_t epsilon, +// float32_t lr, +// uint32_t size); + +// void UpdateWeightsGaussian(float32_t *__restrict__ pweights, +// float32_t loss, +// uint32_t seed, +// float32_t epsilon, +// float32_t lr, +// uint32_t size); + +// void UpdateWeightsRademacher(float32_t *__restrict__ pweights, +// float32_t loss, +// uint32_t seed, +// float32_t epsilon, +// float32_t lr, +// uint32_t size); + + +void GenEggrollPerturbation(float32_t *__restrict__ p_dest, + uint32_t seed, + uint32_t size, + float32_t epsilon); + +#endif //__DEEPLOY_MATH_RANDOMNOISE_KERNEL_HEADER_ \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/Gemm.c b/TargetLibraries/PULPOpen/src/Gemm.c index a46f8ac6ae..02fd991674 100644 --- a/TargetLibraries/PULPOpen/src/Gemm.c +++ b/TargetLibraries/PULPOpen/src/Gemm.c @@ -6,6 +6,7 @@ #include "DeeployPULPMath.h" #include "pmsis.h" +// #include "perf_utils.h" void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, const float32_t *__restrict__ pSrcB, @@ -17,6 +18,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, int8_t core_id = pi_core_id(); int8_t log2Core = LOG2(NUM_CORES); + //RW: Performance monitoring is currently disabled + // perf_stats_t perf_start, perf_end, perf_total; + + // // Initialize and start performance counters (only core 0) + // if (core_id == 0) { + // perf_bench_init(); + // perf_bench_start(); + // perf_bench_read(&perf_start); + // } + uint32_t M_chunk = (M >> log2Core) + ((M & (NUM_CORES - 1)) != 0); uint32_t M_start = MIN(core_id * M_chunk, M); uint32_t M_end = MIN(M_start + M_chunk, M); @@ -351,4 +362,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, } } } + + // RW: Stop performance counters and print results (only core 0) + // if (core_id == 0) { + // perf_bench_stop(); + // perf_bench_read(&perf_end); + // perf_bench_diff(&perf_total, &perf_end, &perf_start); + + // char label[100]; + // snprintf(label, sizeof(label), "GEMM M=%u N=%u O=%u transA=%u transB=%u", + // M, N, O, transA, transB); + // perf_bench_print(label, &perf_total); + // } } \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/RandomNoise.c b/TargetLibraries/PULPOpen/src/RandomNoise.c new file mode 100644 index 0000000000..001d205af8 --- /dev/null +++ b/TargetLibraries/PULPOpen/src/RandomNoise.c @@ -0,0 +1,281 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployPULPMath.h" +#include "pmsis.h" +#include + +// TODO: 1) loop unrolling for ILP perf +// TODO: 2) Perturbation directly integrated in GEMM or Conv kernels. +/* --------------------------- RNG ---------------------------------- */ + +uint32_t Xorshift32(uint32_t state) { + state ^= state << 13; + state ^= state >> 17; + state ^= state << 5; + return state; +} + +/* --------------------------- Samplers ---------------------------------- */ + +float32_t TriangularSample(uint32_t *state) { + *state = Xorshift32(*state); + float32_t u1 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1] + // mutate state to avoid same seed for u2. + *state = Xorshift32(*state); + float32_t u2 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1] + return u1 - u2; +} + +float32_t UniformSample(uint32_t *state) { + *state = Xorshift32(*state); + float32_t u1 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1] + return u1-0.5f; // centered around 0 +} + +float32_t GaussianSample(uint32_t *state) { + // Box-Muller transform + *state = Xorshift32(*state); + float32_t u1 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in (0,1] + // mutate state to avoid same seed for u2. + *state = Xorshift32(*state); + float32_t u2 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1] + return sqrtf(-2.0f * logf(u1)) * cosf(2.0f * (float32_t)PI_F * u2); +} + +/* ---------------- Ziggurat method for Gaussian sampling ---------------- */ +// This implementation is adapted from the public domain Ziggurat algorithm +// by Marsaglia and Tsang. + +void build_ziggurat_tables() { + if (ziggurat_tables_initialized) return; + + float32_t dn = (float32_t) ZIGGURAT_R; + float32_t tn = dn; + float32_t vn = (float32_t) ZIGGURAT_V; + + // Set up the tables + float32_t q = vn / expf(-0.5f * dn * dn); + kn[0] = (uint32_t)((dn / q) * (float32_t)0xFFFFFFFF); + kn[1] = 0; + + wn[0] = (float32_t)(q / (float32_t)0xFFFFFFFF); + wn[ZIGGURAT_TABLE_SIZE - 1] = (float32_t)(dn / (float32_t)0xFFFFFFFF); + + fn[0] = 1.0f; + fn[ZIGGURAT_TABLE_SIZE - 1] = expf(-0.5f * dn * dn); + + for (uint32_t i = ZIGGURAT_TABLE_SIZE - 2; i >= 1; i--) { + dn = sqrtf(-2.0f * logf(vn / dn + expf(-0.5f * dn * dn))); + kn[i + 1] = (uint32_t)((dn / tn) * (float32_t)0xFFFFFFFF); + tn = dn; + fn[i] = expf(-0.5f * dn * dn); + wn[i] = (float32_t)(dn / (float32_t)0xFFFFFFFF); + } + ziggurat_tables_initialized = 1; +} + + +float32_t GaussianZigguratSample(uint32_t *state) { + if (!ziggurat_tables_initialized) { + build_ziggurat_tables(); + } + + int32_t hz; + uint32_t iz; + float32_t x, y; + + for (;;) { + *state = Xorshift32(*state); + hz = (int32_t)(*state); + iz = hz & (ZIGGURAT_TABLE_SIZE - 1); + + // Quick acceptance path + if ((uint32_t)((hz < 0) ? -hz : hz) < kn[iz]) { + return (float32_t)hz * wn[iz]; + } + + // Handle the tail + if (iz == 0) { + do { + *state = Xorshift32(*state); + x = -logf((float32_t)(*state) / (float32_t)0xFFFFFFFF) / ZIGGURAT_R; + *state = Xorshift32(*state); + y = -logf((float32_t)(*state) / (float32_t)0xFFFFFFFF); + } while (y + y < x * x); + return (hz > 0) ? ZIGGURAT_R + x : -ZIGGURAT_R - x; + } + + // Slower rejection path + x = (float32_t)hz * wn[iz]; + if (fn[iz] + ((float32_t)(*state) / (float32_t)0xFFFFFFFF) * (fn[iz - 1] - fn[iz]) < expf(-0.5f * x * x)) { + return x; + } + } +} + +void RademacherRNG_init(RademacherRNG *rng, uint32_t seed) { + rng->state = seed; + rng->bits = 0; + rng->bitpos = 32; // force refill on first use +} + +float32_t RademacherSample(RademacherRNG *rng) { + if (rng->bitpos >= 32) { + rng->state = Xorshift32(rng->state); + rng->bits = rng->state; + rng->bitpos = 0; + } + float32_t val = (rng->bits & 1) ? 1.0f : -1.0f; + rng->bits >>= 1; + rng->bitpos++; + return val; +} + +/* ------------------------- Perturbation Functions -------------------------------- */ + +void ApplyTriangularPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + uint32_t dir, + uint32_t size, + float32_t epsilon) +{ + uint32_t rng_state = (seed * 1664525u) + 1013904223u; + float32_t sqrt6 = 2.44948974278f; + float32_t scale = epsilon * sqrt6; // sqrt(6): => variance 1 + if (dir == 0) {scale *= -1.0f;} + for (uint32_t i = 0; i < size; i++) { + float32_t tr = TriangularSample(&rng_state); + pweights_dest[i] = pweights[i] + tr * scale; + } +} + +void ApplyUniformPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + uint32_t dir, + uint32_t size, + float32_t epsilon) +{ + printf("ApplyUniformPerturbation:epsilon=%f, size=%u \n", epsilon, size); + uint32_t rng_state = (seed * 1664525u) + 1013904223u; + float32_t sqrt3 = 1.73205080757f; + float32_t scale = epsilon * sqrt3 * 2.0f; // factor 2: [-0.5,0.5] => [-1,1], sqrt(3): => Gaussian(0, 1) l2 norm. + if (dir == 0) {scale *= -1.0f;} + for (uint32_t i = 0; i < size; i++) { + float32_t u = UniformSample(&rng_state); + pweights_dest[i] = pweights[i] + u * scale; + } +} + +void ApplyGaussianPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + uint32_t dir, + uint32_t size, + float32_t epsilon) { + uint32_t rng_state = (seed * 1664525u) + 1013904223u; + float32_t scale = epsilon; // gaussian naturally has variance 1 + if (dir == 0) {scale *= -1.0f;} + for (uint32_t i = 0; i < size; i++) { + float32_t u = GaussianSample(&rng_state); + pweights_dest[i] = pweights[i] + u * scale; + } +} + +void ApplyRademacherPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + uint32_t dir, + uint32_t size, + float32_t epsilon) { + RademacherRNG rng_state = { (seed * 1664525u) + 1013904223u, 0, 32 }; + float32_t scale = epsilon; // rademacher naturally has variance 1 + if (dir == 0) {scale *= -1.0f;} + for (uint32_t i = 0; i < size; i++) { + float32_t u = RademacherSample(&rng_state); + pweights_dest[i] = pweights[i] + u * scale; + } +} + +void GenEggrollPerturbation(float32_t *__restrict__ p_dest, + uint32_t seed, + uint32_t size, + float32_t epsilon) +{ + // For compatibility with existing codegen templates. Currently maps to Rademacher noise. + // RademacherRNG rng_state = { (seed * 1664525u) + 1013904223u, 0, 32}; + // float32_t scale = 0.01f; // rademacher naturally has variance 1 + // for (uint32_t i = 0; i < size; i++) { + // float32_t u = RademacherSample(&rng_state); + // p_dest[i] = u; + // } + uint32_t rng_state = (seed * 1664525u) + 1013904223u; + float32_t u = 0.0f; + for (uint32_t i = 0; i < size; i++) { + p_dest[i] = UniformSample(&rng_state); + } + +} + +/* --------------------------- Update functions ---------------------------------- */ + +// void UpdateWeightsTriangle(float32_t *__restrict__ pweights, +// float32_t loss, +// uint32_t seed, +// float32_t epsilon, +// float32_t lr, +// uint32_t size) { +// uint32_t rng_state = (seed * 1664525u) + 1013904223u; +// float32_t sqrt6 = 2.44948974278f; +// const float32_t scale = sqrt6; // sqrt(6): => Gaussian(0, 1) l2 norm. +// for (uint32_t i = 0; i < size; i++) { +// float32_t tr = TriangularSample(&rng_state); +// pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * tr * scale; +// } +// } + +// void UpdateWeightsUniform(float32_t *__restrict__ pweights, +// float32_t loss, +// uint32_t seed, +// float32_t epsilon, +// float32_t lr, +// uint32_t size) { +// uint32_t rng_state = (seed * 1664525u) + 1013904223u; +// float32_t sqrt3 = 1.73205080757f; +// const float32_t scale = sqrt3 * 2.0f; // factor 2: [-0.5,0.5] => [-1,1], sqrt(3): => variance 1 +// for (uint32_t i = 0; i < size; i++) { +// float32_t u = UniformSample(&rng_state); +// pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * u * scale; +// } +// } + +// void UpdateWeightsGaussian(float32_t *__restrict__ pweights, +// float32_t loss, +// uint32_t seed, +// float32_t epsilon, +// float32_t lr, +// uint32_t size) { +// uint32_t rng_state = (seed * 1664525u) + 1013904223u; +// for (uint32_t i = 0; i < size; i++) { +// float32_t u = GaussianSample(&rng_state); +// pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * u; +// } +// } + +// void UpdateWeightsRademacher(float32_t *__restrict__ pweights, +// float32_t loss, +// uint32_t seed, +// float32_t epsilon, +// float32_t lr, +// uint32_t size) { +// RademacherRNG rng_state = { (seed * 1664525u) + 1013904223u, 0, 32 }; +// for (uint32_t i = 0; i < size; i++) { +// float32_t u = RademacherSample(&rng_state); +// pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * u; +// } +// } \ No newline at end of file