diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index e1a9ed5932..61da33fc3c 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -102,7 +102,7 @@ PULPSynchCoresPass(), ForkClosure(writeback = False, generateStruct = True), TilingVariableReplacementUpdate("L1"), - PULPClusterTiling("L2", "L1", MchanDma()), + PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters ArgumentStructGeneration(), MemoryManagementGeneration("L1"), TilingVariableReplacement("L2"), @@ -120,7 +120,7 @@ TilingVariableReplacement("L1"), TilingCallClosure(writeback = False, generateStruct = True), TilingVariableReplacementUpdate("L1"), - PULPClusterTiling("L2", "L1", MchanDma()), + PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters ArgumentStructGeneration(), MemoryManagementGeneration("L1"), TilingVariableReplacement("L2"), diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py index 3c0bba3107..59aec47a5d 100644 --- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py +++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py @@ -7,9 +7,9 @@ from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity from Deeploy.TilingExtension.AsyncDma import AsyncDma from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \ - DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn + DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn, ProfilingDoubleBufferingTilingMixIn from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \ - ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration + PerfCounterSingleBufferingTilingMixIn, ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration class PULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration): @@ -28,13 +28,38 @@ class ProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration pass +class PerfCounterPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, PerfCounterSingleBufferingTilingMixIn): + """Single buffering with performance counter profiling""" + pass + + +class PerfCounterPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn): + """Double buffering with performance counter profiling""" + pass + + +class CombinedProfilingPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn, PerfCounterSingleBufferingTilingMixIn): + """Single buffering with both cycle profiling and performance counter profiling""" + pass + + +class CombinedProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn, PerfCounterDoubleBufferingTilingMixIn): + """Double buffering with both cycle profiling and performance counter profiling""" + pass + + class PULPClusterTiling(CodeTransformationPass): - def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma): + def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, usePerfCounters: bool = False): + self.usePerfCounters = usePerfCounters self.SB = PULPClusterTilingGenerationSB(externalMemory, localMemory, dma) self.profilingSB = ProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma) + self.perfCounterSB = PerfCounterPULPClusterTilingGenerationSB(externalMemory, localMemory, dma) + self.combinedProfilingSB = CombinedProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma) self.DB = PULPClusterTilingGenerationDB(externalMemory, localMemory, dma) self.profilingDB = ProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma) + self.perfCounterDB = PerfCounterPULPClusterTilingGenerationDB(externalMemory, localMemory, dma) + self.combinedProfilingDB = CombinedProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma) def apply(self, ctxt: NetworkContext, @@ -42,10 +67,16 @@ def apply(self, name: str, verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]: - if verbose.tilingProfiling: + if self.usePerfCounters and verbose.tilingProfiling: + # Use combined profiling: cycle measurements + performance counter stats + ctxt, executionBlock = self.combinedProfilingSB.apply(ctxt, executionBlock, name) + ctxt, executionBlock = self.combinedProfilingDB.apply(ctxt, executionBlock, name) + elif verbose.tilingProfiling: + # Use cycle profiling only (basic cycle measurements) ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name) ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name) else: + # No profiling ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name) ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name) diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index d45dc00f9c..85f8cdf5eb 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -245,7 +245,7 @@ class PULPStructBuffer(StructBuffer): # SCHEREMO: stdint is included before pulp_nn_kernels.h because it is supposed to be included in there, but isn't... _includeList = [ - "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h" + "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h", "perf_utils.h" ] diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py index ad9c6ad012..ce9ec86f27 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py @@ -11,8 +11,8 @@ from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, Future from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays -from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \ - PrototypeTilingMixIn, TilingMetaInfo +from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \ + ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, stridesFromShape @@ -364,3 +364,38 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements, _egressDMAStatements, closeLoopStatements) return executionBlock + +class PerfCounterDoubleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn): + """ + Double buffering tiling with performance counter profiling. + Provides detailed instruction-level statistics for each tile. + """ + + @classmethod + def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + setupStatements: List[CodeSnippet], + teardownStatements: List[CodeSnippet]) -> ExecutionBlock: + + executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, + teardownStatements) + + # Inject performance counter initialization in setup (only once, not per-tile) + executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo) + + # Inject performance counter stop and print in teardown (only once, not per-tile) + executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo) + + return executionBlock + + @classmethod + def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet], + egressDMAStatements: List[CodeSnippet], + closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock: + + # Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles + # executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo) + + executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements, + egressDMAStatements, closeLoopStatements) + return executionBlock diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py index ea1e938b58..e4bb803611 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py @@ -10,8 +10,8 @@ from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays -from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \ - PrototypeTilingMixIn, TilingMetaInfo +from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \ + ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme @@ -191,3 +191,39 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements, _egressDMAStatements, closeLoopStatements) return executionBlock + + +class PerfCounterSingleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn): + """ + Single buffering tiling with performance counter profiling. + Provides detailed instruction-level statistics for each tile. + """ + + @classmethod + def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + setupStatements: List[CodeSnippet], + teardownStatements: List[CodeSnippet]) -> ExecutionBlock: + + executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, + teardownStatements) + + # Inject performance counter initialization in setup (only once, not per-tile) + executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo) + + # Inject performance counter stop and print in teardown (only once, not per-tile) + executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo) + + return executionBlock + + @classmethod + def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet], + egressDMAStatements: List[CodeSnippet], + closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock: + + # Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles + # executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo) + + executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements, + egressDMAStatements, closeLoopStatements) + return executionBlock diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py index 09a4ef56eb..70aabd9805 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py @@ -64,6 +64,105 @@ def generateAllTilingCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingM return executionBlock +class PerfCounterProfilingMixIn(ABC): + """ + MixIn for injecting performance counter profiling code. + Provides detailed instruction-level statistics using CSR performance counters. + """ + + _perfCounterInit = NodeTemplate(""" + perf_stats_t ${nodeName}_perf_start, ${nodeName}_perf_end, ${nodeName}_perf_total; + if (pi_core_id() == 0) { + perf_bench_init(); + perf_bench_start(); + perf_bench_read(&${nodeName}_perf_start); + } + """) + + _perfCounterStop = NodeTemplate(""" + if (pi_core_id() == 0) { + perf_bench_stop(); + perf_bench_read(&${nodeName}_perf_end); + perf_bench_diff(&${nodeName}_perf_total, &${nodeName}_perf_end, &${nodeName}_perf_start); + perf_bench_print("${nodeName}", &${nodeName}_perf_total); + } + """) + + _perfCounterKernelStart = NodeTemplate(""" + if (pi_core_id() == 0) { + perf_bench_start(); + perf_bench_read(&${nodeName}_perf_kernel_start); + } + """) + + _perfCounterKernelEnd = NodeTemplate(""" + if (pi_core_id() == 0) { + perf_bench_stop(); + perf_bench_read(&${nodeName}_perf_kernel_end); + perf_bench_diff(&${nodeName}_perf_kernel_total, &${nodeName}_perf_kernel_end, &${nodeName}_perf_kernel_start); + perf_bench_print("${nodeName} Kernel", &${nodeName}_perf_kernel_total); + } + """) + + _perfCounterKernelDecl = NodeTemplate(""" + perf_stats_t ${nodeName}_perf_kernel_start, ${nodeName}_perf_kernel_end, ${nodeName}_perf_kernel_total; + """) + + @classmethod + def injectPerfCounterInit(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock: + """ + Inject performance counter initialization at the beginning of the node execution. + This should be called in the setup phase. + """ + nodeName = metaInfo.nodeName + + executionBlock.addLeft(cls._perfCounterInit, { + "nodeName": nodeName, + }) + + return executionBlock + + @classmethod + def injectPerfCounterStop(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock: + """ + Inject performance counter stop and print at the end of the node execution. + This should be called in the teardown phase. + """ + nodeName = metaInfo.nodeName + + executionBlock.addRight(cls._perfCounterStop, { + "nodeName": nodeName, + }) + + return executionBlock + + @classmethod + def injectPerfCounterKernelWrap(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock: + """ + Wrap the kernel execution with performance counter measurements. + This provides detailed statistics for just the kernel computation (excluding DMA). + """ + nodeName = metaInfo.nodeName + + if metaInfo.kernelLevelTiling: + # Add declaration at the beginning + executionBlock.addLeft(cls._perfCounterKernelDecl, { + "nodeName": nodeName, + }) + + # Add start measurement before kernel + executionBlock.addLeft(cls._perfCounterKernelStart, { + "nodeName": nodeName, + }) + + # Add stop and print after kernel + executionBlock.addRight(cls._perfCounterKernelEnd, { + "nodeName": nodeName, + }) + + return executionBlock + + class ProfilingPrototypeMixIn(ABC): _measureCycles = NodeTemplate(""" ${measurements}[${tileIdxVar}] = getCycles(); diff --git a/TargetLibraries/PULPOpen/inc/perf_utils.h b/TargetLibraries/PULPOpen/inc/perf_utils.h new file mode 100644 index 0000000000..2d9fbc39c6 --- /dev/null +++ b/TargetLibraries/PULPOpen/inc/perf_utils.h @@ -0,0 +1,158 @@ +/* + * Performance Counter Utilities for PULP Benchmarking + */ + +#ifndef __PERF_UTILS_H__ +#define __PERF_UTILS_H__ + +#include "pmsis.h" + +// Performance event IDs (compatible with PMSIS) +#define PI_PERF_CYCLES CSR_PCER_CYCLES +#define PI_PERF_INSTR CSR_PCER_INSTR +#define PI_PERF_LD_STALL CSR_PCER_LD_STALL +#define PI_PERF_JMP_STALL CSR_PCER_JMP_STALL +#define PI_PERF_IMISS CSR_PCER_IMISS +#define PI_PERF_LD CSR_PCER_LD +#define PI_PERF_ST CSR_PCER_ST +#define PI_PERF_JUMP CSR_PCER_JUMP +#define PI_PERF_BRANCH CSR_PCER_BRANCH +#define PI_PERF_TAKEN_BRANCH CSR_PCER_TAKEN_BRANCH +#define PI_PERF_RVC CSR_PCER_RVC +#define PI_PERF_LD_EXT CSR_PCER_LD_EXT +#define PI_PERF_ST_EXT CSR_PCER_ST_EXT +#define PI_PERF_LD_EXT_CYC CSR_PCER_LD_EXT_CYC +#define PI_PERF_ST_EXT_CYC CSR_PCER_ST_EXT_CYC +#define PI_PERF_TCDM_CONT CSR_PCER_TCDM_CONT + +// Benchmark statistics structure +typedef struct { + unsigned int cycles; + unsigned int instr; + unsigned int ld; + unsigned int st; + unsigned int ld_stall; + unsigned int jmp_stall; + unsigned int imiss; + unsigned int branch; + unsigned int taken_branch; + unsigned int rvc; + unsigned int ld_ext; + unsigned int st_ext; + unsigned int ld_ext_cyc; + unsigned int st_ext_cyc; + unsigned int tcdm_cont; +} perf_stats_t; + +// Initialize performance counters for comprehensive benchmarking +static inline void perf_bench_init() { + // Enable all performance counters + pi_perf_conf( + (1 << PI_PERF_CYCLES) | + (1 << PI_PERF_INSTR) | + (1 << PI_PERF_LD_STALL) | + (1 << PI_PERF_JMP_STALL) | + (1 << PI_PERF_IMISS) | + (1 << PI_PERF_LD) | + (1 << PI_PERF_ST) | + (1 << PI_PERF_JUMP) | + (1 << PI_PERF_BRANCH) | + (1 << PI_PERF_TAKEN_BRANCH) | + (1 << PI_PERF_RVC) | + (1 << PI_PERF_LD_EXT) | + (1 << PI_PERF_ST_EXT) | + (1 << PI_PERF_LD_EXT_CYC) | + (1 << PI_PERF_ST_EXT_CYC) | + (1 << PI_PERF_TCDM_CONT) + ); +} + +// Start performance monitoring +static inline void perf_bench_start() { + pi_perf_reset(); + pi_perf_start(); +} + +// Stop performance monitoring +static inline void perf_bench_stop() { + pi_perf_stop(); +} + +// Read all performance counters into structure +static inline void perf_bench_read(perf_stats_t *stats) { + stats->cycles = pi_perf_read(PI_PERF_CYCLES); + stats->instr = pi_perf_read(PI_PERF_INSTR); + stats->ld = pi_perf_read(PI_PERF_LD); + stats->st = pi_perf_read(PI_PERF_ST); + stats->ld_stall = pi_perf_read(PI_PERF_LD_STALL); + stats->jmp_stall = pi_perf_read(PI_PERF_JMP_STALL); + stats->imiss = pi_perf_read(PI_PERF_IMISS); + stats->branch = pi_perf_read(PI_PERF_BRANCH); + stats->taken_branch = pi_perf_read(PI_PERF_TAKEN_BRANCH); + stats->rvc = pi_perf_read(PI_PERF_RVC); + stats->ld_ext = pi_perf_read(PI_PERF_LD_EXT); + stats->st_ext = pi_perf_read(PI_PERF_ST_EXT); + stats->ld_ext_cyc = pi_perf_read(PI_PERF_LD_EXT_CYC); + stats->st_ext_cyc = pi_perf_read(PI_PERF_ST_EXT_CYC); + stats->tcdm_cont = pi_perf_read(PI_PERF_TCDM_CONT); +} + +// Print performance statistics (core 0 only to avoid clutter) +static inline void perf_bench_print(const char *label, perf_stats_t *stats) { + if (pi_core_id() == 0) { + printf("\n=== Performance Statistics: %s ===\n", label); + printf("Cycles: %10u\n", stats->cycles); + printf("Instructions: %10u\n", stats->instr); + printf("IPC: %10.3f\n", + stats->cycles > 0 ? (float)stats->instr / stats->cycles : 0.0f); + printf("\n--- Instruction Mix ---\n"); + printf("Loads: %10u (%.2f%%)\n", stats->ld, + stats->instr > 0 ? 100.0f * stats->ld / stats->instr : 0.0f); + printf("Stores: %10u (%.2f%%)\n", stats->st, + stats->instr > 0 ? 100.0f * stats->st / stats->instr : 0.0f); + printf("Branches: %10u (%.2f%%)\n", stats->branch, + stats->instr > 0 ? 100.0f * stats->branch / stats->instr : 0.0f); + printf("Taken Branches: %10u (%.2f%%)\n", stats->taken_branch, + stats->branch > 0 ? 100.0f * stats->taken_branch / stats->branch : 0.0f); + printf("Compressed (RVC): %10u (%.2f%%)\n", stats->rvc, + stats->instr > 0 ? 100.0f * stats->rvc / stats->instr : 0.0f); + printf("\n--- Stalls & Hazards ---\n"); + printf("Load Stalls: %10u\n", stats->ld_stall); + printf("Jump Stalls: %10u\n", stats->jmp_stall); + printf("I-cache Misses: %10u\n", stats->imiss); + printf("TCDM Contentions: %10u\n", stats->tcdm_cont); + printf("\n--- Memory Hierarchy ---\n"); + printf("External Loads: %10u (%.2f%%)\n", stats->ld_ext, + stats->ld > 0 ? 100.0f * stats->ld_ext / stats->ld : 0.0f); + printf("External Stores: %10u (%.2f%%)\n", stats->st_ext, + stats->st > 0 ? 100.0f * stats->st_ext / stats->st : 0.0f); + printf("Ext Load Cycles: %10u (avg: %.2f)\n", stats->ld_ext_cyc, + stats->ld_ext > 0 ? (float)stats->ld_ext_cyc / stats->ld_ext : 0.0f); + printf("Ext Store Cycles: %10u (avg: %.2f)\n", stats->st_ext_cyc, + stats->st_ext > 0 ? (float)stats->st_ext_cyc / stats->st_ext : 0.0f); + printf("========================================\n\n"); + } +} + +// Compute difference between two stats (for analyzing specific code sections) +static inline void perf_bench_diff(perf_stats_t *result, + perf_stats_t *end, + perf_stats_t *start) { + result->cycles = end->cycles - start->cycles; + result->instr = end->instr - start->instr; + result->ld = end->ld - start->ld; + result->st = end->st - start->st; + result->ld_stall = end->ld_stall - start->ld_stall; + result->jmp_stall = end->jmp_stall - start->jmp_stall; + result->imiss = end->imiss - start->imiss; + result->branch = end->branch - start->branch; + result->taken_branch = end->taken_branch - start->taken_branch; + result->rvc = end->rvc - start->rvc; + result->ld_ext = end->ld_ext - start->ld_ext; + result->st_ext = end->st_ext - start->st_ext; + result->ld_ext_cyc = end->ld_ext_cyc - start->ld_ext_cyc; + result->st_ext_cyc = end->st_ext_cyc - start->st_ext_cyc; + result->tcdm_cont = end->tcdm_cont - start->tcdm_cont; +} + +#endif // __PERF_UTILS_H__ diff --git a/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed b/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed index a9b4aaf597..faed38c72b 160000 --- a/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed +++ b/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed @@ -1 +1 @@ -Subproject commit a9b4aaf597c030ce24bf65a00b5f3ec84a1528c4 +Subproject commit faed38c72b029b69dcab98571d228a66c3263891 diff --git a/TargetLibraries/PULPOpen/third_party/pulp-nnx b/TargetLibraries/PULPOpen/third_party/pulp-nnx index 234971fca4..c4f6ba351e 160000 --- a/TargetLibraries/PULPOpen/third_party/pulp-nnx +++ b/TargetLibraries/PULPOpen/third_party/pulp-nnx @@ -1 +1 @@ -Subproject commit 234971fca4a0eba5e8b703e9ccb62b7764dac7fa +Subproject commit c4f6ba351e30b31125baba35896db394804d819d