From 0e02147a778d4c78c86605a773ae3a385b1e16f0 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Fri, 12 Sep 2025 03:48:26 -0400 Subject: [PATCH 01/56] ci: configure vscode settings and setup clangd config --- .clangd | 44 +++++++++++++++++ .gitignore | 6 ++- .vscode/c_cpp_properties.json | 18 +++++++ .vscode/extension.json | 7 +++ .vscode/launch.json | 15 ++++++ .vscode/settings.json | 82 ++++++++++++++++++++++++++++++++ .vscode/tasks.json | 16 +++++++ CMakeLists.txt | 2 + run_linux.sh | 25 ++++++++++ src/main.cpp | 2 +- src/testing_helpers.hpp | 2 +- stream_compaction/CMakeLists.txt | 4 +- stream_compaction/cpu.cu | 1 - stream_compaction/thrust.cu | 2 +- 14 files changed, 219 insertions(+), 7 deletions(-) create mode 100644 .clangd create mode 100644 .vscode/c_cpp_properties.json create mode 100644 .vscode/extension.json create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 .vscode/tasks.json create mode 100755 run_linux.sh diff --git a/.clangd b/.clangd new file mode 100644 index 00000000..ed91fdc7 --- /dev/null +++ b/.clangd @@ -0,0 +1,44 @@ +# CompileFlags: +# Add: +# - -I/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl/thrust/ +# - -I/usr/local/cuda-13.0/targets/x86_64-linux/include +# - -L/usr/local/cuda-13.0/targets/x86_64-linux/lib64 +# - -I/usr/include/c++/15 +# - -I/usr/include/c++/15/x86_64-redhat-linux +# - -I/usr/include/c++/15/backward +# Remove: +# - -forward-unknown-to-host-compiler +# - --options-file +# - -arch=native +# - -G +CompileFlags: + Remove: + - -forward-unknown-to-host-compiler + - --options-file + - -arch=native + - -G + - -rdc=true +--- +If: + PathMatch: [.*\.cpp, .*\.h, .*\.hpp] + +CompileFlags: + Add: + - -xc++ + - -I/usr/local/cuda-13.0/targets/x86_64-linux/include + - -Wall + - -Wextra + - -Wpedantic + - -Werror + - -Winit-self + - -Wno-strict-aliasing + - -fno-omit-frame-pointer + - -fstack-protector-all + - -Wdeprecated-declarations +--- +If: + PathMatch: .*\.cu + +CompileFlags: + Add: + - -xcuda diff --git a/.gitignore b/.gitignore index a59ec565..ad03a69e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,9 @@ cis565_getting_started_generated_kernel* *.xcodeproj build +.cache +log.txt + # Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode ### Linux ### @@ -25,7 +28,8 @@ build .LSOverride # Icon must end with two \r -Icon +Icon + # Thumbnails ._* diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 00000000..8a8e84dc --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/src/**", + "${workspaceFolder}/stream_compaction/**" + ], + "defines": [], + "compilerPath": "/usr/local/cuda/bin/nvcc", + "cStandard": "gnu17", + "cppStandard": "gnu++17", + "intelliSenseMode": "linux-gcc-x64", + "configurationProvider": "ms-vscode.makefile-tools" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/.vscode/extension.json b/.vscode/extension.json new file mode 100644 index 00000000..f176108b --- /dev/null +++ b/.vscode/extension.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "nvidia.nsight-vscode-edition", + "ms-vscode.cpptools", + "ms-vscode.makefile-tools" + ] +} \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..511e45d8 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "CUDA C++: Launch", + "preLaunchTask": "CMake: build", + "type": "cuda-gdb", + "request": "launch", + "program": "${command:cmake.launchTargetPath}", + "logFile": "${workspaceFolder}/log.txt", + "cwd": "${workspaceFolder}", + "environment": [] + } + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..2e094204 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,82 @@ +{ + "clangd.path": "/usr/bin/clangd", + "clangd.arguments": [ + "--log=verbose", + "-pretty", + "--background-index", + "--compile-commands-dir=${workspaceFolder}/build", + "--query-driver=/usr/bin/g++", + "--query-driver=/usr/bin/clang++" + ], + "files.associations": { + "cpp": "cuda-cpp", + "hpp": "cuda-cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "cctype": "cpp", + "charconv": "cpp", + "chrono": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "compare": "cpp", + "concepts": "cpp", + "condition_variable": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "map": "cpp", + "set": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "vector": "cpp", + "exception": "cpp", + "expected": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "format": "cpp", + "fstream": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "mutex": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "queue": "cpp", + "ranges": "cpp", + "semaphore": "cpp", + "span": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "text_encoding": "cpp", + "thread": "cpp", + "cinttypes": "cpp", + "typeinfo": "cpp", + "variant": "cpp" + } +} diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 00000000..feb5a396 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,16 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "cmake", + "label": "CMake: build", + "command": "build", + "targets": [ + "${command:cmake.buildTargetName}" + ], + "group": "build", + "problemMatcher": ["$nvcc"], + "detail": "CMake template build task" + }, + ] +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 610c27d4..36f5f5b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,8 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + # Set a default build type if none was specified if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) SET(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) diff --git a/run_linux.sh b/run_linux.sh new file mode 100755 index 00000000..922a7d69 --- /dev/null +++ b/run_linux.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +CURRENT_DIR=$(pwd) +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +while true; do + read -p "Do you want to re-build? (y/n): " yn + case $yn in + [Yy]* ) + cd $SCRIPT_DIR; # cd to where script is located + + trash build; + mkdir build && cd build; + cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-I/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl/thrust/; + make -j$(nproc --all); + + cd $CURRENT_DIR; # cd to stored original directory + break;; + [Nn]* ) + break;; + * ) echo "Invalid input. Please answer 'y' or 'n'.";; + esac +done + +$SCRIPT_DIR/build/bin/cis5650_stream_compaction_test diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..77cce672 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -19,7 +19,7 @@ int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; -int main(int argc, char* argv[]) { +int main() { // Scan tests printf("\n"); diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 025e94aa..408b1124 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -70,7 +70,7 @@ void printArray(int n, int *a, bool abridged = false) { } template -void printElapsedTime(T time, std::string note = "") +void printElapsedTime(T time, const char* note = "") { std::cout << " elapsed time: " << time << "ms " << note << std::endl; } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 19511caa..96e157f9 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -28,5 +28,5 @@ elseif(CMAKE_VERSION VERSION_LESS "3.24.0") else() set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES native) endif() -target_compile_options(stream_compaction PRIVATE "$<$,$>:-G;-src-in-ptx>") -target_compile_options(stream_compaction PRIVATE "$<$,$>:-lineinfo;-src-in-ptx>") + +target_compile_options(stream_compaction PRIVATE $<$:-O3>) \ No newline at end of file diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..ed8efb73 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,4 +1,3 @@ -#include #include "cpu.h" #include "common.h" diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..e2d50cee 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -3,7 +3,7 @@ #include #include #include -#include "common.h" +#include "common.h" #include "thrust.h" namespace StreamCompaction { From 7f52fc258f12711dd8118a2b7c971a0ed40e2138 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Fri, 12 Sep 2025 12:52:58 -0400 Subject: [PATCH 02/56] fix: ensure thrust header files are found by clangd --- .clangd | 19 ++++--------------- .gitignore | 2 +- run_linux.sh | 2 +- stream_compaction/CMakeLists.txt | 4 +--- 4 files changed, 7 insertions(+), 20 deletions(-) diff --git a/.clangd b/.clangd index ed91fdc7..82404718 100644 --- a/.clangd +++ b/.clangd @@ -1,16 +1,3 @@ -# CompileFlags: -# Add: -# - -I/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl/thrust/ -# - -I/usr/local/cuda-13.0/targets/x86_64-linux/include -# - -L/usr/local/cuda-13.0/targets/x86_64-linux/lib64 -# - -I/usr/include/c++/15 -# - -I/usr/include/c++/15/x86_64-redhat-linux -# - -I/usr/include/c++/15/backward -# Remove: -# - -forward-unknown-to-host-compiler -# - --options-file -# - -arch=native -# - -G CompileFlags: Remove: - -forward-unknown-to-host-compiler @@ -40,5 +27,7 @@ If: PathMatch: .*\.cu CompileFlags: - Add: - - -xcuda + Add: + - -xcuda + - -I/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl/ + - -O3 diff --git a/.gitignore b/.gitignore index ad03a69e..7d50060c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,7 @@ cis565_getting_started_generated_kernel* *.xcodeproj build -.cache +.cache/** log.txt # Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode diff --git a/run_linux.sh b/run_linux.sh index 922a7d69..fc279784 100755 --- a/run_linux.sh +++ b/run_linux.sh @@ -11,7 +11,7 @@ while true; do trash build; mkdir build && cd build; - cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-I/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl/thrust/; + cmake .. -DCMAKE_BUILD_TYPE=Release"; make -j$(nproc --all); cd $CURRENT_DIR; # cd to stored original directory diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 96e157f9..c4c8c0df 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -27,6 +27,4 @@ elseif(CMAKE_VERSION VERSION_LESS "3.24.0") set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES all-major) else() set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES native) -endif() - -target_compile_options(stream_compaction PRIVATE $<$:-O3>) \ No newline at end of file +endif() \ No newline at end of file From 42cdf63455e4df4a33ec00c94eeb6701bdb16edd Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Fri, 12 Sep 2025 15:18:50 -0400 Subject: [PATCH 03/56] ci: run clang-format --- .clang-format | 306 +++++++++++++++++++++++++++++++++ .vscode/settings.json | 74 +------- GNUmakefile | 7 + run_linux.sh | 2 +- src/main.cpp | 72 ++++---- src/testing_helpers.hpp | 56 +++--- stream_compaction/common.cu | 51 +++--- stream_compaction/common.h | 217 ++++++++++++----------- stream_compaction/cpu.cu | 89 +++++----- stream_compaction/cpu.h | 18 +- stream_compaction/efficient.cu | 69 ++++---- stream_compaction/efficient.h | 16 +- stream_compaction/naive.cu | 41 +++-- stream_compaction/naive.h | 14 +- stream_compaction/thrust.cu | 45 ++--- stream_compaction/thrust.h | 14 +- 16 files changed, 709 insertions(+), 382 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..a83cc174 --- /dev/null +++ b/.clang-format @@ -0,0 +1,306 @@ +# yaml-language-server: $schema=https://json.schemastore.org/clang-format.json +--- +Language: Cpp +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveAssignments: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveBitFields: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveDeclarations: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveMacros: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveShortCaseStatements: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCaseArrows: false + AlignCaseColons: false +AlignConsecutiveTableGenBreakingDAGArgColons: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveTableGenCondOperatorColons: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveTableGenDefinitionColons: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignEscapedNewlines: DontAlign +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 1 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowBreakBeforeNoexceptSpecifier: Never +AllowShortBlocksOnASingleLine: Never +AllowShortCaseExpressionOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortCompoundRequirementOnASingleLine: true +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AttributeMacros: + - __capability +BinPackArguments: false +BinPackParameters: OnePerLine +BitFieldColonSpacing: Both +BracedInitializerIndentWidth: 4 +BraceWrapping: + AfterCaseLabel: false + AfterClass: true + AfterControlStatement: Always + AfterEnum: false + AfterExternBlock: false + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: true + AfterUnion: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +BreakAdjacentStringLiterals: true +BreakAfterAttributes: Leave +BreakAfterJavaFieldAnnotations: false +BreakAfterReturnType: ExceptShortType +BreakArrays: true +BreakBeforeBinaryOperators: All +BreakBeforeConceptDeclarations: Always +BreakBeforeBraces: Custom +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeComma +BreakFunctionDefinitionParameters: false +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +BreakTemplateDeclarations: Yes +ColumnLimit: 100 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: Always +ExperimentalAutoDetectBinPacking: true +FixNamespaceComments: true +ForEachMacros: + - forever + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^&- || true) +format: + find src stream_compaction \ + -type f \( -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.cu' -o -iname '*.glsl' \) \ + -exec clang-format -i {} + + + + .PHONY: all Debug MinSizeRel Release RelWithDebugInfo clean diff --git a/run_linux.sh b/run_linux.sh index fc279784..74d129af 100755 --- a/run_linux.sh +++ b/run_linux.sh @@ -11,7 +11,7 @@ while true; do trash build; mkdir build && cd build; - cmake .. -DCMAKE_BUILD_TYPE=Release"; + cmake .. -DCMAKE_BUILD_TYPE=Release; make -j$(nproc --all); cd $CURRENT_DIR; # cd to stored original directory diff --git a/src/main.cpp b/src/main.cpp index 77cce672..9fc093c2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,13 +13,14 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; - -int main() { +const int SIZE = 1 << 8; // feel free to change the size of array +const int NPOT = SIZE - 3; // Non-Power-Of-Two +int* a = new int[SIZE]; +int* b = new int[SIZE]; +int* c = new int[SIZE]; + +int main() +{ // Scan tests printf("\n"); @@ -37,21 +38,24 @@ int main() { zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); printArray(SIZE, b, true); zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + // printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -63,36 +67,41 @@ int main() { zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + // printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + // printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + // printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + // printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + // printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -113,7 +122,8 @@ int main() { zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); @@ -121,7 +131,8 @@ int main() { zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); expectedNPOT = count; printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); @@ -129,25 +140,28 @@ int main() { zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + // printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + // printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - system("pause"); // stop Win32 console from closing on exit + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; delete[] c; diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 408b1124..507f90b1 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -7,9 +7,12 @@ #include template -int cmpArrays(int n, T *a, T *b) { - for (int i = 0; i < n; i++) { - if (a[i] != b[i]) { +int cmpArrays(int n, T* a, T* b) +{ + for (int i = 0; i < n; i++) + { + if (a[i] != b[i]) + { printf(" a[%d] = %d, b[%d] = %d\n", i, a[i], i, b[i]); return 1; } @@ -17,50 +20,63 @@ int cmpArrays(int n, T *a, T *b) { return 0; } -void printDesc(const char *desc) { +void printDesc(const char* desc) +{ printf("==== %s ====\n", desc); } template -void printCmpResult(int n, T *a, T *b) { - printf(" %s \n", - cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); +void printCmpResult(int n, T* a, T* b) +{ + printf(" %s \n", cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); } template -void printCmpLenResult(int n, int expN, T *a, T *b) { - if (n != expN) { +void printCmpLenResult(int n, int expN, T* a, T* b) +{ + if (n != expN) + { printf(" expected %d elements, got %d\n", expN, n); } printf(" %s \n", - (n == -1 || n != expN) ? "FAIL COUNT" : - cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); + (n == -1 || n != expN) ? "FAIL COUNT" + : cmpArrays(n, a, b) ? "FAIL VALUE" + : "passed"); } -void zeroArray(int n, int *a) { - for (int i = 0; i < n; i++) { +void zeroArray(int n, int* a) +{ + for (int i = 0; i < n; i++) + { a[i] = 0; } } -void onesArray(int n, int *a) { - for (int i = 0; i < n; i++) { +void onesArray(int n, int* a) +{ + for (int i = 0; i < n; i++) + { a[i] = 1; } } -void genArray(int n, int *a, int maxval) { +void genArray(int n, int* a, int maxval) +{ srand(time(nullptr)); - for (int i = 0; i < n; i++) { + for (int i = 0; i < n; i++) + { a[i] = rand() % maxval; } } -void printArray(int n, int *a, bool abridged = false) { +void printArray(int n, int* a, bool abridged = false) +{ printf(" [ "); - for (int i = 0; i < n; i++) { - if (abridged && i + 2 == 15 && n > 16) { + for (int i = 0; i < n; i++) + { + if (abridged && i + 2 == 15 && n > 16) + { i = n - 2; printf("... "); } diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..3b9652bd 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,39 +1,44 @@ #include "common.h" -void checkCUDAErrorFn(const char *msg, const char *file, int line) { +void checkCUDAErrorFn(const char* msg, const char* file, int line) +{ cudaError_t err = cudaGetLastError(); - if (cudaSuccess == err) { + if (cudaSuccess == err) + { return; } fprintf(stderr, "CUDA error"); - if (file) { + if (file) + { fprintf(stderr, " (%s:%d)", file, line); } fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); exit(EXIT_FAILURE); } +namespace StreamCompaction +{ +namespace Common +{ -namespace StreamCompaction { - namespace Common { - - /** - * Maps an array to an array of 0s and 1s for stream compaction. Elements - * which map to 0 will be removed, and elements which map to 1 will be kept. - */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO - } - - /** - * Performs scatter on an array. That is, for each element in idata, - * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. - */ - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO - } +/** + * Maps an array to an array of 0s and 1s for stream compaction. Elements + * which map to 0 will be removed, and elements which map to 1 will be kept. + */ +__global__ void kernMapToBoolean(int n, int* bools, const int* idata) +{ + // TODO +} - } +/** + * Performs scatter on an array. That is, for each element in idata, + * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. + */ +__global__ void kernScatter(int n, int* odata, const int* idata, const int* bools, const int* indices) +{ + // TODO } + +} // namespace Common +} // namespace StreamCompaction diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed9..f98243ed 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -16,117 +16,134 @@ /** * Check for CUDA errors; print and exit if there was a problem. */ -void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); +void checkCUDAErrorFn(const char* msg, const char* file = NULL, int line = -1); -inline int ilog2(int x) { +inline int ilog2(int x) +{ int lg = 0; - while (x >>= 1) { + while (x >>= 1) + { ++lg; } return lg; } -inline int ilog2ceil(int x) { +inline int ilog2ceil(int x) +{ return x == 1 ? 0 : ilog2(x - 1) + 1; } -namespace StreamCompaction { - namespace Common { - __global__ void kernMapToBoolean(int n, int *bools, const int *idata); +namespace StreamCompaction +{ +namespace Common +{ +__global__ void kernMapToBoolean(int n, int* bools, const int* idata); - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices); +__global__ void kernScatter( + int n, int* odata, const int* idata, const int* bools, const int* indices); - /** - * This class is used for timing the performance - * Uncopyable and unmovable - * - * Adapted from WindyDarian(https://github.com/WindyDarian) - */ - class PerformanceTimer +/** + * This class is used for timing the performance + * Uncopyable and unmovable + * + * Adapted from WindyDarian(https://github.com/WindyDarian) + */ +class PerformanceTimer +{ +public: + PerformanceTimer() + { + cudaEventCreate(&event_start); + cudaEventCreate(&event_end); + } + + ~PerformanceTimer() + { + cudaEventDestroy(event_start); + cudaEventDestroy(event_end); + } + + void startCpuTimer() + { + if (cpu_timer_started) { - public: - PerformanceTimer() - { - cudaEventCreate(&event_start); - cudaEventCreate(&event_end); - } - - ~PerformanceTimer() - { - cudaEventDestroy(event_start); - cudaEventDestroy(event_end); - } - - void startCpuTimer() - { - if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } - cpu_timer_started = true; - - time_start_cpu = std::chrono::high_resolution_clock::now(); - } - - void endCpuTimer() - { - time_end_cpu = std::chrono::high_resolution_clock::now(); - - if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } - - std::chrono::duration duro = time_end_cpu - time_start_cpu; - prev_elapsed_time_cpu_milliseconds = - static_cast(duro.count()); - - cpu_timer_started = false; - } - - void startGpuTimer() - { - if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } - gpu_timer_started = true; - - cudaEventRecord(event_start); - } - - void endGpuTimer() - { - cudaEventRecord(event_end); - cudaEventSynchronize(event_end); - - if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } - - cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); - gpu_timer_started = false; - } - - float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 - { - return prev_elapsed_time_cpu_milliseconds; - } - - float getGpuElapsedTimeForPreviousOperation() //noexcept - { - return prev_elapsed_time_gpu_milliseconds; - } - - // remove copy and move functions - PerformanceTimer(const PerformanceTimer&) = delete; - PerformanceTimer(PerformanceTimer&&) = delete; - PerformanceTimer& operator=(const PerformanceTimer&) = delete; - PerformanceTimer& operator=(PerformanceTimer&&) = delete; - - private: - cudaEvent_t event_start = nullptr; - cudaEvent_t event_end = nullptr; - - using time_point_t = std::chrono::high_resolution_clock::time_point; - time_point_t time_start_cpu; - time_point_t time_end_cpu; - - bool cpu_timer_started = false; - bool gpu_timer_started = false; - - float prev_elapsed_time_cpu_milliseconds = 0.f; - float prev_elapsed_time_gpu_milliseconds = 0.f; - }; + throw std::runtime_error("CPU timer already started"); + } + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); } -} + + void endCpuTimer() + { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) + { + throw std::runtime_error("CPU timer not started"); + } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds + = static_cast(duro.count()); + + cpu_timer_started = false; + } + + void startGpuTimer() + { + if (gpu_timer_started) + { + throw std::runtime_error("GPU timer already started"); + } + gpu_timer_started = true; + + cudaEventRecord(event_start); + } + + void endGpuTimer() + { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + + if (!gpu_timer_started) + { + throw std::runtime_error("GPU timer not started"); + } + + cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); + gpu_timer_started = false; + } + + float getCpuElapsedTimeForPreviousOperation() // noexcept //(damn I need VS 2015 + { + return prev_elapsed_time_cpu_milliseconds; + } + + float getGpuElapsedTimeForPreviousOperation() // noexcept + { + return prev_elapsed_time_gpu_milliseconds; + } + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + +private: + cudaEvent_t event_start = nullptr; + cudaEvent_t event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t time_start_cpu; + time_point_t time_end_cpu; + + bool cpu_timer_started = false; + bool gpu_timer_started = false; + + float prev_elapsed_time_cpu_milliseconds = 0.f; + float prev_elapsed_time_gpu_milliseconds = 0.f; +}; +} // namespace Common +} // namespace StreamCompaction diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index ed8efb73..8e907cb3 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -2,48 +2,55 @@ #include "common.h" -namespace StreamCompaction { - namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } +namespace StreamCompaction +{ +namespace CPU +{ +using StreamCompaction::Common::PerformanceTimer; - /** - * CPU scan (prefix sum). - * For performance analysis, this is supposed to be a simple for loop. - * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. - */ - void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - } +PerformanceTimer& timer() +{ + static PerformanceTimer timer; + return timer; +} + +/** + * CPU scan (prefix sum). + * For performance analysis, this is supposed to be a simple for loop. + * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan + * in this function first. + */ +void scan(int n, int* odata, const int* idata) +{ + timer().startCpuTimer(); + // TODO + timer().endCpuTimer(); +} - /** - * CPU stream compaction without using the scan function. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; - } +/** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ +int compactWithoutScan(int n, int* odata, const int* idata) +{ + timer().startCpuTimer(); + // TODO + timer().endCpuTimer(); + return -1; +} - /** - * CPU stream compaction using scan and scatter, like the parallel version. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; - } - } +/** + * CPU stream compaction using scan and scatter, like the parallel version. + * + * @returns the number of elements remaining after compaction. + */ +int compactWithScan(int n, int* odata, const int* idata) +{ + timer().startCpuTimer(); + // TODO + timer().endCpuTimer(); + return -1; } +} // namespace CPU +} // namespace StreamCompaction diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c0476..5055059a 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -2,14 +2,16 @@ #include "common.h" -namespace StreamCompaction { - namespace CPU { - StreamCompaction::Common::PerformanceTimer& timer(); +namespace StreamCompaction +{ +namespace CPU +{ +StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); +void scan(int n, int* odata, const int* idata); - int compactWithoutScan(int n, int *odata, const int *idata); +int compactWithoutScan(int n, int* odata, const int* idata); - int compactWithScan(int n, int *odata, const int *idata); - } -} +int compactWithScan(int n, int* odata, const int* idata); +} // namespace CPU +} // namespace StreamCompaction diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..9c7fc127 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,38 +3,43 @@ #include "common.h" #include "efficient.h" -namespace StreamCompaction { - namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } +namespace StreamCompaction +{ +namespace Efficient +{ +using StreamCompaction::Common::PerformanceTimer; - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } +PerformanceTimer& timer() +{ + static PerformanceTimer timer; + return timer; +} + +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +void scan(int n, int* odata, const int* idata) +{ + timer().startGpuTimer(); + // TODO + timer().endGpuTimer(); +} - /** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; - } - } +/** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ +int compact(int n, int* odata, const int* idata) +{ + timer().startGpuTimer(); + // TODO + timer().endGpuTimer(); + return -1; } +} // namespace Efficient +} // namespace StreamCompaction diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4fe..36513ab7 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,12 +2,14 @@ #include "common.h" -namespace StreamCompaction { - namespace Efficient { - StreamCompaction::Common::PerformanceTimer& timer(); +namespace StreamCompaction +{ +namespace Efficient +{ +StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); +void scan(int n, int* odata, const int* idata); - int compact(int n, int *odata, const int *idata); - } -} +int compact(int n, int* odata, const int* idata); +} // namespace Efficient +} // namespace StreamCompaction diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..ded68955 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,23 +3,28 @@ #include "common.h" #include "naive.h" -namespace StreamCompaction { - namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - // TODO: __global__ +namespace StreamCompaction +{ +namespace Naive +{ +using StreamCompaction::Common::PerformanceTimer; - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - } +PerformanceTimer& timer() +{ + static PerformanceTimer timer; + return timer; } + +// TODO: __global__ + +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +void scan(int n, int* odata, const int* idata) +{ + timer().startGpuTimer(); + // TODO + timer().endGpuTimer(); +} +} // namespace Naive +} // namespace StreamCompaction diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb064..002f2903 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,10 +2,12 @@ #include "common.h" -namespace StreamCompaction { - namespace Naive { - StreamCompaction::Common::PerformanceTimer& timer(); +namespace StreamCompaction +{ +namespace Naive +{ +StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); - } -} +void scan(int n, int* odata, const int* idata); +} // namespace Naive +} // namespace StreamCompaction diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index e2d50cee..30a1c3a1 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -3,26 +3,31 @@ #include #include #include -#include "common.h" +#include "common.h" #include "thrust.h" -namespace StreamCompaction { - namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); - } - } +namespace StreamCompaction +{ +namespace Thrust +{ +using StreamCompaction::Common::PerformanceTimer; + +PerformanceTimer& timer() +{ + static PerformanceTimer timer; + return timer; +} + +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +void scan(int n, int* odata, const int* idata) +{ + timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` + // example: for device_vectors dv_in and dv_out: + // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); } +} // namespace Thrust +} // namespace StreamCompaction diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index fe98206b..517b07fa 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -2,10 +2,12 @@ #include "common.h" -namespace StreamCompaction { - namespace Thrust { - StreamCompaction::Common::PerformanceTimer& timer(); +namespace StreamCompaction +{ +namespace Thrust +{ +StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); - } -} +void scan(int n, int* odata, const int* idata); +} // namespace Thrust +} // namespace StreamCompaction From d782050d577601f7e2a3f60b8cdcc9a1479d99aa Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Fri, 12 Sep 2025 16:34:01 -0400 Subject: [PATCH 04/56] feat: finish cpu::scan --- src/main.cpp | 29 ++++++++++++++++++++++++++++- src/testing_helpers.hpp | 8 ++++++++ stream_compaction/cpu.cu | 10 ++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index 9fc093c2..1470f0c9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,12 +13,18 @@ #include #include "testing_helpers.hpp" +#define SKIP_UNIMPLEMENTED \ + 1 // use during development with `#if !SKIP_UNIMPLEMENTED` preprocessor at desired skip point + const int SIZE = 1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int* a = new int[SIZE]; int* b = new int[SIZE]; int* c = new int[SIZE]; +int* consecutive = new int[SIZE]; // use to test without randomness +int* consecutiveOut = new int[SIZE]; + int main() { // Scan tests @@ -28,6 +34,18 @@ int main() printf("** SCAN TESTS **\n"); printf("****************\n"); + printDesc("consecutive array (input)"); + genConsecutiveArray(SIZE, consecutive); + printArray(SIZE, consecutive, true); + + zeroArray(SIZE, consecutiveOut); + printDesc("cpu scan, power-of-two, consecutive-valued array"); + StreamCompaction::CPU::scan(SIZE, consecutiveOut, consecutive); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); + printArray(SIZE, consecutiveOut, true); + + printDesc("a array (input)"); genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); @@ -42,6 +60,8 @@ int main() "(std::chrono Measured)"); printArray(SIZE, b, true); +#if !SKIP_UNIMPLEMENTED + zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); @@ -161,8 +181,15 @@ int main() // printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - system("pause"); // stop Win32 console from closing on exit +#endif + +#if defined(_WIN32) || defined(_WIN64) // errors out on linux + system("pause"); // stop Win32 console from closing on exit +#endif + delete[] a; delete[] b; delete[] c; + delete[] consecutive; + delete[] consecutiveOut; } diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 507f90b1..37536a6d 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -70,6 +70,14 @@ void genArray(int n, int* a, int maxval) } } +void genConsecutiveArray(int n, int* a) +{ + for (int i = 0; i < n; i++) + { + a[i] = i; + } +} + void printArray(int n, int* a, bool abridged = false) { printf(" [ "); diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 8e907cb3..52d327a7 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -24,6 +24,16 @@ void scan(int n, int* odata, const int* idata) { timer().startCpuTimer(); // TODO + + odata[0] = 0; // identity is 0 + + int prev_sum = idata[0]; // save prev sum for access ease + for (int j = 1; j < n + 1; j++) + { + odata[j] = prev_sum; + prev_sum += idata[j]; + } + timer().endCpuTimer(); } From 4492c3ee31e6be6104e71aae2959845634b2fc82 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Fri, 12 Sep 2025 17:00:26 -0400 Subject: [PATCH 05/56] feat: implement cpu::compactWithoutScan --- run_linux.sh | 7 ++++--- src/main.cpp | 11 +++++++---- src/testing_helpers.hpp | 4 +++- stream_compaction/cpu.cu | 22 +++++++++++++++++----- 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/run_linux.sh b/run_linux.sh index 74d129af..c4dc0ef3 100755 --- a/run_linux.sh +++ b/run_linux.sh @@ -9,9 +9,10 @@ while true; do [Yy]* ) cd $SCRIPT_DIR; # cd to where script is located - trash build; - mkdir build && cd build; - cmake .. -DCMAKE_BUILD_TYPE=Release; + # trash build; + # mkdir build; + cd build; + # cmake .. -DCMAKE_BUILD_TYPE=Release; make -j$(nproc --all); cd $CURRENT_DIR; # cd to stored original directory diff --git a/src/main.cpp b/src/main.cpp index 1470f0c9..b4c3a87a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,11 +13,12 @@ #include #include "testing_helpers.hpp" -#define SKIP_UNIMPLEMENTED \ - 1 // use during development with `#if !SKIP_UNIMPLEMENTED` preprocessor at desired skip point +// use during development with `#if !SKIP_UNIMPLEMENTED` preprocessor at desired skip point +#define SKIP_UNIMPLEMENTED 1 const int SIZE = 1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two + int* a = new int[SIZE]; int* b = new int[SIZE]; int* c = new int[SIZE]; @@ -60,8 +61,6 @@ int main() "(std::chrono Measured)"); printArray(SIZE, b, true); -#if !SKIP_UNIMPLEMENTED - zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); @@ -70,6 +69,8 @@ int main() printArray(NPOT, c, true); printCmpResult(NPOT, b, c); +#if !SKIP_UNIMPLEMENTED + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); @@ -124,6 +125,7 @@ int main() // printArray(NPOT, c, true); printCmpResult(NPOT, b, c); +#endif printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -157,6 +159,7 @@ int main() printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); +#if !SKIP_UNIMPLEMENTED zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 37536a6d..a85b7e6d 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -90,7 +90,9 @@ void printArray(int n, int* a, bool abridged = false) } printf("%3d ", a[i]); } - printf("]\n"); + printf("] - count: "); + printf("%d\n", n); + } template diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 52d327a7..ae2c9363 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -25,15 +25,15 @@ void scan(int n, int* odata, const int* idata) timer().startCpuTimer(); // TODO - odata[0] = 0; // identity is 0 + odata[0] = 0; // identity is 0 - int prev_sum = idata[0]; // save prev sum for access ease + int prev_sum = idata[0]; // save prev sum for access ease for (int j = 1; j < n + 1; j++) { odata[j] = prev_sum; prev_sum += idata[j]; } - + timer().endCpuTimer(); } @@ -45,9 +45,21 @@ void scan(int n, int* odata, const int* idata) int compactWithoutScan(int n, int* odata, const int* idata) { timer().startCpuTimer(); - // TODO + + int outIndex = 0; // pointer to current progress in out array + + for (int i = 0; i < n; i++) + { + int inVal = idata[i]; + if (inVal != 0) + { + odata[outIndex] = inVal; + outIndex++; + } + } + timer().endCpuTimer(); - return -1; + return outIndex; } /** From 927bd438c3a0ed0979c9c198f3dfe913fae28152 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Fri, 12 Sep 2025 20:16:26 -0400 Subject: [PATCH 06/56] feat: implement cpu::compactWithScan --- src/main.cpp | 11 ++++++++++- stream_compaction/common.h | 6 +++--- stream_compaction/cpu.cu | 40 +++++++++++++++++++++++++++++++------- 3 files changed, 46 insertions(+), 11 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index b4c3a87a..cc47ff13 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -159,7 +159,6 @@ int main() printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); -#if !SKIP_UNIMPLEMENTED zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); @@ -168,6 +167,16 @@ int main() printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + zeroArray(SIZE, c); + printDesc("cpu compact with scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + +#if !SKIP_UNIMPLEMENTED + zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); diff --git a/stream_compaction/common.h b/stream_compaction/common.h index f98243ed..feca8ba2 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -51,6 +51,9 @@ __global__ void kernScatter( class PerformanceTimer { public: + bool cpu_timer_started = false; + bool gpu_timer_started = false; + PerformanceTimer() { cudaEventCreate(&event_start); @@ -139,9 +142,6 @@ class PerformanceTimer time_point_t time_start_cpu; time_point_t time_end_cpu; - bool cpu_timer_started = false; - bool gpu_timer_started = false; - float prev_elapsed_time_cpu_milliseconds = 0.f; float prev_elapsed_time_gpu_milliseconds = 0.f; }; diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index ae2c9363..baded8c1 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -22,19 +22,26 @@ PerformanceTimer& timer() */ void scan(int n, int* odata, const int* idata) { - timer().startCpuTimer(); - // TODO + bool usingTimer = false; + if (!timer().cpu_timer_started) // added in order to call `scan` from other functions. + { + timer().startCpuTimer(); + usingTimer = true; + } odata[0] = 0; // identity is 0 int prev_sum = idata[0]; // save prev sum for access ease - for (int j = 1; j < n + 1; j++) + for (int j = 1; j < n; j++) { odata[j] = prev_sum; prev_sum += idata[j]; } - timer().endCpuTimer(); + if (usingTimer) + { + timer().endCpuTimer(); + } } /** @@ -46,7 +53,7 @@ int compactWithoutScan(int n, int* odata, const int* idata) { timer().startCpuTimer(); - int outIndex = 0; // pointer to current progress in out array + int outIndex = 0; // pointer to current progress in out array for (int i = 0; i < n; i++) { @@ -70,9 +77,28 @@ int compactWithoutScan(int n, int* odata, const int* idata) int compactWithScan(int n, int* odata, const int* idata) { timer().startCpuTimer(); - // TODO + + int* isNotZero = new int[n]; + int* scan_isNotZero = new int[n]; + + for (int i = 0; i < n; i++) + { + isNotZero[i] = idata[i] != 0 ? 1 : 0; // val is 1 at i if idata[i] != 0, else 0 + } + + scan(n, scan_isNotZero, isNotZero); // scan result is index in final array + + for (int i = 0; i < n; i++) + { + if (isNotZero[i]) + { + odata[scan_isNotZero[i]] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + + return scan_isNotZero[n-1] + isNotZero[n-1]; // due to exclusive scan } } // namespace CPU } // namespace StreamCompaction From 8940b2ecc5b0e45d845671cba1083191932ac255 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Fri, 12 Sep 2025 20:33:13 -0400 Subject: [PATCH 07/56] fix: solve all clangd warnings --- src/testing_helpers.hpp | 20 +++++++++----------- stream_compaction/common.h | 4 +--- stream_compaction/cpu.cu | 4 ++-- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index a85b7e6d..a900d67b 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -3,7 +3,6 @@ #include #include #include -#include #include template @@ -20,19 +19,19 @@ int cmpArrays(int n, T* a, T* b) return 0; } -void printDesc(const char* desc) +inline void printDesc(const char* desc) { printf("==== %s ====\n", desc); } template -void printCmpResult(int n, T* a, T* b) +inline void printCmpResult(int n, T* a, T* b) { printf(" %s \n", cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); } template -void printCmpLenResult(int n, int expN, T* a, T* b) +inline void printCmpLenResult(int n, int expN, T* a, T* b) { if (n != expN) { @@ -44,7 +43,7 @@ void printCmpLenResult(int n, int expN, T* a, T* b) : "passed"); } -void zeroArray(int n, int* a) +inline void zeroArray(int n, int* a) { for (int i = 0; i < n; i++) { @@ -52,7 +51,7 @@ void zeroArray(int n, int* a) } } -void onesArray(int n, int* a) +inline void onesArray(int n, int* a) { for (int i = 0; i < n; i++) { @@ -60,7 +59,7 @@ void onesArray(int n, int* a) } } -void genArray(int n, int* a, int maxval) +inline void genArray(int n, int* a, int maxval) { srand(time(nullptr)); @@ -70,7 +69,7 @@ void genArray(int n, int* a, int maxval) } } -void genConsecutiveArray(int n, int* a) +inline void genConsecutiveArray(int n, int* a) { for (int i = 0; i < n; i++) { @@ -78,7 +77,7 @@ void genConsecutiveArray(int n, int* a) } } -void printArray(int n, int* a, bool abridged = false) +inline void printArray(int n, int* a, bool abridged = false) { printf(" [ "); for (int i = 0; i < n; i++) @@ -92,11 +91,10 @@ void printArray(int n, int* a, bool abridged = false) } printf("] - count: "); printf("%d\n", n); - } template -void printElapsedTime(T time, const char* note = "") +inline void printElapsedTime(T time, const char* note = "") { std::cout << " elapsed time: " << time << "ms " << note << std::endl; } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index feca8ba2..c4cc47ed 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -5,8 +5,6 @@ #include #include -#include -#include #include #include @@ -53,7 +51,7 @@ class PerformanceTimer public: bool cpu_timer_started = false; bool gpu_timer_started = false; - + PerformanceTimer() { cudaEventCreate(&event_start); diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index baded8c1..28a60d34 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -23,7 +23,7 @@ PerformanceTimer& timer() void scan(int n, int* odata, const int* idata) { bool usingTimer = false; - if (!timer().cpu_timer_started) // added in order to call `scan` from other functions. + if (!timer().cpu_timer_started) // added in order to call `scan` from other functions. { timer().startCpuTimer(); usingTimer = true; @@ -98,7 +98,7 @@ int compactWithScan(int n, int* odata, const int* idata) timer().endCpuTimer(); - return scan_isNotZero[n-1] + isNotZero[n-1]; // due to exclusive scan + return scan_isNotZero[n - 1] + isNotZero[n - 1]; // due to exclusive scan } } // namespace CPU } // namespace StreamCompaction From c9c9c5dff9177f4fe61518a85ff80c8d46e48d3a Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Fri, 12 Sep 2025 23:18:58 -0400 Subject: [PATCH 08/56] feat: implement naive::scan first draft --- src/main.cpp | 16 +++++----- src/testing_helpers.hpp | 4 +-- stream_compaction/CMakeLists.txt | 6 +++- stream_compaction/common.cu | 29 +++++++++++++++++ stream_compaction/common.h | 11 +++++++ stream_compaction/naive.cu | 53 ++++++++++++++++++++++++++++++-- 6 files changed, 106 insertions(+), 13 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index cc47ff13..ca0a5c0c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -69,30 +69,30 @@ int main() printArray(NPOT, c, true); printCmpResult(NPOT, b, c); -#if !SKIP_UNIMPLEMENTED - zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + // For bug-finding only: Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); printDesc("1s array for finding bugs"); StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + printArray(SIZE, c, true); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); +#if !SKIP_UNIMPLEMENTED + zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); @@ -126,6 +126,8 @@ int main() printCmpResult(NPOT, b, c); #endif + +#if !SKIP_UNIMPLEMENTED printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -175,8 +177,6 @@ int main() printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); -#if !SKIP_UNIMPLEMENTED - zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index a900d67b..c559e254 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -21,13 +21,13 @@ int cmpArrays(int n, T* a, T* b) inline void printDesc(const char* desc) { - printf("==== %s ====\n", desc); + printf("\033[1;35m==== %s ====\033[0m\n", desc); // make pink } template inline void printCmpResult(int n, T* a, T* b) { - printf(" %s \n", cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); + printf(" %s \033[0m\n", cmpArrays(n, a, b) ? "\033[1;31mFAIL VALUE" : "\033[1;32mpassed"); } template diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index c4c8c0df..1618686b 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -27,4 +27,8 @@ elseif(CMAKE_VERSION VERSION_LESS "3.24.0") set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES all-major) else() set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES native) -endif() \ No newline at end of file +endif() + +target_compile_options(stream_compaction PRIVATE "$<$,$>:-G;-src-in-ptx>") + +target_compile_options(stream_compaction PRIVATE "$<$,$>:-lineinfo;-src-in-ptx>") \ No newline at end of file diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 3b9652bd..7137c8b9 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -40,5 +40,34 @@ __global__ void kernScatter(int n, int* odata, const int* idata, const int* bool // TODO } +__global__ void kernel_inclusiveToExclusive(int n, int identity, const int* iData, int* oData) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= n) + { + return; + } else if (index == 0) + { + oData[index] = identity; + } else + { + oData[index] = iData[index - 1]; + } +} + +__global__ void kernel_copyData(int n, const int* iData, int* oData) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= n) + { + return; + } else + { + oData[index] = iData[index]; + } +} + } // namespace Common } // namespace StreamCompaction diff --git a/stream_compaction/common.h b/stream_compaction/common.h index c4cc47ed..3dd26575 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -11,11 +11,18 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define BLOCK_SIZE 32 + /** * Check for CUDA errors; print and exit if there was a problem. */ void checkCUDAErrorFn(const char* msg, const char* file = NULL, int line = -1); +inline unsigned divup(unsigned size, unsigned div) +{ + return (size + div - 1) / div; +} + inline int ilog2(int x) { int lg = 0; @@ -40,6 +47,10 @@ __global__ void kernMapToBoolean(int n, int* bools, const int* idata); __global__ void kernScatter( int n, int* odata, const int* idata, const int* bools, const int* indices); +__global__ void kernel_inclusiveToExclusive(int n, int identity, const int* iData, int* oData); + +__global__ void kernel_copyData(int n, const int* iData, int* oData); + /** * This class is used for timing the performance * Uncopyable and unmovable diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index ded68955..e861c71f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -15,16 +15,65 @@ PerformanceTimer& timer() return timer; } -// TODO: __global__ +// scanA is input and scanB is output for this iteration +__global__ void kernel_performNaiveScanIteration(const int n, + const int iter, + const int* scanA, + int* scanB) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + + int iter_startIdx = exp2f(iter - 1); + if (index < iter_startIdx || index >= n) + { + return; + } + + scanB[index] = scanA[index - iter_startIdx] + scanA[index]; +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int* odata, const int* idata) { + // create two device arrays to ping-pong between + int* dev_scanA; + int* dev_scanB; + + cudaMalloc((void**)&dev_scanA, sizeof(int) * n); + checkCUDAError("CUDA malloc for scan array A failed."); + + cudaMalloc((void**)&dev_scanB, sizeof(int) * n); + checkCUDAError("CUDA malloc for scan array B failed."); + + cudaMemcpy(dev_scanA, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memory copy from input data to scan array A failed."); + cudaMemcpy(dev_scanB, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memory copy from output data to scan array B failed."); + + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + + int blocks = divup(n, BLOCK_SIZE); + + for (int i = 1; i <= ilog2ceil(n); i++) + { + kernel_performNaiveScanIteration<<>>(n, i, dev_scanA, dev_scanB); + checkCUDAError("Perform Naive Scan Iteration CUDA kernel failed."); + + Common::kernel_copyData<<>>(n, dev_scanB, dev_scanA); + } + + Common::kernel_inclusiveToExclusive<<>>(n, 0, dev_scanA, dev_scanB); + timer().endGpuTimer(); + + cudaMemcpy(odata, + dev_scanB, + sizeof(int) * n, + cudaMemcpyDeviceToHost); // result ends up in scanB } } // namespace Naive } // namespace StreamCompaction From 74241f894a05be42b23bbb9c342d2e9318c3bbd1 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Fri, 12 Sep 2025 23:27:11 -0400 Subject: [PATCH 09/56] fix: add checkCUDAError calls --- stream_compaction/naive.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index e861c71f..acb9dc83 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -64,9 +64,11 @@ void scan(int n, int* odata, const int* idata) checkCUDAError("Perform Naive Scan Iteration CUDA kernel failed."); Common::kernel_copyData<<>>(n, dev_scanB, dev_scanA); + checkCUDAError("Copy Data CUDA kernel failed."); } Common::kernel_inclusiveToExclusive<<>>(n, 0, dev_scanA, dev_scanB); + checkCUDAError("Inclusive to Exclusive CUDA kernel failed."); timer().endGpuTimer(); From 18feda5a61395ea5b33c3282e82bad496e8388d6 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Sat, 13 Sep 2025 16:26:04 -0400 Subject: [PATCH 10/56] ci: supress warnings from clangd for thrust external libraries --- .clangd | 9 ++++++++- .vscode/settings.json | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.clangd b/.clangd index 82404718..b0e6e082 100644 --- a/.clangd +++ b/.clangd @@ -29,5 +29,12 @@ If: CompileFlags: Add: - -xcuda - - -I/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl/ - -O3 + - -use_fast_math +--- +If: + PathMatch: stream_compaction/thrust\.cu +Diagnostics: + Suppress: # supress errors from thrust base code + - no_member + - typename_nested_not_found diff --git a/.vscode/settings.json b/.vscode/settings.json index 225c79d4..94952eea 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -6,7 +6,8 @@ "--background-index", "--compile-commands-dir=${workspaceFolder}/build", "--query-driver=/usr/bin/g++", - "--query-driver=/usr/bin/clang++" + "--query-driver=/usr/bin/clang++", + "--query-driver=/usr/local/cuda/bin/nvcc" ], "[cpp][cuda-cpp][glsl]": { "editor.defaultFormatter": "llvm-vs-code-extensions.vscode-clangd" From c62357514084a86fa38fb0aff599496da48f32f8 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Sat, 13 Sep 2025 16:28:20 -0400 Subject: [PATCH 11/56] feat: implement thrust scan --- src/main.cpp | 7 +++---- src/testing_helpers.hpp | 2 +- stream_compaction/thrust.cu | 14 +++++++++++--- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index ca0a5c0c..f7be25da 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -108,13 +108,14 @@ int main() "(CUDA Measured)"); // printArray(NPOT, c, true); printCmpResult(NPOT, b, c); +#endif zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); @@ -122,11 +123,9 @@ int main() StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); -#endif - #if !SKIP_UNIMPLEMENTED printf("\n"); printf("*****************************\n"); diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index c559e254..35a4bc0b 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -21,7 +21,7 @@ int cmpArrays(int n, T* a, T* b) inline void printDesc(const char* desc) { - printf("\033[1;35m==== %s ====\033[0m\n", desc); // make pink + printf("\033[1;35m==== %s ====\033[0m\n", desc); // make pink } template diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 30a1c3a1..7d9fec08 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -23,11 +23,19 @@ PerformanceTimer& timer() */ void scan(int n, int* odata, const int* idata) { + // Copy data from host to device + thrust::host_vector host_idata(idata, idata + n); // thrust hst vector + thrust::device_vector dev_idata = host_idata; // built-in assignment conversion + thrust::device_vector dev_odata(n); // for output + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + timer().endGpuTimer(); + + // copy result back to host + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); } } // namespace Thrust } // namespace StreamCompaction From d1d297bb01007c6982b2e4f86e477ef243915ea1 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Tue, 16 Sep 2025 17:56:08 -0400 Subject: [PATCH 12/56] refactor: naive impl does not require copying entire array --- stream_compaction/naive.cu | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index acb9dc83..85d4b98f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -24,12 +24,16 @@ __global__ void kernel_performNaiveScanIteration(const int n, int index = blockIdx.x * blockDim.x + threadIdx.x; int iter_startIdx = exp2f(iter - 1); - if (index < iter_startIdx || index >= n) + if (index >= n) { return; + } else if (index < iter_startIdx) + { + scanB[index] = scanA[index]; + return; + } else { + scanB[index] = scanA[index - iter_startIdx] + scanA[index]; } - - scanB[index] = scanA[index - iter_startIdx] + scanA[index]; } /** @@ -63,10 +67,13 @@ void scan(int n, int* odata, const int* idata) kernel_performNaiveScanIteration<<>>(n, i, dev_scanA, dev_scanB); checkCUDAError("Perform Naive Scan Iteration CUDA kernel failed."); - Common::kernel_copyData<<>>(n, dev_scanB, dev_scanA); - checkCUDAError("Copy Data CUDA kernel failed."); + // ping-pong + int* temp = dev_scanA; + dev_scanA = dev_scanB; + dev_scanB = temp; } + // result ends up in dev_scanA Common::kernel_inclusiveToExclusive<<>>(n, 0, dev_scanA, dev_scanB); checkCUDAError("Inclusive to Exclusive CUDA kernel failed."); From 30cc13b27c28c9570fda5406ee4e73b6a95609e8 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Tue, 16 Sep 2025 20:30:09 -0400 Subject: [PATCH 13/56] feat: complete work efficient scan implementation --- src/main.cpp | 13 ++++-- stream_compaction/CMakeLists.txt | 4 +- stream_compaction/efficient.cu | 76 +++++++++++++++++++++++++++++++- stream_compaction/naive.cu | 15 +++++-- stream_compaction/thrust.cu | 4 +- 5 files changed, 102 insertions(+), 10 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index f7be25da..8461e44a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -91,14 +91,20 @@ int main() printArray(SIZE, c, true); printCmpResult(NPOT, b, c); -#if !SKIP_UNIMPLEMENTED + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two, consecutive-valued array"); + StreamCompaction::Efficient::scan(SIZE, c, consecutive); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, consecutiveOut, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); @@ -106,9 +112,8 @@ int main() StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); -#endif zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 1618686b..fb52b1ae 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -31,4 +31,6 @@ endif() target_compile_options(stream_compaction PRIVATE "$<$,$>:-G;-src-in-ptx>") -target_compile_options(stream_compaction PRIVATE "$<$,$>:-lineinfo;-src-in-ptx>") \ No newline at end of file +target_compile_options(stream_compaction PRIVATE "$<$,$>:-lineinfo;-src-in-ptx>") + +set(CMAKE_CUDA_FLAGS -I/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl/) \ No newline at end of file diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 9c7fc127..fe23ddf2 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -15,14 +15,88 @@ PerformanceTimer& timer() return timer; } +__global__ void kernel_performEfficientScanUpSweepIteration(const int n, const int iter, int* scan) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= n) + { + return; + } + int iterFactor = exp2f(iter); + + int iterTarget = exp2f(iter + 1); + if (index % iterTarget == 0) + { + scan[index + iterTarget - 1] += scan[index + iterFactor - 1]; + } +} + +__global__ void kernel_performEfficientScanDownSweepIteration(const int n, const int iter, int* scan) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= n) + { + return; + } + + int iterFactor = exp2f(iter); + int iterTarget = exp2f(iter + 1); + + if (index % iterTarget == 0) + { + int leftChild = scan[index + iterFactor - 1]; + scan[index + iterFactor - 1] = scan[index + iterTarget - 1]; + + scan[index + iterTarget - 1] += leftChild; + } +} + +__global__ void kernel_setFirstZero(const int n, int* scan) +{ + scan[(int)exp2f(n)-1] = 0; // round up to nearest power of two +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int* odata, const int* idata) { + // create two device arrays + int* dev_scan; + + cudaMalloc((void**)&dev_scan, sizeof(int) * n); + checkCUDAError("CUDA malloc for scan array failed."); + + cudaMemcpy(dev_scan, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memory copy from input data to scan array failed."); + + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + + int blocks = divup(n, BLOCK_SIZE); + + for (int i = 0; i <= ilog2ceil(n) - 1; i++) + { + kernel_performEfficientScanUpSweepIteration<<>>(n, i, dev_scan); + checkCUDAError("Perform Work-Efficient Scan Up Sweep Iteration CUDA kernel failed."); + } + + kernel_setFirstZero<<<1, 1>>>(ilog2ceil(n), dev_scan); + + for (int i = ilog2ceil(n)-1; i >= 0; i--) + { + kernel_performEfficientScanDownSweepIteration<<>>(n, i, dev_scan); + checkCUDAError("Perform Work-Efficient Scan Down Sweep Iteration CUDA kernel failed."); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_scan, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_scan); // can't forget memory leaks! } /** diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 85d4b98f..6f095a40 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -27,13 +27,19 @@ __global__ void kernel_performNaiveScanIteration(const int n, if (index >= n) { return; - } else if (index < iter_startIdx) + } + + if (index < iter_startIdx) { scanB[index] = scanA[index]; - return; - } else { + } else + { scanB[index] = scanA[index - iter_startIdx] + scanA[index]; } + + // profile time efficiency + // scanB[index] = index < iter_startIdx ? scanA[index] + // : scanA[index - iter_startIdx] + scanA[index]; } /** @@ -83,6 +89,9 @@ void scan(int n, int* odata, const int* idata) dev_scanB, sizeof(int) * n, cudaMemcpyDeviceToHost); // result ends up in scanB + + cudaFree(dev_scanA); + cudaFree(dev_scanB); // can't forget memory leaks! } } // namespace Naive } // namespace StreamCompaction diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 7d9fec08..d25fd3de 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -6,6 +6,8 @@ #include "common.h" #include "thrust.h" +#include + namespace StreamCompaction { namespace Thrust @@ -24,7 +26,7 @@ PerformanceTimer& timer() void scan(int n, int* odata, const int* idata) { // Copy data from host to device - thrust::host_vector host_idata(idata, idata + n); // thrust hst vector + thrust::host_vector host_idata(idata, idata + n); // thrust host vector thrust::device_vector dev_idata = host_idata; // built-in assignment conversion thrust::device_vector dev_odata(n); // for output From 06da4f5dadc8040154b1d38781fcca8cb6873966 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Tue, 16 Sep 2025 21:16:55 -0400 Subject: [PATCH 14/56] feat: implement work efficient compact scan --- src/main.cpp | 9 ++--- src/testing_helpers.hpp | 8 ++-- stream_compaction/common.cu | 21 +++++++++- stream_compaction/efficient.cu | 71 +++++++++++++++++++++++++++++++--- stream_compaction/naive.cu | 2 +- stream_compaction/thrust.cu | 2 - 6 files changed, 92 insertions(+), 21 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 8461e44a..aca5b1cf 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ // use during development with `#if !SKIP_UNIMPLEMENTED` preprocessor at desired skip point #define SKIP_UNIMPLEMENTED 1 -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 28; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int* a = new int[SIZE]; @@ -131,7 +131,6 @@ int main() printArray(NPOT, c, true); printCmpResult(NPOT, b, c); -#if !SKIP_UNIMPLEMENTED printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -186,7 +185,7 @@ int main() count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); @@ -194,11 +193,9 @@ int main() count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); -#endif - #if defined(_WIN32) || defined(_WIN64) // errors out on linux system("pause"); // stop Win32 console from closing on exit #endif diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 35a4bc0b..9e070126 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -37,10 +37,10 @@ inline void printCmpLenResult(int n, int expN, T* a, T* b) { printf(" expected %d elements, got %d\n", expN, n); } - printf(" %s \n", - (n == -1 || n != expN) ? "FAIL COUNT" - : cmpArrays(n, a, b) ? "FAIL VALUE" - : "passed"); + printf(" %s \033[0m\n", + (n == -1 || n != expN) ? "\033[1;31mFAIL COUNT" + : cmpArrays(n, a, b) ? "\033[1;31mFAIL VALUE" + : "\033[1;32mpassed"); } inline void zeroArray(int n, int* a) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 7137c8b9..16a758aa 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -28,7 +28,14 @@ namespace Common */ __global__ void kernMapToBoolean(int n, int* bools, const int* idata) { - // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= n) + { + return; + } + + bools[index] = idata[index] == 0 ? 0 : 1; } /** @@ -37,7 +44,17 @@ __global__ void kernMapToBoolean(int n, int* bools, const int* idata) */ __global__ void kernScatter(int n, int* odata, const int* idata, const int* bools, const int* indices) { - // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= n) + { + return; + } + + if (bools[index] == 1) + { + odata[indices[index]] = idata[index]; + } } __global__ void kernel_inclusiveToExclusive(int n, int identity, const int* iData, int* oData) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index fe23ddf2..180d9c19 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -55,7 +55,7 @@ __global__ void kernel_performEfficientScanDownSweepIteration(const int n, const __global__ void kernel_setFirstZero(const int n, int* scan) { - scan[(int)exp2f(n)-1] = 0; // round up to nearest power of two + scan[(int)exp2f(n) - 1] = 0; // round up to nearest power of two } /** @@ -74,7 +74,12 @@ void scan(int n, int* odata, const int* idata) cudaDeviceSynchronize(); - timer().startGpuTimer(); + bool usingTimer = false; + if (!timer().gpu_timer_started) // added in order to call `scan` from other functions. + { + timer().startGpuTimer(); + usingTimer = true; + } int blocks = divup(n, BLOCK_SIZE); @@ -86,13 +91,16 @@ void scan(int n, int* odata, const int* idata) kernel_setFirstZero<<<1, 1>>>(ilog2ceil(n), dev_scan); - for (int i = ilog2ceil(n)-1; i >= 0; i--) + for (int i = ilog2ceil(n) - 1; i >= 0; i--) { kernel_performEfficientScanDownSweepIteration<<>>(n, i, dev_scan); checkCUDAError("Perform Work-Efficient Scan Down Sweep Iteration CUDA kernel failed."); } - timer().endGpuTimer(); + if (usingTimer) + { + timer().endGpuTimer(); + } cudaMemcpy(odata, dev_scan, sizeof(int) * n, cudaMemcpyDeviceToHost); @@ -110,10 +118,61 @@ void scan(int n, int* odata, const int* idata) */ int compact(int n, int* odata, const int* idata) { + // create device arrays + int* dev_idata; + int* dev_odata; + + int* dev_bools; + int* dev_indices; + + cudaMalloc((void**)&dev_idata, sizeof(int) * n); + checkCUDAError("CUDA malloc for idata array failed."); + + cudaMalloc((void**)&dev_odata, sizeof(int) * n); + checkCUDAError("CUDA malloc for odata array failed."); + + cudaMalloc((void**)&dev_bools, sizeof(int) * n); + checkCUDAError("CUDA malloc for bools array failed."); + + cudaMalloc((void**)&dev_indices, sizeof(int) * n); + checkCUDAError("CUDA malloc for indices array failed."); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memory copy from input data to idata array failed."); + cudaMemcpy(dev_bools, odata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memory copy from output data to odata array failed."); + + cudaDeviceSynchronize(); + + int* indices = new int[n]; // create cpu side indices array + int* bools = new int[n]; + timer().startGpuTimer(); - // TODO + + int blocks = divup(n, BLOCK_SIZE); + + // reuse dev_idata for bools + Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + + cudaMemcpy(bools, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToHost); + + scan(n, indices, bools); + + cudaMemcpy(dev_indices, indices, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memory copy from indices to device indices array failed."); + + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + + return indices[n-1] + bools[n-1]; } } // namespace Efficient } // namespace StreamCompaction diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 6f095a40..b6a078de 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -59,7 +59,7 @@ void scan(int n, int* odata, const int* idata) cudaMemcpy(dev_scanA, idata, sizeof(int) * n, cudaMemcpyHostToDevice); checkCUDAError("Memory copy from input data to scan array A failed."); - cudaMemcpy(dev_scanB, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_scanB, odata, sizeof(int) * n, cudaMemcpyHostToDevice); checkCUDAError("Memory copy from output data to scan array B failed."); cudaDeviceSynchronize(); diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d25fd3de..15abc779 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -6,8 +6,6 @@ #include "common.h" #include "thrust.h" -#include - namespace StreamCompaction { namespace Thrust From 31b1405da97aa2425020ac87b2f7cfe6ac787100 Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Tue, 16 Sep 2025 22:19:43 -0400 Subject: [PATCH 15/56] refactor: naming and lint --- src/main.cpp | 2 +- stream_compaction/efficient.cu | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index aca5b1cf..110c5fdc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ // use during development with `#if !SKIP_UNIMPLEMENTED` preprocessor at desired skip point #define SKIP_UNIMPLEMENTED 1 -const int SIZE = 1 << 28; // feel free to change the size of array +const int SIZE = 1 << 3; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int* a = new int[SIZE]; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 180d9c19..0b73f9be 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -15,7 +15,7 @@ PerformanceTimer& timer() return timer; } -__global__ void kernel_performEfficientScanUpSweepIteration(const int n, const int iter, int* scan) +__global__ void kernel_efficientUpSweep(const int n, const int iter, int* scan) { int index = blockIdx.x * blockDim.x + threadIdx.x; @@ -32,7 +32,7 @@ __global__ void kernel_performEfficientScanUpSweepIteration(const int n, const i } } -__global__ void kernel_performEfficientScanDownSweepIteration(const int n, const int iter, int* scan) +__global__ void kernel_efficientDownSweep(const int n, const int iter, int* scan) { int index = blockIdx.x * blockDim.x + threadIdx.x; @@ -85,7 +85,7 @@ void scan(int n, int* odata, const int* idata) for (int i = 0; i <= ilog2ceil(n) - 1; i++) { - kernel_performEfficientScanUpSweepIteration<<>>(n, i, dev_scan); + kernel_efficientUpSweep<<>>(n, i, dev_scan); checkCUDAError("Perform Work-Efficient Scan Up Sweep Iteration CUDA kernel failed."); } @@ -93,7 +93,7 @@ void scan(int n, int* odata, const int* idata) for (int i = ilog2ceil(n) - 1; i >= 0; i--) { - kernel_performEfficientScanDownSweepIteration<<>>(n, i, dev_scan); + kernel_efficientDownSweep<<>>(n, i, dev_scan); checkCUDAError("Perform Work-Efficient Scan Down Sweep Iteration CUDA kernel failed."); } @@ -155,6 +155,7 @@ int compact(int n, int* odata, const int* idata) Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); cudaMemcpy(bools, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("Memory copy from device bools to indices array failed."); scan(n, indices, bools); From 981693be17ccf2f99916bd8a5cfd6a1975fa829b Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Tue, 16 Sep 2025 23:04:09 -0400 Subject: [PATCH 16/56] feat: remove modulo and impl efficient stride length --- src/main.cpp | 2 +- stream_compaction/common.cu | 12 ++------ stream_compaction/common.h | 2 +- stream_compaction/efficient.cu | 54 +++++++++++++++------------------- 4 files changed, 27 insertions(+), 43 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 110c5fdc..2478488e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ // use during development with `#if !SKIP_UNIMPLEMENTED` preprocessor at desired skip point #define SKIP_UNIMPLEMENTED 1 -const int SIZE = 1 << 3; // feel free to change the size of array +const int SIZE = 1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int* a = new int[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 16a758aa..4adb99e2 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -73,17 +73,9 @@ __global__ void kernel_inclusiveToExclusive(int n, int identity, const int* iDat } } -__global__ void kernel_copyData(int n, const int* iData, int* oData) +__global__ void kernel_setDeviceArrayValue(int* arr, const int index, const int value) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - - if (index >= n) - { - return; - } else - { - oData[index] = iData[index]; - } + arr[index] = value; // round up to nearest power of two } } // namespace Common diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 3dd26575..a6b75ee2 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -49,7 +49,7 @@ __global__ void kernScatter( __global__ void kernel_inclusiveToExclusive(int n, int identity, const int* iData, int* oData); -__global__ void kernel_copyData(int n, const int* iData, int* oData); +__global__ void kernel_setDeviceArrayValue(int* arr, const int index, const int value); /** * This class is used for timing the performance diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 0b73f9be..bfb1f6c5 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -17,16 +17,13 @@ PerformanceTimer& timer() __global__ void kernel_efficientUpSweep(const int n, const int iter, int* scan) { - int index = blockIdx.x * blockDim.x + threadIdx.x; + int iterTarget = 1 << (iter + 1); + int iterFactor = 1 << iter; - if (index >= n) - { - return; - } - int iterFactor = exp2f(iter); + int index = blockIdx.x * blockDim.x + threadIdx.x; + index *= iterTarget; - int iterTarget = exp2f(iter + 1); - if (index % iterTarget == 0) + if (index + iterTarget - 1 < n) { scan[index + iterTarget - 1] += scan[index + iterFactor - 1]; } @@ -34,42 +31,35 @@ __global__ void kernel_efficientUpSweep(const int n, const int iter, int* scan) __global__ void kernel_efficientDownSweep(const int n, const int iter, int* scan) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - - if (index >= n) - { - return; - } + int iterTarget = 1 << (iter + 1); + int iterFactor = 1 << iter; - int iterFactor = exp2f(iter); - int iterTarget = exp2f(iter + 1); + int index = blockIdx.x * blockDim.x + threadIdx.x; + index = index * iterTarget; - if (index % iterTarget == 0) + if (index + iterTarget - 1 < n) { int leftChild = scan[index + iterFactor - 1]; scan[index + iterFactor - 1] = scan[index + iterTarget - 1]; - scan[index + iterTarget - 1] += leftChild; } } -__global__ void kernel_setFirstZero(const int n, int* scan) -{ - scan[(int)exp2f(n) - 1] = 0; // round up to nearest power of two -} - /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int* odata, const int* idata) { + int numLayers = ilog2ceil(n); + int paddedN = 1 << ilog2ceil(n); + // create two device arrays int* dev_scan; - cudaMalloc((void**)&dev_scan, sizeof(int) * n); + cudaMalloc((void**)&dev_scan, sizeof(int) * paddedN); checkCUDAError("CUDA malloc for scan array failed."); - cudaMemcpy(dev_scan, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_scan, idata, sizeof(int) * paddedN, cudaMemcpyHostToDevice); checkCUDAError("Memory copy from input data to scan array failed."); cudaDeviceSynchronize(); @@ -81,19 +71,21 @@ void scan(int n, int* odata, const int* idata) usingTimer = true; } - int blocks = divup(n, BLOCK_SIZE); + int blocks; - for (int i = 0; i <= ilog2ceil(n) - 1; i++) + for (int i = 0; i <= numLayers - 1; i++) { - kernel_efficientUpSweep<<>>(n, i, dev_scan); + blocks = divup(paddedN / (1 << (i + 1)), BLOCK_SIZE); + kernel_efficientUpSweep<<>>(paddedN, i, dev_scan); checkCUDAError("Perform Work-Efficient Scan Up Sweep Iteration CUDA kernel failed."); } - kernel_setFirstZero<<<1, 1>>>(ilog2ceil(n), dev_scan); + Common::kernel_setDeviceArrayValue<<<1, 1>>>(dev_scan, paddedN - 1, 0); - for (int i = ilog2ceil(n) - 1; i >= 0; i--) + for (int i = numLayers - 1; i >= 0; i--) { - kernel_efficientDownSweep<<>>(n, i, dev_scan); + blocks = divup(paddedN / (1 << (i + 1)), BLOCK_SIZE); + kernel_efficientDownSweep<<>>(paddedN, i, dev_scan); checkCUDAError("Perform Work-Efficient Scan Down Sweep Iteration CUDA kernel failed."); } From f09634e28ad53ea1ef2ffdfdaf7c407d7a079a6a Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Tue, 16 Sep 2025 23:49:04 -0400 Subject: [PATCH 17/56] feat: complete first bout of data collection --- README.md | 56 ++++++++++++++++++++++++++++++++------ src/main.cpp | 2 +- stream_compaction/common.h | 2 +- 3 files changed, 50 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 0e38ddb1..69c7ab2d 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,54 @@ -CUDA Stream Compaction +CUDA Parallel Scan Algorithm and Stream Compaction Implementation ====================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 2 -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +- Amy Liu + - [Personal Website](https://amyliu.dev), [LinkedIn](https://linkedin.com/in/miyalana), [Github](https://github.com/mialana). +- Tested on: Fedora 42 KDE Plasma, Wayland Protocol, Optimus GPU (Intel(R) Core(TM) Ultra 9 275HX 32GiB, NVIDIA GeForce RTX 5070Ti 12227MiB) -### (TODO: Your README) +Array size: 262144 +Block Size: -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +8 +0.318709 +1.22598 +0.2816 +2.09114 + +16 +0.339354 +0.790464 +0.1536 +2.05978 + +32 +0.329103ms +0.580896ms +0.139808ms +1.20006ms + +128 +0.342237 +2.14592 +0.26032 +2.06634 + +256 +0.331082 +0.4792 +0.246752 +2.13478 + +512 +0.321641 +2.85651 +0.239296 +2.04694 + +1024 +0.335945 +0.504672 +0.26576 +2.13779 diff --git a/src/main.cpp b/src/main.cpp index 2478488e..3b852949 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ // use during development with `#if !SKIP_UNIMPLEMENTED` preprocessor at desired skip point #define SKIP_UNIMPLEMENTED 1 -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 18; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int* a = new int[SIZE]; diff --git a/stream_compaction/common.h b/stream_compaction/common.h index a6b75ee2..a17d4407 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -11,7 +11,7 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) -#define BLOCK_SIZE 32 +#define BLOCK_SIZE 8 /** * Check for CUDA errors; print and exit if there was a problem. From 0b91a31713381579d9fbf823883fdb975584056c Mon Sep 17 00:00:00 2001 From: Amy Liu Date: Tue, 16 Sep 2025 23:55:37 -0400 Subject: [PATCH 18/56] feat: finish peformance analysis and README --- .gitignore | 2 + README.md | 308 ++++++++++++++++++++++++++----- img/compaction_performance.png | Bin 0 -> 323296 bytes img/scan_blocksize.png | Bin 0 -> 151150 bytes img/scan_non_power2.png | Bin 0 -> 320216 bytes img/scan_performance.png | Bin 0 -> 328585 bytes img/test_output.png | Bin 0 -> 145858 bytes matplotlib/graph_13.py | 73 ++++++++ matplotlib/graph_24.py | 82 ++++++++ run_linux.sh | 6 +- src/main.cpp | 2 +- stream_compaction/CMakeLists.txt | 2 +- stream_compaction/common.h | 2 +- 13 files changed, 427 insertions(+), 50 deletions(-) create mode 100644 img/compaction_performance.png create mode 100644 img/scan_blocksize.png create mode 100644 img/scan_non_power2.png create mode 100644 img/scan_performance.png create mode 100644 img/test_output.png create mode 100644 matplotlib/graph_13.py create mode 100644 matplotlib/graph_24.py diff --git a/.gitignore b/.gitignore index 7d50060c..a67ea05f 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ cis565_getting_started_generated_kernel* *.xcodeproj build +.obsidian + .cache/** log.txt diff --git a/README.md b/README.md index 69c7ab2d..c89cad8d 100644 --- a/README.md +++ b/README.md @@ -7,48 +7,268 @@ University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project - [Personal Website](https://amyliu.dev), [LinkedIn](https://linkedin.com/in/miyalana), [Github](https://github.com/mialana). - Tested on: Fedora 42 KDE Plasma, Wayland Protocol, Optimus GPU (Intel(R) Core(TM) Ultra 9 275HX 32GiB, NVIDIA GeForce RTX 5070Ti 12227MiB) -Array size: 262144 -Block Size: - -8 -0.318709 -1.22598 -0.2816 -2.09114 - -16 -0.339354 -0.790464 -0.1536 -2.05978 - -32 -0.329103ms -0.580896ms -0.139808ms -1.20006ms - -128 -0.342237 -2.14592 -0.26032 -2.06634 - -256 -0.331082 -0.4792 -0.246752 -2.13478 - -512 -0.321641 -2.85651 -0.239296 -2.04694 - -1024 -0.335945 -0.504672 -0.26576 -2.13779 +## Project Description +This project implements **stream compaction** in CUDA, which is used to remove `0`s from an array of integers. In a path tracer, this translates to removing terminated paths from the active ray pool, making it an important performance optimization. + +Stream compaction relies on the **prefix sum (scan)** operation. In this project, I implemented: + +- A **CPU baseline implementation** of scan and compaction. +- A **Naive GPU scan** using repeated passes and global memory. +- A **Work-Efficient GPU scan** using an up-sweep/down-sweep balanced tree method. +- A **GPU stream compaction** method built on top of the work-efficient scan. +- A **Thrust-based scan** (wrapping `thrust::exclusive_scan`). +- **Extra Credit (Part 5)**: analysis of inefficiency in work-efficient scan and optimizations. + +## Implementation Breakdown + +### CPU Scan & Compaction +- **Scan**: A straightforward exclusive prefix sum using a sequential loop. +- **Compaction (without scan)**: Linearly traverses the array, writing only nonzero elements into the output. +- **Compaction (with scan)**: Converts the array to a boolean mask, performs a scan to compute write indices, and scatters nonzero values. Mirrors the GPU design but slower due to sequential execution. + +### Naive GPU Scan +- Performs `ilog2ceil(n)` iterations. +- In each iteration, every thread reads from the input array and writes to a separate output array with an increasing offset. +- Arrays are swapped between passes. +- **Downside**: Requires multiple full passes over global memory. Performance is sensitive to block configuration and can hit *performance cliffs* when register or memory usage reduces parallel occupancy. + +### Work-Efficient GPU Scan +- Uses the Blelloch scan: + 1. **Up-sweep (reduce)**: Build a binary tree of partial sums. + 2. **Down-sweep**: Propagate prefix sums back down the tree to form the exclusive scan. +- Can be performed in-place, avoiding extra arrays. +- Padding is used for non-power-of-two sizes. +- Much more efficient because each element is touched `O(log n)` times, rather than `O(n log n)` total work. + +### GPU Stream Compaction +- Built on the work-efficient scan: + 1. **Map** input values to booleans. + 2. **Scan** to compute indices. + 3. **Scatter** to compact values into a dense output. +- Fully parallel and scalable. + +### Thrust Scan +- A wrapper around `thrust::exclusive_scan`. +- Provides correctness but introduces extra overhead (allocation, dispatch, abstraction layers). +- Useful as a reference implementation, but slower for most array sizes. + +--- + +## Performance Results + +1. Elapsed Time to Block Size (N = 262,144) + +For a fixed array size of 262,144, I measured how block size affects scan performance. + +|Block Size|CPU (ms)|Naive (ms)|Work-Efficient (ms)|Thrust (ms)| +|---|---|---|---|---| +|8|0.318709|1.22598|0.2816|2.09114| +|16|0.339354|0.790464|0.1536|2.05978| +|32|0.329103|0.580896|0.139808|1.20006| +|128|0.342237|2.14592|0.26032|2.06634| +|256|0.331082|0.4792|0.246752|2.13478| +|512|0.321641|2.85651|0.239296|2.04694| +|1024|0.335945|0.504672|0.26576|2.13779| + +![scan blocksize](img/scan_blocksize.png) + +- Efficient scan was fastest at 32–512 threads, with an especially optimal spot near 32–256. +- **Note:** Naive scan was unstable due to *performance cliffs* (detailed below). +- Thrust performance stayed relatively flat and consistently higher overhead than custom efficient scan. +- CPU time was insensitive to “block size” since it’s single-threaded, included as a baseline. +--- + +2. Elapsed Time for Scan (Power-of-Two Sizes, Block Size of 16) + +|N|CPU (ms)|Naive (ms)|Work-Efficient (ms)|Thrust (ms)| +|---|---|---|---|---| +|64|0.000146|0.134752|0.102656|0.048384| +|256|0.000081|0.020032|0.035904|0.015776| +|1,024|0.000215|0.023872|0.044480|0.017056| +|4,096|0.000862|0.029248|0.052896|0.013088| +|16,384|0.003271|0.033248|0.063680|0.017504| +|65,536|0.013142|0.216288|0.078528|0.022432| +|262,144|0.132031|0.481088|0.203040|0.080640| +|1,048,576|1.756160|1.441090|0.383552|0.160512| +|4,194,304|3.965870|7.939580|0.653216|0.138400| +|16,777,216|10.041000|21.344200|2.688540|0.321024| +|67,108,864|35.037100|99.206900|11.000500|1.465890| + +![scan performance](img/scan_performance.png) + +- CPU dominates at tiny N but scales linearly and becomes slow by ~16M+. +- Naive GPU scan is competitive at ~1M–4M but grows less efficient due to repeated full passes over global memory. +- Work-Efficient scan consistently outperforms Naive for large N and shows best scalability. +- Thrust scan suffers heavy overhead and only “catches up” at huge sizes, still slower than custom efficient implementation. +--- +3. Elapsed Time for Scan (Non-Power-of-Two Sizes, Block Size of 16) + +|N|CPU (ms)|Naive (ms)|Work-Efficient (ms)|Thrust (ms)| +|---|---|---|---|---| +|61|0.000044|0.017664|0.029760|0.016928| +|253|0.000063|0.019936|0.036512|0.012896| +|1,021|0.000215|0.024704|0.044544|0.016256| +|4,093|0.000812|0.027968|0.074656|0.012768| +|16,381|0.003308|0.032448|0.060960|0.016160| +|65,533|0.012721|0.079456|0.078720|0.017376| +|262,141|0.196031|0.265952|0.205312|0.067072| +|1,048,573|1.340560|1.336480|0.283712|0.122976| +|4,194,301|3.897540|5.601060|0.649696|0.132608| +|16,777,213|7.871300|22.084500|2.671620|0.321632| +|67,108,861|29.286100|96.962200|14.517200|1.468770| + +![scan non power of 2](img/scan_non_power2.png) + +- Padding logic works — work-efficient scan correctly handles arbitrary sizes. +- Performance trend mirrors power-of-two results. +--- +4. Elapsed Time for Compaction (Power-of-Two Sizes, Block Size of 16) + +|N|CPU (No Scan) (ms)|CPU (With Scan) (ms)|Work-Eff GPU (ms)| +|---|---|---|---| +|64|0.000205|0.000428|0.079904| +|256|0.000573|0.000715|0.073536| +|1,024|0.001745|0.003360|0.084352| +|4,096|0.006417|0.012418|0.097120| +|16,384|0.024499|0.052908|0.183584| +|65,536|0.103375|0.264867|0.616352| +|262,144|0.412454|0.881458|1.587170| +|1,048,576|2.178310|5.862320|8.874690| +|4,194,304|7.599850|22.914500|21.235400| +|16,777,216|28.617200|85.529100|65.237300| +|67,108,864|113.149000|427.137000|253.671000| + +![compaction performance](img/compaction_performance.png) + +- CPU methods are very fast until ~1M elements. +- Work-efficient GPU compaction scales much better and overtakes CPU beyond ~1M. +- CPU compact-with-scan is always slower than CPU without-scan because of extra overhead, but matches GPU structure more closely. +- The longer GPU elapsed times are most likely due to many CPU–GPU data transfers. +--- + +## Discussion + +- **CPU vs GPU tradeoff**: + - CPU wins at small N (sub-65k). + - GPU (work-efficient) dominates at large N. +- **Naive scan inefficiency**: + - Requires multiple kernel passes with global memory reads/writes. + - Very sensitive to block size: some configurations unexpectedly slow down. + - This is an example of a **performance cliff** — increasing resource usage (registers, memory, or block occupancy) can suddenly reduce parallelism. + - Unless the extra resource usage hides global memory latency, performance drops sharply. +- **Work-Efficient scan**: + - Best performer overall, both in correctness and scaling. + - Handles non-powers via padding without performance collapse. +- **Thrust scan**: + - Significant overhead from allocations and abstraction layers. + - Competitive only at very large N, but still slower than custom efficient implementation. + - When profiling thrust::exclusive_scan with Nsight, I believe the timeline shows significant overhead from extra kernel launches and temporary memory allocations inside Thrust. These allocations (device_vector construction, dispatch setup) are not present in the custom implementations and explain why Thrust lags behind for small and medium array sizes, despite its optimized kernels. For very large arrays, the cost of setup is amortized, and Thrust performs competitively. +- **Compaction**: + - Mirrors scan performance — GPU methods need large N to be worthwhile. + - For small arrays, CPU is much cheaper due to launch overhead. +- **Extra Credit Part 5**: + - Confirmed GPU underutilization at deeper tree levels. + - Optimizations (adjusting thread launches) improved scaling. + - Demonstrates importance of occupancy tuning for GPU algorithms. + +**Q: What is the performance bottleneck?** +- For the naive scan: memory I/O and repeated kernel launches. +- For the efficient scan: thread underutilization in deeper tree levels. +- For Thrust: library overhead (allocations, internal dispatch). +- For CPU: bottleneck is purely computation, but with fast cache locality. + +**Q: Is memory I/O or computation dominant?** +- Memory I/O dominates in GPU scans due to repeated global memory accesses. +- Computation is relatively cheap compared to memory latency. + +--- + +### Code Testing Output Preview + +A quick preview of test results, running at array size = `2^28` and block size = `32`. +While developing the program, I added extra tests that allowed me to debug scan and compact implementation, which are labelled with a keyword "consecutive" below. +- **Note:** At very large N, CPU prefix sums overflow 32-bit integers; this does not affect relative performance trends. + +``` +**************** +** SCAN TESTS ** +**************** +==== consecutive array (input) ==== +   [   0   1   2   3   4   5   6   7   8   9  10  11  12 ... 67108862 67108863 ] - count: 67108864 +==== cpu scan, power-of-two, consecutive-valued array ==== +  elapsed time: 38.698ms    (std::chrono Measured) +   [   0   0   1   3   6  10  15  21  28  36  45  55  66 ... -167772157 -100663295 ] - count: 67108864 +==== a array (input) ==== +   [  25  44  21  17  33   5  26  24   0  12   3  41  22 ...  18   0 ] - count: 67108864 +==== cpu scan, power-of-two ==== +  elapsed time: 37.6763ms    (std::chrono Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318833 1644318851 ] - count: 67108864 +==== cpu scan, non-power-of-two ==== +  elapsed time: 40.7901ms    (std::chrono Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318718 1644318745 ] - count: 67108861 +   passed   +==== naive scan, power-of-two ==== +  elapsed time: 99.19ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318833 1644318851 ] - count: 67108864 +   passed   +==== 1s array for finding bugs ==== +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318833 1644318851 ] - count: 67108864 +==== naive scan, non-power-of-two ==== +  elapsed time: 96.7852ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ...   0   0 ] - count: 67108864 +   passed   +==== work-efficient scan, power-of-two, consecutive-valued array ==== +  elapsed time: 11.1004ms    (CUDA Measured) +   [   0   0   1   3   6  10  15  21  28  36  45  55  66 ... -167772157 -100663295 ] - count: 67108864 +   passed   +==== work-efficient scan, power-of-two ==== +  elapsed time: 10.9568ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318833 1644318851 ] - count: 67108864 +   passed   +==== work-efficient scan, non-power-of-two ==== +  elapsed time: 16.7434ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318718 1644318745 ] - count: 67108861 +   passed   +==== thrust scan, power-of-two ==== +  elapsed time: 1.48512ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318833 1644318851 ] - count: 67108864 +   passed   +==== thrust scan, non-power-of-two ==== +  elapsed time: 1.48096ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318718 1644318745 ] - count: 67108861 +   passed   + +***************************** +** STREAM COMPACTION TESTS ** +***************************** +   [   1   3   2   0   3   0   3   3   1   2   3   2   0 ...   0   0 ] - count: 67108864 +==== cpu compact without scan, power-of-two ==== +  elapsed time: 113.965ms    (std::chrono Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   3   2 ] - count: 50326372 +   passed   +==== cpu compact without scan, non-power-of-two ==== +  elapsed time: 114.697ms    (std::chrono Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   1   3 ] - count: 50326371 +   passed   +==== cpu compact with scan ==== +  elapsed time: 376.529ms    (std::chrono Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   3   2 ] - count: 50326372 +   passed   +==== cpu compact with scan, non-power-of-two ==== +  elapsed time: 445.05ms    (std::chrono Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   1   3 ] - count: 50326371 +   passed   +==== work-efficient compact, power-of-two ==== +  elapsed time: 264.04ms    (CUDA Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   3   2 ] - count: 50326372 +   passed   +==== work-efficient compact, non-power-of-two ==== +  elapsed time: 262.06ms    (CUDA Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   1   3 ] - count: 50326371 +   passed +``` + +Also, I added some colors to the console output for more straight-forward debugging: + +![test output](img/test_output.png) \ No newline at end of file diff --git a/img/compaction_performance.png b/img/compaction_performance.png new file mode 100644 index 0000000000000000000000000000000000000000..553272dd55a6944a1980bed15b8b6ad6a7179d12 GIT binary patch literal 323296 zcmeFZc{G-7`#*XUr6ME|GNciO5+XyHN}(jmn5k6eAu^XSV;P?!V^Rr4W|280BuVC( zlv!r>=c?y<-tX`It^MELd+oLMUaPm)gYNsfuj@RI<1-!Sy>rS6G@Dp9kw_$()2C!r zNu>4CBof6ds`dCzBAIt2{!h$S?vm{Vi>tQw`qsuIC4E~js>+jIC|1TbLg? zBq(r5n2*!M*4ENSjGy1^-`_Z7VSSB1<)v2{US*@@smnGb5_>7}KQi+MJ4q55iF8`_ zgqlP2kH@ziw#hA46-&(R4)tgAH4wr_`@%OhxDT zWt9nO9okDLPRM#%MoDwhmC}%rQ-s`ix*Bq9an5I?b8IB$h(zdV2dVn9va>I~|IF!h z9Zh|dmd4|4e0e*?3A+FKlS*~bhFAK(ze^D>Bt&-dzrQP+;;~nn>MrO1`p3ksfr9FP zf88yQNL>HxFLF{PG@oGL{I3h3IPpfzgxvGLUX;^UeS?zwe_sGaz|FM0 z^<_S^7Up&vckvB>O4_XaSZq&Jr2ut8Lc;yO{_-VTxBf~^*n!;HFIxISEHY`y#nGZx z{2EoY(HR@$qC?Sy`esgVe&NU)?qI zTo|ukzkcb}6*BX;v+4rWO?-|FC59z;DKF((a*?{dm~`V9caoK6@V!V$Ij^G=*jE+Y z*Je>%QFwd$mFy& zLn8lP)P#RivWns2%*fXqi`e1u$6|J+;TpR4%FAU%*|vm*g>gzraWEayCXv{VSySY6 zxr%>`KIT!p^3$U3iNpy{CR~C-eT>AyxN-rrTEN&1GSN5Betcc!Xh4nvZ5JRO77OWCcO{XaYVrB zmuYp^W95C4i^JTv-%-76ov}?CNKRYk9@OQ=X zq_m|H`}1`V0}TyL?#R=q`D$8^5_d*k^(zZg0~w(lCefTbjyniQF8w&tKQxr65-Z8| zfAWPbmrg0lTvpXSS@))rKL ze9Q7*TOCP`?ad!eo6d8Nm2`P!Go-9-*L?BBr~90m9CKZ6Qd_rfogZ%(zR=@ccePI{ zMQ!Ft=$~~=MQdWD_@^{FR2aPOPR;4;E$*@`qsd(uj9Q6c*ssRFANUX} z#Zl%>Bh!{`7T+||kZq<#%_>aIUpPUI2ii9)7A0&(UM6sbdA(s}K+oE0U$9iKH> z8lPX-Co3x}o!*j&4aqEG&Zg@)vL`G&d||iBDLcEPovuq}XJaLI>g3zrsW!TcHMI2W zr=aVy1D|#--G(h}bm2RWT%~6A?xB_5;O@1VY08di)u!Y7b?hGJNBmm)t){|!I{DjL zbnJJE*^f~tHu??^o6HP1uCwoQ5kVOkzaUXyJ5s8b^XwTF>72H9z)-9k`^w_1jHxN# zE^h9|%xfygmnX_Z?Y@TvZ&zavl=mp{*f8Ct5-r*zJdQ5~Ejo`=?>OvfA%5@PJ!Uc6 z{e$(f!8Su5Z@*=?m)hZ7g4^X@IW3{%Xw>6|{iTrhTFa8jBjo5lcOJm}%_)?yAvRRwG$;jB6Uc4&iO~dN_ zueOucEJnYip2+){ zAm@L{ql7a*VpPhbrYPhQOIUBUx5EF2FFkkr zpfR(z3vQsqcY&K<_$1dpC7o&g*_M-u|Jn0iz=yTHpBNv1=VwE_N0p=H8lG%xz0_L! zwKo3C8J>au{?n&_eapJJV14=WWyx@|cOM@eJ!4zPr0a0v%c}K{_Po%aUs9u5ZW8;` z^!oMZ3YW3u2d86?-wx5qv);qXz|g1uw9&-KZFy25)@+2PXBPYC`_Z+ZIb9!zRFss$ zpJ=73|5{xdzt(&sOQ8)LBHfj@FqT!s+-Pm7YprHc?(ErsXO9=0#xF-02eTb3y=r9S za8TT`Tl&}hWS`$iUC%dp`|-}YTO*$;QrWGntW0%v+4WYFV_gjl4GlX6#~b3;IpN+0 zJ4!9|hw_yPNVZOV(Xuce{r>%UjZJFPopl?&>b`Zn9VYI!>KuId7`A?yEpXITQhAB$M9kJ^g4dzM?o8BJ6t7gejeUL|+H#TY z&cu+^+K%y}r9*2SMu+u(e0_aNSy}${>5anX?Y`f>X}<3sz0=&7SIL!k z_!dzi)YR1tP{p1-dq%3waOi)aAb@*A%^}%7I;(=UUA#QNkDs|$8fM(HGuVSD!F9KSa7R(OtrP*vPkE43|B9gZG%QMP~p?={r;5b%EDviBj;^X--~7 zW*zaa=%(vP1SUb*Hoz`8b@uEG&r*w?Tg5us<0PN5JJ`Vb^AkO|*2?P8eXp-K?fv@m z)+N5}?atG6`}gmE$EcxAe{ScayOw)SU?uHOO-tJnx>srI+v+Y96KXb5I;@#o+mWr= zW^LZs?TfoPex09G_{B zA-T)=Y|+QZA-Utb2Fvr4yDq2At#+4qysxP_dHwoRWztE+GS_U5|Bg=A&@zRJK?t)_U8 z@1C3vwVW9~|D&UTiKyJB!mMJqe)!_<>=F}WQ3zryi8!d$Z7XK=>&NAG^gW`h3$ZN3 zc3%4!O;($M_m^;-kq%~;z^9?b=AW9f{?VH0GxrmflLZiPezJ!~MW%WOwD9?55*bYF#c|7oQ7cWwh=CMD6@SL3j7|?CrUDVL=Lg$)xJEN#riq8c= zNR`s;hX-f~IP~fgUjj2%d5>jxgqC>;dU*X>DsI~HrbpNh%h>;-uFIl7Ziy4!AS@!H z*MnAaLR0S#7P|b%X}$N~{oT#eqO;4kNy)Evt=?^w+i}_+&DHec#qj+q7cVl38&*=E zFB&gi(^Zd|sIz^mTTgyPA%L0dwtu^+aHdFyZRqgNMIR2=zbM}Unaut`SCn1g*U2vr zVwZSSFLmq3r5wAhMfXyhMlO+l?6hy^dft+BCh{w*dG4M16xHWn&Dyf7YhI?OUmN=H zu*@8l*UZG^ew|^KW~;9Y9hhJ>6~f!5g%rQ9U7^z`_7GTqi(g^wM3SijU# zyc(`_`m}zQ=D`A^>aAaTEPbc0UAva~;K2juRkYX7sRvgV>fJKVL>`*Wy>Wedu+DUD ztmF3W81-A#%_Gl~V?tiCp)Fsl3S_Ms0$xgK3*Ub6;zd*3j93vat~)DJwfx4@EnmK5 z)JF2#x2eatt=`sEwzfXJ1TOx{q8P1Kw}e-Yil z?5*+b2Q|xwq7EDTY3LMMP4%C4t$oHN<2|pPd@bg7rCDc4$ks;Lx{r^=ji)7+eq=hw z{Ph>}FO)g5mOK8B&Az?S58f$0D7Z#6JG~r@blo?M+kfABvJ*aW41Yii`}4Fk?+){! z;?1=5>UZ^A7by3s#88j`#|ps<&2#Manb#Ao@r~v|Qa5_IE`fg|qe)Ac4VTL6NhY?YGqFRQhx%Oage7cMeyNk^)Z2{{N$uBsaifLslf|q8^OsjjxRQ5-5@c02=x^LaM>+_b{-f0Ut-MwZC<;pBZhbH(Zr;S-b*0nUq zORvlEYZeF^pLkiOD3Dde(YXOD;I-47InlW)Dk`2~mDF7gtwR^oZ7p5{##$Z%#%6MS z+j&#xWHtE1Z@L?E#(L!_fn; zT#ir~GN7r=cwb*%1tL)RrTU71!-voZ5AIkLEvz@0G1DMWPfd+toK=RymzS5>2W}gH zV=fPY;ddk-Mp-J$w;LsqP`xVL$<~n5;+>A8 z2bV|n)=pYky;iZVt*J3uLys<=j?K%dE!!ARlUo-5mU!J&8=zX03!h&<+Y0K&YmK}o z@{8In+`hijU+3nKEB-Cp%s*Ia^^(wo9E&bV;72kN_C3LsV$-U2ss&fB%asrWV2w3e zECWQEzew5}x=};-c2djXm8CfaBct8lzklzx6uloCJ1(eBkzfywRya5*qFi24Q9=Ml z*FJM{BdlwO#gXLK#Fs}cIx52Us3Nnd}6Kz+lEY|VL5#TKkxKJ{D1bWw6Yn#VAl*IHYPYeI0 zsIHC|o4KG5O+BhQ8ryJ)ZFY7xckt28iCX^R`Q5=~c8s(v9Je(2wy!!t@>mJkv59;# z5ID2+iom-YS>%DMJ=e+0BbN?FOHU8Dicos%(=jr7K@|Xv{4Jw_VD6@kLE)%!JH*e= zWOkvSmywZ?y@`Q=VI=}oN&@ZIs-WXBUAUqb>KdDvE!9MP-G&5vn~HlXk_!}b?KZ*n z3>=`w-x&8eFX2-!4?QjSB|us%D+|lr-t<2e+qz-L-^#P6h9_;TmqkC%9ZR`;$!2Dr zRMWSe=!R?8eRN(Gb00=*6JEU7gojlI>h6!-b>>&<(u)@__UdUlWCyd| zDf&5n?_7J6^o)6sD7KX^7F?`FES9Ombe+h9&`=MknG;%9tkyi+RE|CVcz<~R8@mez zviq0A|E6P_|LF1#dyX7os0!xTpq;IxO5P)L)cg#G1wKogh7?E6#baUL{jHb(-s^NVQ8{74Aaeh-r)7oo?NrTn+ zRbcQ}H=21GPmHF`XMC5r?wM^~^ULR?TVPO-y3xr$dVMLA3FSX^HQcJNax#{nV<6qU zd$n?P>HzRuc({+h1kyPnsB_mVeRk`Z)wpIls>Ldqk)vaN%eoT$g8@MaF)_`Vr-l4J zc|MP%o%_(xmVarOaIMl)q3{n#*=%@|L*Y61CO7wMa%ya}a&lF6qbhyON`^_}uHaCP ziFl*8aN?vbY&P}9&@I)32nZ*{Cu?LSbd213#y>lo#ONLX|r2n7P@PDDx*X!em%N2+z`J!)aANVw{K?$eL(6H&mybg z1}^P<+v=MSacz^ap`oD?F>8$>BVEN(Gma7sq9Zn-!h`D7`_c-h6rVNUh_jL_L$`Yy zQC(j@q~VV5pqcOwJ9?A5eX|9eM^RxSPc4Tm7qC`nShb}w@5E|DnvVFBCr>82!lPyP z{ocG26#u#*x`Hf$1P7iU?{Zs{C|X-}Djq=5InD08Q6uu@<;?XUZZrG)+VgB6b*Gvx zSQ@L+YbXN>N*5&3AHg`Zi+wn}@t;2|I9MzMjK%o(p*SZG@Fi z$(3#?Ub}M+>AG+4M@LhWEJfMo(R>Oc7Uuhc7wQeGf-cGiTb7#$`8lt>o$I`*?&7=$ z(8gmuO>Z%=-fr&bk=O|itar`F^T_I(kzmQiQ?G9g(4tz&Xlt`Ii0aR_UW=vVy9xfc zJQVTwMt$S_pNDpj5S$Y(ZiJ{*j_&MoNblS^cTmtt9|s#kB+$^dqHJ*ZSYa3}F{r*C z%D{Y&&%%Z~9r<=;o*Nm+pDCH>U*3+leb)BVm2HcU$=w6kF%nL{whfGY%i>)-CNAFA z$G*5W=e9P~^pK9gD{gzs0DO;s>tvmroLnBtyjH)SPl2jTAQxb6e%uXejYN%Z)(uA2 z#gTohOJL`5zU)0U7gQuF+*~0)O#&<}YG)tIsmn(zS*XXqN~;Q|AxJRxdA*y3(wy4Y zry8l61HxUvFRv|oO6jiekxy^w1DfkEmYV&1LDYHPQaU5Xp?9O}Y<*2Qugjz5UX#;p zh4e%jqu2e{YU^QuLWhNUFeD^|MB468Nf70+!f9o;;|}MtsnLAv>MF1r8q)Qowr$&X zYU-I9Pa$yA@*JVn;j^aqzw3DPh~>k>BO7?tUv{tk0@b`Qv)wK?p=som>-YKNaSqGh&kQ-Gjs;eq0dIcP(ci#*7^<8r5PC;>jNv0NCh21P&rxB3#a_r%3gpDtb#lAe!H1oakNF7 z!N6<{;I+NP!mzoa!5u)5kQG9li&qzUpPqeK%CSxI$~#h{g!u?m<;QEwk;TiDP+UU$ zThYub19o^kdi3b3*1{B_P+%RcV=+{@dUY>xL9H8g;NuWC&%VPhayKZXIe}^C$MK&` zG0sz`AzD}KwwUMiMe4bh;I_)O_56NlC+0o=hXu6-AgTi7JqRByF)oe-Sv}Tm^_b+^ zvW@M?CrZGAJrMqZ2F|If-i1cgi$=6-$NBT;x8{j-IUgZ(BuLjxChcPba1v+2dXjroo;lezIu`FsjFA7 zDjE@M1ABYfqXP7p``|$uZutOdkdpJdxK5D~E+#>uBos)XDk<*aLd^ErqY=bk;3P-aS?vJiSYv2!9K*f@?cJTZ{$y4wTXd9Jp&kP0#S9?0|M)*ncd@Md=j90nwXfp1J04VcIC;_r+(Q= zH|h$pyF>|NX<>j#NJ!`kjG{|#Oz6G0?DT_Fb^>MsWOB4dg%c;pitAe6-B}lT$jA-;6}`WrBOo)A$;rvh&orr&WbhGYQsK(4HfA9cI)Gqr z;1Tq+MA#EBKKcw3ExmU9(E+6XQwo^ij+CH#O{BZ6Y$ix04=*nn39q@l&idHm?5Gdr z=Iz~ZnnIP_z%x+H1LXX-#}^f`A3b_Bqpg&gdOcwiVX*|<&DwN`crk2cPgaXU`l0&R zDq5*!&gw+oJYjPFYO|QTPEHZHG&u*C z1u#qjDkB7DApQeR4&r43zP`TqmvxdyCt)gII3~d;Gez5i$zPiu)yFpua}(kLA&ljX&{ip1o{CJg#8MvllbNv53mkl zHW7vXFHtLl(Ddd{@Wy@%u23(TV_kmjAdKVHRd{&dD~TW|gxy3`JoISq3X1b^>~H=2 z=7lc1g@8iV*4CDjy>f9@)>i=AawjWUCzGXC?&-;W{>CinEV_kFlvtsLv)`LVt$Ha) zafIP0YRP%^!(UwAcv6mywC5ek$jp4LU$&m`;=X1WZ~ho76$~K)=5Yevo%rRnqQx8O zS}!Hw3H~MFButd;mwLtQ3kwS1Em|Mz?N!ZY^YE&o4$mim8F(B=vx zyeO@Wf7I#!`S+f9#{YL_c$5%i(q@P6^rNeogG5O4f!^AGT&#buKOxH?rwt2d6Ny+$ z)azy5+r=d}$e#LI>?Q@j`cwOj|IQq7?cKL;13J}~1-t8;`Tl*Of4ApCs7Y8Mm~REkIr`yg#Ruk|r@xoD8aTS;qckLKFB?@2RRnyOJ# zIn-IaT~t+zIsL*(VQ0UmXdWVdI%1cXbHJ_UT+~#cWvbhmv6M%dN|dS*i#!7x|M^vx zwJhz@tW}}a&Bc?k>9=J@E}&cw*;8(nVDAQ&QT zYHAufaA41ob30ZeruP-6-}caNijd9yA{FL%z$G=}^V8*nGCylXN?)%Y%u_nBCv@>e z#2Nfl%~5B}tS0vG=cmrx+M`_|OD_gZFWd=BDpI`t@2^(69gMV0l?q%dwOAgy**Q4% zd{B(xQTDrCe-bGF`TPGg3bvVV*M4|&{zYx3PzFJe&hBMc(@ANDcDi+o&pNcMt(V(X zUjn*9_gfu@g$TvD!9(yR6H#si(m%XG$Xs~CA-khdbwqr~*Vpo<48j5M)IYbhoU!%u^>tTH_J9jf?b+Rl z$OC%#nf}W1azvL$zc{ z7*iABsK$l{PQ||EHuIds?CkAyZw~lZl;#+_nVI}{`5(<{>~fF(WMKuBy|Cw*0IKrhqbxM=n)lqg@sSz+?QXM8lI8f&R1e&2 z9Q^1{N~!V4TS`{w?7J^{E=e`-XG&NA{mHdRfBKWG)-K`AH^PT^2hfi#zIZyHquNKM z!75x)+=172xoA^GtfE$sFMtp^hH4x&N~(64P@p;x8JN0%{yk`Sy05o+(!kdjyCG> zbDc{d{mIx(wrpIVtGl{bAYM7^4V(#}`#xG%e9X;ic0+DCRm&~LVNr6f`BZx9wOb;} z<{z1Rl#{OnaC^Fhs|wzoKA^ueW6xGHd(qlBJb*rJb-db}uXdNDB>NU0nS{`W;(^5% zUw$1gy$~>A7tQvaIbJ=*c!?*eioR5kcJA$S!IQ%m?gVi|Gwtf45SM>HdvR>J?Sfv# zLJoxC_JS)C!S|cEKcleITN9!vl=f98>{miBgu!cc?b?|sJ*S_Lguz?m5)w{^34^~N z972T7mgdHplJ@M~OK8!r+J0tO!-SxIs~Ze#4v6pjiHC=W@N)?~{w?cF*med654brb zQnqR9uIn=(@N(cbuogX#1@r%eWL6`Pt=h$l`Vb5fgaGIW9i5Q-_ifDQRtS}^)xz$4}PF8+=AEGYy$!VR7Jd96OVuR4oO~1W4z4g)t8WztjJH_I1bD3eBb{msR z!_@*n-xO81_a;HIPo^{*bQQUvZ89biDmuZ7U{&6ajNGT_@9$p*fuBrTeGKM_{^5s~ z5M96`{XaR*4Br8sLl%!*{MHY;BsqC`gHq3pBzK5TXT-|k;KqZn5I$=Vhtwg(QRtZj z0EXIPEru40!A-0Y(-qa`XmF42hb%Ha8eL-G@@b|GsGVC71E`hoDYpjEC zkI!qat*$iY*pjn$FN=e4w0&jp}}idy&kCI}|LWr8bq0*nc_gI95SkG#>;-wP+hyVV}#S%6J zOa7+)JGeJ(+C=C}6rw@zKYWk{VR~;MYi^#_VkL{!2|70*C~rA;)?qx1S;4d;|Ke*a z3MOv4ZqZV$0Q^1y%gcD=u_5qiPA$H;62y&uv1#*WMt}OeygVg#1(PgI)2e`x+D^c6aLJZO>$8;t>(z%u54=+dPeyeK>=- zb2rlOF~lb`53Qmnk+UdJY~{ALv2kG`Ud{lZ)Z9=OMnr!n<>+ls-So)M{GSD0j)v_% zL2;+I*8plDDNyQ=xb5(}WXi=#%G!*C%PM<1H6d{E)s~tF;m`V+Y$RZi=6j`?8uHLw zCG$CT**l}dCz4d-pFXV|HRVa>EfZ)Mon`N;>FwN>mXZ?sp0Oz0`1K`+hty@-w|J#J z=e(UVN%|A&+vh5e*z3`iGjbx~h<-6*8*S(Me=?=z|1S1E9vFrqwtV?_$UlWZ2=oX7hu1w6UtHZ&9DS6}O%{|s$f~7ioc6l#J^fio& z7U&rM735f{-&5`i26Zonp;PAYu3HJDkB+sr5ZvW%<*A;6=umf+*2%CS`qEpBtgNG7 zyb)JKj}?wm%C;C-jv7tzoEx*&Br6RJt-ES-se&rP7zztlK0no`O>}80J&Q|EE2-jx z<>lqq3k6-dto}J-WhIrKo}MgRaUh#^8-7+KpV*k_rXVL*GGbfv?F2LTOZAjzIy&aK zi`E^p7wcY=dxjDpxQ|L-ucEnGk2SxN>Z1y}D1S?rm8WuYpFK0x)d0(tmA-0k`9yk_ z!1c(UX1MSUbww9b85-PlV(-Di>@&){CRVqDJNM$#G?(yCeN-QlQTgHkE6|;#26Vqo zqcLUh>9u_O_ARcGDzB*Kv761D{Gq2CR|<-XbZy7)&K^%d$Qkisk~>t?IMn>DZ|`n7 zL&wz(63<84PI4OjutUyBT}|!P+RC_FtDoN{x&Hr0py9biHG!CoOPhDzRsL_t{eQun zK?zJ6;he3qqude_Ix`(tGkZ{PN$%*3uzT;QG~LMZ4G8co>ybu^MOGbQPf|Dh8)#41 zJZPnsVvms`!c&1fr<{C2El8gk!ad2o!#whRvxyK1p)9hr9k2K{M;HSR#i>3w8rW1@ zQ++X|xsa`8FiG_}zV$AH52EuEzQ{40@h`BORTg7g6Z%b$~plxSPSlDRV%XHy4uyiHPE1F+eZqUg4y#e>O-d`H3 zml$$xjRk(w&`kZ3LIXcD7V+nk58IDM)fTUNF#7!9(sv&$s$FY%#g=^G3U6yM$};ae z^YPKzZc98C)_M63-8Z=4lbg79?IH&N2C~IHfZ*odG~!uz)dO3p(FDECv%_2g zcB^a`9A5xjWGPQr>v3~)gT^Bpqc0qF`yo$w&5)HWZs!w8>=d+wKY;wf*G{9u@Sg!I zPWQ{hc0@TsJPkWB?5Y4WLdFe$;P`77v`JE2MuzGx5eN}CvL5Bc(*cSj-lTUSX|b!| zENT(ZJjoq|y!!2csmbe0=MdIW>>Ja&@cTb>bn3d8s45-s$dK{cng`~EX9yi)s}$l! zD12L4tyeo?1w-u&7M`)SK&%YLtHVR#C>oRp{5r|~#S5N$9FKk@?#rw2A)kL{XdO_qg;aMwRMn0nsIz<`ojvi)`ff5uH%b2)kYy% z^x6iVU_V*BAH7Sa#-EnAUYXB9l}R5-OW$M{0A0oT(YZgEma$C0ube0qT~-uB!GPx> zf+xQ|RpSX)U}d3R-$|=IuxX_LkKvOB<>4TA4#U@1X79vQ8Yu1D=OZRh3<4E+nsZN%f(xB2?& z`D!4^kRsVWw}L8na@;bAUO4LD4+B{j7nf5jtbzRN-c&M|D1}){z09{C4+6AKc=qg+ zS~lE;;ihC-+)KjnKdl4rf*j#>0Fcw`+y4s_KFYBw$85=F5|M)h=9j&`Zaua1AN1vq zgn|DjB1eOK5kjMv^KkzVxFeKtMMXu~>u{cjJKu^co;mXx$)od{XU|eYy;?_t{N>+Y zgl9m+ya_=bF^_|aXOxuW#yj6)6$j%|?tP+OzizM6BMQ>j*M>gjDoAD$1ST4}Xtx7i ziO6)mfB#+@iIOL2*t{Sa8%RGo-)c>*Gi!)D0f}xM;GzGz4O@Iag@lF*B8vRgRQNJ0 zARi-oHRNs?WUcq7T67f+!>&U}*d3Dlc};kS^1lBhsC z^P`mkOOd#q{Rg2KAl*qK0srZIN|O&{@d76#;#lAZx%*QQ+8b_3=YUwgN9?)~i?+*b zGszt^7>0I; z>-HjCgipPFeZYSku`{j*lLV}Ki3x~p+qd_^kxh652+Re8g@C!$)$~aa>Gj)ltw`<= zs}&8dBMl3h4Y*Fa9~zqIK-2~Jcr+=@@v6`99forO&yX(IFE0zR`Jg-)(S~i@YCIRe zR#0JafcQ{u;|6A0=SL6OXR~Gi!q=0`-2MHFTe(r`N)O3{7J53f^?WRjvSB-qH@hX+ zi(AER=&7yO%u+=kV}M#u_~4~H@^OTr=;--g$xI(B#RU1OG?;`Fcesmw8UIn7X%I%aX1 zE6pN2H^N5ai_WpU41CQS&ybT*ZkKdNJg?}<(InGX{rP7U5^1!%Hqm-1cYf-h*%U$J zGAW}su-NuA>~r<(x1z7rcE;iRTsk_KH1by8G+$Y?x5+wYvD(JA=&hP8z1*OsDLQ*G z`k&lpo@eE|4%PJy=Nt ze&pIqqyLJ$AJ{|5kM0R0C(KqnHX=1azhlSq_fb*Nv9}Or|4j}YY?=`N#b68I zhR+W_K6{J3T6kj!;^269G*f?#)G6r~C@L%8x+xQ3Z1v--Aad|Jw~}@ZbwMmn5caAy zrSw%#kpq{+yvm1S(UQh!!2~o+D9pLNe!Q|~5&+SjuFq7TKVswdlw{%H;Aq%H=U;Ib z@QxE4kgs-e_Le<@<-~G?n|j8S_8~LZXOTmPJ|?;$!H$IpS=!6(`CS=iZY+JdpWm&u z$a*bB3x8dV528nVBcxEgH{Yi}=#`P4KK@id_p;E@qm0P$Nt61h($Y~bLUrU))dJ|U z4kju@8>bKe}M^kBy`YPZ^j z(5{w3#m~ii7}92GlXMmJQubsWaH~C60FCvf+Zpo#O>6L!gyC!TH-6&ZY8 z$&umVToMwGIWFB4VrjLKHa6Y^n(y=Zvs$aI(CU@LEP;h*^(FJHJn6=K(%9O+s9Fn9 z_nkL4GJ4V`1dMk(j;B1d9a@>IUBrEOHrRr3h~k>p+DIw)cW3VkO*|m1_FOGv&g;zd z?iQoNc}k&~J@6?OIGXNsvf`8R)Q)^CSveIU^U=F#p1-}Fb<7pSM3McOb?&$OO_5Pw z?`OvIi&^o0ZSN3wGadHZ$C^KF;owWp7PB1X^lX=p_}Z@u_^pgVHh9#c%9hbKXsnO^uIfTppMSv7qYn zx24@3=u*0uSu$avxA@{*l`q5}?OaPTs8cfww>|0Hs_3=#l8gc@f2J(&RM_9BvB1LJ zzpRH#AAX|x{P?ent3lkJ`;P`$Rsen|T&SwXn!8od@zdo_sW2ZeZy!fKFC({TN8Zo$ zbvqR1KRpfeqX+XLK7yVQOJJK5(|ggoG9Qa|_C0yD4udIFH$5{au`0pV@0MJ-Usxee za0%JzcW-IJqN6{oDUH!6{-e}W-K9E8sNzRUleA;5a{P8SOmqo#`R?5U8@vBV)4Re5 zIqr+)(h$1Bv*z!E>OQ}-S0a{9i82jte@B3c_pv$2z$xo5m!w*9MIcv#IWNy>Jx^Je z^=kjzQ40anZy{7Ii$1FSN$JfY-@=NGpXn^fi+(BIlXD=+$oqxP4(jz7(yINbn}4T& zR^jT+-P)rZJa6w>?c9S`;l6LJm=kU}`}`dJKWp_^n;NI_^K{jif7UAXMLjt)Xz%~A z0Lq$^?^2d!@U^uo(xt}!U6M(}=q76VqS_dW637k2@>5DmG#L^22oxuTD8J|Y$K?r< zj@`(6dcH8|G@Wa&tr#?9QgYJ{^~3AJdn}H*O)WhVVHP$!?Rj%W=6U3&mK&$dYPt@8 zPIKmFgCEcqrg~VWISuQK*o;wiPcP}Z8D-1k`r<$FGPj&Rc{T1Z`GI6xt&JM7n;8GG zK>jyhfn1S%O_iOw6IhL#CL`xCnu2*&fp! z@86#SEQS2zThsdU)@o*1kGo|_2oPPX>7a3*WB{Vr=+=C-HDhvV>M5A}ICW^>5kLb# z66NHkRBeqFPs_vFwJ+3oa;&{(A*iJJXKL44F92M}si#PM32!>EYuCvPvwP>#I?R~{ zHkW%BeQu7T@i^F`gK@2nZ>JL%xV6{6xNyOA*iv-2Bjso()ylasvSfa1FR_g)YROHM zMlu#+c?BYO%Ap&8tQqQf-S%4T{qB>t5H?|vv?B1=>d%kLp!w}1{RibK49W)K_I%@? zZ(~0O;~X`bh}r`BP%{gVfQ*3&@DQJ+riKqAyARV-z@&i_7Ny^(e+d~3u9{l-3R9v% zB=@ghFOdkETfSY}6zn3Btv#*LWL`4w#S6tL&j$v< z-qu&ptP_~*3GxImyT{tlz~G$#H6iSnb(jd9Om8`3`hEfPT?Fot_}0+$_BBR;KoqLk z5P-mV8c8TfL#NLlY)C%wK^FL?oU-y3D2;oDz6er#Af_rk*hw{N2r8MSDQ7%jy5Cd- zCbsQO85lTFUjvv0$u(2e!}iXMBldqj>TdAr!8cq!4YU%i4UgTN{6Ng8A_!B#%rq2(xkY1Zpr<-J7w&=wk&MYxvOsQ6GnrKN4M+dB zA$ly|cQR2eBYUMCvai94qHy_(wS+=zK+)Wqn-oHS|&0YZwp)R z7l>F3_pfKOo|B*F{QQ{TO4wEeu`0HZjP3qjIqA`$XCf4|5qTTZ(=6xYtC=BUNG#?+ zIGmkoF3xj~c5}*%*9PvXh-D@#asQ?hFkvL4vikE(0PIZ#sF24V!yX&+dAyIcWDAe*vDIx{D$#BNW*Bmj{PNm#T& z6zJ?Dp^cVO=M5G%^Lc$w% z2}dILLJR<+UCCajq@-MyNg|9LgBfC;7}5=!lq(xpA5)UmT!pFdnHgwt|5`fMOLVIU z76PZo$&Mj%Lco>hH!&&uEyscu)ePtoGZWfjF-WPA_Ve)PQ_(EBgES5d)^b2NS}}naq2?|6}6Z_NTw_^eh?6K3?s<0<4;e z?cBLO{XScEhO|cw2GQTYKH6y@OAu!y$bS=Oteq`;3_U0G%`kb2WC|zG=s|hQc~)*u zTBqfE^jI$JhRDe8UQfePKH4v~{((Pzv^kPlCyHHH&S`1!Hdbo|V8E0LPSKEk0LdK* zoV-AKVw`}8?fr=k`hD^gJgJ5*7u^{X4ng&Xvy*cr5pz z3FSAtc{FFmUubTM7LOX2jlx?ARKM-Q0Bd zwG6G-4!~snHvZtm1ASB~#YYKct3zhv@gM5$XnHl-cA8{U8|w0i??tJ*&%Yus*V1cd z*)h@k%@IAe1;YHMJ`kH+)0Bn_9#DeT6v6_>(6T z#YlsnQg_F8o8li7YT3a_gj0?|+H~HqUp7_!%&IPs!3O;nj?mKX4-zjQc<3{q9vXAh z(RkQi?zJzE?)XGVQ*D#oK>w!EK243h3~kr3J1>%NZkk?9V2Dh|xg}lB(==bwbo|j; z@g@oR`7C&GLj52PdyrZgQ9;Bk`sGKg)SheMUeM6W%gc#P{g;3~`B=fYTEWza;c?AnP_QoBp*OAyolAD7V8YU)mF===bUJ;=Y!4=0$ z8>L9QrN<8s*)49%=h9wA>neqh*3m{K@N1}>G@TpWb_(`>ZL7|H5d?QInzskRiPc{@ z#gn)5rf}RvDFly&Wg<%(!LN6ydS940R)RAoYuw*mdt1ToX0~5L!`>RP(?DXp1f5>y z{CRrp4}AeT)4%NQ|PE-9yJ@RE8qkT&8s z+O6lSn{K~!b!(Xb`#}CG72_^#o&7%h@*u2-_`Ei8b|8DWxS2l9*1Ez(XlkxB9IY}X zIa(JIe4@5J!wr)JLW=}-Jiz;*{tt7EUUSThPj)Jkpt}zq2`J&rc0ZFVIaFKc7_+xk z+fzDs8ZU>8c=0uFALbJJXEYrhtRI!}cIN4A+JfZl-}3jdIu?k;`_Z?1xKVU>ojl3N z4JXQBCwlHQQ)qR+dUm6U?H_XZS|7N^$$j{F`i&swp3}(@hlA(_=U`MVSO6lc1XyNe zMbjL6jIOWs7_xI!5s!8y{YkJzOCh`Hb~fVO!aoW3yD}U=C1H92tM}yos-23JaMA`o zBD68N&Ewb(>)T=M3h#fi(cKuPvrVWd6$lvSGTv|e<&v4GWBhujdbQqRFP6`j%v@`! zX1?74A4?bL@?+#}fJE4ir2afc@vPvd6RF-?1v_zs4a#0w@!E<7amK?i416Nt9}BDI zYql9H7As+0>krg~OGeUePrHSV!a>N177kkBS4m8Uf8zaXzstcr{gk^BQBUBsI1I=8Eb|Qg)Wp~y+ogyK9s3Sq#3|j8ILQKI z{Zy?hH#gzxh!Pcd!eat+w_J#Ig8D40tD*~5;Rp*N$x8@`^O(tS31L-Sn`N{RBl>ja z)jD!aM650hc{rvHhU|DzZP}d|6R|rUeoy$=Cg_x{yWOV z3>{kLUpY1~Q4z%eXQtG<6@Pz`Vp(^zB@SB;2c%SBCY?w;ywrWmGTM^v(=;NB*0+1< zq&2TJZhtUh6O(Q^t)|FOltP_*jfsUgt3gL}KWNU~(+4lTdiQe^o&9_Uc{vkNurR{) zF!s5!ib@p{=XWv8#VsgE59mnrAJ^4|YcDmk{OUE%pYO4H8cjtLJ&t+A%?Q0-ki81N zLF871DxV@Zab_wv4GCprj@dCaVacmuPI<~^;@vvhZVbR~AR+#@g*dxMK7g5!9ice_ z(CH(yLDVxMX+mV?iGk_N5mr-(SpmdAZQzpCW{o>^V;hrQqyC%#fWJ+2$zAro-2H}f z`-yq!l2)GOfvC5l)+bzBBg(1`>Q=YDY}LE_CVOw8m5tlXNVjXXp=c(~f^wbWMywe{ zI`GG89&xgP46TGGOwZ%1^L-&QRh|e#03s>Orz}h zho|O36#&0wU!VxiS;Cm$mC4?Uzr^j+etejRX$)@Z2Ys)%!@kRGyUV?X3CyUIlW>Q4 z#20!L5fK7fdSyMJh=8ktP#%*xhEKXNFhF9VEoei%H`-+Z(EB?Rk8HL+MqAAz*>biM zxgMOX#mUc4i@ED;1BUCr#fd-8uxC)M;p5GXJg-4}U%s?;qRUe~oWpjoqHcWL5^q}b zU~^XPNLSnIk#na*(^J)4^`>LQ>^2dn72K(GS)ZAi*}=~KzA;&adAzZj=37x<*wm>3GkjwTSkoVZS%}x3DSf z@9WD3g*a$n*<%g$ zZ+!U|e<$)De+e4OOScF;JWBw<1_uFbu}Jce#BG?BA!b-fB)BVan;a#j2{R3g`HK zNXT6rqSTGieRexZjDrvd6SP~`9wdU=1_lOkEu)P%yhI!Exp@pX^-UXLf79za^iUG# zul#YlaRkXbaEn(sL;>!=-Foq>ADA)Q0qcs4gfVkoY#0IRvbAYr-lH0>^F13@yL$+f zBN-0Su>=R&Y!F(!i0SpN7DiRPG{h7m^!y##&-Vv z#D*YFUm+r8n0SXdw0qw^LxkAoaiGpUOiY}qA;29*aq$q=JI;;KAesd^rN7^Z;ATkm z-%(h@Ap*VQM{*1dv4Q@QbP)qMifh2vrB1Eh+d&1|Z)3%kGLS%7rSfe!W3=H!!d|D=Hbw`lJXH5bOL0|Ogd z7Mtezmn+?nWQL5lohVSb@b`)O&zK1kz>xsUc-47K4AH^gsbY7cCQeIu_T%pZxlB6?cSxMt*m5y-{|M`_RiYJ1dOWmtKOIW=M|9id3#7#?IDs||G?)ga$@r}A_lR`rth7%sU4 zQ$vW;;Y^+F-53YkIJ{WAJhj<8e`GI$!2YYQ7}SVHiPCePq-1v-<{}P_Alqs1v}t6% zegiGLERL=r|HKeDw}ycp-tGLld^^tSKx8MTPjK)ThA{TGs`FroHMY?D;2hWs@g5}v zdLd?Fp!De4-5z{+Z33UuZxzuVPn-`%Lc4m0(&~-fn}4tiSw^h7L{J7~;Ak8YBL8hX zoW6s1a4KIZjkt2yQsK{V@hy7KNQVzH0dA%Cbh(O?YVXBHL`?kx_xE5)R$B`{MQVATFYhbno~3-97U zu3k*lRAO>uc#<@_yzK0ZBbi_elpd)+?tvjR45Jfg3|uO75X5{dall14&P#a*TqSFq z=hI1?Ie>|0;_x*@?cU*lBzSo96J@j@a$ATA4_b*CIyX`-FM(Iw*p*iS)rl~ z$qtC@fZK2l+i>m{ zLlWEZb`>TPg#Fmx-|yA`w~WfJ@8G`f(gMtp!<{2H=lGE$LN4uke|q{=Qc5$2-=BpM z+$BnP;#@}4pptX+#DELzQrZhHDk+Syp z_4eGym(R(yD;G=P#RcbuvT3*5_P0?*;LVDoLtErGo7u}t3 z@(WF;f|>y@5Xmae>)~VQKR3-MGsdrzpgnJ_65(1@#x9t>NrZz!UMpzUsVSZhfgdyC zRCNg9VWP(XAStP`=lU}V+vBIvENd7#nSlXSu1SqVSBN1SgO{h*U}B>ER>YSU4!4d{ zoDyX6q2LbtLwegqE(v%tZYz57q9j?H2wV&1tpOeO9bH|7!I)3ZjVr1}Em81n^NWfu zMQ^`=tU#AR(rM&QP8uT6PW<;)By(TGRGo~01^L6Q(_%@F0vR_5#)3=DFZs5dSwZf| zz~v%^Ctny3HfGE`V11ON7FUQ;uL6V<7+o-o#ps)u!SJUM;?hbnT`%+(B-vP6si}Ac z1G)gqblipn!%7Lgb?Z{NsE(SpP|)fvOAxbOmlu8wjyf1m_;ru0fY}_pl;IS=T!HQu zecn^1`fy{BJRzPL=e*XLP?g$5{f59M>(b?#+r7z6hH9MjD_o1Fvc%z{G->w*PC(o%&J2qy*`Q4uU!y%nYOy zsXC%V2z4Ya;iv@6-z#%wX zCUUnG#q}P*;XIXc^n=i(G`UWE?ow=no~ya%Bo~oRVKvI5hW`y|*LL zFf^>MKDeoT~#AIe3vmcV<0`sHn(yl*sp3ur8x_7cNHdFRn zvNBOues0k?iSyC024=f`o;FgEF+e9;q0pl89$E^q4g)2xcR8%prDtA^ z+HbGZs-@dr7aE_1iSoC#V>UH1i}+?S;%AEGb0PDJ@fB+t>sU5P7DkhSzQ2XuZqC;X zZzi(SIKlAAY39=Oc)Sae&5BbhhO>GaA`rXv2jj#gl^-X`&adq4xFn&Pp87Y_oUH$& z+znswyJ;Yj!qB;n{kImQ-_Z5JVL6-$wm&>IWjcDB;UW99?^}z#`w{+jh6%})OSSpF zm0g5OPIga_mwMJA7w&?x?wLJ-oC0G{FgPt7DhOtU*>2nXjVva5JPdc>J0`q`Dew))^2j4KY657Sz@sBdDRFtVRmq=b|u`t9r2$@3cuQ=yEz884v?P)3Z z$EDaL%Rm0;vI$wXGb%ulclg$yg35&vXk)nEc&PV5lvrEt9Y#w&;t9wCsb--dDonJ} z5I%12GCjI&myD;5{^f@i}-ZYU|3;ilp_p*88FaZqju3UFxVvj z^0?gIX{7#{!phcuuKPuwyt(-llLBg8L|vBR6}uCMgeOrQQ5o;<6kEA8@BJa%b-b?q zlB5U6#AgDqpOG(zHDQn+`PoH?1^C&XH)v!h1-ktkANWfk7_b*M!vt;ET?^V{-XBSX zqC6WFHKTO@BV;TPZh);VGLi_O<4&xB?B)ml7cg5~P6mxA)g+7yX-OcA>2y+3 z62~NRzFEX(MC(PL>afva2+pI!`Y57%?YY4Ki8-;$UZ-r)-d@J@z_{g z@QxN`cjUrVNu5dTJ%pHGf@4c?ex(87?ves!mz@vS%h?}Eo}D6RFd>sg41(I)gjV(q zjTiDQ{0a)q35;#Rsg+XZ+GRr8Kw|cFC`g~m{{G~P$C@9@%T*{a0#_zPf zZc2Y_qh#uNCp>%+s-{m+pj%Pv(@{vwYt&R4HZTyTL^N*aqaoUT8VVYrGY~!OYE(o7 z>Ai7NlR|TyxOhaZ1m5;&Rj*ArA?OHY9kr43^pb(?e`#T(rmKQ6{v3(dI{q;YL8QNF zU=jZ%%=B9aqjYq1>@D%Km-H#p3nBvDRYb*6lb2mc?h0bX*S4%^?@Wtc5E(9jjYJ5; z2XCcuegdn62!itVyPgxJsI)Wy9u`0rl>VrO+$C?n7H%Dg^fQ-O0iXnS)CW%-Pz7tR z_H7~>lXDO&hUqhx_mHp?;uD#UAZiepGpy?^k|u9L1ONOK`< z8~SpB5utb)dd{+gvtgj)6~w1_XK1^Ai}Y|R^$C22Gvm83^U5-}c&Mc9wH<>~(&!gR zEQ1%B=(|U`>Mdme(};!3Rp+=PI{*o5u9fGE*kLeCC?nlJ*qft>6$Z@oil{3^BRvU7%IZf`K=Z~IJSy$|BR491;(z|+|0QrvWE+o5|H+8dvSy8 z037ZZyu8Dpn$LagY7J-Gla1(TNI8kenQiv`c~D=TW2i>Tb8rn;l#!v4$mY43ZpC)P z`KgF!JQZ^3i3thCMa+sppaQBmeDvE&w1;%?N2nGC1{l?G*xo*JX3<``H-1s<{0NwK z=;&tLsr;;O>nyGF4rDMVVI2JGE+yWs^w3RY|TAgXIM>)mvD&Qm=E8Au}+3 zlzh+)E1sL~2{b6UsO`lfAOh1TGmA0?I~_v+2)nVI+mT$?W{c(?P9l1TZ#Mf?D<@i? z0&U`wb0>|lNa7lOwfRf-2%Swln{4Ex-k&_E;AAhK>>|7zT(SE?Fke$#O%9Ghf#2 z%N)*q3Md6rVuMOqo5m?8vy3#3iV5L7eP>Na$8bGeT?Ps<;D%nm-jNCf8R?rbkMPAf zB)w|emMurFl{9g+J$;IS{%DwR;?$Clx=&zMaZLdG>++4HEGbBr9#xm)KQz*OCgR}z z5Cvrn4oihXaIwA)JUsNo7D=q!`(4fNV4&um)k#b=A9J_9Eax;QIA&w4*JinnI|vQp zo375pfu|Q#?|Y-wLd(MI+`*a+ng`LTFqFa;8G$eR&FRC{il;+09ufUyE{(f3RK#q9Q@{ zKl>$xwpBk)Os}taHMd}-EstP-+tKQ=wp`$p3gY2-9ihZ=ft|b=H8Nhbqd)9welD?E z_a&k)zQgj{WsQYkRK}0>hE~vC|8A<~=ZC3wWqoCI2#S4t3)T%QvTG|Eaz?uR4*jeR8i(kQd*;)ht#r6%m}4(yI$no>}JpX#{KF*yguY^?c>N zGCSWx?%^a=PAp(*KaMn;_-@lHudA?>X~#>P*YVJkKX|7%c$)gM2$itfPizZrHSd*^ zN6TXJ{Ct1>^_1A;Ied^;IR&nl4Ft6f$sycxeqJGsLwAy*<)Wp{RJ1i_TPB@cCfx^L zUah8KDwBqv5$~6Z|2ElD(sP`MMfc5QT4OM6Y}csE8TFSNbZ_idQMqtJb;7>j?$q=( zoe85ESVJ!L1eTaZAcWpM&x3F*PInb}qd}ZmzipB#B4?9(&c_2I^leKRg)D7*J_}KX z;`T`UQu`V4=?~M~cRdOvAH!0yW9Rx^TI6P)SCV2K#+f#N^hf0;srbfOe|%v| z|9-qRjwjuq)UjO7kU`ZjjwqxPdWlfTP?TOvyL{`~Kc1g|++8x#;uUweNy8)k-iB>6 zd*<4kon9WyJnl!d8ssTnF8M0xP&?b`D_s6ub};MXnX#J_NeQ;=hV#`DOokhlAC>6Y zAuBsrk8-Et{LzD~9I+QFdnRL}^S~AinHSs6a(pq`3iz|yo#fk|Jx}r%wksdoo^!f2 z7_J}G^1MXxCOkqWY4)0;Iz8$GpC)k_eEk?q>^;EWUqBsxAG3JO8D`eLgUtysq#Mb1 zU#qLDcPP5viWyZ!W^!&cBF|`UW^w}B=LlAZm;g?p5Y9+XYF#o6m2ep|C)`6YDPQas$acwiLa=XKJ&%1^bwka z*^Gvqk|wm;m=f`|mhj}WGu&p%eI|VRX|r1Ufm2ycn~p{2BYjI2&cuv_reW+?WGVCA z)o}^1)clS10o&*w%d={DNjsVmLg7Wy7UE++-{!}Wt>I;xx=~!*zsdS1Czl+b2`3$# zRR=y-^1zHslrB&HdJLW?UOsG2Ba<=n?RjR%Ls>`o$(pfkVDbH#4S`A^N9L~)D{EJB zKVO5vb!V8u9O8mNX|JwI~!(Qk| zom}60q8+)t!&UoGHD7vBFcr~zsmfYgwR6Zs7_V9D@0ekFR2|?KHS^i&;gQ=-_!o&28UPb*wXV8CF;wQ4=a$1oiWn*oJVm z5t<4=DSxLcdyeV-q~AHI9FlsFv$`rxc4{^uL~+vn{WOZv@60ZkQih=GntXubknHqv zf7{0lS#O4G7+)~>Z9i@rJ}$ywZ#MT12B@G87TM6J!hqlgDN`ZB5cs#eMI03Mo5Q84 zmSW{5Q1YzHF%xu}+1a4KN?tjGk*KMJv6$9#*KKt1{L#lRY(GCpO#F~AwTLc; zbA&WSQ9><=(Z8^Ls6D;-&V8NPS$k1`5`xQ$qwEN2 zxLIp9?)doT4a2u(p+-NC#NL4OA0M-IQ)b2vImys}ZXfY{Be8Pb*mkj+*~@R#(&7U2_2yi_X#Q^3}LfV>8(OT}8&TuU!+;{=)|kTBDtssaEqT zpTy7r3{TTY)GxrruUa5^hBePYI3eOQ%@w^~VtlaX`!Wb36j5#{03sM858+{`8Fz?e zo`K!(7V#km9`s3O4QAJ1XgO_H_wW{6k!=P9sAPT?;Jo&h8=e@EBx5hYzBX`<6OkZ53HR z)}FM*eahkBRvmE^KvCA2i{T}*9_VmsK>_An61XSBDiPR}V1HhhH$$0l_tWlmci(|8 zfu3>`gtq*eB_|Ep&j1>=dG!^fQ$re#?D+p6y0ZD6=1&bRzN7I$hv;=^s?YqaIh6eC zMY!)!B^D$gXhAt%Orl-=Ur=DU;hDzl25AgpliQ>9~Ang zXkp2OfGn~Kba&j!Lv9hA!*GEIam%xjn-b>GVDRgB=BH}%NS0$R_Pa$=j_}q!J2mpO z$V>!h^Gfs?ud{}7F9Y@hLjgPV=pumnCcm#EQs6;(AfrgnYJOXcKv6gh-Mq<9tWBoU z+&)4};7VFf4IYXKm%oz!8b*9=%IX#>K{FL(; zECNd&@>`XkkvVOjZXrr`OH`z@S7Dg8z3a#BE}SEi`);MANQHC~w*g|)5K3U>U?;*B zLF!@T=>`WBC-94k8RM8z(!IQL4Bp+jq4_yxvU?6Akl_%|nF~LE;Duybe2JYX_9PMP z@#-_W(|4N3Y{?cdvb2k0gck!D9FZ+53g>39QRw`SDjifp-mqV2{4tIif|f#jqp0~x zMee%!z^h6e0ZUhkktfp*r6`g#`9o6~UoG_Nvz<39K5>RtOz)OJPP^+k?FO^?HT8NY zqyBlF7yNcJJymsV(4l8EkFU0jDD|9Uhs{n}*N1@$L{uzNq156ynb04^fWQ)2N3z9f zA7=Qbq8m2oB0+WY&ftkb@x&8JE~d*HCZF)-%w_dnInY0T;nYlZdCSxU2{abE5%^&P ztR!=}&QlS0?tDnpOVbcc!rViQl8V18RF(0dI5)+Tn6@L>Y?tqWmkgC*X_W>-rTxT_ zl(G|}ziIffo+GCr=md|%Gx$&S;Ie)po;Q=Cmy_;B4z7mnr2qfE@p$GJE3)rkbP~Is z*1WpE-7 z@XvF2BCXpmB@N`CeDNGR&Qe?$B7t(`Yo`{v?tWz5iSwe}^)@w<<=Sk7=&n%lNM%{|4w==4P1}|oo)%U~V6%z7cTh4t z(`K8S7jXe%5s2i0xj z((!;M17|?iR+%GFkJ&#GUS8XVvX_&EQLuJw(Ay*|Ob63-5q7x*KtImM8i5m1vBW2a zxfdDD0S-I;por|gM6Lv{Iu2qw!xAs}PEAY%F^EkD72XXx@-=(>?)i9oEf{f!nxhq$ zKUh!1ai#X7;*B!82Z(QBx0>1|O=sL60XX7_8HIeq_wL_^nTjOCiNZoapv}(R(Ozl0 zTws^9MI3?H#nYz;vm|j^gGmREBw1jHV8=rD^ghZT1xYseqt;1r;N}8}Yl>+pVTgx9 zXh2?(eDv^qeSIB>iJ|O7!b>@HYsB|Q1R6sA#R%9)di5vh@?=Izyd@B|S7^;r7r@4X zwHvBZg*Uz;6>cjp)#(~-)%TLLQ5X1W900tj(B;09n&oXs0~i?WLEmz*i`J0p6g zk4K`X%ACte>3Um8k)2-Us3Zhj2<|*6L|m0Rk#Jd2iIFZ2+vE0KO6v$P7OsVwwW|mf z81Ge@4Hhrz5I|WPR_4B^7H7`g)CbtW7-Xo|OkSPyScc^G;Uh$Jau_pVHZPr&rMhO& z9&eY9uxBvOXU~k5r$}ulyIROM>k)Cb6dMYH$;2nbi8RogH;mb9I5jy>JVPjkD7iix0d$$xxKFoV##oQdxp9LaguZKw#oHMu zGm%;Z%c=LvJwm|JD=+W9j*W;$Qjwq_d5&KR#k3WbDb^GOmdI1!-TW8$N9`~cVqUsL zO_}T8AOP75pCU$T@P4wy_55&|dtpJ)f_NgSD7bjnTubXJOrG!6e%|A-6@Q#`*~Qz5 zg`;D zCrwLHM{d5Wcm2ENz+f79Sfo#Bo>f9a_NFf5?4xfn-3a6T3&gXs^GZm_9+V3eoUM&J zUL&o`OOw(_*@7T3aC5jc)Hw(NXit~Y`aePh)EZu3*S;ACs z?Ss^>6gJ5V8=1&NO@8>?O!-xN6BfD-LkXpfk0Y`j6U<(nohdHnjg#vIoA zca_XpXr|I=n4p@jw*dKW`Ynk8-~Ys%5h$wpI|XVsJ5Ank2n+<`73Ao{;x`BeC{;^1KpQF6JMBGMr{mDV7HdP1-fU?>&c(k?d1nv@IAax2qGGuqTP!9 z@UbAf0wA5fp-^{AvnA)MRf{hg6B1Ei3#cn_QI5ViQC^ShCD?-Ss zMhXRbYiO-4!pg)>jY`4}Fpz`C{tF&7Z^|D2him?F*bt(OdV#RIm23?u{Qv97XsNor z{r`pelYo^9chI54kb;ebUC#Tr*!TDk@ml*Xg5w4{m;{|*A(&=9X9PVAES z$#7oYdx<=~M^!(nE1?k~i0;roAbvbP5tp3CDQWh01w{#h(AwN`L_=*PO%RC#L=7g0 zBiL^nT_9-sa1l6k>o>@`V2aq*B4R%B@9Y+Jn^KTG4pp$5)DizHc>31ro>qBn_h z=Q!42`^)x{0Ms7@bBg~TjZ|3B&JGjWM|u#UNdJlvo?373Mpfc+L+kr}8)%_e;M-Ud z02zxA*vUFqP-S+XF60ytsnH8U6h6Vyu)WNFvN$Z+mGaJLe$E5Xfx>WGgR zUoXsl*G8Q2Sup8v(OYmA%yu1=`kgH`u8{C55}tJB??~3Ump!t2(1Z~E!4QQ^hld8< zku}cN`NaX4n~sdi-I59(9@A5qVADBUkUZD#r_4fiNMHZmkfZpF-D2?y>a|X1{Fav`uxywh`KihnUoon<+oW1IT|!@U4^b$x zAAY^+%R)}f1Pxo!WO3p#LT0S$o)u@W>shjuaFH?Mqw5kfpqW~UD*23P-(bT(epDM< zIG=s-DyOOGmbRBx5KBX-32iY0ey~$g*;3^a>xwl>l3zXTEk3O2>Q$9Bt)XQ68eXLe=~6thD; zl$bc4p5}ZO0hNn+oik-a7R^SrFNMG@v0=)&ev8ZIZ;%i^wdhShbD!c=!$Ow$ey&sA zUH=e$645ykV&-jSLw(m!Bzi`xhw|!f!;aHyy!G;iw@$`7WGx{f1#JKOWMX{uhT>zn z+y4pCl7zDwKB<+$Vf$E}H$FKqsduqH+{V`wl#|NNW*rcLLb~tK{!`t@@Lap1#g^fs z3;6|ApI@eL%HL>6(V$NrrHUDgxlPL&Ji)6q-$Qd(+hbi*BC;4hi~d^q^jEd&%gw<| zH6KZJoiBWCT_Pv_!27$Dl61wn;Z~*?bJ5Ug0o&m{UhL2A=$x*ouwt{X75&uRJI%_) z!BMEDpZg!S2c#cLrk-f~=ykwNLT)LYfBo^0b2Zt)y7!F)FBp4?s?LslJjVFtHhr%J zRnFpoGxRU20!RBT=km`APW*@8%vP>HKS`Xjt@-N1RVD{zCl7GT7sv3lw*^?KjPcF; z>WYTfgbynw1O}!C%9zlc3#qFg3|}_G;Z~gjf94l8p5CEJoUj;lb_CRzg6_ z{6WHy3pkq(GVFVg_Qqcq`4`Sc=U^0wI|-igyz^BCw7JnVx9c|?*q_Um-g&lGou|;6 zQ0{>;jrr&s?n}5CEJ2G|7KlTIOaH~yG5>UXj%VUrHJ_eN#%E7D+aDHQ)wSHWUhOS3 zre3?g+?+DS<|GqHyboBh9_9UV>325+La+dEVIA76^q^IN`9 zi4M|K&#~Lq7f0UTMt#dhh+WF`1<_!8{f{k)gh)k zW5}rfh1#u$m#GsCsmE?LbKhd!Nk}Rymx8){=9Llc^pM|$|Lq!xcUrGRfi^$l{29N{wRB`!GN?cUDBp@z|{! z9yGS^8@!so;O;*5E5QnD^AgNx?hV2FJ&%CnqOcy%f?a=xKgdFdd-mtDqUZZ<<@jX{ zv6TUuDhxK?hL#l#pT;~gmAgoSEd$2^%+L7iH?v6&5}EkNBZgpmhN}sQMufeOn?6+C zq85NP^Zgi zky2-7@9W7w!1SCIf0QJW!N9L-JLM8D8XI z>?oQ^z=W731+};Y_qvCx-d{Y?4BA=7<%runAU456Nw~P^bEQ|qhYs^Dpz5MmPfJA_ zJ=Z-kjh;d4r2#^A*m>YXlak#%)nQHYO}BhqO$J?LGogBQOqvg zoQ8IsqU7lKs-xLX=Hh+gW+3aYCflkA1oHcz-qeR)rnl)pxYQH=3%yvZ{k#kUtT;JQR>&|~=VqK^9ur$CKP z#&?^5b#;Ap_Aad4UOJt;IR^m4nS3;gfa@7 z_wPanU%E5OVpzpg!kAMvbvf%fg*#jK%M&sMrZ)KeQWSSxQC;)9(WhBR{%IlI+as2- zQ;!(ONss@w_k>g=W;ErB74}s+=%#>^{(KaKYW-8I0vI#AEMFZHijEWET0)a@n=ymR ze;Ze?!`7qj{G1cjRaKK_Dv2)~PN`VOZ0c?)r1Fo?{jrWM7s{#ogPn0Uo zQBlxEn;!}axO|z0vVC@}@_g@;N4rkL--U7CAr9%YK^O(5@tCk4>dOpaSCw9WD+hEu#G@@YNRHeQ44DQiZsEEu^x=cRj}_@>-h02 z8}(8HKI6G3D5(B^OR$&%lNk9FxDgH_lmj?&cAviUKiT0;5GlU*64O#5@t?~&u005e z5R>eKSer@cXUDLbicp6C<3f;;2slni!$_osp31MXl__kW}OUi|k>AeTM^ zH{lDM09iqL0{&C=yK|*;rLYp-m{H>T#c=b&X2*8qDet{RhM8D*kr>eK{-1Y%tdD{v z8tA&C8TiH6*Ifupy_UYdm|EYb`}8j3{m3URL87ZDfe}lWHwAvWuX;rpp$prY2|GZd z^cS{y5D^e4GOHzs5R>&TC+y995-3UrATKu+o4z|Bm zee83k^&$y6B-fLZiQ#Q|92Pe$1zsOKDHU|@>&S9tINvRFTw@q7On2k-ewLc~Gb?zucKnp1zG7tx)!LYK4buG; z5Eii>=ljQ1ezuah99n{De?g?4H7PH)^4^UFSoV+pME-x5{u@c^$mn={o#jQ4)7_K# z-sru$t!~{V6)GF{X(D;Ol+@=W**ktet}Z=fFG`q)ctK3Ciw^d_?ODOfDiAsAc-$<% z3~v=|iy}K=FTb*=5d>-o@&#I9Rpd;_6M@KaE5I7WkZ%e9)4(N69`-qWGMB~GpVC@m=(!uerY0>5ldg|Wyorix**Q(`cf59d1>-{0FtOP| z{FieYeI62Ol%;EyeMRxT8zcJ5 z5OK0;rU!a=%D$VMp|h)_gCD~ch#Pq9iKh?WEJpTh%a;@ABpz#zP^+|t-BKAY!te{j zw?y$mrkHZJ>z$p2|3`)CWebvgE7|1;7XXgjWNlR zm`j$hXnIT_nf4VL&uc(TM(|JCxW5s(^1g~-l8DEM$YmrW6dKw(k{|}RSakH2ofqh! zOCGwu$?ZkL%a_+9R-bej*d^ON&{Q0H70nJ!}JKHR7xCDh#5al)SY z3~PQ=6iONrzePG`(oWF-k!%=lkEofj7qQ959s6rPB`~7hBsLCgUfetGv~tP`Zl1>n z+NAWTd`jqzFb!748klcG2zQ`1Giz{>(Dq`O4ZXZsc?N!AJ_v_aN}xJ#Ejv){>XL=U zXKP>;LV$wpxWIB;DSY%UV>Z^xM$pEu0=L=Z)v=s~;-j(1Stl9`QSkYy zs|SJjn3y!`#7#KuNXGw1jAY~5aF$rlyRiLz);SL9ncD*3|2J0b+PSzfIiK%Ou;B>$ z<}~}7o4qEx=A=1pEuCKbLp(&?@5e3X&@ZoSl(65Vq19u0Nnk*L63uGchoj3j^lmWv zgX`)Eb)U-Ku|F4Hfizk2FGN|^I?dz-NSPqcSb`8L4@6a%OS9shcD#_MIPzK-lKVpT z8QXU8R=>d@^TAoCoWywbLE~J#jOsHnQ{2*wj$WJC0<4)i?+HmY9@o)GOjbyA3>Pi$ z>`v_+aX4UXVCaAvOBb7($tLBpXIm-PBO>mD-9U6^ za!3S)qMK^DiUf=UHI$BwIIa#o2lvvoetA@A38P=i(^y7^S&lL*E1!lJx*tH99xc7^MbB;2(S$ zsghuq;6z-57Yp8m*S;KKGBhOuMOY3P28}L#3D;ylQwX~OhW*+cS6LEa#&@mOtj78^ zNj#&n>~6|js=TIq)r*e`mx+FgcN^Iv9V(#WcCGKMVVqGOzYd?xIhOGr=yY%fyUAzR zQGU`$URN^sZe_ z`ss0{MPr_hemXCZb<6#d$LH=>R8_OzulwYI%}W23d6wRAT8QwecTOxRr{1mSb(`5+ zMUl#fLZ`|&c8hLVF8w#TZ24Dz_(mAGEXJMHnr8^0Svq!S5Jx=OJBBlyBrD-+z5)ka zJKn}5C)A|jh1;7K-nI5ToIiwG1>34_j>{HMd+quX(oAqXBec%2k0*!ZP>F@ZE#`m3 zX)VDX24dPE%Vb^q*9F|z4|i{Mk|APbuu-h-L1j<^nXA=d>vP=5@NXW~W=i9ZaNVq8 zEuW@8hsF{mMb!K>AS$~CUJ5~8Vg>&u(DN+5nMCrc3&2AR`40pNf<*%)i~ts*^qOKS zK{uJ{(ex}N&n5w+RXsiOK7s8|xAd;^k{;UBTq;Yq#YV?dMXFUPHQ9{&-_j;qr@lP1 z#IMuu{JFjta@x-7GED_rlY1J4EenM$8`D{nPUc){9?3EL&h{35Rob?taHNa!QPh~< zK6=3vGfa=a9xr!QcCWJVdSa&G3%cZO&&Rj0als~cQuW;A1QLJ46)McJkf?}R>BPqa z0(kFYF97T8M!dlB#qJ=d6*-sC(dXaWeR-Hb#bUMRNa07qWiv-N5nc%1v}P^IcjV6s z!)T8s^2*upqP)K+&bpc``=;a*OFgV3Wo*XQj&u}~(ixy-NaI@oBOt-Hny}+gRPJiX zx5JL=t1W3eHg7(hQl-DYz3$ZAP7%f5D{1$S5*Qx2gd~oIZ{p#qm&-O1$12%}K7)18 zWRGqpYU;)fzc!u2sQ*&^*^&IW8ToB@ADp(zq)Kz6dO5)O#?rI;kJP&j!KE?^beEgb zRPr>dZ@=!pd&+dAwwUvihws-ZmiY@GC<4kp!^>Da$6fzz7JT7Z99Qr>o!btY!5 zHQp5Y#E6=a5O=UFnu>xzzv-qcTj<^UnRO#Z>Zx}hBp#o$>l@*Zd3`!!Ju#bN)na5* z*FIaOv0A-kBPzH@Kbt$_2{LcW+=7uf`Hr@Kf4BMec1(j3j=fTboCczc`d~kav_cam zFN5sydJ9mw3!pGc+B9$1XhtjFddGE;ib4!aBzW+wI&z3X>~Mv5IIOALEOi30q%!!QI&{>+rt(Kl%pW?`VE_EzF&Vb~aQZ)tw&Wd49? z<4fMd-beeWm@7-ewpx^P0@yfH_pW$h>I5JGb*uC!<*IEhHSS^XtDP%FY8n0BW=x@r3SnL5W zQq#k8?;|&to)~WLt%@h#2MLg$cXchQtXytvY^)nMOV&;M! zm|^i^Z~aYsu&bJYojwR1f#O*NdeBWbh>s<3QOia6J@5JQWce;Z*6$}huxF7FZpcB1 z{}G4U%8kc-Q3Ji$+LGK zpP!$fA{DvJ+uIwZZ3(h4ww-T1FGs<=rm~hz0MQ2u!mElBvwP-G9pHa|6GfU-_u$9K z&Hl0>Sy|i8`m5k;>HaD%t@V$7k z5Z{D>fdN@H1k6+r3e{0M<>hQ_g3GSv-++||a zxNUxL##U?i!o79b&Z7sieE29@_Y(@DL|R_<2^sh}hEf<}>+V(dfb_p8TI1Ai>uo0v zeaR~|s9QY~^F)WrQ*k7TTd!}WT;3Mv^_L3aJ*7zNr<`zG|6)z7_-*A2i3j|kEu%Rp zotZ|0mSfi-j@$saz`r8MjIkaaUTdVof&bqW259m;!WL?JFxVie6ij8(osmUUvNp|R zc?sEU30tud1Zhn4V%9Ao#LkBr(U$;R-aNSpSzJ;sfdR8}I`ZB+K*1KnDu(u16xj{X zUxq9z+D+W#( zer>v`wB*2;`E8&gQSxy< zp>MN#t|8E+=xSDbED_~4Y??;F3d;qPHcAcc(tkDk;d43b4+eosgH_|#m)C3SJF3es z)v?Ok{DUns@!`w$u5YhpsGZk;KKs7zq3^YxXBn4juMel6nLchZ`zW4%Jddvb{EEjq zTo21;y(hPMeT$&nU-RDY<&)Y*-7qH)-SW=S^1YkXQgtJ=&UHRDeS2`>j#NIl0@LFd z)H$lg1m1SLIiQrPge;^{^N;TcIDurdsP(=;pe~!~Q@hp>BBRGAvt$J&{EzX64Bq zvo0@E$ohJOv}x9;2Ib==ehH&AkP#@j3+O25(M$n1n}D5v7*ITu(T*}O^yq;}Bx}gK z1o{AZT1LHt1YW;_qO`}>R)W9{iTzhbW@S>&`0B={CW0wKe=R5E(igb2)t7NO_TQsCq9sR>{N^ilBc|k~ra*5UM=vOt3%{6A&`oCKq^E zqGW9d&29MuvL}{o{X*&1m4{hV)U(R#2^HP}nC{(rU)~(IGs3w0zT$lvzOnhr%1W%L zI;nLEOw1QtWGut)WfWW$ud@bSV_O@*`NPAu7sg@8X+y!2ZN|*IJ%C`JP*4?MQi7ni zFZ?*IejEl~x?&fPC6wSqJaLZWs-(m8?92@5wJ7<|ZmDGX6yBf#P-a z#YwUR8T>e9LQ8ix%CnS8lizAH4ol+tUH3hX%?uRo7+AG>=N@EDjM( z&697qt|b+{MTa$tjaSFYGc0NT=@Rdam0quHGW?Y3sP=2#+TN!7 zmx2G~@r>S^_A5Hr915eo4Vb?$*g9O(UV8P|qJT5rmDM@37FT^8m2AZudT$()IFOMi zk@-A^?Ly`qI4^ zt#uIKPYth}M?WiZ>H{|%o2^I%o}cWDkaQZypkaVj>Efi@m&XSY zR}vIfzv%_H-*#lJ%f17 z;tOcM)zWV)n=zQ%8+Qms?`G44Zyy)@t8l{aGBV;v&29(&qk5Fgl`<|qzK>$6K_Pik zd(6$(L8FTFpC|9*hP0$3W_S-vz!o<1qsGwo>{(NV5Ddwblqk(EeH-<&&M^XQQw)sI zv+6*w9`6(oD=+0df-EEOQ-8&GqY9A`Z~789&l@)tJr(f66Igk91V616Fx$<0`CuU= z?8(MV$EA1f+@U*-lcWT5yrtN<8MI&8Ee!+9VloLp4y_Gqp~&%661>Gd;xKR-cnOem z3S&6`NjGp+i0_^3*@l3!V3?FNy|_;vfI}7T_5U~|A3hi>0ea!IPMIqQ`r1o3q5)J0fwi2Ph)KlGYEzZ^R1;Qr*D3o!RYqo z)hCk&6k0{r;m-=K_uUJtXVsWmo1Z?vwaT`ydSlm6Obq`{Gp_rH0RHGbLUKPO&h>o~ zkjSqUE*lF5FSNF{s+LcK;B4Do!br&{*W1Z0t2Y zK{@YLxOIf~-{2Pzpu$Uy6o!N})jUdq6C{)cXe;Da>fGFidzz3qGGUpCfhm3&flU#1 zB>IlM8X70hamBQKVB7DsJsb28l4c>@8_b8q+fLU5h6n4I%MMhcgujT|?AG<&h7UW} zp@fLc+jwQEpRzX@Vy<6b#_#2QQArIr#{Bw8DhlM7*JyFTr8&hyk#w5(zdt-YoR6oB z(T7i(45dgw4C0IcHozrP-YgZ*_QF_~)F|fZEuFX>UK7wDh13VgiCrPt5e%B!8v%8O zh0h1Fxq}pwM{--k!Ii65zoqS(82<62n0(Wi|9^$Plnh8m@8r{F1=n4Z4>`0<-o4X%F`gZ#-c>-dd#*nZpSB~_opZ%6;ogL;|Ao7262 z^oquAt}fDWxXx7EYr;mcqjWHcQF~LQxzkikdj)M=%gW0)r+Xb04igX)TZS4$0g@R$ zH9~2`d|+RNi`DLL3KK|?dtLPM%5dEPPG>o-;5R8ZuxNPa=4P@yk&(%t!P97Y!S#rG z=lu}Ols|s_*w5A1KYp`v6{Ax`W4av;9FfY<%RX0D+QF#W6~I3}xMnK^wrTH+``>RI z$h4IPU+`Uvhlj_?@0A6Y5Wd9QRqAdf)fIg2T>a-I9hn)TTZ)Q`TEA`FV-ILfgbSxR8G;COxbaQogpL6Vdw({%F8_GcRdZ^heXBQV%=@W~(PEJ~Mq9|E5xt~#j=^@A z(R#Qjjb`}U&PQ~Ll$hX@x%1{0T6M0>k_BQX=uv13uDsmV)3f+_x>?DH_P5@^>@J6A@57F<$!^^+zF!Ab9VwU9SDX;T3)WZ!aRbH7E`5gV#^hO5Q?XvCG)b z!NK9aB{rnZZ!-wtp-9H*!1l1fG!+OSp_}tAE?==-#_Q7{b8>Kz+P-}| zvCyo;ULNPt&HKd=P(Ng?DPqMk&`Qg#v!Ts5E`Oew>PgHREF_G5EaAf-_EOzGhp(>} zW8jU=uwh}Wh&KVg|I(`A#(E5ik5ZIy_ut&M+p-fS^#me|1Rt~pY2$wPl_l+A2aJqj z>Y4=lz2em3q@1&PbWO`wx2$Ub(Z2i^*_n!R`BRR)^KBjC2xAtoJz8(;496 z8+tlvnd{rdR-%bpAKv!EJ3i87@o}BQ_1AZpmFWv@Ppq8-cUig2p;~{pB}!M6xcc*q zeZ9>tSpC-tO4g!NdLj10^>b!V5h(Pg_%)c5im73m1y+S>w87=~DTvK;W6c zSNm=g(|%6+J+;iYQ7|>E)8n{YEJLpp)HgajG1RD)n)q<+64F_F*P3kF;fYjMTUQrZ z;Y+=pipAwqEEP%i>F1@jG=0I%gCc|Ly6X{sllcixje+j2J~gjrbgP7plhG!%?M zJZ69BkmHi9??Lsr5UqmTs=ms_E%}0AD#5}*pIy!Ee2VF8!8VKOrP>q1nY@Ry(t=Yv({FnXTGH-o; z*uQ>%*M>H`;&ItiQEX}tUvzCSIb7I|GOT_|yqmv%(1rSOyECiJm-zf|HB`^7oYI$= z9j3`#=G76jE7Uakf#7J9mQ7 zqv1cITDUB)RwhRgM3;|HG+D@mwk#iGkXs#d%zKQ>e|8TJ24m+{`~@`9u>B)b!rmln?;4LHzE4-qGznuWazV$ zNV!3j`7=CL0GNWE&Mz%}y{Dz5>z`H0c||`ypY}E#q7qSTf0!{%{bHVe?51Ued93A` z?u_Wr0N=%4K?4%i9kYDDpQ}$&ADILjS@99zT%4KK+9yVrP0clX*I+PY^;M3sgss#$ z50A$Bv5(!Rn{Z0awa&;hJ6>#3G~V!8N0^LP&TcQqO?>d^(e>)nTvXE&oE9I8AAOI^ z6fNS${Q&!nk_x|&j7;^(RhH&F&*~1CTxtHW%uDz4(fBGUX3M&*hgTG5C-C8r0+_=1 z?e+EFuyZFnJ6oN1WAR0I^$I);R;&9;DyUd2+Tt(sP}YlzZjOq$$aDWD!iGe6j&nFOM=R}F70S(fi)vC?I2zw%g$L;w6~*nkm<9BZ=q}M8m1T%`Lv&?VYcMNTg6#m`FpgQEgY4Xja=32WE zY5~->E#>#HT&Cda<^#L&(vHkas0%-Yvo4v@! zO)~uz!>c|z3bE01J3@^xJF;p#S_V<+*Kk{)KvNBBZ91}xUJ33hlA|U4KX-`PM{8k* zM*0Ig_wHr*E}OLxqc=JVDeDoAkSWWyZKMAF6V<(X7gUqfgAFAI|M=rIZS*vVK3_5y z{~pECQWwO%(WIJ_UGk>WdHt)Foc#O#-d>3ZVppxZROlT~FJaLw#=TK)!_7wPgiL{K z$8@7nhKm?ucpF{P_@tzjR8qz;$mnn>Mex4GFuVwo%q}g?-8?5wj_PkJzJNg(vfqN~ z3W)2NoZ}!}$h&BUhemx8n2lxiVq1dX^6hny5lOikUyh^wE*%GF2x=_zoBWuYS*A4N zV~f0ew@2#0dmt#EyZ*Ee>wz$7GIz4?9n-CwS?`>2bPLmnnLOu2ud#>%bxNv0Krd!! z00R-3h%KtZVa8y4QoOJiFaB+ENCGS{rqruTN)11ukyH}As)QYMh$y`~Z~ksXM|s@) z#liS0XjGKdgNuz@4oYJP*){Z0q1~D5&9Z(;7)zbMBq}=Y*7+R z8IsJ>-%#57xa$3!kgCo_S|tYB!vVYUpV?4Fy4x5YmJ&DWn)2+FUXHeI1auW@M&=|_ z6B99Bns5xyFz;_eV46*zFH-#??k8%-(k6m%5f1T2TTlcxG=;Lnj~d-+O1#F^y}qzZ zTwP4N@fTlWvuor2)2G)H#xC)YVES@O;V$H$?<=PaA9xp=ffV#F67#ysRSV7une}8(^T6W(m1i0KFo)ET9L|;WbW&&>a5YHp~TZ zK>Gc$p%_#vQ!NejGZ>@2#=l%ZFH9IBL^fQovgzm$&lYKSS1ehCX_`9k`Ym7~KGV-& z!#WBX1om0j+5JTBoTtWVpn~}Za5z_imRFpgS55 z$5@sw>=dM{g7JuqO>+=92BfzkBN8`vckhd>Cio)B8hU!SaK4{9I;W1`seb&ee#D<3sn6!OqmGg zm-vJbEqB0@F7w(pc5ZI+C6F~&aLJZr}c-1}s02`UZy#~v%!%k4jC$o%MWr^1c->65SXCZpsL zB#nW4?zNju*IXZ}Cdc8{oxwryWgVhC#@}VsA^`Kzg7~oV2c{e`z>m~E&a8#cGkCSU z{evU!EG#Ta?wBrK*Lc+a2J!!;or8ncDbIEZ#u83Im`lYZtY_9zn!xaf*}z68{2dZ$ zP?=MIt)#}*+5f}Zn}B1zul=KsA(Xi}vovQWk-10%8l@GEndh1F`L_02d!O^Z=lY-bU1wkWy7pRot)Aice24q~Om_sPhPloHI@=b1 z@>9a=urSUEaxT^$)6>&r=8~>7zPYfl5Rap72VzL6l|H!p!BmEpzRvyf?5ISoj=Bf50eO)Oc)n-G>1vSB?%sp#9Lx%d2!A zDvDTO6y=T8&`G=;H3!lz6;H2$FSvQ-oo$qFxHk-JT(WiuOIWP>>z<>n+uFybn)ow+ z$#EcG{BBk4?mIfjOP$?0C$i&59m9*Ez`XFKCoVp*ljj?b;H+**HM#mYT2FeCdRLnl z6 zU@O(SDaymach)JZdu5o{XRSY3@*s-h&Cx0SLO#7SLrO%p6<33pd7~r=B1NX88VFF_ z_ZXPEjF*>}Y!*U%5c(Ays_gj7LNuEMeyV?M!s@)VU>C07#DVB6R1V~zCF8^Zyc{rX=fny46FyddV(GzoWl#}WNHf^HCN%R3zWs105j1MZBqypF+ANanG@Yx6;zk>?^TawJ= zBZDa_vv)7)uE@*FBXi~y7RFD^BXyqWbWiS31sP4=Pj0Ho3M77n)*eCKbSC@_D*z-f zz9zxu4oslZSduisWNLWE-~6Qxos0*M5y^uS<<0=-wTgy=}i%?<;`(}gM%13s06kH zl^ibt;PFzBofDw}`V6*}E8VGR==8yHlAwJw(dwT_02$Gs!twe8INW}qWw_v&Sy@@F z2b(sar3v0`!Si~~<05iS$Rr+ooom===t!cp5>62upT}`nnancDv>gcDH|yqyq#x$h zs^}bPaiS;doHFVm+@FN@SJ~p1Aqg)_ulSV7vV-A!J$_w9e(%RL?c4HQp6P3}xlep4 z)cQW8p6mH4VY~UK$En7zK!upOSfy`|Hr{jP)kY~`D~*f$e?F_D*{VQ&Kk1Ua)R(>8 z%gW~IdL-3e+vzD8_pt4)m*1UFj_|ikY^uX7fQAu!rjVoXN zo5w_Y{3>uAL^ae7GR-Vaj^ElYP~*&w6R-;qg*WCm;hvO0$EMc*X$7dxFW*YfoS zh9XUYJm+~o-z*?}+zqnB*G5Z@j&OMl4Ym7a?9aKCy{DySTQ1kz#n)0v5+x6FD?D#` zo^{vctM~9?W&1{By1TP=X*aDR18rf1&)#T#VG9>rqvd*?8hodVu-{2adVanVabsvC zvizG%kPZy%abu2|@!-uJq~npj%r z0ieoj^9ywbzM9qwH)ktD({+CBQ~SJ2tM;toSBpL06HXY-`pffN%H^79PEMK1Ht2iL zb3!ux*ZalYk(#l2oHre5b{;ITtsBiywW!!!83H-bfeK#mTy0;QN@^a zPXN2l%+B5?93+u_;NHZuWAT}a(%AD5a=@f>g>Ti`T6RkC?ocH4OFI#m7=SPA)BRv5=IFTS#2=HI`o~!FSEtj6E%DuN*ZoF?Z}v66;T+wLeoT_HSBQP*=CEZRTqIKzw3i7gycKIxjO$(K;{d?|prJGJ8@= zGEy`OK8=(Q?&Fx0maOxNEzXePHF&{l&?%rk5gk9GQDmoXd}5Aa_}-1f3VVfvbmy;@ z`R?O)q`j&B)yZ^Vy1Tk`EiYBfP!$JnL2X^t^u<17rtfO&zprWby3f(-JM#Me`c}#A z?j0s`3@7%;7;+Z`3Ow{{3rjFNr&gH`Q<=<#TjT7-9j#}1tIU%U5;}U{^RzlzJ(BNZ zs83#N^^ME)6)R`9=G7IvR6N<@yYeoqTk`CAu$!-n2eralb>o zK0pU95R=q#>^%_m`(wl`w_JV`8X`F1kX!men#1`tr|kAg55`8SD)X!Ayj20uMVYK$liQ+(INRo^)I}PB{Af8H*e4>bKEW@)e9YIvmY1XNLOkow zq1#vnrGE@%8%%|~<+bJTE$(!Ok@4E6+Hc=P7e15UdiQD8K)iFq(EbCwz^66t4<+Tz zu7H;>ruW*&@-!~w>UVN03O8^9u_P0s=e`{^%h&dlSeIm(#Q3+acOoK>-;nHN&=udW7hc)CK{FL;NCb z4nHPQ461Czw67hISY7ak=*^Cz1gHgVDpM6TmN{nJfFSSvTzeB@F#pe8WRMpMASQq; zHH?r8sNr8mcp(n1fdOAQ>ZmCw3QI>s#>I%+3q;-v<)eR4+7*e^iQ$HU*^ zM2wzap@c#F{l;CpYWREy1g1OjvIPYNbOUjh4)H24tre>+J=Vv-5$&XM@!}Q`DH~|m0IN$mktG~N%oDoYZq2HEB90haB)i>x7cLou;SSZyy3#`2JvXKfl6><&UQ866RXhlf} zHj+awM?CPbV*it|H)-5_<1`2IoG*Xk=CdA;NDPX}3&Ua(vvVcWLWSQJ%kJnEQ(fv! zHlb%Q=X4CdIo5It7lrngfDrSY-0Gq{B95|3^|m!twyu>_G#$>D@hiMrWBf88#@1D` z>ugQ^je=7*%gW0y*s`Txs-}Y=0jpxYoV_T3gYvvbs>fCX;+ zaPwwHCO#MI>8u4wja-_iNZ*J%5-C;r`IxNI!$X_L{MDax^YzoeNw%UtfZhVqVy2cH zk2SJNy_2}dPv_Ejrn^B`8q!Qf+>j7;eKZ#LkerWbN^_xlUej9(VtWbei|}uTo9u51?ZK zL{j?pv&I45g7obc78X9TN&vH%-A_vPk3c<}ACjO`#lm)_uJp?!JFG86v|jS6zC-us z97oQi0@bDVGnxyHQXWyESv(KzbYD9i@Qa5AiI$dDvQf>{MrZ!|$U3txE%zMOJC{({ z`q}Sk)8)CdC#nTcu$z64a+ynIuTmQ>*Fu#5&|$((-**VHVUWymbYA<$0!DIW_lNM< zEE-R8gBGv{MS;oTI%dRWn!N6}wyX9!>3@?1X47Q;RCkSU?j^bsxN|s!l4KelCSLG9 zcgAZcxZ*u;(%ycuZ>jB@D0_9NL&-<%AJ^}yxz~?SzVlL@+}xBF?c;W5i01g`Y#L9q zrJB15_FEUZ!-cx(j4Aq>^;|it7gy#q=iI2iRr&crpW>CVJyz;(3pe+Id9hq^pTYG{ z5ZzMfW6s}zdYa|772ZoLr%jeAZQy1VoS?)yu`uPd#_#w!zGwd7$f`We1rCmg+W|a3 z*0`^Eu#=5_W&6oo9qLzXlxq>m{BHUJ<0(73h3(8&giL-)eiJSun zP^HbTtzoHem{NB3S^mw=_8&s5H&VB!3isa}Yuq@d`5tYdU@q4&KQ{4!M+Qe^qRl^O zl{|cH3EuaFNOa%jLt&|Lf*}c>C2PL7hMOPS7V;%D!CPawUqX7Mwur`>;klyxcDu|1 zzf;_9YOLS6=1tA!k{+HfD4OFf-nu<3U8!QxV<<9WPrr>|bQ1|p| zU0*(ZRnYRE7M&%`chUV>iPIy;uN!+IvXw`UvY41)m*#JN0H%L?~Jz1 zV?Q3XGyJPV``y#N*~t3uM(b~;r{H^5+JXlXP=MrA?Eotv5 zYeeL+#r3y8{+q zq&7cPH)(i;dI&;z8h{@KqyJ>#9Zp}G-7sx-li|s)y;mDu>0K{HQj5Rk(>in6CHEE2 zlI)VAW8|cK+G@KxHIvpar*~anLr2S;tP{)Tu>6cBfY(y7E+v(RJDU$3$}}pis89pQ zgk->^^J?(ZzT6QM5y6~ziV>+4Ffq_yIr#hMh!*p|5k{;Xl_lWh5cZ1a>ZUbvEk*Jhnpgvu)6t>cK*GM zUdbjlGZn<)LeB1x+wzR)SO3cy2L)@m*{53Z_pe?S{k`*26OiEuhCd1!3CaIx@rw^6 zouF|MS~i(m#_T-pN$tl)Ddc6Bn=Ytx_SN*LIId7cGsn}$>}-9dhWVZP%(4pB8;^HS zfBpTDYCCO0#JV>9-oo%O0)TT@JbC#g(TC`K37p3cSBaF``nN(PK%+tDcKsupOvAab z*BZ+=SD&#(<}Vb#OmB3APV4C!x7;D;)w3XhU+jBhbH)3&Z%s?gg!pzX5e5C)Y<*7S z(uv83o2?!V|NAAZ)}(B2zjkM`HILfSWu2un=hpZ;y{qa=E{E*x?fMAK@T0@`wr|}#Na6o4yN){OFuki{ z5G8bl=5gEMleFctD-(05xnJGhA6HOvkhdewNAVgpFQu~j-PTjSXKy`Tl;?iM7<~ik zqW`kxl%`p8S)}vUN{{gGE-&YNv7DDbCV!QfVTqz%-izG37Jg1Xj#>&^HFg9VURru) zpVk`Vf9C>A-L1-bJ}S2V=l)gVoA;TON;VSnxAF{I`)yV5l*F%R%1X*FjcT52cle}< zB*!?=Z#x=3u0Z89JowS>YjaZez6m}_i;vty_jF*BJD`OF`JW&CmRezM(jTpjIrn={ zXtzPY4sQ_t3#Ku%)bUmAB1jnGI)L|PKLqi{;v3MdBJ+ZIbO8JukX03kbz@v0Yz(%} z8i1KA`E;CoM`d__xABSez)L@EF9^M=$4~gJs%OU?73%Zn&jSiSmLxaeytvhnt;k6AISWOXU8vzi>Mxh-{3uHdMXN0<&mjk0W&Vs?et`d z1ci8!Y0K1!wHbH!7tul5F=I>+^bUU_}L(-y%^P*S=n zxuSbSqfKdpN%Q&3my#h260T!q($QcpAx?CHeuoYgem|rj6qk?2%?wjX+Er4Xym|34+j}qOX4f{|%r#!U!NGg0N{AHO0DWi#iNp`%ffjm@?#j z&3^kzFSqgAuYaF5CNVXX>_{40g%8Zm_RDA(ZE4t#?2(&hcM;%uZTF>{*%^1o$o1*Z z)7Pr}HccxhFaLc9!!!dD7$K$+!mqPlamaMZ9N2kqpXLn~=)uhc4J?``i5iYO9CNTQ zdT>~CE> zIP$l^WDhF62u2f3n`s{!^1~Wmfrq&dQ$8Vgw}fMtxOhm{`}FvBFj*s2hee`+Y`pBp zOp*%y@b@1+7$9O5L_IYHMxOlsFFQ?6z>!8|)mXUuu{*)C{005L2b5q5@`Ow`#tQ&y>N@wx}&N)q3^W64)BVkJ#b}B_V}E zGW~TkuByTBxDu2}(m^i~nRzZV6 zq!Z{riNy}a64r1<7asWg)+Jw<#L;(zga%Vw0aqD8oKkj`6Y+e1vN>ch3GqQf3+n;u zE$_~oEz&Sih}i$5%m4J8)qN^-W|yy8Ro_-*X?nB5KTGChQs%t$OS#;Ys=qdBhC+Pw zhu&c}OCbvPYHp5m>%x9tOetF|4ySHUrSVisc84c7IQNPakNo&HmL6^?24{-v{i=4$ z76Unl-eYCWvg?9AiR=n_H7O2y`l3kzCIE-5T1|jN&U# zwv`81{3sPTGD@d-JAK1lZo8p;bQo@@Rd@_m^?Tjpp)Q~v+5IDBIDhY1;df?Qi{W>< ztpBHOP;rG6~Lj^{ks42r%&WHfOUQBx=t*xr-IhN z@Ow%L^aQlHWhS_?Ab9nFZwxiYsh6h<(juG;Y@sm>k(gKA_@H1}{kTcs{rp2mj+EfR zdjVV_nvN|2(RiTb{2*SFrkooACyI3%;H-nH1hLV*RJZn00ht0)OG9S-yguwrJf@-p z@8A`Hk3`C>-W9JjJu_4A`7%j1jy-*srb#>ixb)~?(PXlZJt}{U+%^P53B?ALd4KSg|f_aq>x$)3}NZuUs!8@Y{ z8NpLGLi{-KOu})5l8v0;ME*vYfmrG%&}`#&{& zI5_su;YVV-g+Yg>ujx>8OSA)<4ju8V6fkYb5FegT={XjYm=b^1_Z6MUV|#Wu!<8~{1EYJCFe$?kfMwi`S(;ed90VVRvwiO;0QpAFl2S0{n>dT`A{Sn@< zl)?(-j-QKbwd#haz?Y++KW6}buMAUkx>>u-s&7o-kCFDU1yw`Eg9vc0knNECi&n=m zW*GYnPX)qEi7*S2$l+1(^QJg}!CTOw=qylL^Z+m4h6{|64><}RG`CKqSbwuZx?ZfX zBx2HV(b93ulWF9fLSvA#kRd9>4ChgC7_PWKv)i~ZJ$e@+>;3QrLm$~d$4CTSd>_e( zT->U}UrIP{$aECQ5slm{G3is97-)`#O{UzEKZ*i1I57cHH$+Z>>zGLa(#E}zl#q`K z0h@C4QOi%T!DXL*dwp}y7|2VK6^AWZ1Q*)~SXMH&Yl{u>-CQgWy@k+wO&x4}%(5DR zaqI`owd$0=TV}pAzPsX!%+5_;+GA}ZnaXC@`hQ<4e7yP7mCq^NWgPvVn@hrYzDiN= z+^Ef`QXqdZLMeG2&gV+A2M0C{PflJG?pUj{MfI5dSZA0Ny`t&mv|Z#c!@crqzX@48 zUl}WY|6bH(Y~WXvVhnHi*Cp_0YG=-bD7S#NC7+tYwqe7iqjt*5RER(E!oCod-M|7b zEM!CTFpra=Ta-2;!$;9r{>yQoaTn=&pm{l+Z7ZZ1vF8?yla5Ho%VFj4EiElo#!z7h zirP__5_>X0jkTX;+~I0liQkW|gaa1An~;9Wp;NksQ8>h2iF;H6`3u9S8pUSt)~1%1 zq;kVbIIvjUNk-Ue)o4UY5X(22Kn!^!#LkAuWGS;V+=&k+2fYjZ-6v!UGqI(SbP^On z!_9WB(MKW`key5N9WcT$Z>FFd2?Y5lg_89PsELXa3DVQ}LD<;YsVKiqo{{Lmh@ggi z!i#0WR1^?8?YZ|JpDX?&v2xwF7=ztjo3mFoYNt$eW#23bNuDeGmaCkec0R7!0A0kN z9toHE5qik2h&8V)TxJ!83@RX_qA`EuVpln;6U3I0xjk&0oIa2y6L%`A3pea6l$V&w z=tD|vP@@UjP*?|SeGnY%{b;8)@ju}tV_YXJncYWAAsH)ZmdLmkKs5WVUl%9dmg2P? zKYoy!ZzQnh@*5&bdh9rQ@ORV-^g~$4#D&xWh3u`p(XQBqB+L@N<=Y>7XT&b1Ly>o8>;t%polecfcf$ zczj*a8#s7MN>{B~RrJ)$)AJ~rr7ho);wH9b2rDbNq9Ri9$Qu}msg*#U9>WSdHU7tq zUI(bd;^A8NS;9xVW=zV5>yVF{rM`moLShCXhC;v69A~a;Ho}Yc)b1Y6ZCt}@fLq-8IJ?$-SVBm70E1-(&FT^VOBK$5pKWxOn09=hx|;`Mlul0 z(J{)ll+)^$Bt!Rqjxorn`8K6@=FCBx!KRQ>I~dqJtY^oNo|f>-_&Z&zo#`FOL?pUc zc@@m8ZWxiAp8YKd8c~OePm|0<61$=2G@+$A3oUAhwVj>a4t{>O0Nk|l>W^WZ-61OK zpXE3szO@xf-lEPq9E37Ii#w|8>yE8ft*NU!feXcWl*F z#0-Be`r*86Lw!n3fp>JdpTg`mHC8+patovDUZ+6w}kVxDJ2}L|F$WmRPg^XnDj0zKl*yT}jRqlowc*zYQo#*4Akn*FC#& z(w4QN^vbzmHp7#LVR#fJRBB|Iw9K7>M(7Tr45%r%7-d`L?wS(sJrl1&_}C1lysVEr zcKovtZKiR`HJV|=CCpL2TpfK%|Ku6wg~;HavK3g70<%RtaVcurFED76c`$W_`8N5hFN z$WDL^wX=z4ucF}i@UW<~LKh&)G7pKoHhK;8PnLD(>k`XyeU9?OvG&S&>2O za0VmJ0%mP_OU&dUDi17)UkWx`YMouuufqfn^(KV^O`sQg=YrWND5~B{-@x^D(a*iF zCSe_TViM>DQAP(Wu|LY~a8QOlRQ5YT$-LgRVs>iCLi^pB6R`k%C@9A-NDVS-|w^ozrA>7aAPoj&L6msc2}?LQJPxDG3&v4nY@=XgE|#@XV}PY1L6p-aVcZa)kX8H*Wt`mzGOye#?|>*ms)`7v*>QQ9@>9wFhbv(qdN=f&AA# zwHu%BUsdF=@U!PEcswssbw<}uz}n&^j&sY|PeSw7dsw~1??zQGP06WjTx*~ntmts) zE*Ude@Ynhx@OptOE{|^Cd7LfX1*szoyprRby|{)T3nyV#SbETx&`{PZIA#`jq+-u^ zTjcG2-GGhC1>Lk4ssdCSOw=Motg}|+pvh|_xDYb|kdh)4-Xu#7{bfkmc|E=R^Ye}^ zl92Nur#fP{5LE5UDVaOs<+`{vbA z_Oo_zyCFV|fOW^?xpe%*U`z+D_MVg_fI?bc$X?}QmOb#belfXbck<_u6|TXlM z!M;JC-M4eywnxMb=PA`T9Ta(<@AEEhV}kcm-l2%cg^EE}+j$Qpzmr~bzDaSt(>|w& zS0;x&2+inD1vxXg1BgE{YSSJPX(|aSe|V`$0vU!9z*$WuW|5Ixn2ore+hrLAvD6nA zrEH8@&=D`0yBP-=84VU}<{f93h17Z)?*DRF@!O!AmYUh(49f|Ls|fpLx4AbaoDzj{ z+5MBSuyA@p_}hV??v_JrK1;G+zp|^{N?FdppmgJg46;|s^zJRdbw@;@UZ)aNA)|{3o3^YVI!^8LQM=i~+u`7A*44)JI?(z(I%OO9d zUVX2V=br8;eBr6(UGCSV@h+j`+-WJt4b)w3Eb;H+?|c?6sERxu_3HIth>B)B$JFAG z#W@&moQ+&pUzca{U|Ad~$l(mbGhA)51$kPrmJ-qK6*+8do8rR*Wy1t5(? zuWuPvrr!&ctH+dM@)$IJsux9SFYC0J2 zLxyF*I(u2hEAJF_mov>wT+RQRPca&Ug~(x5 zL*5iggC^0qLPCiLtzVNZ4;HNZ$%W_(Ep;6V%Em2q)q7;p(@Th?;n`S<-`hRV#l4Ox znx5VnO2Vr)t%Q!>42D8o*>5;Nqbv`?`?pjAR#zW{?D{xX2BU66Q}+F?Xb7HNJm#Z4Wn6RH-Go8cet>Q|Q3ROizs290S9dg`G0hl#+uu0TZB=e*Xj#AOsC|@(a7IlO=>=C-QL43XynUV<_yixyCA&}d3*GfHe7eb z0Y?-rjk?JTjiLRN`f0&RdQ{tUiD%zdb#iZBt=f)s;|tU$2wT#zWPSk%uRXp36$R5C zOOR~oA2iik=$mw83ivUIB95jjR21T>rjQ$t1Zi0~q0t}%B`IwwB|NXMWIREl1~uzm z;#LN7;5~K&wK?G%k+wy}>{Px5EVF5H0rV6RC-eX?j{NvQ0yM`qs2-zpY6SgIoU}-c z*$$Q1Y1pbGjr5ex&dy--xWq-6mfK--4$nVr^iSkutQK=)CtR04P_*w`F^-2Z3Mw$s zTCX|DD*vp+BIKRH1XN$QkI-WeWXqO&|o)jPmK~ArA>?i1K zbJm$9$`XzWlEzg@(rFBA55s?BVDs&=pj(bB8vv>9=85Q+!91E;Yvjlyn85- z9!5F^v3w&VvWP?oBHd9C?KA(tQ@bpfm`HXZ`Lj%eN+}<-sa?@VEiTSkAQg}qvtm!b zbCxF@G*PLn-*tI4eDj1ELvq#GEbHYMSwk`>gGV1fP7k*;k;fLF1_L(I8{_u`oCnb* z0Es350>Rxe0)|9n=PfPX>|P&_p~CC?e!e zOmT};J(rMkyKCh0gR>9I!W0s|3+Bj;UzpCEVbDoRIH5=1n@LYqe@pX7^z+oUEj*LC z;Ri&Sm%B>xqkT~Vo*H5#_Oc5{H1WU!_eB)uFvh{?gV;nLPZGLEz;EzFMFMZXDz**W zaYCP7CMA{ZW0j}Hk%!7&X}IHd=+0akhmJ9^$c`DKitnpw9~|zSw*D5^%U$VMQO&vQ z^236nLBs-vIGMsKatlCa=#1Sv=Y`4_T02{EATKF;AfP4+S2f%-G(cwFgCdfdxmfYI zJbRytDLfC$AY;Il`vZq48Hh!eV1n>UDSV>T8n^dYvjI+BOp8562F|P0~F4hlr)(