From 50277cbc0a3b49aa17d992c11c6c8fdd36113457 Mon Sep 17 00:00:00 2001 From: Jacqueline Guan Date: Thu, 11 Sep 2025 16:37:20 -0400 Subject: [PATCH 1/9] part 1 + 2 --- src/main.cpp | 2 +- stream_compaction/cpu.cu | 52 ++++++++++++++++++++++++++++++++++---- stream_compaction/naive.cu | 31 +++++++++++++++++++++++ 3 files changed, 79 insertions(+), 6 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..b29384c1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..f8b35528 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,11 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + odata[0] = 0; + for (int i = 1; i < n; ++i) { + odata[i] = idata[i - 1] + odata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +34,16 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int oIdx = 0; + + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[oIdx] = idata[i]; + oIdx++; + } + } timer().endCpuTimer(); - return -1; + return oIdx; } /** @@ -42,9 +53,40 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int* tempData = new int[n]; + int* scanData = new int[n]; + + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + tempData[i] = 1; + } + else { + tempData[i] = 0; + } + } + + scanData[0] = 0; + for (int i = 1; i < n; ++i) { + scanData[i] = tempData[i - 1] + scanData[i - 1]; + } + + for (int i = 0; i < n; ++i) { + if (tempData[i] != 0) { + odata[scanData[i]] = idata[i]; + } + } + + int scanResult = tempData[n - 1] + scanData[n - 1]; + + delete[] tempData; + delete[] scanData; + + timer().endCpuTimer(); - return -1; + return scanResult; } + + } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..9b4b901e 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -12,6 +12,17 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void naiveScan(int n, int *odata, const int *idata, int d) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) return; + if (idx >= (1 << (d - 1))) { + odata[idx] = idata[idx - (1 << (d - 1))] + idata[idx]; + } + else { + odata[idx] = idata[idx]; + } + + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. @@ -19,6 +30,26 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); // TODO + + int* tempIn; + int* tempOut; + cudaMalloc((void**)&tempIn, n * sizeof(int)); + cudaMalloc((void**)&tempOut, n * sizeof(int)); + cudaMemset(tempIn, 0, sizeof(int)); + cudaMemcpy(tempIn + 1, idata, (n - 1) * sizeof(int), cudaMemcpyHostToDevice); + + for (int d = 1; d <= ilog2ceil(n); ++d) { + naiveScan << < 1, std::min(1024, n) >> > (n, tempOut, tempIn, d); // Check that this is the right blocksize!! + std::swap(tempOut, tempIn); + } + + std::swap(tempOut, tempIn); + + cudaMemcpy(odata, tempOut, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(tempIn); + cudaFree(tempOut); + timer().endGpuTimer(); } } From 3ffd4d022f915ab22a8f1f5ef5fc54c7b00f8a92 Mon Sep 17 00:00:00 2001 From: Jacqueline Guan Date: Sun, 14 Sep 2025 23:04:25 -0400 Subject: [PATCH 2/9] part 3 --- src/main.cpp | 78 +++++++++---------- src/testing_helpers.hpp | 3 +- stream_compaction/common.cu | 13 +++- stream_compaction/common.h | 24 ++++++ stream_compaction/efficient.cu | 133 +++++++++++++++++++++++++++++++-- stream_compaction/efficient.h | 2 +- stream_compaction/naive.cu | 24 +++++- stream_compaction/thrust.cu | 3 - 8 files changed, 224 insertions(+), 56 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index b29384c1..69eecb76 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,11 +13,11 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; +const int ARRAY_SIZE = 1 << 8; // feel free to change the size of array +const int NPOT = ARRAY_SIZE - 3; // Non-Power-Of-Two +int *a = new int[ARRAY_SIZE]; +int *b = new int[ARRAY_SIZE]; +int *c = new int[ARRAY_SIZE]; int main(int argc, char* argv[]) { // Scan tests @@ -27,32 +27,32 @@ int main(int argc, char* argv[]) { printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + genArray(ARRAY_SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[ARRAY_SIZE - 1] = 0; + printArray(ARRAY_SIZE, a, true); // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); + zeroArray(ARRAY_SIZE, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); + StreamCompaction::CPU::scan(ARRAY_SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); + printArray(ARRAY_SIZE, b, true); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); + StreamCompaction::Naive::scan(ARRAY_SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + printArray(ARRAY_SIZE, c, true); + printCmpResult(ARRAY_SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); @@ -60,35 +60,35 @@ int main(int argc, char* argv[]) { StreamCompaction::Naive::scan(SIZE, c, a); printArray(SIZE, c, true); */ - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); + StreamCompaction::Efficient::scan(ARRAY_SIZE, c, a, false); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + printCmpResult(ARRAY_SIZE, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); + StreamCompaction::Efficient::scan(NPOT, c, a, false); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); + StreamCompaction::Thrust::scan(ARRAY_SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + printCmpResult(ARRAY_SIZE, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); @@ -102,23 +102,23 @@ int main(int argc, char* argv[]) { // Compaction tests - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + genArray(ARRAY_SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + a[ARRAY_SIZE - 1] = 0; + printArray(ARRAY_SIZE, a, true); int count, expectedCount, expectedNPOT; // initialize b using StreamCompaction::CPU::compactWithoutScan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); + zeroArray(ARRAY_SIZE, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + count = StreamCompaction::CPU::compactWithoutScan(ARRAY_SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); @@ -126,25 +126,25 @@ int main(int argc, char* argv[]) { printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + count = StreamCompaction::CPU::compactWithScan(ARRAY_SIZE, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); + count = StreamCompaction::Efficient::compact(ARRAY_SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 025e94aa..d66efb0d 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -53,7 +53,8 @@ void genArray(int n, int *a, int maxval) { srand(time(nullptr)); for (int i = 0; i < n; i++) { - a[i] = rand() % maxval; + // a[i] = rand() % maxval; + a[i] = i; } } diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..9e178f03 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,11 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + + if (idx >= n) return; + + bools[idx] = (idata[idx] != 0) ? 1 : 0; } /** @@ -32,7 +36,12 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) return; + + if (bools[idx] == 1) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed9..bd0779e3 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -9,6 +9,7 @@ #include #include #include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) @@ -66,6 +67,29 @@ namespace StreamCompaction { time_start_cpu = std::chrono::high_resolution_clock::now(); } + void pauseGpuTimer() + { + if (gpu_timer_started) + { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + float elapsed = 0.f; + + cudaEventElapsedTime(&elapsed, event_start, event_end); + prev_elapsed_time_gpu_milliseconds += elapsed; + } + + } + + void continueGpuTimer() + { + if (gpu_timer_started) + { + cudaEventRecord(event_start); + } + + } + void endCpuTimer() { time_end_cpu = std::chrono::high_resolution_clock::now(); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..d54971f0 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,9 @@ #include #include "common.h" #include "efficient.h" +#include + +#define blockSize 128 namespace StreamCompaction { namespace Efficient { @@ -15,10 +18,88 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + + __global__ void kernelUpSweep(int n, int* odata, int d) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + int k = (1 << (d + 1)) * idx; + + if (k >= n) return; + odata[k + (1 << (d + 1)) - 1] += odata[k + (1 << d) - 1]; + } + + __global__ void kernelDownSweep(int n, int* odata, int d) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + int k = (1 << (d + 1)) * idx; + + if (k >= n) return; + int t = odata[k + (1 << d) - 1]; + odata[k + (1 << d) - 1] = odata[k + (1 << (d + 1)) - 1]; + odata[k + (1 << (d + 1)) - 1] += t; + + } + + void scan(int n, int *odata, const int *idata, bool isCompact) { + if (!isCompact) { + timer().startGpuTimer(); + } + + int logn = ilog2ceil(n); + int nPadded = 1 << logn; + + int *dev_data; + cudaMalloc((void**)&dev_data, nPadded * sizeof(int)); + + cudaMemset(dev_data, 0, nPadded * sizeof(int)); + + if (isCompact) { + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyDeviceToDevice); + } + else { + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + } + + + for (int d = 0; d < logn; ++d) { + // n / (2 ^ (d + 1)) + dim3 blocksPerGrid((nPadded / (1 << (d + 1)) + blockSize - 1) / blockSize); + + kernelUpSweep << < blocksPerGrid, blockSize >> > (nPadded, dev_data, d); + } + + cudaMemset(dev_data + (nPadded - 1), 0, sizeof(int)); + + for (int d = logn - 1; d >= 0; --d) { + // n / (2 ^ (d + 1)) + dim3 blocksPerGrid((nPadded / (1 << (d + 1)) + blockSize - 1) / blockSize); + + kernelDownSweep << < blocksPerGrid, blockSize >> > (nPadded, dev_data, d); + + } + + if (isCompact) { + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToDevice); + } + else { + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + } + + cudaFree(dev_data); + + if (!isCompact) { + timer().endGpuTimer(); + } + } + + void printArray(int n, int* a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); } /** @@ -32,9 +113,49 @@ namespace StreamCompaction { */ int compact(int n, int *odata, const int *idata) { timer().startGpuTimer(); - // TODO + const size_t bytes = n * sizeof(int); + cudaError_t cpyRes; + + // mark em + int* dev_Bools; + cudaMalloc((void**)&dev_Bools, bytes); + + int* dev_idata; + cudaMalloc((void**)&dev_idata, bytes); + cpyRes = cudaMemcpy(dev_idata, idata, bytes, cudaMemcpyHostToDevice); + if (cpyRes != CUDA_SUCCESS) { + std::cout << "Copy idata failed." << std::endl; + return -1; + } + + int* dev_odata; + cudaMalloc((void**)&dev_odata, bytes); + + int gridSize = (n + blockSize - 1) / blockSize; + Common::kernMapToBoolean << < gridSize, blockSize >> > (n, dev_Bools, dev_idata); + + // scan em + int* scanData; + cudaMalloc((void**)&scanData, bytes); + + scan(n, scanData, dev_Bools, true); + + // scatter em + Common::kernScatter << < gridSize, blockSize >> > (n, dev_odata, dev_idata, dev_Bools, scanData); + + cudaMemcpy(odata, dev_odata, bytes, cudaMemcpyDeviceToHost); + + int lastBool, lastScan; + + cudaMemcpy(&lastBool, dev_Bools + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastScan, scanData + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + timer().endGpuTimer(); - return -1; + + cudaFree(dev_Bools); + cudaFree(scanData); + + return lastBool + lastScan; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4fe..e6525cd0 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, bool isCompact); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9b4b901e..d6d14240 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -24,6 +24,19 @@ namespace StreamCompaction { } + __global__ void exclusiveScan(int n, int* odata, const int* idata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + + if (idx >= n) return; + + if (idx > 0) { + odata[idx] = idata[idx - 1]; + } + else { + odata[idx] = 0; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ @@ -35,15 +48,18 @@ namespace StreamCompaction { int* tempOut; cudaMalloc((void**)&tempIn, n * sizeof(int)); cudaMalloc((void**)&tempOut, n * sizeof(int)); - cudaMemset(tempIn, 0, sizeof(int)); - cudaMemcpy(tempIn + 1, idata, (n - 1) * sizeof(int), cudaMemcpyHostToDevice); + + cudaMemcpy(tempIn, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + int blockSize = 128; + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); for (int d = 1; d <= ilog2ceil(n); ++d) { - naiveScan << < 1, std::min(1024, n) >> > (n, tempOut, tempIn, d); // Check that this is the right blocksize!! + naiveScan << < blocksPerGrid, blockSize >> > (n, tempOut, tempIn, d); std::swap(tempOut, tempIn); } - std::swap(tempOut, tempIn); + exclusiveScan << < blocksPerGrid, blockSize >> > (n, tempOut, tempIn); cudaMemcpy(odata, tempOut, n * sizeof(int), cudaMemcpyDeviceToHost); diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..0f79000e 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -1,8 +1,5 @@ #include #include -#include -#include -#include #include "common.h" #include "thrust.h" From 5dc34edbea1858185b6decc926eaf1929b92c104 Mon Sep 17 00:00:00 2001 From: Jacqueline Guan Date: Mon, 15 Sep 2025 12:29:33 -0400 Subject: [PATCH 3/9] part 4 + move timers to exclude intial/final memory operations --- CMakeLists.txt | 20 +++++++++++------- src/main.cpp | 2 +- stream_compaction/cpu.cu | 4 ++-- stream_compaction/efficient.cu | 37 +++++++++++++--------------------- stream_compaction/naive.cu | 9 ++++----- stream_compaction/thrust.cu | 11 ++++++++++ 6 files changed, 45 insertions(+), 38 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 610c27d4..28355738 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,21 +7,20 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -# Enable C++11 for host code +# Enable C++17 for host and device code set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) # Set a default build type if none was specified if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - SET(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) - # Set the possible values of build type for cmake-gui - SET_PROPERTY(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() if(UNIX) include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}") -endif(UNIX) +endif() add_subdirectory(stream_compaction) @@ -29,11 +28,11 @@ include_directories(.) set(headers "src/testing_helpers.hpp" - ) +) set(sources "src/main.cpp" - ) +) list(SORT headers) list(SORT sources) @@ -41,6 +40,13 @@ list(SORT sources) source_group(Headers FILES ${headers}) source_group(Sources FILES ${sources}) +find_package(CUDAToolkit REQUIRED) + add_executable(${CMAKE_PROJECT_NAME} ${sources} ${headers}) target_link_libraries(${CMAKE_PROJECT_NAME} stream_compaction) + +target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE + ${CUDAToolkit_INCLUDE_DIRS} +) + set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME}) diff --git a/src/main.cpp b/src/main.cpp index 69eecb76..933b4126 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int ARRAY_SIZE = 1 << 8; // feel free to change the size of array +const int ARRAY_SIZE = 1 << 24; // feel free to change the size of array const int NPOT = ARRAY_SIZE - 3; // Non-Power-Of-Two int *a = new int[ARRAY_SIZE]; int *b = new int[ARRAY_SIZE]; diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index f8b35528..cdc6da5a 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -77,13 +77,13 @@ namespace StreamCompaction { } } + timer().endCpuTimer(); + int scanResult = tempData[n - 1] + scanData[n - 1]; delete[] tempData; delete[] scanData; - - timer().endCpuTimer(); return scanResult; } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index d54971f0..4e4029e9 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -39,9 +39,7 @@ namespace StreamCompaction { } void scan(int n, int *odata, const int *idata, bool isCompact) { - if (!isCompact) { - timer().startGpuTimer(); - } + int logn = ilog2ceil(n); int nPadded = 1 << logn; @@ -57,7 +55,10 @@ namespace StreamCompaction { else { cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); } - + + if (!isCompact) { + timer().startGpuTimer(); + } for (int d = 0; d < logn; ++d) { // n / (2 ^ (d + 1)) @@ -76,6 +77,10 @@ namespace StreamCompaction { } + if (!isCompact) { + timer().endGpuTimer(); + } + if (isCompact) { cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToDevice); } @@ -84,22 +89,6 @@ namespace StreamCompaction { } cudaFree(dev_data); - - if (!isCompact) { - timer().endGpuTimer(); - } - } - - void printArray(int n, int* a, bool abridged = false) { - printf(" [ "); - for (int i = 0; i < n; i++) { - if (abridged && i + 2 == 15 && n > 16) { - i = n - 2; - printf("... "); - } - printf("%3d ", a[i]); - } - printf("]\n"); } /** @@ -112,7 +101,7 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + const size_t bytes = n * sizeof(int); cudaError_t cpyRes; @@ -131,6 +120,8 @@ namespace StreamCompaction { int* dev_odata; cudaMalloc((void**)&dev_odata, bytes); + timer().startGpuTimer(); + int gridSize = (n + blockSize - 1) / blockSize; Common::kernMapToBoolean << < gridSize, blockSize >> > (n, dev_Bools, dev_idata); @@ -142,6 +133,8 @@ namespace StreamCompaction { // scatter em Common::kernScatter << < gridSize, blockSize >> > (n, dev_odata, dev_idata, dev_Bools, scanData); + + timer().endGpuTimer(); cudaMemcpy(odata, dev_odata, bytes, cudaMemcpyDeviceToHost); @@ -150,8 +143,6 @@ namespace StreamCompaction { cudaMemcpy(&lastBool, dev_Bools + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(&lastScan, scanData + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); - timer().endGpuTimer(); - cudaFree(dev_Bools); cudaFree(scanData); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index d6d14240..d7f772d8 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -41,9 +41,6 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - int* tempIn; int* tempOut; cudaMalloc((void**)&tempIn, n * sizeof(int)); @@ -51,6 +48,8 @@ namespace StreamCompaction { cudaMemcpy(tempIn, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); + int blockSize = 128; dim3 blocksPerGrid((n + blockSize - 1) / blockSize); @@ -61,12 +60,12 @@ namespace StreamCompaction { exclusiveScan << < blocksPerGrid, blockSize >> > (n, tempOut, tempIn); + timer().endGpuTimer(); + cudaMemcpy(odata, tempOut, n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(tempIn); cudaFree(tempOut); - - timer().endGpuTimer(); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 0f79000e..0a45fbcb 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -2,6 +2,9 @@ #include #include "common.h" #include "thrust.h" +#include +#include +#include namespace StreamCompaction { namespace Thrust { @@ -19,6 +22,14 @@ namespace StreamCompaction { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::device_vector d_in(idata, idata + n); + thrust::device_vector d_out(n); + + thrust::exclusive_scan(d_in.begin(), d_in.end(), d_out.begin()); + + thrust::copy(d_out.begin(), d_out.end(), odata); + + timer().endGpuTimer(); } } From 21553de299fa8d9f5cfd6cb67c3db570a7875dc2 Mon Sep 17 00:00:00 2001 From: jyguan18 Date: Mon, 15 Sep 2025 13:51:06 -0400 Subject: [PATCH 4/9] Basic README.md update (no graphs or analysis --- README.md | 48 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0e38ddb1..ed5f3979 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,48 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Jacqueline Guan + * [LinkedIn](https://www.linkedin.com/in/jackie-guan/) + * [Personal website](https://jyguan18.github.io/) +* Tested on my personal laptop: + * Windows 11 Pro 26100.4946 + * Processor AMD Ryzen 9 7945HX with Radeon Graphics + * 32 GB RAM + * Nvidia GeForce RTX 4080 -### (TODO: Your README) +Introduction +============= +In this project, I implemented a GPU stream compaction in CUDA, focusing on the core algorithms of parallel prefix sum (scan) and stream compaction. The primary goal was to remove zeros from an array of integers, which is a fundamental step for more complex GPU algorithms like accelerating a path tracer by removing terminated paths. The purpose of this project was to help think about data parallelism and the unique challenges and opportunities of programming for GPUs -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Implementation Details +============= +### Part 1: CPU Scan and Stream Compaction +I first implemented a CPU version of the scan and stream compaction algorithms in stream_compaction/cpu.cu. The CPU scan computes an exclusive prefix sum using a simple for loop. The stream compaction methods include: +* compactWithoutScan: A direct implementation that iterates through the array and copies non-zero elements +* compactWithScan: An implementation that first maps the input to an array of 0s and 1s, then uses a scan to determine the output positions, and finally uses a scatter operation to produce the compacted array. +### Part 2: Naive GPU Scan Algorithm +I implemented the naive GPU scan in stream_compaction/naive.cu. I used global memory and multiple kernel invocations to perform the scan. A challenge was managing race conditions using two separate device arrays and swapping them at each iteration to ensure correct reads and writes. The implementation also required careful handling of ilog2ceil(n) separate kernel launches. + +### Part 3: Work Efficient GPU Scan and Stream Compaction +I implemented the work-efficient algorithm in stream_compaction/efficient.cu. +* Scan: The scan algorithm operates on a binary tree structure and is implemented in-place, which avoids the race conditions of the naive approach. I ensured the implementation correctly handles non-power-of-two sized arrays by padding. +* Stream Compaction: The stream compaction uses the scan function and a scatter algorithm. I implemented the helper kernels kernMapToBoolean and kernScatter in stream_compaction/common.cu. + +### Part 4: Using Thrust's Implementation +For this part, I used the Thurst library to implement both scan and stream compaction in stream_compaction/thrust.cu. The scan implementation is a simple wrapper around thrust::exclusive_scan, which demonstrates how to leverage highly optimized, existing GPU libraries. + + +Performance Analysis +============= +This section presents the results of my performance analysis, comparing the different scan implementations. All measurements were taken with Release mode builds and without debugging. + +### Performance Plots +CPU Scan vs. GPU Scan: + +This graph compares the run times of the serial CPU scan against the Naive, Work-Efficient, and Thrust GPU scan implementations. + +### Performance Bottlenecks + +Extra Credit +============ From ad1442ff98b8fee1b177273c13c62f4e07ec3eb5 Mon Sep 17 00:00:00 2001 From: Jacqueline Guan Date: Tue, 16 Sep 2025 02:59:46 -0400 Subject: [PATCH 5/9] change thrust timer position + add graph --- img/general.png | Bin 0 -> 18092 bytes stream_compaction/thrust.cu | 7 +++---- 2 files changed, 3 insertions(+), 4 deletions(-) create mode 100644 img/general.png diff --git a/img/general.png b/img/general.png new file mode 100644 index 0000000000000000000000000000000000000000..912db014ab0e07780ad37b4988577bf3ab406fcd GIT binary patch literal 18092 zcmch9byU>dyYFBN0wSd#pdg~8NH>bKbmxG8$k5$jAxKI~i{#MV14uWB3^jx_3=Bx; z40n(3@7(h{cdfJ5{o~&A{^7e8d(Gbasm~MpdA^@jm1T(uX$T<@2(i4}3v~$Of-VF? z;C$^0_{;o~dj$j{<1GK;nWl#^X8Nj!rWWqpF>N}{0-3g1SBEn^w<|5OFmg0%pShXo z{-XW;HC?sRj0-)C_gLQ(%55Rs*ASm(yh$^o_z^zm&IA1~9|}Z&X-@f);MtVABAQTp z_#mM+!CYKu#d}Lg1srGs-|eFX|95b^25teNVlp9sKy*XjUIt6t7pNeRx9$HQ7aY45 zw!aG8XXw1L^zLoI*Gj8j53PnXKTrAW_V3IEGW7h+kR`&L@1~vKh4~-%LNk?9gv+eQ zik_b`J+||f{zi`qB4g@HQpA2$L=}IpKi#Zn2LHMEcZA+KsqIe|;;B%=oF6uw)73%s zv&3ZPii`hH@mi0vAFo5tSyO~vSzYRRp-sHL$}2x*8Tjn;ZuWKd`fL<O^yrH$94+aj6Z=JRNJ*-=YmnjWVxN% z88Ttd=NN0mA}eer)Rza-80yY9>$eoJC;R)I_bQvT^0m`Xv$Q77JnQ^eNAgXUc8>|C zIk|GYeJG!0cHXv1LiQ|pLoTtuKxRuB->DlhLT8`E%dC$US}b-&22{bn?Bh4PL~!Nq!V#ljreghw} zz9)M>Gwwzn1hUzA>*Wr`oGkh)OlzJGUlXs25I>P?diS-O4R#`>lpvI%d`HZyV#=|T zPWq(Me%AOC30DD!+V$PY>f+Qau>=jw3%dHMSTW;eJhP&k2Z$iMdETYEY4vh?a{G z8^-Xs7F~$*{K$Rg<~TU!(`5B4`=kCE{Xob};7eQjGhCqK~X$k%y-%IRm1p!3VJ-}2s#-RR)R6<(bH>&B_Wr>c+ zmx;Xc{zZv0s`d_OrFzDYz2uQ3yCXMBcFl23CI;4*6o*%`vUQQA8tF`ON`-v(Glprl zdWHIY69{kjH&E&536Vb@OSk*HY~9M&=i_6f z`i!QA2W&}9+fr_71!yKMHEP|SF zb?GhAE2ZCb1^-e5oO}pb3HX;?ruhN{QgWMs2m)z-b&37|@dC49b5B>-7wclgXoFr< zbo5fg?f}~eIAq|(^Q@rhdUwSvg;>Tr3E~6{U|T(63a0jTPB$-ujNNjEahS!%Gt;lhLlGv9+cBIFa|3WaeAS zftB5Ei+H2gGdCa*RaRz}FhSS4w^=f!MRjP_-Pa%tQhzLD)drU%SYJ-EAJzE4YRCa? zG}N!MvDyS&RArTpmbnl_QKDH>w>u!R)I!7+gGS~nzJ8ru3)m|Or%<8+BnYzoKWFp* zged=S&RF8)8ML3F(7^|b-wcJO82i0YS<||Ki{lGCZ}1yP)}n%x@^l5+x<`rm&A0M=C54VbfW2_m(w>r ze*v$sF(LMr!uun?zeLE&p`*_uLs!;=XV-Y$Lx}JkzUgXxNUQNdNtiA=Mp$L_?UH_J z+Z2c0P5d=9mxRxTGvk#ajBSrgK9K0>VP^Q;L;ecprj}989gPpuO-5{RteR zk(zUL&2zk^sFlBp&oOXJU)xPVs$7?QFiZuB8MLxwg0(~tf3R; zHQ&N?&}e)>n<id=ht{Jc-h+k5D4TJqbkadiF({~<1SeQLC zJ^j3}ym|dE*L!T{^{YvA!u(3)-HLL)mH6uVJCA*QfftsA6MHKQvZ?zQf%Ou7@i(%=0* zALJZm!gS1F4z16MkDh{M3F5yN5!cBw413GXH|`wHtsaZrr;GMN(G&or((8+v-;tYZIpAaKn>PJx4FNV1%h9aR zm9?sc8WsEA&9hpy@tHX(4f_{=m+vb_bSe6V!GGY0!mK0@vonEIqbaK5Wm+y5gofUz zJ(#raPqIK&+5XKrE>j!c1Ce5Nfl>YF)Md!qnlbNGK zrXqqCXwIESiM;dUNF6B0#j7Ntw@XG=we)7h( z*y{kvIseNo_<;cXliexS@3`OIkG!h7WO314UP0mePvD9;I44)3?yIg>^4+>)?}SvL zZBZRz!vUDbDiV)qSFrkv&hw@4L-%~(LayIaZ?4K{V~e8SsHQMQMQ84RKDAM)#j*;j1M z0yW0%?&EFe`eJ@}$A}4vjHEm54h2--!}JvW9P9WRZ?F%7+C@Z#4UuTOXmY(D9?VhG z@D!lR+Y2Q#g0y%WxFz{I6^cH5l6@t|{z=>?jVanuN{Xr+8pWQu4ZOw5K`<ejNXlHM0JhaZ!ULel?uBXn-@-~;>0fLf z0N7k!YXTdJPh_;J1l);!WK;dm?+?~)11y~$FMaaHf(KUxAXQbbj)>UetQLuv9)9Abcd zIesr65=VVCG4(bhL}dj1-7ldExCxt~)`vMefQLksRmD*9gF7YvTqVkA@i7`cZNEZF&(0a&d-fRMC;Lmo_0hRJN0p?-aiZ)iT7|KBwZIz8 zK2h4pj=W$tQ35fyUESJ4tC@z{z9a#a;o;#nj9YJaG>gT_-kR6x{uq>t{Nc+d{;a^| zT$JVUQO72U(v@g$Zc|O>3{Z^#->7BkJ!d8LLRRyFcIzi%W+9=8hT1O~vJnahuYG(R zDtYG^%VQO`0s3%D0~;IPk*OSP!y z)0-uun_Pfdqqj3luv0;><`Yn_;#la6pib(47;otGnbRvx(D6Ho9$?Lehc9C>DLAz$ zP0Ir|^vK5E9H4ks38F!o&&0@ME?|D&%Qd}TI7Br?Z*|)PuLrs&M^3<*)u!3Ja;8kB zsR*NgZh9(2fqa|wUUO>(k@f7um7Ol8h*euaew;!qYl7DW$lJ2<`1$BOqus^s;$8EF z^cIZ|a{-u5;&nod)G@drq4HJn{Sl3*#KgSnX*ZN@*=yqXm8+27FFk!23$K6}2;s-! zWz@Y9&Hw_Dl*t75^yC+O`1oa=*m!<;4jnM#=9oBY?N->!WmAmJJsJ-EK^!B2w^MHO z!|Q3H$o8)^Q(l>WM!s>{_`^;8Oi%x`oT?UmrzshsoJLa2f!tIllK@=P@#@Y98y48R z_-=?Y@uUnfKd>w-Jb_fm%vQc)5N8c=5rOsi)5&&=9uLDU4 z%gc!Z7a?z-PWtt+Xf+P}vR!dZ^Q<;^Kk5RCy=A#-wo5I$LCqSMMoZI$S^kgd$ z3bP>d)otGJGi-?wt?oDK4edGVQB{e(Z;>Nc0p-!942uGN`JdSp{9hv(G%K!1`z zd`t|*_9;stds_^>kAU+)kOlqX@mDEG$&$Tyc!hCe;g z?;NxWtyAyHJx(k#wAN7{H+~oC0)gb3{Nrf2`2?K;tT>V(O(^C&T>Rfk5Enuenqeck zkq=l{k_4!oHpaQ((56OD9ss<>r#Es)bnHp0Lb#p>gxS8(HLV}>(ZA37KKD|=XL6Pv zyp60EgEA@C+S=O5%NCH{^@QHx**j>$V(g#VO$qMiT!4HT!aE&@ePA#`$;K|QFD(b- zH)G#s3Em|!S~RYNY1N|h^7qPw@QEY7a&>aJReT7fdg0&PjtfjRy+n-v1qV>Xw`FTj zk$8X8cRy0O(Y;VF5;~6idi1pBhPpI9DgJgrw-E(Ubz3R*ZZEqqpKfl6@0r}Lo#x+Q zX`qxn2}*-Ac}c8nOzjC9;(ul0XX>p%h%$rV@!okOU;ZCo^Z5zSqZxA308G+Yl6%Gl zdn4y_7Dx@0XK#9@zaB#`QB-w)13&ZHM|bC{WwfpA2E=u)P1jdX zIS-2c^AoNP5#XFWTox6nDOqjOBu%~9Kf{{qHp+URkJS@iVRtQg>!lA)fq1`UKdMZ(Pg4ZE^q@G9^_u6W@#HL0Jxki3_6@P?)ZB?KdKT$lYG6 z^@E?LfT(sou1n(sNN1!#wnN)jul57zmW73dd=kIyVzECq-w=S56Q*)Z0Ti^EnV4Fq zC?Rh@ZsQ%?pK`~@oGY;#T%j!o$()$}`kNPl8iUxtw45L0Wc!WP@-oJYe?8gS@^laZ z9)o-26{?^xUu%#mjZ*nPdRw;C%(xP64{Oj~Mdy#rKXp@!KE> zaZ6t1-6AO^UbhE zus1;$5pzH>Rvx#j5Pu>>t&4ItzMK&-Gt#^`Is(_o0)iUp6h>}&q)XFN zU54D78Bg{zvd?Nk@*r%YR@iZ3Pu6#W zw(7QqUy)YtBbx|~2Xvsi^n3sF4a1EnRwyl>PBCfOI!(bm1<*g#09e zxXemfpxLbcO!0D2)Z%3#lz|w_yoi!zTQY!$-xS|G^}jrR4v}m~X#A0pa{&a_8XpWI zs3e$gXA+_ECEid}*`v&7!UQFU2_V03A!CI|iW(pW*~0vu5KgT7Sw>5XDEK2}$9l2=7EZH(tDC9GKL?s6De82^NwXMg zl&+S4C@cJcFUKWA8`KwKK$(Fi02AGr=*|@Wg84c={M&xLC7;%C7NXW=Txp#eYJeG~ zCg%ugxZku$T_vGn>YW^-aO){`g)9sNWfv&FcN93Ss{+QdVK>USvNq>z$ya2M|}j2{jF}bL%l58 z$C9EIqDIAFd;oBc7!I%2H=Qz&8|;4|Si)=E2*6?lj6`#fLnnfrQGPpJ$7 z01|RXeuO}}s+WKa`=PQWuicaS#yvjmzBApqatMM*0z7Hg2g$p{E>ckzFdcQ6)ZgtD zJ$<5CQ{r3hDKLMVfDP*8^wd)Lg5(9^sLklHD>U&?k37ZH8^%s2u9pjI3nGb#>@j<5`r3CBZ$i zmwo8bdO=kJmy$7ow#1ECPR9kKhK1G zj!Hni?H)F{kqz|zP!9VzCIX|UIpvFa_nXAX* zFuET<*{0PiL%3l`kQ5)J{Sbbi6OGCVBrD3(rdm0hSf0M-WEtcxrA1dHO2kF!zw^X7 zqUKq1kNa-Fz&iXHWlY%k?>R4!CvH8viS8-=v&nJMJqh{HdY=zr7z6M{dXd|u+ z|LvbcQNT3r_g<Rx4oX`kuVZ$teyoae+?|!3if)OF^wiZl= z948-+eq|?kw~67;d`+i(D8F9fdMS{-(vI#KkH2!bdE;>zSyAFB#a5)$GUv5Fmc^8V zWlgs=-m}@=@}F^Yj{hi$Cp7c1{N?p);^#ow8A8;{2vx1GzpnDp4$KdwMu!}JbfwM= zWx?Fyl&j?;CN9TAj~rs8hMhA6U|Aj!cVqS}VXF?VXpBx}B z>46;<9T{-~{;i1YWOz^+nt#mi4|~y2yUck#0(gyF2j$ntODU#+s|Tqox7;bT>$#on z6{0HiBhU`9*d-Rm)YVas_Pc3{J|g9Tlx0CR&NoX-M3S^(OS0=B{ zJ4zPXTQ%ug<&+tdGWZ9##|emLWpniTQ!jHNTd``!%@)T!QI6h7NQF1~+$ z5Kf*FMIHvU$AQ!69vn$OG{<@vSj33iIc_X%Nvqcr?JX8)LGCn#NLXgn61~Kq>1_Aw zez9nDQ@BatCVv8=_AGn(uCt{A4A~=WxP%@4)gUr=tno0&^5>Ea0$CR*ZOGDZbdvxa z>oAb(ddXk!cNEL+7@3cCkWrLM?Cff!@8l)tmohnmoe?56J)aSHmq*D82J{@JS zO2ZyQg8!-VU4>owd;-8H5+n4eB}p7LYJ5&uHwz>Q8w;|D?kxQFsUFq515F;Q((@uM zUB-HN#9y-85w6zUdL63Iw{9Cx*V$YBP1OEP)0u;7nqamH@0PMnxYWFnZ#7vF74$JK zw04>P(Pg|UOqnkpFGX{M)7;YLD7-1g$U+}ftB^7oY8A4FdzSarlP8{!Jl$r;mIGBl zyKSQmzSe(>!86CH^m!rEc~)gPLIQCSM+ZI>ShxLj4Jm$tE*za=n-_h(`M&(`v9C9^BSw0lviQohp8ub_ z5Aq=Zh?2eXYyT67uJ^i^Bmt{2`IukWe}Q&mnX?3+lsJs;3BQkixqeN!2QjIzo6-~0 zr#-ml5c{(r)kEd3fV=!l@j#Di#8?0AA)*V<@eugJ^Bmz5+09e z;5{CVMI<^2g{jm~?d)2`=R20gWd8*)sbAnVUKN(7>w9ch>?70-=c>PHDC;m@wqqna z)aSkUEmJ~6Dv(XfI}28F^O_Ee%xW*!szW92F`0W^Ezx&;>0YoJHmOM)qq&!QapDWpXl%%C>w{ zK#{7iR!Jp|NqlMkbX1%(NnpVad|f{a>HRzqUj=fV&i}a5O|{iAdECCyL_S;U2W;Rf zULEB?G9pvxq8Q@WfgJmQ0VS8G4yB4DSt+QWc@F5{4Rp^2tD%ManJ%;k7qq*qisDh>z_-VTTkz z4HwL}F`Aby{}o9LT6v~8+y;s!&1+X>I_Q&}LUhNvsctV^f zp34^=yd}}8m2mmFFY|f8eNlFsy{7A$P7>{B59fX()dBon?v3dA$zN?4c1_fPxpC#@ zt6LKIiB6^$w`}JAF1HD<vT$W7M(qw4e^*V2t2>ah8)4XSQ(snkh(NR zI|n+Z9}J zG1uqyTx!+qWA}oB)-!8~SRnw++Ma0fWR2o9_%!q4Sa0Xy-{`8>l=b|Z*>K}D1*|WS zdTS0^m1RXu3O0V*m~-_p{k4!lJNtC&5>;-1E%!Qd%TEF8QXYzAdEuP+^0d|yS)ifr zUEsyfvHe849xLwq2|yWe9hM{ydH)@Shks37q{RE1S8Z`j#%tWD$NP<>21O5tW@tCR z+@TeVS_qm;&vOt@YpjYT)Isa(pc8ej_dCipRlRS_Lf=CNmC>D9F!a+VbEJ3@koo`3oWLpfl z@Ca~0^9fwY?@oCS2gfNjr?&k`hSCX_OZU5&advG9IuOO`tEdL%d#QkD3-W}mamc^W z`AUmr9N6Fqegg=ozVuD4Mjpxt?o~jaXd8I7E z9BdR@=KS3HHxvfG7t64XIet{a(dduc##6Hr#QFkS_KOK$Xn*JRL=)b!U;I$SqMg3j ztO`;78Gcz>CWO3;%W(uK3VKa5dIG zTO@0=`m(DQsQ5*b$_ni)nW=Wjrq6!yOZ z;%7JR1N{;jcWz-ptkmyGOSG=%88ev2U$>r$B4g@B>UPJ+vEJbl;BV3Tol%#TduGMc zR(?d7F$2T+G7gw6a|UtabvueY=gdnlG5(Yauy+*2DHT_@s4L?`l^_2Jz~vil6%f5? z4}RX#1aR5%hi>Cx>SgNNb1>`j<}3BJM6YfFHDPz@S&n}!Z|CWqo1vNuJNlr+RnxHG zG5;y=9tcaLW-78tbn*@qkS5HTl3#Z0yzigSr&C{guTAYt+J+ZVGINfBIZivmN4Vp@ zR%={792C4MLIf`w zr*$Rz8n1~WA0wL{U(?d6UzV>gCDE&yW*X3i7InNr*J)err;f}i7;PyS9kP-gmyZQm zPC!E+|DY5Ik+28U%;TL@j|obxcrSS&>WUb^it0PdU#!-}qHHa5S#P;YySZApQ5Kbn z_+%E3rv{y|d|Lkp{o&2;*sc-wuOVG{F)ZC zW1-PwD?WPQWWjSTV$`H%8y`V0yblT7^fG{|Qf!9+B|Fx9DV#VLt>uk)>Q`yNRbXRW zvE;VJ<45$o1AqCH`Bdw0C7)ql<}?5V-B+sASk9)>#CcRm z`JEw|qsMCZwZ^8eml%d2@BU&@fW@AIIGT2Hut9{sRykW#+>CdP@^s>lV z$`OeVb0Z=x$o`bA4o<^1IL{X5m5+?wTj{93(fL>?(ffIndR2FnnYG2q59y~KoiEl9 z7BMb{yU{)+1xXdglRgz^yCzMWN^&WKrM-Tcew(xDnP@MLDlbX+*Yfm~Tw9X0pE>1> zyg}mKKJo0<;rAK&F1qL@+vL4TUc`|i*Rwhx?ZLXox1(XteLwu@pZGYqBQ%@?du!)k%PO2PhAn*G_gU7!!azV5}dFu z{3qLK#pmVetOaF(=aIJV&mIYL4_}j(^}3V(#{cxaYDKAx-=n&*Ytnhr*Mu#nGH_Y{S#06cWVJmoy zk&CiL^R}6Oak8hf{LHuTsF@>Z)Wp1-;{frcuxPU#sh=vx>A7aQkMe9U#Z(^Brb`EW zKU;dmZmWpnlAxG2z8PGFh+znx86=f9YI*+CkZEYi=1KP6-N63Ay=153DPeX@-&FC3 zp1uzu`@BhniqoTqWA9`FlBiew3R;IhETnPVdar=WQ?Y$a{JOEQY36Eg(@$vqVBF8A z=AGtradGr2!x3{Hzqic=%K%ApKx{E=e8f4~3vnBJT|rBBVoEtwqT0XQmj6)L!0J?# zPJQ_Y(sKxqF26^ajK##R5gqw<1Vv&^eoit{N@}9EVK@qXEvC$Zn)YkbL|mLSuz@O$ z#-3}^Z1%)N(p98JJR!8TUg9!4{!se4s#Vg-*~9W#By=m9a<*H?+HZ8$vaQJKE5M1X<9DWE6_vO2Pm3)M*Hk86wTex{ql9p&^il6@bK&9!0&vQ zi|PDrh(&}OlgCDiJSP)XIwA3f@XSNoN3~emphBod_c}J%u2yXJdPP+f=tXwQ!&tXh zyXz|ej~{Hi7M4%T_lz<#3+em4%dF34yTZKjAjZ7XQ?qQ=U)5=HnyofvE55;)^mE}L z;uXUmF2@_ST6dZ3DOPIM+xXUCjw+{!8p}I6mn2T9Jc#>`qwR_&+=IMljOG&$HGi8S zn>$x76eb9+G_n=d+FoQ3xvH<@$VtP#N&JiXG@mqVMQdHn(O&p)UnVDdspsd91v%w1 zk{3S`A5gyRz$6$wqJTdxt?I2q-`ho>4Z4&Fp{%O^91I<;?2UO_ZFcTVwve2h`j-2> z#o@>guz5M>BMYclPf3Z_)e+@SUR@_(Bk3bPb5!+*6xbDWp>zG`1})ahHsgmF8p>^d z)V=$Y{i*$fzqu9>Zj5?H;as*qO(M8L!k@y*cugjf3Q9RgT)ffGx8BoCyBRy5==4Fb z6W8t`MekGZ_hDe&EpXAT#=uFuNw(>oWrIUtrodTz=2)J2gOK)VwWGw))6^!Tr%AJ{ z3B_kpWwVHxHw@=WPQu1HNtpWl_5*BM+vqy0wsu6<>VV&e}YEz2%=vP+!uZtaM6x#Ers(ibgX7$N1Hi->J4r{gK`SmpdHqc*7lEo*0?ljo#Fu?OY&Q z@G!5e{oxVHHU0W-;K@1bEyy6y1Mla>JhwUU4a%IGLauV>sYQwtG$(ZQjm~*EEY51T zYiKsCn#w=x{1f8OdSJS$If;NaFJ;AhDD?7^eX&{6OHcQ2Yf2K~Flv{V&|4K=8>eL? z>nrms!7#^&x0F{uVUpDssne(s2ddwYG&%x9=u7$>(}^lRhWbiAyd#q@z2qiUYO7UG z9VaESwkl9kwq1MvT`>t`)D>`nt{um*?mRXB*jOnXC)>&NqjmoV;{(^z`VEZFhYc(D zkd5ObhNzhxDkgT`I*lgzZ0*{dE?Q%~BpNNywj7>pqc0mSP{a{PI~s@Gx9&|9uC$OX zoGh!u?UvP;_A=VaXT+7AOlX&tAv?<&Z9z|))m5Pr3)$|~ovcMa)WGd`guJJJNZYt& zD&5n75QA|KFaOFo zw#@geMkF_#e}4bnW_L{ZSJkS@ywQ%pHhb}dR4vjFOC^r&0y~PD*_TsnQ1U}G=>=5d zjf%~A<1v*fd8WP9Nbdz7$3jEaaHwT+)kIDxIcmr+$JP_;B|jhlO|7zUhvmrLEN?n{ zA$od9b&|BW?=5xyE!SUPvqU$t;XXz16&r@)53+x5R2}4d2jEJ1vWJZ0VH)}r$J6EC z#kkj)vHM8ZukUAff5pzOcmCvqL-EEZzOPHoV35zylz8EWI13-&wPX@tRJqo%A0t|H zk?*DBS2F3oa{W4Z`DI%~7Bo!xJsvqJ!m6DV9ciC%p1nNj!^)lX!PpQvjT{IT1Sg`i z;!Bz=Uwm4iD4PUnA(zB{aXKo(H?PYb%Md+yRpLJM29eL*lM)~9c>+(fJ+oman|pV| z{9j(ppk@)hU)eGCXk?Q?qp6EwEqH|f&aVeIcZ>-u;-w1PW85>56^G9;)1H{6XwTU$ z@A9TdfgJZ}o!3`phO#5RP_u-`Tw#(`h$UcJ{2|^NVyi-Y_kNg6#0{TmwVj#bj7;qC z&mE)TuZ$TVXYFolno|r9SD_8oYRd9|#`)S+Ff%B2wr{PCnm${Xw0=yt+{BpHZ^c@e zEkW*qBhEyQ)VG98@qOaS^) z(W~Iw8{r0y`k5mY-)&dv^N?alzu3C_eCk=#eBu#{(fa&t$7pUO|Jk!rQh#-u59@Nv z->x{BEUr5_^of-Do1aRas*fsR*w`#du2r@x%67+OeUP$6fBO8?WdG(`z1LeiY*lC+ z13xjxt$Q=Ndlu;eMbsAvo(f(jv$1r3NEW6|kC`bAlPcEHf@T``+H)P-ueuf$p$nh? z>@xc0dM_~&YMnWWD_|17nKi-F@4xo;>+|N|i;YsVRM!c)XZ^$^G_5r2rk7&kX;;c-foyGdV(_-{t#0?{K97u-u?|5>Ypt3d(`8^vHKZ(v@Owtd!4rwoP zJRYInEHzBsJj~~%Kgj205YU-PQl*AOw7*4*9pG>iM&WdLfWjLc|jZ`=8!j9X^+AHH*34>epWqzHnmS-s= zjHd-JZhRP%;-iBho@!m3|H`K|7Fsh}IBLo48F80a`Gfm-zlvCPO5aXzRfe&zSng-* zX|JDi0~#qiGuHw|56hMETsge%e2`P(P(1_PxLD7W9Krpugpv+joHWOMXU33jGRPzQ zO9g`8o(B;=6q;@te#X51LBPeM+XJ31-uf6+;-rHU2P?7!3N4cPkl6uhzbD}-WZ|GC zmEnW$jxJqzaxe9I=h6g1Bamqp-galxy+%6R;>aAl2AtKkGcoDZWUbtd_u-yN=gOVP z)af33PiPN&{8i^oE6t0l;i+z#yd#ZZhd5|+&j^7 z5$v(qZy=G?ORGlCJ*u`8Sh(^1r&Opzg-JMx>?Q}T6t&vN`#ghO-YO^NNMkd{x%-Mn z!bd(guSxd%mnvdz70KAy4W7OEwY~;A%fA>m)w$f%c=z&hxmg0c=nY6|C~#1s^;NVQ z*5$i7qfztTFbab2H_qSKiwiip$nsmWw{oHGjy9n8V)M1k7=S9D?w|heEE!< zn8lbyp2i7>OLwNLofu^e_N$6|*&WvlZ#`t2pHGzwVVwl0S=LEuG~8O_YO^|Ra*W3| zE8n?U?Mbc4rl1c?xKj{hM zXz(-$dt47HF!SMs7Q+z83Mi*owk>gIk7%Ntzu&t7m;CZYWH2kvVR}K_?mCi4Ud^7!Y-w6y{6fh z&x2b(z1%ugcTwi%q*q&wcr*XMX z_Qbog0VC~fgKsn+WtnJp$x>YaJ@4Qxo)fXBzU>i8L<(jmYr!z6k{Bze*6pJAU+UT< zpIo!xVZ8)t7Lsc2IRvAI{4S`i!(My8-J}u(7>mi01uxZ0u{yyh4450d7Dw}wb%3ks zjPV?I*ktu@vpV-pmg*P6MHa_P%}x#?nPe9k&W{4&S0NC|H9RN4K;AH1g9S`hMjv2M zy`U+;=AYRd@PG~ajiT{0cK9w#UzzkePS9=GoB*mu|J|^&mMWW0Q4D_c*YB`?RvwI{ zT8tGLrTc>J0H+zxm9>bur%#_QPFfdpR4A#Xoi<-2?P88MWQ-8q3EDta4^A;J>w5a& z$DTg3qz;XVdml_#w%b*MSM206`n0H%&V}cz%wn@;^bA=wW7_PjYqfFv<1rOn+Od1d zJzhQ|TXBOV+l&KZ`8|HO3I$yXu_RXd#ME=EVK_^2+!OpJ8&>y?+m2{H3 z$@U*(UGpKTSoh2mHncDlEfL`1RMoi>%vN-bj)C&IJz`&5BVt@Tae84GDQccuBJ1bj61iBcf-AT?C9zlap|8Sx7=#2s0u6cx`K1i8ob{r$JG+Z;HUd~AQ>JH0b{J%ZRa;%O@Phuy(5O4@r@~|sm6=c zqNB0PM<0o{iraRcuVy(`b>?J_X%~o~%*cx}eM@`t7BX<#*t6ZiEHG@jY{cCkypSPN zOvPn<_g{pot_-B?Q>1-|nef|AX4>0N)ule8Db#tL!cVGa!76C_N3ceLZ+@GT2p{jy4N_bnrrSbTkvb4|IQa z8Lp8-HA%fbaBy@%D)O*R4jd5i#e`0P2gGpXnNak~s+PLu^df^*2&*}@0k zUA(%-te9X+F%~cs;@#LTWOV)uQT3{OEj3?H9;*lxs^=tG;TIJ7fEhizJb#iu z``9^zxpe;xd62McCFp6aTPk z0#gnj=+`vj@2QUQ2D<_RnfD_)1BMj68Y5$}{WEP#b9o2(4F8ipbTQiOe81R#jodm< zyO32jf}Z%aZ(MVDH!QOP8-g;9Y@cDe0@`bTgX;1B_y-vNyN^8F%MQCMq&^_BC3=Ss z7UQWi0zgeAPb>c+HPoln(6OCzxXePayT6|pqZ4se^3_5ysS)A6Wn8;IY7oYCd-;0^CoAu( zB^Tj+Ic-&kONzcX?3=)h&E5fMq@}wCoQM-#r#N+Saq*)#wWJTg`yf5M$v!M}M$pW?$ z9SC^U;~5lX0>d=O>eIb7CZ!~PVvNRpS*2M@cZp;ah9uxSC7X7ahKg90ef}SQ`^T;SFj70j`uXQJ9XArn!!+fUkXXQW(f~$EkD~fbVRlU5q_$WPPHdpfJp*SKP3pkd6g|;qkkJ5>OQpneW+GCBUZ d_in(idata, idata + n); thrust::device_vector d_out(n); + timer().startGpuTimer(); thrust::exclusive_scan(d_in.begin(), d_in.end(), d_out.begin()); - thrust::copy(d_out.begin(), d_out.end(), odata); - - timer().endGpuTimer(); + thrust::copy(d_out.begin(), d_out.end(), odata); + } } } From 3e5984b609d150344ffd7f9140eb9e49c7556953 Mon Sep 17 00:00:00 2001 From: jyguan18 Date: Tue, 16 Sep 2025 03:00:26 -0400 Subject: [PATCH 6/9] Update README.md --- README.md | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed5f3979..274384c0 100644 --- a/README.md +++ b/README.md @@ -41,10 +41,85 @@ This section presents the results of my performance analysis, comparing the diff ### Performance Plots CPU Scan vs. GPU Scan: +![](img/general.png) -This graph compares the run times of the serial CPU scan against the Naive, Work-Efficient, and Thrust GPU scan implementations. +This graph compares the run times of the serial CPU scan against the Naive, Work-Efficient, and Thrust GPU scan implementations across a range of array sizes. + +In this graph, we can see that all of the algorithms have different performance characteristics as the input size grows. +* Thrust is the fastest implementation, especially as the array size increases. Since it is a NVIDIA library, we can see how much faster and the high level of optimization of it. +* The Work-Efficient scan is the second fastest GPU algorithm, but it is still slower than Thrust. Even so, there's a pretty clear performance advantage at arrays larger than 2^22. Since I did not implement shared memory, every intermediate calculation must pass through the slower global memory, which makes the algorithm more memory-bandwidth bound. +* The CPU and Naive GPU scans have really similar results. In fact, for array sizes lower than 2^22, it even sometimes looks faster than thrust or Work-Efficient scan. ### Performance Bottlenecks +* Naive GPU Scan: For the Naive GPU scan, I think the primary bottleneck is using log(n) separate kernel launches. +* Work-Efficient GPU Scan: While it is faster, this algorithm still has limitations based on memory bandwidth. The up-sweep and down-sweep phases require multiple reads and writes from global memory. +* CPU Scan: This can only process one element at a time, making performance directly proportional to the array size. It can't take advantage of data parallelism so in large datasets, its performance quickly falls behind. Extra Credit ============ +### Work-Efficient Scan Optimization +A common issue when implementing a work-efficient scan is that it can perform worse than a serial CPU implementation. This happens if the GPU is not used efficiently and there are a lot of threads that are not being used. + +My impelmentation did not suffer from this issue because the number of thread blocks launched for each step was dynamically scaled according to the amount of work required at that level. +```dim3 blocksPerGrid((nPadded / 1 <<< (d + 1)) + blockSize - 1) / blockSize); ``` +As d increases, the number of blocks launched decreases, which keeps the active threads busy and avoids launching a large number of "lazy" threads. + +Application Output +============ + +```**************** +** SCAN TESTS ** +**************** + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 16777214 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 12.2615ms (std::chrono Measured) + [ 0 0 1 3 6 10 15 21 28 36 45 55 66 ... -41943037 -25165823 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 12.1849ms (std::chrono Measured) + [ 0 0 1 3 6 10 15 21 28 36 45 55 66 ... -92274673 -75497462 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 12.7741ms (CUDA Measured) + [ 0 0 1 3 6 10 15 21 28 36 45 55 66 ... -41943037 -25165823 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 11.8661ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 5.25357ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 4.76275ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 1.71027ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1.31456ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 16777214 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 11.6285ms (std::chrono Measured) + [ 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 16777213 16777214 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 11.2948ms (std::chrono Measured) + [ 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 16777211 16777212 ] + passed +==== cpu compact with scan ==== + elapsed time: 58.9926ms (std::chrono Measured) + [ 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 16777213 16777214 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 12.5078ms (CUDA Measured) + [ 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 16777213 16777214 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 10.498ms (CUDA Measured) + [ 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 16777211 16777212 ] + passed +``` From 6d9af5b7d5e17f5858e1f2e57783db969a9daef8 Mon Sep 17 00:00:00 2001 From: Jacqueline Guan Date: Tue, 16 Sep 2025 21:40:18 -0400 Subject: [PATCH 7/9] fix issue with array sizes > 2^24 & cleanup code a little bit --- src/main.cpp | 6 +- stream_compaction/efficient.cu | 171 ++++++++++++++++++--------------- stream_compaction/efficient.h | 2 +- stream_compaction/thrust.cu | 5 +- 4 files changed, 100 insertions(+), 84 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 933b4126..2bbf86d9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int ARRAY_SIZE = 1 << 24; // feel free to change the size of array +const int ARRAY_SIZE = 1 << 25; // feel free to change the size of array const int NPOT = ARRAY_SIZE - 3; // Non-Power-Of-Two int *a = new int[ARRAY_SIZE]; int *b = new int[ARRAY_SIZE]; @@ -69,14 +69,14 @@ int main(int argc, char* argv[]) { zeroArray(ARRAY_SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(ARRAY_SIZE, c, a, false); + StreamCompaction::Efficient::scan(ARRAY_SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(ARRAY_SIZE, b, c); zeroArray(ARRAY_SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a, false); + StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 4e4029e9..749bf826 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -15,138 +15,151 @@ namespace StreamCompaction { return timer; } - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - - __global__ void kernelUpSweep(int n, int* odata, int d) { + // sweep them up + __global__ void kernelUpSweep(int n, int* data, int step) { int idx = threadIdx.x + (blockIdx.x * blockDim.x); - int k = (1 << (d + 1)) * idx; - if (k >= n) return; - odata[k + (1 << (d + 1)) - 1] += odata[k + (1 << d) - 1]; + if (idx >= n / step) return; + + idx *= step; + data[idx + step - 1] += data[idx + (step >> 1) - 1]; } - __global__ void kernelDownSweep(int n, int* odata, int d) { + // sweep them down + __global__ void kernelDownSweep(int n, int* data, int step) { int idx = threadIdx.x + (blockIdx.x * blockDim.x); - int k = (1 << (d + 1)) * idx; - if (k >= n) return; - int t = odata[k + (1 << d) - 1]; - odata[k + (1 << d) - 1] = odata[k + (1 << (d + 1)) - 1]; - odata[k + (1 << (d + 1)) - 1] += t; + if (idx >= n / step) return; + idx *= step; + int temp = data[idx + (step >> 1) - 1]; + data[idx + (step >> 1) - 1] = data[idx + step - 1]; + data[idx + step - 1] += temp; } - void scan(int n, int *odata, const int *idata, bool isCompact) { - + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int* odata, const int* idata, bool fromCompact) { + if (n <= 0) { + return; + } int logn = ilog2ceil(n); int nPadded = 1 << logn; - int *dev_data; + int* dev_data; cudaMalloc((void**)&dev_data, nPadded * sizeof(int)); + checkCUDAError("scan: cudaMalloc for dev_data failed"); - cudaMemset(dev_data, 0, nPadded * sizeof(int)); + // copy direction is based on where it's coming from + cudaMemcpyKind kind = fromCompact ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice; + cudaMemcpy(dev_data, idata, n * sizeof(int), kind); + checkCUDAError("scan: Initial cudaMemcpy failed"); - if (isCompact) { - cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyDeviceToDevice); - } - else { - cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + // make the padded region 0 (if it exists) + if (nPadded > n) { + cudaMemset(dev_data + n, 0, (nPadded - n) * sizeof(int)); + checkCUDAError("scan: cudaMemset for padding failed"); } - if (!isCompact) { + if (!fromCompact) { timer().startGpuTimer(); } - - for (int d = 0; d < logn; ++d) { - // n / (2 ^ (d + 1)) - dim3 blocksPerGrid((nPadded / (1 << (d + 1)) + blockSize - 1) / blockSize); - kernelUpSweep << < blocksPerGrid, blockSize >> > (nPadded, dev_data, d); + // Up up sweep + for (int d = 0; d < logn; ++d) { + int step = 1 << (d + 1); + int numThreads = nPadded / step; + dim3 blocksPerGrid((numThreads + blockSize - 1) / blockSize); + kernelUpSweep << > > (nPadded, dev_data, step); } - cudaMemset(dev_data + (nPadded - 1), 0, sizeof(int)); + // exclusive scan, last element is 0 + cudaMemset(dev_data + nPadded - 1, 0, sizeof(int)); + // Down down sweet for (int d = logn - 1; d >= 0; --d) { - // n / (2 ^ (d + 1)) - dim3 blocksPerGrid((nPadded / (1 << (d + 1)) + blockSize - 1) / blockSize); - - kernelDownSweep << < blocksPerGrid, blockSize >> > (nPadded, dev_data, d); - + int step = 1 << (d + 1); + int numThreads = nPadded / step; + dim3 blocksPerGrid((numThreads + blockSize - 1) / blockSize); + kernelDownSweep << > > (nPadded, dev_data, step); } - if (!isCompact) { + if (!fromCompact) { timer().endGpuTimer(); } - if (isCompact) { - cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToDevice); - } - else { - cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); - } + // Copy result + cudaMemcpyKind finalKind = fromCompact ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost; + cudaMemcpy(odata, dev_data, n * sizeof(int), finalKind); + checkCUDAError("scan: Final cudaMemcpy failed"); cudaFree(dev_data); } /** * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { - - const size_t bytes = n * sizeof(int); - cudaError_t cpyRes; + int compact(int n, int* odata, const int* idata) { + if (n <= 0) { + return 0; + } - // mark em - int* dev_Bools; - cudaMalloc((void**)&dev_Bools, bytes); + int logn = ilog2ceil(n); + int nPadded = 1 << logn; + const size_t padded_bytes = nPadded * sizeof(int); + // buffers setup int* dev_idata; - cudaMalloc((void**)&dev_idata, bytes); - cpyRes = cudaMemcpy(dev_idata, idata, bytes, cudaMemcpyHostToDevice); - if (cpyRes != CUDA_SUCCESS) { - std::cout << "Copy idata failed." << std::endl; - return -1; - } - + int* dev_Bools; int* dev_odata; - cudaMalloc((void**)&dev_odata, bytes); + int* scanData; + cudaMalloc((void**)&dev_idata, padded_bytes); + cudaMalloc((void**)&dev_Bools, padded_bytes); + cudaMalloc((void**)&dev_odata, padded_bytes); + cudaMalloc((void**)&scanData, padded_bytes); + checkCUDAError("compact: cudaMalloc failed"); + + // Copy host data to device and pad with zeros + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + if (nPadded > n) { + cudaMemset(dev_idata + n, 0, (nPadded - n) * sizeof(int)); + } timer().startGpuTimer(); - int gridSize = (n + blockSize - 1) / blockSize; - Common::kernMapToBoolean << < gridSize, blockSize >> > (n, dev_Bools, dev_idata); + int gridSize = (nPadded + blockSize - 1) / blockSize; - // scan em - int* scanData; - cudaMalloc((void**)&scanData, bytes); + // Step 1: mark en + Common::kernMapToBoolean << > > (nPadded, dev_Bools, dev_idata); + + // Step 2: scan em + scan(nPadded, scanData, dev_Bools, true); - scan(n, scanData, dev_Bools, true); - - // scatter em - Common::kernScatter << < gridSize, blockSize >> > (n, dev_odata, dev_idata, dev_Bools, scanData); + // Step 3: scatter em + Common::kernScatter << > > (nPadded, dev_odata, dev_idata, dev_Bools, scanData); timer().endGpuTimer(); - - cudaMemcpy(odata, dev_odata, bytes, cudaMemcpyDeviceToHost); - int lastBool, lastScan; + // Get the final count of non-zero elements from the last valid element + int lastBool = 0, lastScan = 0; + if (n > 0) { + cudaMemcpy(&lastBool, dev_Bools + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastScan, scanData + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + } + int count = lastBool + lastScan; - cudaMemcpy(&lastBool, dev_Bools + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); - cudaMemcpy(&lastScan, scanData + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + // Copy the final compacted array back to the host + cudaMemcpy(odata, dev_odata, count * sizeof(int), cudaMemcpyDeviceToHost); + // LET THEM BE FREEEE cudaFree(dev_Bools); cudaFree(scanData); + cudaFree(dev_idata); + cudaFree(dev_odata); - return lastBool + lastScan; + return count; } } -} +} \ No newline at end of file diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index e6525cd0..4677a6a5 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata, bool isCompact); + void scan(int n, int *odata, const int *idata, bool fromCompact=false); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 82d78507..a1d8ab19 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -21,7 +21,10 @@ namespace StreamCompaction { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - thrust::device_vector d_in(idata, idata + n); + thrust::host_vector h_in(idata, idata + n); + + + thrust::device_vector d_in = h_in; thrust::device_vector d_out(n); timer().startGpuTimer(); From 48d0e88a5c440c13f9c124d1435af9cc9972fd0e Mon Sep 17 00:00:00 2001 From: Jacqueline Guan Date: Tue, 16 Sep 2025 22:32:26 -0400 Subject: [PATCH 8/9] update readme and add graphs --- README.md | 128 +++++++++++++++++++++++---------- img/block.png | Bin 0 -> 13398 bytes img/compact.png | Bin 0 -> 17734 bytes stream_compaction/efficient.cu | 2 +- 4 files changed, 93 insertions(+), 37 deletions(-) create mode 100644 img/block.png create mode 100644 img/compact.png diff --git a/README.md b/README.md index 274384c0..1b17f9d4 100644 --- a/README.md +++ b/README.md @@ -1,71 +1,127 @@ -CUDA Stream Compaction -====================== +# CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* Jacqueline Guan - * [LinkedIn](https://www.linkedin.com/in/jackie-guan/) - * [Personal website](https://jyguan18.github.io/) -* Tested on my personal laptop: - * Windows 11 Pro 26100.4946 - * Processor AMD Ryzen 9 7945HX with Radeon Graphics - * 32 GB RAM - * Nvidia GeForce RTX 4080 - -Introduction -============= -In this project, I implemented a GPU stream compaction in CUDA, focusing on the core algorithms of parallel prefix sum (scan) and stream compaction. The primary goal was to remove zeros from an array of integers, which is a fundamental step for more complex GPU algorithms like accelerating a path tracer by removing terminated paths. The purpose of this project was to help think about data parallelism and the unique challenges and opportunities of programming for GPUs - -Implementation Details -============= +- Jacqueline Guan + - [LinkedIn](https://www.linkedin.com/in/jackie-guan/) + - [Personal website](https://jyguan18.github.io/) +- Tested on my personal laptop: + - Windows 11 Pro 26100.4946 + - Processor AMD Ryzen 9 7945HX with Radeon Graphics + - 32 GB RAM + - Nvidia GeForce RTX 4080 + +# Introduction + +In this project, I implemented a GPU stream compaction in CUDA, focusing on the core algorithms of parallel prefix sum (scan) and stream compaction. The primary goal was to remove zeros from an array of integers, which is a fundamental step for more complex GPU algorithms like accelerating a path tracer by removing terminated paths. The purpose of this project was to help think about data parallelism and the unique challenges and opportunities of programming for GPUs. This was done by implementing serial, naive parallel, work-efficient parallel, and Thrust-based livrary versions of the scan algorithm. + +# Implementation Details + ### Part 1: CPU Scan and Stream Compaction + I first implemented a CPU version of the scan and stream compaction algorithms in stream_compaction/cpu.cu. The CPU scan computes an exclusive prefix sum using a simple for loop. The stream compaction methods include: -* compactWithoutScan: A direct implementation that iterates through the array and copies non-zero elements -* compactWithScan: An implementation that first maps the input to an array of 0s and 1s, then uses a scan to determine the output positions, and finally uses a scatter operation to produce the compacted array. + +- compactWithoutScan: A direct implementation that iterates through the array and copies non-zero elements +- compactWithScan: An implementation that first maps the input to an array of 0s and 1s, then uses a scan to determine the output positions, and finally uses a scatter operation to produce the compacted array. ### Part 2: Naive GPU Scan Algorithm + I implemented the naive GPU scan in stream_compaction/naive.cu. I used global memory and multiple kernel invocations to perform the scan. A challenge was managing race conditions using two separate device arrays and swapping them at each iteration to ensure correct reads and writes. The implementation also required careful handling of ilog2ceil(n) separate kernel launches. ### Part 3: Work Efficient GPU Scan and Stream Compaction + I implemented the work-efficient algorithm in stream_compaction/efficient.cu. -* Scan: The scan algorithm operates on a binary tree structure and is implemented in-place, which avoids the race conditions of the naive approach. I ensured the implementation correctly handles non-power-of-two sized arrays by padding. -* Stream Compaction: The stream compaction uses the scan function and a scatter algorithm. I implemented the helper kernels kernMapToBoolean and kernScatter in stream_compaction/common.cu. + +- Scan: The scan algorithm operates on a binary tree structure and is implemented in-place, which avoids the race conditions of the naive approach. I ensured the implementation correctly handles non-power-of-two sized arrays by padding. +- Stream Compaction: The stream compaction uses the scan function and a scatter algorithm. I implemented the helper kernels kernMapToBoolean and kernScatter in stream_compaction/common.cu. ### Part 4: Using Thrust's Implementation + For this part, I used the Thurst library to implement both scan and stream compaction in stream_compaction/thrust.cu. The scan implementation is a simple wrapper around thrust::exclusive_scan, which demonstrates how to leverage highly optimized, existing GPU libraries. +# Performance Analysis -Performance Analysis -============= This section presents the results of my performance analysis, comparing the different scan implementations. All measurements were taken with Release mode builds and without debugging. -### Performance Plots +### Block Size Optimization + +Block size determines how many threads are grouped together and can significantly impact performance by affecting hardware occupancy. A block size that is too small may not launch enough threads to hide memory latency effectively. A block size that is too large may limit the number of blocks running on a single Streaming Multiprocessor (SM) due to resource constraints (like registers), which also hurts performannce. + +Here, I tested the Naive and Work-Efficient scan implementations on a large array (2^24) using block sizes of 32, 64, 128, 256, and 512. The execution time for each was measured using an event timer, which excluded memory allocation and host-to-device transfers. + +![](img/block.png) + +| Block Size | Naive Scan Time (ms) | Work-Efficient Scan Time (ms) | +| :--------- | :------------------: | :---------------------------: | +| 32 | 12.63 | 5.47 | +| 64 | 12.81 | 5.50 | +| 128 | **12.46** | 5.15 | +| 256 | 13.25 | **4.53** | +| 512 | 13.00 | 5.00 | + +Based on these results, the optimal parameters are: + +- Naive Scan: A block size of 128 was fastest at 12.46ms +- Work-Efficient Scan: A block size of 256 was fastest at 4.53ms + +### Scan Performance + CPU Scan vs. GPU Scan: ![](img/general.png) This graph compares the run times of the serial CPU scan against the Naive, Work-Efficient, and Thrust GPU scan implementations across a range of array sizes. In this graph, we can see that all of the algorithms have different performance characteristics as the input size grows. -* Thrust is the fastest implementation, especially as the array size increases. Since it is a NVIDIA library, we can see how much faster and the high level of optimization of it. -* The Work-Efficient scan is the second fastest GPU algorithm, but it is still slower than Thrust. Even so, there's a pretty clear performance advantage at arrays larger than 2^22. Since I did not implement shared memory, every intermediate calculation must pass through the slower global memory, which makes the algorithm more memory-bandwidth bound. -* The CPU and Naive GPU scans have really similar results. In fact, for array sizes lower than 2^22, it even sometimes looks faster than thrust or Work-Efficient scan. + +- Thrust is the fastest implementation, especially as the array size increases. Since it is a NVIDIA library, we can see how much faster and the high level of optimization of it. +- The Work-Efficient scan is the second fastest GPU algorithm, but it is still slower than Thrust. Even so, there's a pretty clear performance advantage at arrays larger than 2^22. Since I did not implement shared memory, every intermediate calculation must pass through the slower global memory, which makes the algorithm more memory-bandwidth bound. +- The CPU and Naive GPU scans have really similar results. In fact, for array sizes lower than 2^22, it even sometimes looks faster than thrust or Work-Efficient scan. ### Performance Bottlenecks -* Naive GPU Scan: For the Naive GPU scan, I think the primary bottleneck is using log(n) separate kernel launches. -* Work-Efficient GPU Scan: While it is faster, this algorithm still has limitations based on memory bandwidth. The up-sweep and down-sweep phases require multiple reads and writes from global memory. -* CPU Scan: This can only process one element at a time, making performance directly proportional to the array size. It can't take advantage of data parallelism so in large datasets, its performance quickly falls behind. -Extra Credit -============ +- Naive GPU Scan: For the Naive GPU scan, I think the primary bottleneck is how each of the log n steps reads and writes the entire array from global memory and it does more operations than necessary. +- Work-Efficient GPU Scan: The up-sweep and down-sweep phases require multiple passes over the data. Since my implementation does not use on-chip shared memory, each of these passes must read from and write to the much slower global VRAM. This makes the algorithm memory-bound, as the GPU's powerful arithmetic units likely spend a significant amount of time waiting for data to arrive. +- CPU Scan: This can only perform one addition (or operation) at a time, making performance directly proportional to the array size. It can't take advantage of data parallelism so in large datasets, its performance quickly falls behind. + +### Stream Compaction Peformance + +To evaluate the effectiveness of the final GPU-accelerated stream compaction, I compared its performance with two CPU-based methods: compactWithScan and compactWithoutScan + +![](img/compact.png) + +| Array Size | CPU Without Scan (ms) | CPU With Scan (ms) | GPU Work-Efficient (ms) | +| :---------- | :-------------------: | :----------------: | :---------------------: | +| 2^18 (262K) | **0.17** | 1.11 | 1.68 | +| 2^19 (524K) | **0.36** | 2.67 | 1.86 | +| 2^20 (1M) | **0.70** | 4.04 | 2.68 | +| 2^21 (2M) | **1.72** | 8.39 | 2.90 | +| 2^22 (4M) | **2.79** | 21.44 | 3.22 | +| 2^23 (8M) | 5.64 | 37.03 | **4.55** | +| 2^24 (16M) | 11.41 | 72.01 | **10.70** | +| 2^25 (33M) | 22.33 | 163.57 | **20.20** | + +In this table and graph, there are a few things we can see. + +- In smaller data sets, CPU does really well. For array sizes up to 4 million elements (2^22), it is clearly much faster. Its low overhead makes it ideal for moderately sized tasks +- The performance crossover point happens around 8 million elements (2^23). At this size, the parallelism of the Work-Efficient algorithm can start to be seen here, making it faster than the CPU method. +- At the largest tested size of 33 million elements, the GPU compaction is about 10% faster (20.2ms vs. 22.33ms) than the single-loop CPU version. +- The results confirm that GPU is memory-bound. The significant initial cost and slower-than-expected scaling are due to the multiple passes over global memory. It is when the dataset is so large that the CPU's own memory access becomes a bottleneck that GPU's raw computational throughout gives it the edge. + +### Summary + +This project successfully demonstrates the implementation and performance characteristics of several parallel scan and stream compaction algorithms. The analysis confirmed that while serial CPU code is effective for smaller datasets, well-designed GPU algorithms like the work-efficient scan offer a significant performance increase for larger, data-parallel workflows. In addition, the exceptional performance of the Thrust library highlights the power of using highly-tuned, hardware-aware libraries for common parallel primitives. + +# Extra Credit + ### Work-Efficient Scan Optimization + A common issue when implementing a work-efficient scan is that it can perform worse than a serial CPU implementation. This happens if the GPU is not used efficiently and there are a lot of threads that are not being used. -My impelmentation did not suffer from this issue because the number of thread blocks launched for each step was dynamically scaled according to the amount of work required at that level. -```dim3 blocksPerGrid((nPadded / 1 <<< (d + 1)) + blockSize - 1) / blockSize); ``` +My impelmentation did not suffer from this issue because the number of thread blocks launched for each step was dynamically scaled according to the amount of work required at that level. +`dim3 blocksPerGrid((nPadded / 1 <<< (d + 1)) + blockSize - 1) / blockSize); ` As d increases, the number of blocks launched decreases, which keeps the active threads busy and avoids launching a large number of "lazy" threads. -Application Output -============ +# Application Output ```**************** ** SCAN TESTS ** diff --git a/img/block.png b/img/block.png new file mode 100644 index 0000000000000000000000000000000000000000..2ca0fd914fbfefd512b8dd583f6dc3a1e3e468a7 GIT binary patch literal 13398 zcmd6OXIN8Fw`MGWihvbF1Z;p(6$F%yh)P$)P(xID?;s_BBA}x5j`Uta0D%A@AfU9+ zTciX;q=XV`2oM6Z@%!$5?!C|4A2W0B%>2spoO80zT6?d(_IlrUoexhmA2FS{cme`} zFsVF#pbdc>)`LI}xg4VhSAI`>ltUnQI8`1f>by2yA{>uDH(0&z#N`tBp!&5-Adcrx z_0Fk`YGaeP&F{~MC%(_{uj0MUe}+zc2G#uFxb@KoPsVi54I3#cKR9(t*XUSl&fBz% zJpa3gn@)<|YC8E9>o?x-VqKu6OL1k%4Nf2~&w6%=5poL(3k%|Q$oF!s4UPjNYOSZC z;O7P9R0;Tzx%T1^1oHafx5E&~$|rw12;>+0Aw~$~o7iDi2*lsws33T#`xx*6l5+aL z_77i+juzRVuoCHGuKgNot=Ga@Em9FPU0fYe8g(g|Sy>OvI}>?4#+-tU{AlF0Zoj=} zBrL_)bEyKKRa7KsZw&7qK1JHYi@siY!7k@hC9}6RwYG$ws9{;n$;)e%39g%w9Dkju zo+QzkOvhDA8TS@-`u+XaUW!k>S=92<8djz-C+Fd~-#!8@WrYDH~MxPA!DIby)eVd6qF?C;Lbf0 zNKagG(MXjeB-3GtNe*=p9QIXLNaABr%V^s*EUYfhg5ZXSMc_1`%eWkkq>eCIgcih7 zs_bG((fqviO=02K*%a@tfrUeNJCfx&d}LbswdCz$ohA^JHeEP^N7{oF%_;5PI-Bap zS7q&Mmi-Y7B$xz<eE2!iZ+D72hiXR}tByck zk=nFs{={xxJ>s?2qr}pNhx>6#Y7MwpLVDo7O2N!A!(T00G(uh-fmU(cI#0Q`L^J)- z2j%AMcN?`sC)nW;Y1FIMqft(A>^W3uF0rhs4`Gb4T@oE!wHT3I{b>@$V;hEHCf4P& z#d4?Q)DKv6eX%l@W~y+gT@6YoUZ%54=ofy1kgwh7Q#+1>m%a+LQXcelm3F_2M1tt@o4pWBv2X2oAnzwEkwis|Co z202B>v&^%0@8|2)A8Z;SD6av@gm8=+`e}wLsW1-OHHQg6gEO0Cp8ncH-6<4B4eqSYEF($=Q0?| zo*HZ^t}j-Sn&KRrb;b7zsZwH?4cv0AJU4nD_1m9!PL;N$?U2nAec25YWE`rD_vfo> z*o?r+u-%7XF)s^? z+TQ5b>fF5;Y2>aCm7*HW_V(m4c-XyU?mv6p)fhc$D39}Md?%;Lo!aIexIlK;s?caA z6mPH3^2vk=E!VDV&PDVt)=FFt3JI!oe68s*I%Z{{J!nz3xZuiVqf-5 zoo}zrsXk8$+n+Cd(`{%O${}B}AB1?>6S^vF@boV)%LMU7%~s%soD{aVw-vF{vVu|g zJA-RW<-M7YlJJ_qQ=7y6EP@~pL@m)5<(8nx1p=5mJrNCiwMIT{y8VN@>0e|fhrj|NI6IKaEMxX;ff(4!oostU!#gXMh5mi9bV!%(m3}8Ua5agU zI@b2F*x}{Nm$&?QyKUmq8M1GQ>^h$3Bk+qjDMC)dGT(AMF<#qJ`sgU{4Kl zUn-cD!Q44BjYmz#mijIuDgDQ@o-VG9BPfI_FM<|nN5XW}sBNC3Y*B~WDJNV&y`V6K zv;J1Lfk*A_;6>bN&x;za`{dJm_>Q)x;h&x8Uu7^r)yoJ`3%}MmJ>9%#dlnPRBEQ>{ zEOc!rWIakt?l$t=Cf@gT0?A$fV$PzXD$9#WCvJD|$n)qJ9(Ib8LAl{0490cr12sFP zZGA)vt=3l7-z4g(@l0q!vwy{uNW~2Wqm)^TJ*?ECxSo(J`k|=2ZQJC^Nd1c?1ke6| zY^2_y>&bZROrkShAPQk0kYvZ{IiH`~fnX}y(QTo-$Zn6KVPK)5VacF; zG^%KB!}Woz2Y-SL3aKVxeq+8`?7&U=VUR0672sL(93@ePjT!S9^Z)d!6z7N;eqKga zwMez0qzw^YizgQ&A5@YxTIQ_;3d;zk4)rwmm7nUBxS?tkoS)=Qw0MXSck}n&@^n>? zAC=>|Xy4kG?WGqY9k}H_IJykl4*#mpdGe_{>#OF0PPk6%sbx<>2RuslQdB zrnb-&cfSWdX<&i*8Clw$9`MfXR30|9D@`w^Nc>xw#Z2~yu$_2cK+KR8;~Od+YH@P_ ze^Hy8QoRt>wmyVj%p)e$<~$8UM{1~{_df8Y+@grfSnBmS-SSF*C=DO@_A1UWmE?UZ z3i0I=)A^@D>?ALo+!8?8x+how-xhG`j!ool6bZSx30=?-IhzA@3`FS@P3F0H@iW>CdGnmiFo zb!-X!`KD;H^f+b0uRfoVWoPinn<3@T4P~w+*l~QolPb(j_R)$9Zkw@vt2d-%Wm+Y6 zzevIAtH^b79%b+h_fa+x<3-K(_ZTd>?T_bzq5Hswx^aJAi(y+%wxL7I^1E(Q|EjWk zUEZ^0m_4Fqxj}u?Np$ll9aEsN2Uc`=wmnG8`*3TS8BQIGH44C<3r^W)jcs$UUWgBM z<;r}iGTR%j2a*qE0zVfoh{Si-Oq+~G4pQ~p<7w&Pnyp8{GO?DFZyU6zJbTrOuPo^1r6U*#Wxi0?6Xvi%U1DB**hRaJ;!%qImKl` zW}aV#mRS*pM#Bpy72)!n^_l5EyW-~ixZ3_TK;=)CPd85&>+({Kh_CM>@sz5)3S@;u z?a$wG9=xN|#U@f?En}Xu$uo8I%Ct)h>sE2uLxC+pJSK6uQ>QCz;M^#_8$Y}ADytmbuKQF zv;Um1*JUP;Lsi}Bix6Yl;ZN459f@1#D4vbVei6Jjhn!|V^Q)qThATo`>$9m^iA^~5 zN4(vJ6J?n>7Hf$QWye(|ga|t;5Mo}&q3gpYQG>Zut`1XfzRZQ(b(>a?>V=p#IbE*_ z<8?JOU&r39K9DS2LH;C3Qo$MLo7M^*W5{rSCn;YVj926y;#pdB!KNiFF-L-m0 z?Fr^s`G*NvJ!RS8GNHp#zhpRfLUcu*6f9(SXV-g?H{5CB^IRp3-9%DY-Yep3VY}n~ z(Axxe**T(1&HhE6^BxJr$CQIeEh8S3b6lu{y=ZLcRDS+xQyACxJN|Qup5wKi=5n&% zc)e{4c3j5;_z7S?*0s3qS08$;jMS~6j`exEvD*v0$3kNb9kMa7rwe+ic`ZIG4Gc3b zxv2C|?}5>$c>0i6ak&dcs04S8g`N4zM@w<8XpMl+b;6sN;ySMwQd#Ea8?{s=Lz5^b zc}P!H*_jfEX_&Et2214#_V^9#;^#WP#j1M{Z>~d(mgePw_feTSc(EhVm0rYO?gl9T zoB)5iJ4&9Oo;fyZ`&ZGc#*jqb!>m2(L?UhSwn?kT4wvJ;2AjWZ*nc@B^OwylQ3xX)^22(P*IQe>G3AOOo|=f0ElDcK7%DU%oYcghx_vozj_(gli0! z9Q{9CxP6u6E``&wNZGmMx4UY8-e2(Qi$kBYYVZGf8ypoZVCc$~EXnEhOO`zFY^pI3 z>CkT7j?4PhD0cRK%)vdh+z|%mXaFT-tNcJ&ietC<@^a5W3el6M*g$j5#)vE=E&J`w z`l*)`6c>kL2zv8|@(|zCCH@cBIvPK|yA@M3S4->(n0ZUbsQ%4y0SR&}d}o7r4Tf=# zN!{O_6?K~*c()yTZ{~tN#vNIW^b3>S)N}D3t+aD)+SC2E)&%(5J1Q0&CG^^)9atL8V`x zcOSMZDkxC3Bpyd=6KMj)RgLdT9O|}t^S<)P!1>!|m?nnEJ!GfASru|()k659C*&W? zuPd(~dnyY=p(H0s@mb3&A)~5W#dYv{UuQhbqm9F%_GL|MB&TZyoeeO{=|~fKvDWs> zcs&^vaOSN}S;l&f$y`r5?~4?VaZe|*-&Pa%pLPM%Cl2ZCmns;B82X-6+M-3n9goxz zY80INz1o5JFg!&>BV^x>$B}F zmNS=VQ>^0EgV-VFcG6b#gqZ{jp-537!x2JGz%=59x97wA;i^#Aan}RrZca<}oJpwi z;`GQNOEGu$EqQ^6?9}7o3^>oMNs_CeRG6t?j=T9iaN?d-t(<5=i(i)C{At2_9Ru@& z7T~|bf>#iBoTIg#E|&cMwtsU?&BH$heDe-J`aHoqsu*;#83>M8dPYvaEFtcTypfz^ zlJ;XaU^T9{)0_H=D-Io@Zw>R?HMs|($&5Td*icT;h6x#>xZ+RcjAg8Ybkd5$iCwws z^DwQCYJlno=e^AS?s{iJ%*byu%()nhRujdoD6#Ura_@GA>T53qnk;yq$!R1bM=12^ zdX|@lHWI_!v)>!$doJ9Dc24b{V9PZhj%tguP! zIs)lD4Ys}M#nc&OK{Q9IK-4;ypt!Pw4b1*QVR+)jo2it7bF_5_?Uk0goH;vWzZIm% z=)aie`*%d5v+#@a=gvKz?MfzCx8ZP6P^7_+^MGdzYRx}a>P5CD4j~OqUnkyk$=O&M z#UlY}+N$@e&$2)bEs5eOuU$F0vY zJ>&3@A_J#s4ZXNgXR17$p83+;za29+sIZAyT79J4>KfLI9NE}--{XK)Ex*po1k7J# z{cq=3|J~#R7T4c=aug{pYPriy6t8)}g>cE0X1CNnxWSuQUi)xpMi&<;QfS&09J971 zecqX>HtkIKBTDqvJtz#W-;i|>iU1MB)DGtwz2b)`=!kB3)^55{+%ha`uY0-Q%qpQQ z?Y^=8Q}*Z|aGkbb8gM%UDt9}G3^)*h(zYOo(6?uIHKS6m~4a;5ti zwQ|ReuhO`OulV^%vFdSoTjX}^rD?Mw^CQhLv9x_Iktng{n=!dFx~*NR9~k)xC3F}X zlnt{R)6Li3>iz7^AcRcb%0ik*+@N{(g;t#!T(A7@W?Pg&y;>`*D`$*O!OL@S&+*ni zXPZm$7Ln@_q814EAonEXdYz24@xn`Iwa8_64za zW?Z40`Z~s^x#yt1FgBxcL*QDIgvv%}YWz^Y2In;io~O z8CnmT{e6Uxg*JIZ5($f;3*+ALsm9=(F9J&5Ii5-DqH`H*ii9L>W#HV>PkpwTa-Z@# z;U1w~8Mz+Wd-cLSvb$bjbo_VvY#i@*hMI{_sy;#-2EG^@{qk3Xz)p?tA7Stmbf=&s zJUZrNQ6NKIW4{$<26IFC^@!$c>(j2EjPCXb!FAc@_Q?7^Y2LpHdhwX8Ud{&UgO;px z=0l8+1NL^9{Z=h|kA%?HR3u1BA+jaa)1mrq^|v0Wi^NR?)zH%~?Zj+<6L2%(H%FW@ z97-zW518#cl3U!8^NQKN`UbzuFe#TlFt_a;pQ$EBBUiZK)*G8q=5c|SM~g013fGSq z>6&YemdW0(ol6k6f1OY-b-T8<>dx(|xnb$a87D*Avz}E(D~>)c)v?BCKCX9(YD5X$8?aC2q<^%S-a0m`^aqXJwya95-g75#h{B9Q(W)8<4bco zRpLBw4Yhm+{%vG`@0-B1=r;u9Rrcf37A>)EE|%~GtEC#Qdhg5F z06QhzFr~Rtw&;N8|3RcQ-mC>G36rt9vYkVtb4YRx0_$I;g4 zAX|9SDF{ZTbEc|2UqL%B^FnhG|qv+^9W9x07$y*W$@#q)12~VpzH9Jzz4*adtj8lsQ5e13Oe6*x;*O6juq$UM_@Nc1nz)B;WFZb$~A@ubVZv> zqiV{^fz?PMy^)LD2j>;;P>#fVB56Iy$;lLy~Zv4dxp0?}`QhXCjJWnD%k+WxK8$ z0}HQ!T6*Wft3F`mH~WLk|G%M~##Z=0heCO$TEgN0gRBoKipxktbgX8wRGU?9GVej@ zoGk58!vY4kJPn8iHI*xwt)dZPzq7(8*+9dcg~i9Aq)zN=!+!m zMjk(Z9>YFYY;k%%AuuK;=0Gm)$uJXtVHJK+%>2_AP>(y0IkhCz++7T(-?X<7VHGEfHCjQ#haW` zuDA9jasgM+2aqkhr2Ur!5Vt*N%!CeN?hpTeRyhJOgCA(gKv##>xDu~|xRP)K%tE=( z?rKZd7w{gTN6;hCdS9C9+MhOU7>sk*`^#;U=2j8kRYnU<;AHHz!b621E*i+iw%_`w zcB7R{-5_*&G5}F&#W?iwyg3kwA+dx0;vOU;b_v@H7EFgB{$C=&1(W8J2e_eeOQ0Ey z?jI{ysFI7IZUTNr$0hwK@W}|+RqUY!J0dMn*^PGHhMdmq3$4+|@x4a0@+U#?DE6CJMp#hJh3l`MT6($f*{ z>a6o4SS@>{n`X8EXHc1DOT|P{^5&6&+GIQ9%;7MppaE-xZ6Y2cqLJA^sLIh4QgWl+Ur8 z{i)~k3!M2F3K-W+P`I6n4}`F<@RJ55mGX{{1g{e$?i}+B4A{HyF2A#$W_JJQnzp+T zUhdIk+WOns1B#=UT(+Pm;a#Ke0Q18LJzuE*=oZ5281V|ysSAw^mrj!{xFZfoRt;ZH zgjo6qKa=QnYgM&?%Sa*Zi_YbN?Uhg~Tue}Uq!SB0(m)vx=Va8>H9qq$e^;)BFGxts z43{8l((N*t;3o7|2^E6u=9I*{g~-WQ7aMNScuMVi2U7>!Ld>^(QRB5w+l`a$_6{rX zmPKAoG`XAbuMjpoKXLwYk4~{ZXSq7vk)2$5j-wfN9)vRsSFzjNijMqMbBlUt%hzAh zW;2A)_L5G$zxH+ULQ>yE+vK<7f9~dPgs;IA3;7)a25cv0>~BM^x2m484n1T8 z*ceftEl2c4g|nx==-0eNz|eEj4&U#O;>6}_l(}mJj;R>#Lq~txd)1RV__w+qSP(SM+sqM%vPkoj9owr|*Nd&_h zk7Z%%5;_&ILP|r(DT4T|4t~@_&;33ay<6tWDCxp!Nlm0<&vpeYyn`3(TWgjiH*p_lL%Ou-hEP@XFVv zJ}dJAYrPw@o#k51ayL>5P9Z*}HsdII_)R$6>y2&IFZJ!s#UMAAnPj&z+q?TG_5Fz3 z=AJe|ED_!I+%Hosru)NiAK7veXW^BWPaCD4-5|!AVM5(@oWM* z;!92aIIsVZq!G$%6NXBO-KyIhE&*a<)p~ZgEM$s#t;v#d6mG(YXXpfV;QLQ+TRipU z=-kWF+o8HlpFTUtPzK&no+`D+!rlnf<56#^UUqT=S6(RhK#Q0;rK$!Y;Rtc@(jzk# zsjXdRA#a7%yjd-Jf7L&poMvloj^?padHlR`m!xyE_76)#wpe9YwV`V2C2kJs$`x3? zUbT$7Rcx{}L0l=;Qa6cMJXAbSVZ$3cn|Un_V_3c?AA8kuZiXUzWwN5Pe6Eed+~;>{ z*XQIprzW^boVMp&vEE}8R<9IGO2)qXr~@59dheOLyqWqPA(=?bQ~Flm)}zRfP^`$L zyFS|;_vQ!#s4Eqso;@ogoD`KnvMtxK8!_5Zv1pZ5A<6FsEjy_^hFvl@Cn1J#kQ;SN zhe`!RA~HBoADCwRY>O7-C4LyyH5$=R6^d(P&Aws`?kP9r9sdDGoMu#E9RHAfo4Whw zr_EDu4JmKWv)(zjTfZBbgtlqE&%7Id1eV_Rtzf$&%)b9*(ZI#U{JXH>T7!1J-d>%) z7L#(T++nB?lWs7ldwjW7|3feGJP#mvLABhoNEFBN<9*DTt9Xy9BMek|LqE{@U(rp# zLBBKam8$8OYIoE-U3Ao3RFI!fZBRDD8K>Nw-uF`Dhkh^Kg@4T)fFWgt=d7u&&wX@9 z`_qdhWTl^DMv6BHgYE`)lFQjMi@Gkni!k)}u23Y6<;3QzS}(;{Us=#j)!{jz0Fk`U zQfzgFEuDCHG5pSrVODmgCpqHgx@p!u;y8D@yWX6lR8+#9Ijdh^g@i92of2L%(6iYsw8m=VoxpE1Mx`2zp&ntwISPr zbzy!QM+5Cifrb=UTE^7)j2mJW^?( zE8QO)4j+<0&mE_Cg4s#3d|0+lqP@y1w9_vtv|H!(V1pb2)j?YCu#?)znQS5H9BES9 zs(BBVxlR(P9=F?YvfZQU3E-leYewh?-6{y|(uJaP)`eJ-9!%G;6l3kr28vLHsNA@O z8%gUKwj_3r3&uVBw)s21#cD=+Ob^)2&LAtYL7gB;`;fx32jBkhcRv1~A-4aPX3YP3 zlL{)Fr&aH5{R+r-TNrZQnheJM{t-gZ2PIYC{ci<>^?rWUOO>Npd3lfNS-1p2^JDQp zfY&j-igGePD8UaoG!7haF{UW=jZRRvuo&MMVdB;atb`UTGP0 z7!MT=r3Z8Ib4b|ULgP_z5&7*lt$$$9gH-h6yl(6J)7P>8vJ)O|4RHaAsN$I(jzlpJKwYb5y(wTg1LqpE#_k)8rQvl?;Ylo2m z%u{;hJdk40cL14w4eg&3}?%(|*yk6J=G%^KewCzDJlF6pV< z$Y)+lGPFSE{V|%H>)aF22b{8^j4Brs=iF-QVhvt2ms2}Q%!lAt4CzI zZt4CH5#LGZHO9Jq+A^*+;+k1KZBvZ83uMwyNySLrkbfa*uur%f2 zO1q@R`u)9`R|R6`9gch)J9ODF?p8Tk@Nd6;>V!r}hP%~VT)defN(TWq`iUSf0k=vm zd#_p%YAQUx_@=Om7+*@{sH>u4pXF+b0$7t@R1_V^$flYqhj?q1FQ@fW7A#>55iDWW z_%+Lj9QA~&^1G8P1Z#cJqH5&QDQ+)b_5hOrNSKZ#z!)ULgJbQm+GK>&OG@@g%hS}5 zYhf~dE5>#JCv4aM`v^hq7*EAaHl0JOK*ck6fbMBJ2aGdW@92 zBh$d`QE!;w8|19Gaj+`;@Ku7xgn0d8`Sxz~GtJ{U#Qbs~f}EgV%5XZ#CG+!(ekNRI zq+p9Wu|J;t!a=z+35vTJ#&s5u`h5aQdWVb3pB5$TiN5^ zWKQ_3f0hnuScpJ8JqY4PV+Na4#O7^3m!%!K;b`mn1;xFRCgvyr`s0sy9CRxR5xUQc z###T^xW0p?L1_B!0-vs+7;!!(j#QL4H3|g=1%{wpt6(NCl z5&OH{`_Z6~Y6sgdfJhWPqJaCJ%dPPB9J5GvcDAVXz>Vz*!2h%onIgKT!ldSJR0FA5 zJr9UhQM=pSez7=VOh+=40&~GW$5BY)Wl6|+R#sJv`_fzLB$1Aet`F#LVpy$IIdkKM zuibJ{`_%gVC{Sg#7U_R|yR$MSUJC<}+JIQVOwjC&LY{kiR$H}|Es8cG?F>M+7I_9Y zG7<#6Z#jR3I}}_V>?^|+JxC|V0VR8mS>dAX!!Ry8)qxG5bKfUnmgUX{k?N@S{e7lZ z-+G~W2aHdc??z_YxSVk=jpR`um6(|LA~8FCZVe|A_u(vGJK!khH8qdDvHt%4w5aXR zTQ1as!s5vD0?^0$`U&bOGWx~XO#=&iNayq4x!jP%L+Iy^ak7-LvyucUAj=0FbOG#4 zviMz4x0$HJ+K$)Wt8yStErV|n?rroaY50eLX=i1>V;SzcFw|MDYQ;89U9RU=_c$^H z2sOUkWcNfJUEL(=K~Kxy!^;2V^%d3-<1IF2bvC`JLZ&i zK|z5jcmZ<`ZM2@2lXu%XIomvhf-h@orXAA6oH=MMp?8~f>mk)og~=NON-#hQ zhH_LGXxM=m&>xK4X2;q0hP1!8yw5epqJsv5wxJ=Ki`QCSSS`d>sv^MA-*87Q&Kq=bw1Q6= z%nV9`PO`re9(^%vHn&w!Pg`D6E0kQSBC;<)PZ(9yqj{pBrRejxOk!aaF VIe9MH6_hp*6=lr_#rI#l{U6o@-y;A3 literal 0 HcmV?d00001 diff --git a/img/compact.png b/img/compact.png new file mode 100644 index 0000000000000000000000000000000000000000..dae2a69ff690dd1db8db8c22bc08379365dacab0 GIT binary patch literal 17734 zcmch1px&WL7D<0eM=GPD7_=Sg`#w%_mY6yf{64cU3wLeDkXr@dkIBC zQ+j|9YA7LO-nf5f=G=4c+~>|SXXc;ne)uNucdhlV@>wlm+L}s~|Uj%Lu-I6A*Kcxj;{;LSTc?JAyfBW1m5|WUv%YZHt zW;RPw5)!|UAs4_09{%5Z5PPdaRZY$2WFIAH-F?k}Eh}_sM2RJV&fM>bG=5*VsIahb z{L$^Jy_Gh7jv9T|8xvpRMsk$;;q9k|UHyuovRt6n=TseQv2cgJ;-1yau+Y+*| zc+rUG)vr})y2WS0&ZZPuD#ACLn3dIJZeG5+`o5Os{i9hPHpP#BmD9HJs9Z6+aN&ZW zWoyXronijh7b#;G78bN|cA3yeT&`n!x)2Co+B3}dBBEm3YeyIl; z(-n@sJ$X=P5r;EuXrT0BQ6|9wdRnNz9d1 zTLUog!~Q$JJLoLpBkqW*>gy-OCnfc1$>47U#071Czm$dT3CPK3K#w4nH>c~$gU*gu zV>2@aq%*aAHh*3>v5S$u%n+^zWvZARO~Uf8OWU0u?fB^THqVVJR8Ck_A1kJAap27Y zw(3_IY|4e3lHx=1SmaL>y9WkpN~~80)AP5!b}>_EOy6-$gKEfp#Iz7#dz22-waz{@ zJA)FKPY0hvvlw#pf`~7eHnf1+)JAOizAQQlP~+?|2_9LwJoCzH&z1g%2jZ}o{huD%N^42( zyirem#J8|GW!wMBYNaHKKEmf2^_@rR(#K12CWx9k`6RU@t;|N|=jkF&20s@c?bk~X z$~U}!yt$H}&b*#3>F@FLo7hc^xc&*jq#z3S>os+6NqIB3X`Kt@_;mRz<#Mt*`N=RW zKm2$lbwy;XB{;Zhvmwpfp?`WH*){jjBGq`VUc&i#3zx6oKk|4X{~>SMQ{K?U3zZv~5WE z{-|0FGf#|nv4j}fr*0Z`Dd^d=7>5^=4(>k>H>x!6MO>*Sd&VB&-zbJ)gA>xkyH1by z+E!NM1)7iUjg?7nAw4ZM z*$oPNJ582-5n)o+(>a$2bAl}=7(NxS#Xrh=!MdMX1*5H=a=jhc5Si88(^I5h=Y`VI zEpT1pGp+q}yndGve=#4?a(1$zZCaqD(Fi@5PL!5GGYvZ+O@FOOuIOq>*#sDI^Y9Fa z+NS5v5AT;Mr5AB~&0 zZ2<~Y(dCh871u33o?fFl>54x&vazwrO0NuDU6E`a$S}|J&MvjLNWxx1N7v*&N znKo4lspgsfT_&zbM4|Kk0hzEokm9JT_etk?8B~gWsj0&IlB@3%rNG6@AV+uWC(f~osSUq5=-laB+>;V$9L##6jcJtWzgILAP=u-V3}9i4xS)>}l&8gbPV58(SjhZjVvtOGbmF_W zGLR)Mbt4av_7w^wkhaR~FfIQ=z_n?bm%$EZ0Xw$&f@;XvTs8f7><dtmqzJ1diUf?UQ6$Dfy_MkkHLPzs9BIAbAJ)E$eNiHy+J()%4Lh zvN$%pYcA@*n6uU%;S88k2sGak4=p2;lFtw2n%zUVtu`wfK_|lbkY1^KMk4_%wCP%s zL&nt9>%vTI1qVpArTx+MRtKi=!Zf+ofGO(#!PUQ90v3xy8VhXwKQ2-8y)S`9npTbgffb?ep;K^|#4@_4T*xLL8w8s&xOb@TiBawI?S zp{wXBg_7i5+V>rMA$hMa9ds0t*&YNu5TB6sMGhv5p1dkP*dkf++CkEJd_1sqYgoV? zY}gH+3{#6UrLCEHPs8^-2G1jy(xvH`oIaBdCAZ+(?jv8^7nh=eE5_`ld54t37DsnD z0^SC4zFD(6fB5P8s+#1z?C7@5%b6D~B?WmZEz6=t|-~O#SgH{Rl3N7y6TDUes<8)VCGo34AtP82-q|oygjpZ`tg!^vlw;OmYM?UMim_2AYCrT(lx+5uIzn7iN=cg2lYR zR@*=ruW2=2wi*@IKb_1R?&(iW*()o@&SA-gSIlWaR=r&s9bA;YdPd&34;c=cqs9N7o+XJ(TT`~~J@$^zB)n9n)?T(~!581^re z)k7#R`c-8nQ4$W(qfB36ePffa`KtoE3TXV9*qzel5x477@sC|%))NIRT;>+VEud5Z z5~Cu8u{Gu|8#YgF`*gcM-U~CBS%3~wt-M-wG3w zfexO`O|%W21bI)feM$05;zPn%kzO}3O>AGtQl;Nv*E<<5Q@67@kW!#HGt#MbsHgeF znUeB16sg~y^B$>sdvLJuF_nzr?~u3Gqze(W;3@j;Fh#(+>oTeLh`9jMj3(x0lWgnL z%PeE4F!U``1A|1dpuFH$_I~NDPUm^wY(GiQ`03(}&^B2}$dV^SmJAi@II`{wt~YzxzO#8`d(Pu~cL|#M0CC=*^^`hKS~QUccuZA8wB3 zsnLm@9W5?{q%7`>Y28FI0qq)mhns+FA@DFs_{57iO^g*sv5)R2@|pIQK^mVP7HDO< zAO}7@WE0ldAnYc!e45_!PxGDg&zg=|nbqDpZgfUgtCYd9CSus1FP3xTGzB*btExn^&OU*9PUK6L7WFaoF;KTkM2Bqnzzb&JWtlFsWi&MhJRimgvyqe6cb z6=XR26as8kY1MYIe&GYp68>4$^K9mP0dKjqhGH_3$$1ojXQhyCzW5! zHB%Z-dWZSqC&x9(8ZVZ}vTAQ&=X56b-OzrK>-H{;RAw>uYz8BfV)`3wke zkvl{)sr&oaGy2ZBXW@L{Y@_>DZSQ*ahGbk_HZ*l)z6BrszO*+v<=}|F9r_r^pPE;7kHK;`5R!_t7cg%bJY=890B zp1kgSr+)ed9bMmm=={qJoQCKuI|m0x4N?JVj(w56!wSSfca>M8uzhhT_^e7yo+dnS zHCyWTiL*t^vSP7sxG)qonrFMZGSj?XT+)x`DR0tAjpfX;xn|NgC<05B z=D(7>g8lsD_0p{Gyhz;v2BgmeXwN~hu4>&8pTdCYGwkML{5}E@8!;^fYlmrKCE0q0 zhLS*@G9M&Fn8m>Cq>Wl|TaDUqSjEc9O2Zjj8uOzn7By<6dQ9MVDsArsKkEgjcE0r5 za;DGg|8}F5N&ZLMzdgQRpnG#BLqjyKnFH_H$l8b*=QC_w%iD{kPM9>N_RUph)FTG! za|UcLE)_PYN}5#L-<9y)DizUaaabQjJe#Caq>lM%hg^fEGxy{rnREO5`>&8`5V-My zpj?sCHBLGjX)hs;6RXE;Tx#<4%V}s3U#S~0+WROkW47-W3bTLgb$0Uj5&qYN-2D@@ z3AI#v051*(ogytRqlr=vija?6pPu}~1bb!KMu2O5V9`DdGY8kWJPoYVK1Fv1{XA@9 z0{Z6~Mpn0AK}@_XMd{G_GPyj~L-}PwsF7`F+iD z1!T>WDOiM-j0z~DIB9t9Z%hvD-(E{Lb(u8tjF@rnVyfwMl%F1%8tQ>s_^u6pV!&S; zlQX08mB3#M6Vd7l7&+<1;g(5w$d91<%XV) zx{D$Ck*BlOv%kK-w&_dc%MX>K&if|wabLZ(ZXRZOi{H#>qvADHKAxCo+`yCo`GP zjko=bK~5d!py{HgE*ovP&(wP^$ExwiAv5d+rnY7O{=cXVui9%7?ld`EbP<1{_wy z8qyWuI@PzRN`x_4pWtRyyi`h z%fvxlSl-&KRU8CwoAt3G9dmPjE$QRjG$?i|s{sU%UU21k6&2c`Zp#K~@CqxTA|YYD z4qh_T+uOS|EJsZ_JIP#}md5BA2nwbiqbi&o`j<#ZqThd{GBT}w8H3Q2;PWC>v{BKv z%)7c68W{B64P~S;WFjGvy(cJcNy+$FJ6|i4xgoff^wJxbORt|y1rzX##bhKT%JQ8T zOA~kud!-I%be&8~tI-3f$&Q3J?y3+q;84{nEs5;*a z%>(s8_i<1eXTP_2&H%7t(}KsA=rgJ%^2XQ>r1ZSH#B4vvFy>^@q5BVe6vSdWj^Y2_ z#r~IKx&Mjs5Z_TTbU{rm*JF@3&;xl7^a?_tWk*x5Vxhk+^1P1veDqEJ`D1TgP?5F8 zzGzSV5^vF5*rk96UT~z)7{3bbNIj=XRKDs%P{Xell8Vqh<{9>72oM_OXw*;uWpC z4qJsek+Ij=6y>iP;ZF|Yx|jp|GhW(b^Z|aXhdUtdi=lGkN`Xi|-zCRvIRDTeiHkz& z({oC`&8<(D@J(rnIX6L!qF+pEV%Z`eKOGd`;GJsn_bl8ZS{5OLB2E8amPNE2O5L52 z_!PO<>85I*D_YA2AOGfJmy;(qQeuHt_os;ToiBPzL8lr@#q`ws5~G{!n$KUi;{hI) z@nRt%8M9y&Q1ak!Y&~DvD^|6+{_#e}Q$hLMkmXyK&o6DaQB%*lj^>6P1xb6F23Axn z@h~v)wt%|pA1i#|9zWoV75?qG*Ann%JWpq2$hW**^R7c_Sx6q-NQv3sdhM)-Gm{r| zAcX4bUQXTPOmll$2pr0o*0Nod#*;ZN9v^a*7l3C_b9AMmbo`bg6WqMN zIVGew`Jb(}jBuWWB#itPoyt~%vwCasC+VKDLHmEP%uuE&wtPbjK=eH4=~~cTVJSAP zve$met z-;9NhpkilN)OX4WJ!LM!Lb{%c3Ith_=i-rgM7-Bk58mLhY!)BJ|YXUL?O#iT)6=qou||o`F5F9J})b{Jh7nSC@L1 zdg8t$0+(%ZWocPevw#8^V46^0nW7ZCh$VhmAUI}cu}jU^IGHhEEz5Qo2bxYA-QW#H zS%vfX`(`(#y^q4uX6bYYhmeJ`dH8oy2p4 z!^L++z~O+=6cMMyeayVY?(Xg;H~37FL*b}f-d(k}Td|x)q)wojx{_juJ>uyE`di_P z`YkKPTNrxj;IF%}JVh!6DslH$3Ox4L(SdQ;PfVWu53$z9|3JVQGyP@&<177k7Kp$| z+38it67X9n_$cX?a&zv*()rUB(b%}S7h0OdHBOW70tQ}!zIMX**O%0+tg`k|2&D!u z)E^9oD&2hT>qtduJ~?8RP1-}3@l-NbW@`p$=v0kU()q*Do6n=f*B{V7d@kP}k|(97 z`D?n`A+EkSNYa~xB$o&NzR+A_(uHANfwS)Rv>`$M48`&w}GSsE1S%n-#;q z0nStvTZ)!j0HKYQ@51X9J2zo7)X;(@1_W06dpfP+ze32K*3~FE6Yz&o0H20<={
C$36+%DGFxy+gneZpRqm^cX>P$?L6m2N|W&v z_+2D7!_oUIhHnq&T@0-Rh{!JbZt*Rcv8R0nzFr6Zq7f1Se-9kF?8VE8394J;XboUk zk2}HliRDCAviPRIx`8jAZp3Jsk~T5~8taf5fClo6?>uz0+Cg`Zt#|#S&|owR?C85A z7)4p{1Ktq?#Ci7B*s9zB#KlGioGkcCfo3|fB9|n~dQH5dkznYs#Tq+C&7!yi1aI;26+5IP+fPA=?5W?Eq^$r^ z^GBH?xypa{6`55R!3ShPWCds@V8N=9oa_ESu3|J>A-rP8@Z)!IZ4Yl&<&QHmYzBQj zPriqC%>9&mKmEQ&O83#u;=o%)Aa(rtT@Q`|#9N>(gJ<2~4M0361FkSOF)=ZsyNV70 zabNu#chDL?cmsD4=?19q#c$0v_5o1Le)}qN9N-bE1dBlcXA0VmX}aee4A~Sw6q#Bq zyZsud)9K$G9kGG>rnD8k1)3ElF+Y?+CYoEga5oBsA~)aiS%+}8CnzF>kR>lZS#jFa zeJtiY6ubT<6_EQa_dM_Fwa-waXt_54yYU%4#nm?-lc)oWz))&f2bi&UIArZo{ z@$B+MJ%_zYd&xz#6oZvv%kmBOwUWxzyAkXhSEk62Y{0)WNVRD7b~f>eC1n_TJ{<8` zRi2V)<1kYArs}JrB7Xh>3CZ&k6Jd*8mrcg?8t#Bi(xSO!8&SCw)qh@m&G?{}Z$2vT*?Vo4bspiwFm&#VnJPZbYxAyTz?_R-p(!aZ^Km+LN;a=^Johg}~ z+&6)EHQb=4XMOc|P$Rt{5s-zj6A|ZWq~+RX>)dR6OF+WxReGW=hjanq>+Fsqh$e8w zpx&@L@k3Pp?(2bwKRB-ZJekfzXzY?X?3qTneyUKe^_nLQFW*vqeD9Fm^uTa8@o6D-i$$-r zpOS}onuX`o+gdvn)N~uWk+DOaJlh``KLKD{#k`IuxS2d~J}@t!%cqf^;m?4Yd+{Lh zzCzR-ZcqPbBZNa=jc#gygb2>OuNW98qfEo5kS@0 z)shO|W+QG}^+$BqFD#4~o-+W^V-u9SYCgOeFV-Ig*%uMK<^B^$pHBv#BM~(J>%mPH zjVY!=6L2ADDkBa7>W4CcWbDPyE|eDO7N~YbGNF4EC(9rj+TQY;vyuK+^^$yk}b~ts0C4ErHN=5L5r!!HCV) zR#xgzfv~F0M8evpXQ+1k=8(N$ao&l)(XGc=N8F?EJWnq=Q5G&-9OAt{qSRMvT%{gu z^35&FYi;Hi#xt9+Q_ z6MStfsOVyB77+Yml*}9>!-gg%oj5U8bu^re0&MbP@F1=yJmzdB+5#_ zUnPKw?7M(3A1)mv$hy?dUfnj!AvgN}mqbQq6xY3s|A7l21l`1%PljO{m%&Xq01=!c zv91M~Oneiz$+}J~2dHWR1<17Inf#nYFT{QTCaD}8a1$FJP(tdE*y=3Nl%N8h=2==5OR55T#>+nPkQ z>vD0^6J#{9y%g4%_-s<@w}0!;Q6{O48(RDv2#F`=Pr0R^%T;?7qoP56F=6IobvO@l zt9o(?9b#1`I=1*$kV`;unt4e!Cv&1KmcNlcIu@j8A!FArzXv&Nepm6YNM@pvuQ;_6 zb0sI%JfdI0eGFa<;K~#>AZMv3ItdnX{Xc^0pZeSEEfb1pn?Ie3eQi}blyvjfdX$JL z-|+%B7@?g`(AizkT~SJ@-#j-;PozQkL|24p{R^v?ROTYm+s^54$skpBe!+BhdaiaF z-pwJ(G=u0o#$(^=h;MTAv>d#vkWE1BV06YX(p&_$G(FcTdxAvtLn7rTC1tljBCmrT zbmRJlyXC2WVDhSXc1p(&Ba^p@uLG`ieCvFtAGa_SQE*RB@4#Y#lUDgH=c_W9w>rL+ z*Kp*u*VdQE_U?o06wMU_{V4*KFtI=&t3EDKtJcxar313B9(cC*Ppg)uEkl)_KDk_v z?{+l2JT(L+dt!DUVnV!TF$^si%;)nGWS)yxd)XXXbVCOJNiRrT_gwuHwuS5(xN+ft z?;kvhP6A%VH=X^xblC8SR~vOu_~GO*X19noH(jZ0^%qQ0mNM{OUE|nbGA^y5yr0IO#5nbU3b5Wh#;;s$uKbEow#p1n7Q#6m=+ub4@EvY2R-I?A zmGI*eBE70i$8qzC@z;8j`~29RXjFnn-+Pbxr6QgB-M9Vp)XG4TH?DQ}B=!ods`fYy zrT)ro`EAF7MaaD}DG$APaP4k1RqfKnrEn1#{jqU^q+r?8GFZJj|7|0NuVymUX1&FN zIdtJbw*!;Uq%(<;_3(86Tt>ZN9W{ln$*#-#y^bO2y9u5$#6(=`4r^XKhDuT9C3SbG z8gzG|lpBRNsIO+B5mT=TY|vw7ILJ{y@hdJh+HP9n#TOji$`lw6*Q&EitpzF+ZGB=2 zV~E|aybMZ552l<4uTtoWB|$99p6NeYrz|o#O*pTnwmC(@CD*0Ocl6a{E*%pGqvc-i zoa}9T8lM<`@m9}lTVl@|byrVz{cHA(k~_IU-{;$9Fop}6c*2jqwAVf@R)RUciGUmC z(aiLbaPM{f;c#ZS=t2~morPD!u}VrT-9?p-QU@IXj{!3rddCW|nd{N?efZs8PJNHb z#+#!%Q`_c6KCgXxWGqZrT|Y8$P}oYbrpmNDRc$#!Y_;x(Z(m!E7l84M&zrHo5+TI4 zCK|GTIt){`NyOh6y#%}N1jDPRO|rA1yQzzK;*iF{V4 zH*|Dh4%)ZAlq7zofbJeGsiJli>&=d=(X8XBBEH>ny4CLCXXV|ih^=_as%f>JrWb8( zaV2^JkIjn>&F;Tq!Nj#|mst3vjv6a5_@jpVh1ZDny8OkNq zC_t#alM9Lh{R&bCAJ%2Li>Th0&fkpIYT4@>p3?Uq;BI;x@6`n6ojtUU*!TCYY?9oi zPZm19HMPA{uOq(N()L5QuwP2Pm@b`roQH7t)d8OUn`hejaKrcKd(ok;#A@QTx50QV zorf%{|LQ6+6{@mNi72Tle2R?_Ych8TuuN=vKl$To#KmY*Ek%GG=%AUR(b7^s$u z_T?S?%Is3|4&1uqzhCok_rz4IJ8fsOH$O@`nD*o)-IVvfuH`QNe?D%dTvSr zJw<}+@7xRt!oIec`((;N4b{GD2sc9T=x*(XD<)D8%Y5DbtZ};C<(AK#A{NKNO^cgme3gMuT$GV7$x??y< zP*)9wvtRH%6qMn45z3Wr;C9HT1MVPW0krLyKBl5-*zg& zg%{|PPhkSa(#MQF__e}{?Hk-5qmZ27J_N((InTyR)XU?r(hdUgO!?{&=KW2+^2!R0%dO?ZU_Ns%u)Ta5bLb8XN{4T?1oiQL_)DO)qW@95cq$6ONXC_io{!@R{3FEE}~cP>_e`nU4{ zMfV4d@fCHrdw7uGOU(p&YuMo$IqG4lvJ1?=X6AU8bN6TiRI9FQ_2DpcFP@Hx#tkNY z5lT^6F8COw>I($#AU8CO5IP{b{A}2v%9g%!+mHBL_yK8rSn>yYmJh*gTG*E%0X;mF zy$ujj*jQ;lp6mBDep`(XC(mE^p0ns>x=DGg=;f6no2AS&PfbzstM3pG9@| zKNRi#5$3#9P*l(^9dCDYKFBFZZ)6g@nhm^~OO8wrq_{hTZ%8vrII*hCp3o#Izv~Kr zQfXR8PN5sXSNtOslL%l(t|RK6=NE2P^Y4}eRt=cC3)W{y$K=|6ft==TqPJdAmt>MH zlhK^9a!d4r`qaO&^1 z|FqlvyHs5Yi*V#4P3}^&poXP+4|sgjQM_J|U|T=Sd--6$$0e-xREPoN<)kYXxTOJq zk;T+Ay!qRieZ_h&ShG4mtWvwof8|pBVxHFW*U7nrBqget!+CsNkT5pD^P9`w{$lQa zTw9x*PW%FH;das4iExqm+>>)M`|+*sx;QGJx#vA)is_BaJMyF$D9+sq`2>JNzF|+D zM}7T=`imSyX{ef=&cxY%{&tZAfBgleYRuqpc-@kjM`Cj5+0w_sD*In-I+E5Q@zCwE zqh@^P=^zV3-QSg;Uv^V~3UarK_XVf6XhCf@ImMGm71 z%R?tm-=EYT+2>)Pu*7f$w{6-{AT*%gEAjwE;*Qqm4*EsqS#*w4tUjIXRN4I;$Kg75 z3gtVwN%z0u(H=t0J;#tHwM&@jJDH|;ec_UQjAem~U%G4-)~2nuz76i8mxKm> zi_dS80fD!BeAXM10oswK>q-B*##hyH(ejFI1k&@{mQswP<_Fn@%SN&c#dFCV=wx zt}{9JbM^hJVxiO3YepQDa)Yw2lyU)yY0NOhZtG8SM56aL&fbsg}&#$RsiDP;T9pUKv_POU2-rj#a#%(H^ zs%=GwcekKq+|rRgCZko{)RA0rr!|<#+a3u{wbU!wq>nzMw0U!XR39S`NuZa2@KoRIoc_2lhYLJZDPv}c(Rw$=3UaD@~hwyN5yhp3Wfcxp!!ueNDX`? z1pXnUM(O`88UH2HBEb7?jqAWFYq@Ibcb1JWYXzAaDz*IGg7e2Wi=T-sP~Ey29BI7kawOA(fj#fn`H6eX)llNluXAco!R3Eiu5&VTK?kcU!NoFS z%6R-N1)O4lm~!{lPtX_4SZAuBZJ)uBLT_NUUf&Z!bT*UTm5uU=NnFa)Chz%D`SZkL_fXaGH*(px=5yQH4|kqeu`t}|ZGLcEmGb6* z(c##3*T3?#ZEv};NW|vgV@qEzJ(Km$e%UU%G-bDK(`9#Y#-;N4oZRl>kF)%jMl%+* z0viI;iPW)8nRAJMAGoUZbk+sWg)R-2(uXhc2L^;~K0P7@t^~RGA4X4}*C;bJM+MW@ zBJwtIItRxOy|K@41W&1w)0thXTH;&3CuZRrWmG$exm8n&D_v1-SMSh2%J1sSw#%*W zx<0$R?pXiAXlb{*(P?Stp%YsoUBT&t@`1|&Lz)87zdcKyzHUm-p0pj^(F~|3wPPH6 z2O$-Dq)HJoCI_ewA)~rHOLg&eCE*O$=%}l=~*a zc(`R^bNW;1hxF$S-MQ(#tg-^gG)sj~{b7QG6-%s7{qHGPC#bz5chc2f5j%lr3j$v* zC8dxKj`XHu#;(4J)H~5CrRQZTTH`wQA#v|*?`R(W<=XCH48WMJ* zk^T4Co!5C%CL>qDGT#}P_|<+u9Ny{qoSlg=ziWwh|JTG$KN?Jh#`+`p`yRR%8 zeqZNVsf~)))Qj=FY@;F35u3@$CQV$20PMO!(ue274{d!CbjA>Xf_7U|BOwg1uaB*6z>-D%kTw-K80d)|4Ii4N!YxpxcFeNyW2ptIWt@tw7CnFk}9S!zd^ znR>)&#D1svYMQ5Y=STXLRhaEy8vTx`RpB)BNkVAG>`PYcKHP9r$pQrysrCmy|lM;t{^zc>*XMc+B_( z5uTraV#HVn_-)?ljJU%IL>k>E&?1=^Trr*4UCHx5hDbJ zJ@RHehorDeh4W_MIV zK`hvOQ!B;Bn9~;VPE^X%lt97t#INmrDS=1UpAx5ywKP?{`NH{h0t*gR^AH=-qm}JC zhB#SPR?w~cCM9N*JCxYthNML1C{geIzFn^D%6piV?M453dAzdmi#gl`hMS&8#_4Ki z_c4Egik}(zSKEZ;mtnmD*D8>{=i%ehKs#0Uk2?|OU0c=*&HN^T&@a;*;|ivU_6rW> znciT)D3|!2GXtOz@nc~4FXeks*!%tbKa=iYH84*M_cX9_KbEqnzt2Wy>)WFB?oyAA zwl*tq7aZ&eBmaXDv+q<#?WZU!?C^%0mSVrI>paUI5;X{v`YLzmExzRewy`~WIFf#9 z=%|+3fc+pWYP%uI*O>7ua)bU|fW3CxIM|gPFn(XeeNd;)_SsHrY;{^W*JseX!YjJ! zEW!&NueY#VIg~D87%7D{LWCD2aSc1MC_+rWlc0Xd|sKn%9_zerSp$w_K++23O z@}Wzzi=E~qvL=nEyL}qO0T4)na6u-l;VQjg`FwghiFL)$d!K0Jjt~)D~nVZ;R{m3BFtL_Ts z0!uAg=4Q@N1({3jv|6RfS$zVqop2K=1F$HN&)N5T-x6ivOPp05*w4*YnyA5>i+D`Muf-AK5l@%1; ze!TtgDVXkgo<9%H+g%73NSOxyB$jBUB0^px+FsMnbv@lNY&Kv$!^gW_?hV5h*tBqU znfG5!uk=_}-TN6}zXvvNs!Yg~eSZg%k|@&>j=E%)h@1Cy7J+Dy`jhn%Id2A!{cT7! z&P6&?;Kqkryhf~a7C~R@zx+mnId^YS5f4w#Ji~JHd{OnlJmFFhgL(B_FY%)JoaUS1 z=g*OP5)NAk@FP6T)3iXu896q~DElWBRzu=Ee_Dl(i=RJaA?&c&HjLuFTmCgR*Ia&A zB`{%MdzJ;zqWxgBbx(${Cli~V&PO<~AlOh5I*7k@K%6x`MQH_-Yj|Gbl{y9cKdUO5 znws`V(0@1E@Mi(TnOWi_$;4s}9~5+vJix-qM2ZjirG!4<_4=I~>3W)j!DLdZoDXqb zJal8`Ky*H+>U6_`5DTU>8_pK(#JBpSE~<6up}z5k2AWF+>_?Y3pKyCw5JzJ2C$Gw` zfYz_F+N4D={3DY@?L;;4S0^Y%XDnT&U-dSZll3R@LmWG;XT1;_HR}Z$srN52`+XMM zydB2O90Ulm0e75#X#yQ0N)Ll)cm(;9PpyZsJ zUR=yg+-MrG5S5ZF48?ghFcog%#9JXlx5wskS2F_344`1hroIIXg-!;ast1-=C#elF zo`O*-E+%oW=+e?sZMcvClj*YbjWu -#define blockSize 128 +#define blockSize 256 namespace StreamCompaction { namespace Efficient { From d7267d57afbd8eaa43fa70a82a336b37645aac82 Mon Sep 17 00:00:00 2001 From: Jacqueline Guan Date: Tue, 16 Sep 2025 23:14:03 -0400 Subject: [PATCH 9/9] revert CMakeLists.txt file LOL --- CMakeLists.txt | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 28355738..31a8cc3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,32 +7,45 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -# Enable C++17 for host and device code -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CUDA_STANDARD 17) -set(CMAKE_CUDA_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G") + +# Enable C++11 for host code +set(CMAKE_CXX_STANDARD 11) +if(NOT DEFINED CMAKE_CUDA_STANDARD) + set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) +endif() # Set a default build type if none was specified if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") + SET(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) + # Set the possible values of build type for cmake-gui + SET_PROPERTY(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() if(UNIX) include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}") -endif() +endif(UNIX) add_subdirectory(stream_compaction) +if(CMAKE_VERSION VERSION_LESS "3.23.0") + set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES OFF) +elseif(CMAKE_VERSION VERSION_LESS "3.24.0") + set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES all-major) +else() + set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES native) +endif() + include_directories(.) set(headers "src/testing_helpers.hpp" -) + ) set(sources "src/main.cpp" -) + ) list(SORT headers) list(SORT sources) @@ -40,13 +53,14 @@ list(SORT sources) source_group(Headers FILES ${headers}) source_group(Sources FILES ${sources}) -find_package(CUDAToolkit REQUIRED) - add_executable(${CMAKE_PROJECT_NAME} ${sources} ${headers}) target_link_libraries(${CMAKE_PROJECT_NAME} stream_compaction) - -target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE - ${CUDAToolkit_INCLUDE_DIRS} -) - -set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME}) +if(CMAKE_VERSION VERSION_LESS "3.23.0") + set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES OFF) +elseif(CMAKE_VERSION VERSION_LESS "3.24.0") + set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES all-major) +else() + set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES native) +endif() +target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$,$>:-G>") +set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME}) \ No newline at end of file