diff --git a/CMakeLists.txt b/CMakeLists.txt index 610c27d4..31a8cc3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,10 +7,14 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G") + # Enable C++11 for host code -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CUDA_STANDARD 17) -set(CMAKE_CUDA_STANDARD_REQUIRED ON) +set(CMAKE_CXX_STANDARD 11) +if(NOT DEFINED CMAKE_CUDA_STANDARD) + set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) +endif() # Set a default build type if none was specified if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) @@ -25,6 +29,14 @@ endif(UNIX) add_subdirectory(stream_compaction) +if(CMAKE_VERSION VERSION_LESS "3.23.0") + set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES OFF) +elseif(CMAKE_VERSION VERSION_LESS "3.24.0") + set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES all-major) +else() + set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES native) +endif() + include_directories(.) set(headers @@ -43,4 +55,12 @@ source_group(Sources FILES ${sources}) add_executable(${CMAKE_PROJECT_NAME} ${sources} ${headers}) target_link_libraries(${CMAKE_PROJECT_NAME} stream_compaction) -set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME}) +if(CMAKE_VERSION VERSION_LESS "3.23.0") + set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES OFF) +elseif(CMAKE_VERSION VERSION_LESS "3.24.0") + set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES all-major) +else() + set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES native) +endif() +target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$,$>:-G>") +set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME}) \ No newline at end of file diff --git a/README.md b/README.md index 0e38ddb1..1b17f9d4 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,181 @@ -CUDA Stream Compaction -====================== +# CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +- Jacqueline Guan + - [LinkedIn](https://www.linkedin.com/in/jackie-guan/) + - [Personal website](https://jyguan18.github.io/) +- Tested on my personal laptop: + - Windows 11 Pro 26100.4946 + - Processor AMD Ryzen 9 7945HX with Radeon Graphics + - 32 GB RAM + - Nvidia GeForce RTX 4080 -### (TODO: Your README) +# Introduction -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +In this project, I implemented a GPU stream compaction in CUDA, focusing on the core algorithms of parallel prefix sum (scan) and stream compaction. The primary goal was to remove zeros from an array of integers, which is a fundamental step for more complex GPU algorithms like accelerating a path tracer by removing terminated paths. The purpose of this project was to help think about data parallelism and the unique challenges and opportunities of programming for GPUs. This was done by implementing serial, naive parallel, work-efficient parallel, and Thrust-based livrary versions of the scan algorithm. +# Implementation Details + +### Part 1: CPU Scan and Stream Compaction + +I first implemented a CPU version of the scan and stream compaction algorithms in stream_compaction/cpu.cu. The CPU scan computes an exclusive prefix sum using a simple for loop. The stream compaction methods include: + +- compactWithoutScan: A direct implementation that iterates through the array and copies non-zero elements +- compactWithScan: An implementation that first maps the input to an array of 0s and 1s, then uses a scan to determine the output positions, and finally uses a scatter operation to produce the compacted array. + +### Part 2: Naive GPU Scan Algorithm + +I implemented the naive GPU scan in stream_compaction/naive.cu. I used global memory and multiple kernel invocations to perform the scan. A challenge was managing race conditions using two separate device arrays and swapping them at each iteration to ensure correct reads and writes. The implementation also required careful handling of ilog2ceil(n) separate kernel launches. + +### Part 3: Work Efficient GPU Scan and Stream Compaction + +I implemented the work-efficient algorithm in stream_compaction/efficient.cu. + +- Scan: The scan algorithm operates on a binary tree structure and is implemented in-place, which avoids the race conditions of the naive approach. I ensured the implementation correctly handles non-power-of-two sized arrays by padding. +- Stream Compaction: The stream compaction uses the scan function and a scatter algorithm. I implemented the helper kernels kernMapToBoolean and kernScatter in stream_compaction/common.cu. + +### Part 4: Using Thrust's Implementation + +For this part, I used the Thurst library to implement both scan and stream compaction in stream_compaction/thrust.cu. The scan implementation is a simple wrapper around thrust::exclusive_scan, which demonstrates how to leverage highly optimized, existing GPU libraries. + +# Performance Analysis + +This section presents the results of my performance analysis, comparing the different scan implementations. All measurements were taken with Release mode builds and without debugging. + +### Block Size Optimization + +Block size determines how many threads are grouped together and can significantly impact performance by affecting hardware occupancy. A block size that is too small may not launch enough threads to hide memory latency effectively. A block size that is too large may limit the number of blocks running on a single Streaming Multiprocessor (SM) due to resource constraints (like registers), which also hurts performannce. + +Here, I tested the Naive and Work-Efficient scan implementations on a large array (2^24) using block sizes of 32, 64, 128, 256, and 512. The execution time for each was measured using an event timer, which excluded memory allocation and host-to-device transfers. + +![](img/block.png) + +| Block Size | Naive Scan Time (ms) | Work-Efficient Scan Time (ms) | +| :--------- | :------------------: | :---------------------------: | +| 32 | 12.63 | 5.47 | +| 64 | 12.81 | 5.50 | +| 128 | **12.46** | 5.15 | +| 256 | 13.25 | **4.53** | +| 512 | 13.00 | 5.00 | + +Based on these results, the optimal parameters are: + +- Naive Scan: A block size of 128 was fastest at 12.46ms +- Work-Efficient Scan: A block size of 256 was fastest at 4.53ms + +### Scan Performance + +CPU Scan vs. GPU Scan: +![](img/general.png) + +This graph compares the run times of the serial CPU scan against the Naive, Work-Efficient, and Thrust GPU scan implementations across a range of array sizes. + +In this graph, we can see that all of the algorithms have different performance characteristics as the input size grows. + +- Thrust is the fastest implementation, especially as the array size increases. Since it is a NVIDIA library, we can see how much faster and the high level of optimization of it. +- The Work-Efficient scan is the second fastest GPU algorithm, but it is still slower than Thrust. Even so, there's a pretty clear performance advantage at arrays larger than 2^22. Since I did not implement shared memory, every intermediate calculation must pass through the slower global memory, which makes the algorithm more memory-bandwidth bound. +- The CPU and Naive GPU scans have really similar results. In fact, for array sizes lower than 2^22, it even sometimes looks faster than thrust or Work-Efficient scan. + +### Performance Bottlenecks + +- Naive GPU Scan: For the Naive GPU scan, I think the primary bottleneck is how each of the log n steps reads and writes the entire array from global memory and it does more operations than necessary. +- Work-Efficient GPU Scan: The up-sweep and down-sweep phases require multiple passes over the data. Since my implementation does not use on-chip shared memory, each of these passes must read from and write to the much slower global VRAM. This makes the algorithm memory-bound, as the GPU's powerful arithmetic units likely spend a significant amount of time waiting for data to arrive. +- CPU Scan: This can only perform one addition (or operation) at a time, making performance directly proportional to the array size. It can't take advantage of data parallelism so in large datasets, its performance quickly falls behind. + +### Stream Compaction Peformance + +To evaluate the effectiveness of the final GPU-accelerated stream compaction, I compared its performance with two CPU-based methods: compactWithScan and compactWithoutScan + +![](img/compact.png) + +| Array Size | CPU Without Scan (ms) | CPU With Scan (ms) | GPU Work-Efficient (ms) | +| :---------- | :-------------------: | :----------------: | :---------------------: | +| 2^18 (262K) | **0.17** | 1.11 | 1.68 | +| 2^19 (524K) | **0.36** | 2.67 | 1.86 | +| 2^20 (1M) | **0.70** | 4.04 | 2.68 | +| 2^21 (2M) | **1.72** | 8.39 | 2.90 | +| 2^22 (4M) | **2.79** | 21.44 | 3.22 | +| 2^23 (8M) | 5.64 | 37.03 | **4.55** | +| 2^24 (16M) | 11.41 | 72.01 | **10.70** | +| 2^25 (33M) | 22.33 | 163.57 | **20.20** | + +In this table and graph, there are a few things we can see. + +- In smaller data sets, CPU does really well. For array sizes up to 4 million elements (2^22), it is clearly much faster. Its low overhead makes it ideal for moderately sized tasks +- The performance crossover point happens around 8 million elements (2^23). At this size, the parallelism of the Work-Efficient algorithm can start to be seen here, making it faster than the CPU method. +- At the largest tested size of 33 million elements, the GPU compaction is about 10% faster (20.2ms vs. 22.33ms) than the single-loop CPU version. +- The results confirm that GPU is memory-bound. The significant initial cost and slower-than-expected scaling are due to the multiple passes over global memory. It is when the dataset is so large that the CPU's own memory access becomes a bottleneck that GPU's raw computational throughout gives it the edge. + +### Summary + +This project successfully demonstrates the implementation and performance characteristics of several parallel scan and stream compaction algorithms. The analysis confirmed that while serial CPU code is effective for smaller datasets, well-designed GPU algorithms like the work-efficient scan offer a significant performance increase for larger, data-parallel workflows. In addition, the exceptional performance of the Thrust library highlights the power of using highly-tuned, hardware-aware libraries for common parallel primitives. + +# Extra Credit + +### Work-Efficient Scan Optimization + +A common issue when implementing a work-efficient scan is that it can perform worse than a serial CPU implementation. This happens if the GPU is not used efficiently and there are a lot of threads that are not being used. + +My impelmentation did not suffer from this issue because the number of thread blocks launched for each step was dynamically scaled according to the amount of work required at that level. +`dim3 blocksPerGrid((nPadded / 1 <<< (d + 1)) + blockSize - 1) / blockSize); ` +As d increases, the number of blocks launched decreases, which keeps the active threads busy and avoids launching a large number of "lazy" threads. + +# Application Output + +```**************** +** SCAN TESTS ** +**************** + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 16777214 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 12.2615ms (std::chrono Measured) + [ 0 0 1 3 6 10 15 21 28 36 45 55 66 ... -41943037 -25165823 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 12.1849ms (std::chrono Measured) + [ 0 0 1 3 6 10 15 21 28 36 45 55 66 ... -92274673 -75497462 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 12.7741ms (CUDA Measured) + [ 0 0 1 3 6 10 15 21 28 36 45 55 66 ... -41943037 -25165823 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 11.8661ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 5.25357ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 4.76275ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 1.71027ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1.31456ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 16777214 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 11.6285ms (std::chrono Measured) + [ 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 16777213 16777214 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 11.2948ms (std::chrono Measured) + [ 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 16777211 16777212 ] + passed +==== cpu compact with scan ==== + elapsed time: 58.9926ms (std::chrono Measured) + [ 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 16777213 16777214 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 12.5078ms (CUDA Measured) + [ 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 16777213 16777214 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 10.498ms (CUDA Measured) + [ 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 16777211 16777212 ] + passed +``` diff --git a/img/block.png b/img/block.png new file mode 100644 index 00000000..2ca0fd91 Binary files /dev/null and b/img/block.png differ diff --git a/img/compact.png b/img/compact.png new file mode 100644 index 00000000..dae2a69f Binary files /dev/null and b/img/compact.png differ diff --git a/img/general.png b/img/general.png new file mode 100644 index 00000000..912db014 Binary files /dev/null and b/img/general.png differ diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..2bbf86d9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,11 +13,11 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; +const int ARRAY_SIZE = 1 << 25; // feel free to change the size of array +const int NPOT = ARRAY_SIZE - 3; // Non-Power-Of-Two +int *a = new int[ARRAY_SIZE]; +int *b = new int[ARRAY_SIZE]; +int *c = new int[ARRAY_SIZE]; int main(int argc, char* argv[]) { // Scan tests @@ -27,32 +27,32 @@ int main(int argc, char* argv[]) { printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + genArray(ARRAY_SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[ARRAY_SIZE - 1] = 0; + printArray(ARRAY_SIZE, a, true); // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); + zeroArray(ARRAY_SIZE, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); + StreamCompaction::CPU::scan(ARRAY_SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); + printArray(ARRAY_SIZE, b, true); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); + StreamCompaction::Naive::scan(ARRAY_SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + printArray(ARRAY_SIZE, c, true); + printCmpResult(ARRAY_SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); @@ -60,35 +60,35 @@ int main(int argc, char* argv[]) { StreamCompaction::Naive::scan(SIZE, c, a); printArray(SIZE, c, true); */ - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); + StreamCompaction::Efficient::scan(ARRAY_SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + printCmpResult(ARRAY_SIZE, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); + StreamCompaction::Thrust::scan(ARRAY_SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + printCmpResult(ARRAY_SIZE, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); @@ -102,23 +102,23 @@ int main(int argc, char* argv[]) { // Compaction tests - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + genArray(ARRAY_SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + a[ARRAY_SIZE - 1] = 0; + printArray(ARRAY_SIZE, a, true); int count, expectedCount, expectedNPOT; // initialize b using StreamCompaction::CPU::compactWithoutScan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); + zeroArray(ARRAY_SIZE, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + count = StreamCompaction::CPU::compactWithoutScan(ARRAY_SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); @@ -126,25 +126,25 @@ int main(int argc, char* argv[]) { printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + count = StreamCompaction::CPU::compactWithScan(ARRAY_SIZE, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); + count = StreamCompaction::Efficient::compact(ARRAY_SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); + zeroArray(ARRAY_SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 025e94aa..d66efb0d 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -53,7 +53,8 @@ void genArray(int n, int *a, int maxval) { srand(time(nullptr)); for (int i = 0; i < n; i++) { - a[i] = rand() % maxval; + // a[i] = rand() % maxval; + a[i] = i; } } diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..9e178f03 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,11 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + + if (idx >= n) return; + + bools[idx] = (idata[idx] != 0) ? 1 : 0; } /** @@ -32,7 +36,12 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) return; + + if (bools[idx] == 1) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed9..bd0779e3 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -9,6 +9,7 @@ #include #include #include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) @@ -66,6 +67,29 @@ namespace StreamCompaction { time_start_cpu = std::chrono::high_resolution_clock::now(); } + void pauseGpuTimer() + { + if (gpu_timer_started) + { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + float elapsed = 0.f; + + cudaEventElapsedTime(&elapsed, event_start, event_end); + prev_elapsed_time_gpu_milliseconds += elapsed; + } + + } + + void continueGpuTimer() + { + if (gpu_timer_started) + { + cudaEventRecord(event_start); + } + + } + void endCpuTimer() { time_end_cpu = std::chrono::high_resolution_clock::now(); diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..cdc6da5a 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,11 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + odata[0] = 0; + for (int i = 1; i < n; ++i) { + odata[i] = idata[i - 1] + odata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +34,16 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int oIdx = 0; + + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[oIdx] = idata[i]; + oIdx++; + } + } timer().endCpuTimer(); - return -1; + return oIdx; } /** @@ -42,9 +53,40 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int* tempData = new int[n]; + int* scanData = new int[n]; + + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + tempData[i] = 1; + } + else { + tempData[i] = 0; + } + } + + scanData[0] = 0; + for (int i = 1; i < n; ++i) { + scanData[i] = tempData[i - 1] + scanData[i - 1]; + } + + for (int i = 0; i < n; ++i) { + if (tempData[i] != 0) { + odata[scanData[i]] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + + int scanResult = tempData[n - 1] + scanData[n - 1]; + + delete[] tempData; + delete[] scanData; + + return scanResult; } + + } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..32b0a22e 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,9 @@ #include #include "common.h" #include "efficient.h" +#include + +#define blockSize 256 namespace StreamCompaction { namespace Efficient { @@ -12,29 +15,151 @@ namespace StreamCompaction { return timer; } + // sweep them up + __global__ void kernelUpSweep(int n, int* data, int step) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + + if (idx >= n / step) return; + + idx *= step; + data[idx + step - 1] += data[idx + (step >> 1) - 1]; + } + + // sweep them down + __global__ void kernelDownSweep(int n, int* data, int step) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + + if (idx >= n / step) return; + + idx *= step; + int temp = data[idx + (step >> 1) - 1]; + data[idx + (step >> 1) - 1] = data[idx + step - 1]; + data[idx + step - 1] += temp; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + void scan(int n, int* odata, const int* idata, bool fromCompact) { + if (n <= 0) { + return; + } + + int logn = ilog2ceil(n); + int nPadded = 1 << logn; + + int* dev_data; + cudaMalloc((void**)&dev_data, nPadded * sizeof(int)); + checkCUDAError("scan: cudaMalloc for dev_data failed"); + + // copy direction is based on where it's coming from + cudaMemcpyKind kind = fromCompact ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice; + cudaMemcpy(dev_data, idata, n * sizeof(int), kind); + checkCUDAError("scan: Initial cudaMemcpy failed"); + + // make the padded region 0 (if it exists) + if (nPadded > n) { + cudaMemset(dev_data + n, 0, (nPadded - n) * sizeof(int)); + checkCUDAError("scan: cudaMemset for padding failed"); + } + + if (!fromCompact) { + timer().startGpuTimer(); + } + + // Up up sweep + for (int d = 0; d < logn; ++d) { + int step = 1 << (d + 1); + int numThreads = nPadded / step; + dim3 blocksPerGrid((numThreads + blockSize - 1) / blockSize); + kernelUpSweep << > > (nPadded, dev_data, step); + } + + // exclusive scan, last element is 0 + cudaMemset(dev_data + nPadded - 1, 0, sizeof(int)); + + // Down down sweet + for (int d = logn - 1; d >= 0; --d) { + int step = 1 << (d + 1); + int numThreads = nPadded / step; + dim3 blocksPerGrid((numThreads + blockSize - 1) / blockSize); + kernelDownSweep << > > (nPadded, dev_data, step); + } + + if (!fromCompact) { + timer().endGpuTimer(); + } + + // Copy result + cudaMemcpyKind finalKind = fromCompact ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost; + cudaMemcpy(odata, dev_data, n * sizeof(int), finalKind); + checkCUDAError("scan: Final cudaMemcpy failed"); + + cudaFree(dev_data); } /** * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int n, int* odata, const int* idata) { + if (n <= 0) { + return 0; + } + + int logn = ilog2ceil(n); + int nPadded = 1 << logn; + const size_t padded_bytes = nPadded * sizeof(int); + + // buffers setup + int* dev_idata; + int* dev_Bools; + int* dev_odata; + int* scanData; + cudaMalloc((void**)&dev_idata, padded_bytes); + cudaMalloc((void**)&dev_Bools, padded_bytes); + cudaMalloc((void**)&dev_odata, padded_bytes); + cudaMalloc((void**)&scanData, padded_bytes); + checkCUDAError("compact: cudaMalloc failed"); + + // Copy host data to device and pad with zeros + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + if (nPadded > n) { + cudaMemset(dev_idata + n, 0, (nPadded - n) * sizeof(int)); + } + timer().startGpuTimer(); - // TODO + + int gridSize = (nPadded + blockSize - 1) / blockSize; + + // Step 1: mark en + Common::kernMapToBoolean << > > (nPadded, dev_Bools, dev_idata); + + // Step 2: scan em + scan(nPadded, scanData, dev_Bools, true); + + // Step 3: scatter em + Common::kernScatter << > > (nPadded, dev_odata, dev_idata, dev_Bools, scanData); + timer().endGpuTimer(); - return -1; + + // Get the final count of non-zero elements from the last valid element + int lastBool = 0, lastScan = 0; + if (n > 0) { + cudaMemcpy(&lastBool, dev_Bools + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastScan, scanData + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + } + int count = lastBool + lastScan; + + // Copy the final compacted array back to the host + cudaMemcpy(odata, dev_odata, count * sizeof(int), cudaMemcpyDeviceToHost); + + // LET THEM BE FREEEE + cudaFree(dev_Bools); + cudaFree(scanData); + cudaFree(dev_idata); + cudaFree(dev_odata); + + return count; } } -} +} \ No newline at end of file diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4fe..4677a6a5 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, bool fromCompact=false); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..d7f772d8 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -12,14 +12,60 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void naiveScan(int n, int *odata, const int *idata, int d) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) return; + if (idx >= (1 << (d - 1))) { + odata[idx] = idata[idx - (1 << (d - 1))] + idata[idx]; + } + else { + odata[idx] = idata[idx]; + } + + } + + __global__ void exclusiveScan(int n, int* odata, const int* idata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + + if (idx >= n) return; + + if (idx > 0) { + odata[idx] = idata[idx - 1]; + } + else { + odata[idx] = 0; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* tempIn; + int* tempOut; + cudaMalloc((void**)&tempIn, n * sizeof(int)); + cudaMalloc((void**)&tempOut, n * sizeof(int)); + + cudaMemcpy(tempIn, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + int blockSize = 128; + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + + for (int d = 1; d <= ilog2ceil(n); ++d) { + naiveScan << < blocksPerGrid, blockSize >> > (n, tempOut, tempIn, d); + std::swap(tempOut, tempIn); + } + + exclusiveScan << < blocksPerGrid, blockSize >> > (n, tempOut, tempIn); + timer().endGpuTimer(); + + cudaMemcpy(odata, tempOut, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(tempIn); + cudaFree(tempOut); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..a1d8ab19 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -1,10 +1,10 @@ #include #include +#include "common.h" +#include "thrust.h" #include #include #include -#include "common.h" -#include "thrust.h" namespace StreamCompaction { namespace Thrust { @@ -18,11 +18,21 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::host_vector h_in(idata, idata + n); + + + thrust::device_vector d_in = h_in; + thrust::device_vector d_out(n); + timer().startGpuTimer(); + + thrust::exclusive_scan(d_in.begin(), d_in.end(), d_out.begin()); + timer().endGpuTimer(); + thrust::copy(d_out.begin(), d_out.end(), odata); + } } }