diff --git a/README.md b/README.md index 0e38ddb..f97b136 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,249 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xiaoyue Ma + * [LinkedIn](https://www.linkedin.com/in/xiaoyue-ma-6b268b193/) +* Tested on: Windows 10, i7-12700H @ 2.30 GHz 16GB, GTX3060 8GB -### (TODO: Your README) +# Table of Contents +[Overview](#overview) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +[Features](#features) +[Implementation](#implementation) + +[Result](#result) + +[Performance Analysis](#performanceAnalysis) + +[Extra Credit](#extraCredit) + + + +# Overview + +This project is mainly about Parallel Scan(prefix sum) and Stream Compaction algorithms implemented with CUDA. Scan is about computing the prefix sum of an array, and stream compaction is about deleting all elements in an array that meet certain condition. These algorithms seem to be inherently sequential at the first glance, but with GPU we can convert these algorithms into very efficient parallel algorithms. + +One application of Parallel Scan is Summed Area Table, which is a very important algorithm real-time rendering, especially for pre-computation. Another one is Radix Sort, which is a sorting algorithm that can run in parallel. Stream Compaction is very important in ray tracing, which can help delete unnecessory rays. + +The data is computed in the Release Mode. + + +# Features +* CPU Version Scan +* CPU Version Compaction with/without Scan +* GPU Naive Version Scan +* GPU Work-efficient Version Scan +* GPU Work-efficient Version Compaction with Scan +* Thrust Version Scan/Compaction +* GPU Version Scan Optimization (Extra Credit) +* Radix Sort (Extra Credit) + + + +# Implementation +**TBD** + + + +# Result + +* Input array Size 2^22 +* For Scan and compaction, max element value is 50 +* For Sort, max element value is 7 +``` + +**************** +** SCAN TESTS ** +**************** + [ 7 5 8 15 24 5 45 33 9 19 33 37 40 ... 18 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 7.7637ms (std::chrono Measured) + [ 0 7 12 20 35 59 64 109 142 151 170 203 240 ... 102732720 102732738 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 7.9926ms (std::chrono Measured) + [ 0 7 12 20 35 59 64 109 142 151 170 203 240 ... 102732598 102732638 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 2.83472ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 2.75386ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 2.58877ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 2.49437ms (CUDA Measured) + passed +==== work-efficient scanOptimized, power-of-two ==== + elapsed time: 1.47747ms (CUDA Measured) + passed +==== work-efficient scanOptimized, non-power-of-two ==== + elapsed time: 1.34262ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.615424ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.500736ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 3 0 1 2 3 1 1 1 3 1 3 0 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 10.3321ms (std::chrono Measured) + [ 1 3 1 2 3 1 1 1 3 1 3 2 2 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 10.7476ms (std::chrono Measured) + [ 1 3 1 2 3 1 1 1 3 1 3 2 2 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 22.0692ms (std::chrono Measured) + [ 1 3 1 2 3 1 1 1 3 1 3 2 2 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 3.81203ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 4.11203ms (CUDA Measured) + passed +==== work-efficient compactOptimized, power-of-two ==== + elapsed time: 2.52003ms (CUDA Measured) + passed +==== work-efficient compactOptimized, non-power-of-two ==== + elapsed time: 2.6936ms (CUDA Measured) + passed + +***************************** +** RADIX SORT TESTS ** +***************************** + [ 4 3 2 6 4 3 1 5 2 5 6 6 2 ... 6 0 ] +==== cpu std::sort, power-of-two ==== + elapsed time: 46.5172ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 6 6 ] +==== cpu std::sort, non-power-of-two ==== + elapsed time: 45.6927ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 6 6 ] +==== radix sort, power-of-two ==== + elapsed time: 15.2273ms (CUDA Measured) + passed +==== radix sort, non-power-of-two ==== + elapsed time: 14.2143ms (CUDA Measured) + passed +``` + + + +# Performance Analysis + +As seen in this graph, **blockSize = 128** has the general best performance: + +![BlockSize Different](result/blocksize.png) + + +![Scan Power of two](result/scanPot.png) + +Thrust's implementation consistently delivers the top performance. Its sort function is not just a simple quicksort or mergesort. Behind the scenes, Thrust might choose the best sorting strategy based on the input data and the specific GPU hardware. Like remove function in Thrust container, it re-partition the element order instead of deleting the element directly. So I guess Thrust's implementation has great Optimized Algorithms, Memory Management and Parallel Patterns. + +![Scan Not Power Of Two](result/scanNPot.png) + +In the Scan operation, when handling smaller arrays (below 2^12), the CPU consistently delivers better performance than the GPU, irrespective of the GPU's scanning method, be it naive or work-efficient. Yet, as array sizes surpass 2^16, the GPU, with its naive scan, takes the lead, capitalizing on the strengths of parallel computation for handling large datasets. In such cases, following the GPU's naive scan in performance ranking is its work-efficient scan, with the CPU trailing noticeably with extremely large datasets. Interestingly, performance doesn't noticeably differ whether the array sizes are power-of-two or not. This uniformity is attributed to the GPU's approach of padding non-power-of-two arrays to their closest power-of-two counterpart, and the CPU maintaining its efficiency even with slight variations in array sizes. + + +![Compaction Power of two](result/compactPot.png) + +![Compaction Not Power Of Two](result/compactNPot.png) + +With small datasets, the CPU is faster than the GPU. But as data size increases, the GPU's efficiency, especially using the Work Efficient Scan, surpasses the CPU. This is accentuated in tasks like compaction where parallelism shines. Conversely, the CPU's Compact With Scan lags because it tackles more tasks without benefiting from parallel processing. + +![Sort Power Of Two](result/sort.png) + +The CPU employs std::sort with an O(N log(N)) complexity. Yet, for inputs beyond 2^17, the GPU's Radix sort outperforms it(Still slower than thrust). + + + +# Extra Credit +### Why is My GPU Approach So Slow? + +* Global Memory Access: A primary factor causing our GPU to lag behind the CPU in speed is its reliance on global memory for computations. Accessing data from global memory can significantly hamper performance, even if the GPU's parallel processing capability offers advantages with large arrays. To enhance the performance, we should transition to using shared memory rather than global memory and employ an algorithm designed to minimize bank conflicts. + +* More Threads Than Needed: The original launches more threads than necessary, especially in the deeper levels of the sweep. Many of these threads do not perform any useful work but still contribute to launch overhead. + +* Overhead of Modulo Operation: The use of the modulo operation (index % (2 * path) == 0) in the original methods is computationally more expensive than a simple boundary check. + + +``` +__global__ void kernUpSweepOptimized(int n, int d, int* odata) { + int path = 1 << d; + int index = (threadIdx.x + blockIdx.x * blockDim.x) * 2 * path; + + if (index + 2 * path - 1 < n) { + odata[index + 2 * path - 1] += odata[index + path - 1]; + } + } + +__global__ void kernDownSweepOptimized(int n, int d, int* odata) { + //same optimization as above +} + +void scanUpDownSweepOptimized(int n, int* odata) { + for (int d = 0; d <= ilog2ceil(n) - 1; ++d) { + int threadsNeeded = n / (2 << d); + dim3 BlockDim = (threadsNeeded + BlockSize - 1) / BlockSize; + + kernUpSweepOptimized << < BlockDim, BlockSize >> > (n, d, odata); + } + ... + + //Same code + + ... +} + ``` + +**For my optimization:** + +* Thread Index Calculation: + + * Original (kernUpSweep & kernDownSweep): The thread index is calculated as a combination of the block's index and the thread's index within that block: int index = threadIdx.x + (blockIdx.x * blockDim.x); + * Optimized (kernUpSweepOptimized & kernDownSweepOptimized): The thread index is calculated differently: int index = (threadIdx.x + blockIdx.x * blockDim.x) * 2 * path;. Each thread in the optimized version is responsible for twice as many elements as in the original, which means fewer threads are needed to handle the same amount of data. + +* Index Conditions for Operations: + + * Original: Operations are conditioned on index % (2 * path) == 0. This is a more general case that applies to each thread. + * Optimized: Operations are conditioned on boundary checks, i.e., index + 2 * path - 1 < n. Given the adjusted index calculation, the optimized version has a more straightforward condition that checks if operations will stay within the array bounds. + +* Thread and Block Launch Calculations: + + * Original (scanUpDownSweep): The block dimensions are calculated based on the BlockSize, without considering the depth d of the sweep. + * Optimized (scanUpDownSweepOptimized): The block dimensions are calculated based on both the BlockSize and the depth d. Specifically, int threadsNeeded = n / (2 << d); determines the actual number of threads required for each depth, potentially reducing the number of threads launched, especially in the deeper levels of the sweep where fewer updates are needed. + +In general, Even though I didn't use share memory, I still slowed it down to a much lower speed than the GPU and the original version. + +### Radix Sort +I've crafted a radix sort algorithm in the efficient.cu file and benchmarked it against the std::sort function on the CPU. The results show that my implementation begins to outpace the CPU-based sort when the input exceeds 2^16 elements. + +This is the result for ArraySize = 2^22: +``` +***************************** +** RADIX SORT TESTS ** +***************************** + [ 4 3 2 6 4 3 1 5 2 5 6 6 2 ... 6 0 ] +==== cpu std::sort, power-of-two ==== + elapsed time: 46.5172ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 6 6 ] +==== cpu std::sort, non-power-of-two ==== + elapsed time: 45.6927ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 6 6 ] +==== radix sort, power-of-two ==== + elapsed time: 15.2273ms (CUDA Measured) + passed +==== radix sort, non-power-of-two ==== + elapsed time: 14.2143ms (CUDA Measured) + passed +``` +The max element value is 7, cuz Radix Sort is only suitable for small numbers sort. If I set the max element value to be 50, Radix Sort will be slow. diff --git a/result/blocksize.png b/result/blocksize.png new file mode 100644 index 0000000..905a8e6 Binary files /dev/null and b/result/blocksize.png differ diff --git a/result/compactNPot.png b/result/compactNPot.png new file mode 100644 index 0000000..05799ac Binary files /dev/null and b/result/compactNPot.png differ diff --git a/result/compactPot.png b/result/compactPot.png new file mode 100644 index 0000000..c0aff6c Binary files /dev/null and b/result/compactPot.png differ diff --git a/result/scanNPot.png b/result/scanNPot.png new file mode 100644 index 0000000..2e2ebc4 Binary files /dev/null and b/result/scanNPot.png differ diff --git a/result/scanPot.png b/result/scanPot.png new file mode 100644 index 0000000..8bb628f Binary files /dev/null and b/result/scanPot.png differ diff --git a/result/sort.png b/result/sort.png new file mode 100644 index 0000000..46489c6 Binary files /dev/null and b/result/sort.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..ef87942 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,11 +13,12 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 22; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; +int* d = new int[SIZE]; int main(int argc, char* argv[]) { // Scan tests @@ -81,6 +82,20 @@ int main(int argc, char* argv[]) { //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("work-efficient scanOptimized, power-of-two"); + StreamCompaction::Efficient::scanOptimized(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scanOptimized, non-power-of-two"); + StreamCompaction::Efficient::scanOptimized(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); @@ -102,7 +117,7 @@ int main(int argc, char* argv[]) { // Compaction tests - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); @@ -147,8 +162,55 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + zeroArray(SIZE, c); + printDesc("work-efficient compactOptimized, power-of-two"); + count = StreamCompaction::Efficient::compactOptimized(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compactOptimized, non-power-of-two"); + count = StreamCompaction::Efficient::compactOptimized(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("*****************************\n"); + + genArray(SIZE - 1, a, 7); + printArray(SIZE, a, true); + + zeroArray(SIZE, b); + printDesc("cpu std::sort, power-of-two"); + StreamCompaction::CPU::sort(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("cpu std::sort, non-power-of-two"); + StreamCompaction::CPU::sort(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(NPOT, c, true); + + zeroArray(SIZE, d); + printDesc("radix sort, power-of-two"); + StreamCompaction::Efficient::radixSort(SIZE, d, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printCmpResult(SIZE, b, d); + + zeroArray(SIZE, d); + printDesc("radix sort, non-power-of-two"); + StreamCompaction::Efficient::radixSort(NPOT, d, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printCmpResult(NPOT,c, d); + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; delete[] c; + delete[] d; } diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..3280af4 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,9 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > n) return; + bools[index] = idata[index] ? 1 : 0; } /** @@ -33,6 +36,12 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + if (bools[index]) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..bce32b8 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,11 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; ++i) + { + odata[i] = idata[i - 1] + odata[i - 1]; + } timer().endCpuTimer(); } @@ -31,8 +36,18 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + for (int i = 0; i < n; ++i) + { + if (idata[i]) + { + odata[count] = idata[i]; + ++count; + } + } + timer().endCpuTimer(); - return -1; + return count; } /** @@ -43,8 +58,35 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int* scan_arr = new int[n]; + odata[0] = idata[0] ? 1 : 0; + scan_arr[0] = 0; + for (int i = 1; i < n; ++i) + { + odata[i] = idata[i] ? 1 : 0; + scan_arr[i] = odata[i - 1] + scan_arr[i - 1]; + } + + int count = 0; + for (int i = 0; i < n; ++i) + { + if (odata[i]) + { + odata[scan_arr[i]] = idata[i]; + ++count; + } + } + + timer().endCpuTimer(); + return count; + } + + void sort(int n, int* odata, const int* idata) + { + memcpy(odata, idata, n * sizeof(int)); + timer().startCpuTimer(); + std::sort(odata, odata + n); timer().endCpuTimer(); - return -1; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..222b77a 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -11,5 +11,7 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); + + void sort(int n, int* odata, const int* idata); } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..149d32f 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,7 @@ #include "common.h" #include "efficient.h" +#define BlockSize 256 namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +13,133 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int d, int* odata) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + int path = 1 << d; + if (index % (2 * path) == 0) + { + odata[index + 2 * path - 1] += odata[index + path - 1]; + } + } + + __global__ void kernDownSweep(int n, int d, int* odata) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + int path = 1 << d; + if (index % (2 * path) == 0) + { + int temp = odata[index + path - 1]; + odata[index + path - 1] = odata[index + 2 * path - 1]; + odata[index + 2 * path - 1] += temp; + } + } + + __global__ void kernUpSweepOptimized(int n, int d, int* odata) { + int path = 1 << d; + int index = (threadIdx.x + blockIdx.x * blockDim.x) * 2 * path; + + if (index + 2 * path - 1 < n) { + odata[index + 2 * path - 1] += odata[index + path - 1]; + } + } + + __global__ void kernDownSweepOptimized(int n, int d, int* odata) { + int path = 1 << d; + int index = (threadIdx.x + blockIdx.x * blockDim.x) * 2 * path; + + if (index + 2 * path - 1 < n) { + int temp = odata[index + path - 1]; + odata[index + path - 1] = odata[index + 2 * path - 1]; + odata[index + 2 * path - 1] += temp; + } + } + + void scanUpDownSweep(int n, int* odata) + { + dim3 BlockDim((BlockSize + n - 1) / BlockSize); + for (int d = 0; d <= ilog2ceil(n) - 1; ++d) { + kernUpSweep <<< BlockDim, BlockSize >>> (n, d, odata); + } + cudaDeviceSynchronize(); + + cudaMemsetAsync(odata + n - 1, 0, sizeof(int)); + + for (int d = ilog2ceil(n) - 1; d >= 0; --d) { + kernDownSweep <<< BlockDim, BlockSize >>> (n, d, odata); + } + cudaDeviceSynchronize(); + } + + void scanUpDownSweepOptimized(int n, int* odata) { + for (int d = 0; d <= ilog2ceil(n) - 1; ++d) { + int threadsNeeded = n / (2 << d); + dim3 BlockDim = (threadsNeeded + BlockSize - 1) / BlockSize; + + kernUpSweepOptimized << < BlockDim, BlockSize >> > (n, d, odata); + } + cudaDeviceSynchronize(); + + cudaMemsetAsync(odata + n - 1, 0, sizeof(int)); + + for (int d = ilog2ceil(n) - 1; d >= 0; --d) { + int threadsNeeded = n / (2 << d); + dim3 BlockDim = (threadsNeeded + BlockSize - 1) / BlockSize; + + kernDownSweepOptimized <<< BlockDim, BlockSize >>> (n, d, odata); + } + cudaDeviceSynchronize(); + } + + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_data; + int arrSize = 1 << ilog2ceil(n); + cudaMalloc((void**)&dev_data, arrSize * sizeof(int)); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + scanUpDownSweep(arrSize, dev_data); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_data); + } + + void scanOptimized(int n, int* odata, const int* idata) { + int* dev_data; + int arrSize = 1 << ilog2ceil(n); + cudaMalloc((void**)&dev_data, arrSize * sizeof(int)); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + scanUpDownSweepOptimized(arrSize, dev_data); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_data); + } + + void scanForRadixSort(int n, int* odata, const int* idata) { + int* dev_data; + int arrSize = 1 << ilog2ceil(n); + cudaMalloc((void**)&dev_data, arrSize * sizeof(int)); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + scanUpDownSweep(arrSize, dev_data); + + cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_data); } /** @@ -31,10 +152,191 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + // TODO + int arrSize = 1 << ilog2ceil(n); + int* dev_bools; + int* dev_indices; + int* dev_idata; + int* dev_odata; + + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, arrSize * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); + // 1. Map the input data to a boolean array + dim3 fullBlocksPerGrid = (n + BlockSize - 1) / BlockSize; + + StreamCompaction::Common::kernMapToBoolean <<>> (n, dev_bools, dev_idata); + cudaDeviceSynchronize(); + + // 2. Compute the exclusive prefix sum on the boolean array + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + cudaMemset(dev_indices + n, 0, (arrSize - n) * sizeof(int)); + scanUpDownSweep(arrSize, dev_indices); + + // The total number of elements after compaction is stored in the last slot of indices + the last slot of bools + int totalSize; + int lastBool, lastIndex; + cudaMemcpy(&lastBool, &dev_bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastIndex, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + totalSize = lastBool + lastIndex; + + cudaMalloc((void**)&dev_odata, totalSize * sizeof(int)); + // 3. Scatter the data to the output array using the index array + StreamCompaction::Common::kernScatter <<>> (n, dev_odata, dev_idata, dev_bools, dev_indices); + cudaDeviceSynchronize(); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, totalSize * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_idata); + cudaFree(dev_odata); + + return totalSize; + } + + int compactOptimized(int n, int* odata, const int* idata) { // TODO + int arrSize = 1 << ilog2ceil(n); + int* dev_bools; + int* dev_indices; + int* dev_idata; + int* dev_odata; + + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, arrSize * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + // 1. Map the input data to a boolean array + dim3 fullBlocksPerGrid = (n + BlockSize - 1) / BlockSize; + + StreamCompaction::Common::kernMapToBoolean << > > (n, dev_bools, dev_idata); + cudaDeviceSynchronize(); + + // 2. Compute the exclusive prefix sum on the boolean array + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + cudaMemset(dev_indices + n, 0, (arrSize - n) * sizeof(int)); + scanUpDownSweepOptimized(arrSize, dev_indices); + + // The total number of elements after compaction is stored in the last slot of indices + the last slot of bools + int totalSize; + int lastBool, lastIndex; + cudaMemcpy(&lastBool, &dev_bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastIndex, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + totalSize = lastBool + lastIndex; + + cudaMalloc((void**)&dev_odata, totalSize * sizeof(int)); + // 3. Scatter the data to the output array using the index array + StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bools, dev_indices); + cudaDeviceSynchronize(); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, totalSize * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_idata); + cudaFree(dev_odata); + + return totalSize; + } + + __global__ void kernComputeBEArr(int n, int pos, const int* idata, int* bArr, int* eArr) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + int bitVal = (idata[index] >> pos) & 1; + bArr[index] = bitVal; + eArr[index] = !bitVal; + } + + __global__ void kernComputeTArr(int n, int totalFalse, const int* fArr, int* tArr) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + tArr[index] = index - fArr[index] + totalFalse; } + + __global__ void kernComputeDArr(int n, const int* bArr, const int* tArr, const int* fArr, int* dArr) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + dArr[index] = bArr[index] ? tArr[index] : fArr[index]; + } + + __global__ void kernScatterOutput(int n, const int* dArr, const int* idata, int* odata) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + odata[dArr[index]] = idata[index]; + } + + void radixSort(int n, int* odata, const int* idata) + { + int* dev_idata; + int* dev_odata; + + int* dev_bArr; + int* dev_eArr; + int* dev_fAr; + int* dev_tArr; + int* dev_dArr; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_bArr, n * sizeof(int)); + cudaMalloc((void**)&dev_eArr, n * sizeof(int)); + cudaMalloc((void**)&dev_fAr, n * sizeof(int)); + cudaMalloc((void**)&dev_tArr, n * sizeof(int)); + cudaMalloc((void**)&dev_dArr, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + int blockDim = (n + BlockSize - 1) / BlockSize; + + int totalFalse = 0; + int maxVal = *(std::max_element(idata, idata + n)); + + timer().startGpuTimer(); + for (int pos = 0; pos < ilog2ceil(maxVal); pos++) { + kernComputeBEArr <<>> (n, pos, dev_idata, dev_bArr, dev_eArr); + //cudaDeviceSynchronize(); + + scanForRadixSort(n, dev_fAr, dev_eArr); + + //Compute TotalFalse + int e, f; + cudaMemcpy(&e, dev_eArr + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&f, dev_fAr + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + totalFalse = e + f; + + kernComputeTArr <<< blockDim, BlockSize >>> (n, totalFalse, dev_fAr, dev_tArr); + //cudaDeviceSynchronize(); + + kernComputeDArr <<< blockDim, BlockSize >>> (n, dev_bArr, dev_tArr, dev_fAr, dev_dArr); + //cudaDeviceSynchronize(); + + kernScatterOutput <<>> (n, dev_dArr, dev_idata, dev_odata); + //cudaDeviceSynchronize(); + std::swap(dev_odata, dev_idata); + } + timer().endGpuTimer(); + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bArr); + cudaFree(dev_eArr); + cudaFree(dev_fAr); + cudaFree(dev_tArr); + cudaFree(dev_dArr); + } + } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..eea7038 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -7,7 +7,10 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + void scanOptimized(int n, int* odata, const int* idata); int compact(int n, int *odata, const int *idata); + int compactOptimized(int n, int* odata, const int* idata); + void radixSort(int n, int* odata, const int* idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..996c6a1 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,7 @@ #include "common.h" #include "naive.h" +#define BlockSize 256 namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -13,13 +14,52 @@ namespace StreamCompaction { } // TODO: __global__ - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { + __global__ void KernNaiveScan(int n, int d, int* odata, int* idata) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > n) return; + int path = 1 << (d - 1); + if (index >= path) + { + odata[index] = idata[index - path] + idata[index]; + } + else + { + odata[index] = idata[index]; + } + } + + void scan(int n, int* odata, const int* idata) { + dim3 BlockDim((n + BlockSize - 1) / BlockSize); + + int* dev_odata; + int* dev_odata2; + cudaMalloc((void**)&dev_odata2, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata1 failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata2 failed!"); + + cudaMemcpy(dev_odata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata2, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + int depth = ilog2ceil(n); + timer().startGpuTimer(); - // TODO + + for (int d = 1; d <= depth; ++d) { + KernNaiveScan <<>> (n, d, dev_odata, dev_odata2); + cudaDeviceSynchronize(); + int* temp = dev_odata2; + dev_odata2 = dev_odata; + dev_odata = temp; + } + timer().endGpuTimer(); + cudaMemcpy(odata + 1, dev_odata2, (n - 1) *sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + cudaFree(dev_odata); + cudaFree(dev_odata2); } + } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..25835e6 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,19 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); + + timer().startGpuTimer(); + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }