diff --git a/README.md b/README.md index 0e38ddb..834919a 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,164 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Mengxuan Huang + * [LinkedIn](https://www.linkedin.com/in/mengxuan-huang-52881624a/) +* Tested on: Windows 11, i9-13980HX @ 2.22GHz 64.0 GB, RTX4090-Laptop 16384MB -### (TODO: Your README) +# Features +- CPU Scan +- CPU Compact +- GPU Scan Naive +- GPU Scan Efficient (Binary Balanced Tree) +- GPU Compact (use GPU effcient Scan) +- GPU Radix Sort (use GPU effcient Scan) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +# GPU Radix Sort +Expect for GPU scan and GPU compact, I also implemented GPU sort using radix sort method. The algorithm iterates from the lowest bit to highest bit and sort numbers based on the the binary value of the bit in current iteration. +![](./img/radix.jpg) +I also implement two optimizations: +- Check whether to skip this iteration +- Check whether all numbers are sorted and we can stop sorting + +![](./img/sort_time.png) +It is noticed that to keep the scale of y-axis reasonable, I did not include $2^{24}$ data of CPU method in the graph. + +The line graph shows that CPU sort is faster than any GPU methods when the number is small. But GPU methods perform much better when number keep increasing. Besides, the performance of Radix sort with optimization is better than the radix sort without optimization. + +# Analysis (Unoptimized) + +| Block Size | +|-----------| +| 32 | + +![](./img/runtime.png) + +It is noticed that to keep the scale of y-axis reasonable, I did not include $2^{24}$ data, which are over 10ms, of CPU method and GPU native in the graph. + +According to the test result, CPU scan methods (looping) perform better than any GPU scan methods when the number of element is small. When the number of element increase, GPU methods require less time to finish scanning. + +For the number of block, I only luanch neceeesary threads in both the naive scan method and the efficient scan method. +- In the naive scan method, only $n - 2^d$ threads needed for each iteration. +- In the up-sweep in the efficient scan method, $n / 2^{d+1}$ threads needed. +- In the down-sweep in the efficient scan method, $2^d$ threads needed. + +## Block Size +Test on $2^{20}$ elements. Run 1000 times and compute the average run time. + +![](./img/blocksize.png) +According to the line graph, the efficient method get best performance at $block size = 1024$, and the naive method get best performance at $block size = 1024$ as well. + +Although I run 1000 times and compute the average run time, I can still get different best block size when I run again. I guess this might because data distribution will also affect the perfermance as well. + +Finally, I guess that the perfermance are best when $block size \geq 32$. + +## Compare +| Block Size | +|-----------| +| 128 | +![](./img/runtime1.png) + +It is noticed that to keep the scale of y-axis reasonable, I did not include $2^{24}$ data, which are over 10ms, of CPU method and GPU native in the graph. + +According to the graph, the CPU method performs better for small number of data, while GPU method run much faster for large amount of data. When $n = 2^20$, my efficient scan run almost as fast as thrust library. But thrust library is much faster when the amount data keep increasing. + +### What thrust::exclusive_scan Do +![](./img/thrust_scan.png) + +The above imamge shown thrust::exclusive_scan firstly luanch *DeviceScanInitKernel* and then the *DeviceScanKernel*, which means it finishes the scan in just two step, which is much more efficient than my "Efficient" scan! +- In the *DeviceScanInitKernel*, it must finish GPU allocation and data copy from Host(CPU) to Device(GPU). +- In the *DeviceScanKernel*, it finish the exclusive scan. + +## Bottlenecks +![](./img/nsight_1.png) +The bottlenecks comes from the memory throughtput as shown in the image. This is the same in both naive method and the efficient method. + +## Output of a test +``` +SM: 76 +Warp Size: 32 +Max Threads/Block Size: 1024 +Max Block/SM: 24 +Max Wave: 1824 + [ 29 17 49 36 3 14 10 16 43 47 28 21 8 ... 27 0 ] + +**************** +** SCAN TESTS ** +**************** +==== thrust scan, power-of-two ==== + elapsed time: 1.07789ms (CUDA Measured) + [ 0 29 46 95 131 134 148 158 174 217 264 292 313 ... 410868819 410868846 ] +==== thrust scan, non-power-of-two ==== + elapsed time: 1.05686ms (CUDA Measured) + [ 0 29 46 95 131 134 148 158 174 217 264 292 313 ... 410868712 410868730 ] + passed +==== cpu scan, power-of-two ==== + elapsed time: 21.9401ms (std::chrono Measured) + [ 0 29 46 95 131 134 148 158 174 217 264 292 313 ... 410868819 410868846 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 24.0604ms (std::chrono Measured) + [ 0 29 46 95 131 134 148 158 174 217 264 292 313 ... 410868712 410868730 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 13.0793ms (CUDA Measured) + [ 0 29 46 95 131 134 148 158 174 217 264 292 313 ... 410868819 410868846 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 12.9254ms (CUDA Measured) + [ 0 29 46 95 131 134 148 158 174 217 264 292 313 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 2.50429ms (CUDA Measured) + [ 0 29 46 95 131 134 148 158 174 217 264 292 313 ... 410868819 410868846 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 2.21098ms (CUDA Measured) + [ 0 29 46 95 131 134 148 158 174 217 264 292 313 ... 410868712 410868730 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** +==== cpu compact without scan, power-of-two ==== + elapsed time: 8.4759ms (std::chrono Measured) + [ 29 17 49 36 3 14 10 16 43 47 28 21 8 ... 43 27 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 7.2072ms (std::chrono Measured) + [ 29 17 49 36 3 14 10 16 43 47 28 21 8 ... 18 46 ] + passed +==== cpu compact with scan ==== + elapsed time: 42.8932ms (std::chrono Measured) + [ 29 17 49 36 3 14 10 16 43 47 28 21 8 ... 2 18 ] + expected 16441514 elements, got 16441511 + FAIL COUNT +==== work-efficient compact, power-of-two ==== + elapsed time: 3.55958ms (CUDA Measured) + [ 29 17 49 36 3 14 10 16 43 47 28 21 8 ... 43 27 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 4.56838ms (CUDA Measured) + [ 29 17 49 36 3 14 10 16 43 47 28 21 8 ... 18 46 ] + passed + +***************************** +** SORT TESTS ** +***************************** +==== CPU sort, power-of-two ==== + elapsed time: 255.996ms (std::chrono Measured) +==== CPU sort, non-power-of-two ==== + elapsed time: 241.058ms (std::chrono Measured) +==== Thrust sort, power-of-two ==== + elapsed time: 5.74099ms (CUDA Measured) +==== Radix sort, power-of-two ==== + elapsed time: 21.6387ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] + passed +==== Thrust sort, non-power-of-two ==== + elapsed time: 4.88704ms (CUDA Measured) +==== Radix sort, non-power-of-two ==== + elapsed time: 31.1871ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] + passed +``` \ No newline at end of file diff --git a/img/blocksize.png b/img/blocksize.png new file mode 100644 index 0000000..469c67d Binary files /dev/null and b/img/blocksize.png differ diff --git a/img/nsight_1.png b/img/nsight_1.png new file mode 100644 index 0000000..b3a50df Binary files /dev/null and b/img/nsight_1.png differ diff --git a/img/radix.jpg b/img/radix.jpg new file mode 100644 index 0000000..992c3c5 Binary files /dev/null and b/img/radix.jpg differ diff --git a/img/runtime.png b/img/runtime.png new file mode 100644 index 0000000..1e04f44 Binary files /dev/null and b/img/runtime.png differ diff --git a/img/runtime1.png b/img/runtime1.png new file mode 100644 index 0000000..6d3acbb Binary files /dev/null and b/img/runtime1.png differ diff --git a/img/sort_time.png b/img/sort_time.png new file mode 100644 index 0000000..e057dbb Binary files /dev/null and b/img/sort_time.png differ diff --git a/img/thrust_scan.png b/img/thrust_scan.png new file mode 100644 index 0000000..dc93e5f Binary files /dev/null and b/img/thrust_scan.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..ba7a278 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,147 +7,219 @@ */ #include +#include #include #include #include +#include + #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 24; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two + int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; -int main(int argc, char* argv[]) { - // Scan tests +#define CPU_TEST 1 - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); +#define SCAN 1 +#define COMPACT 1 +#define SORT 1 +int main(int argc, char* argv[]) +{ + StreamCompaction::Common::GetGPUInfo(true); + int count = 1; + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - system("pause"); // stop Win32 console from closing on exit + for (int k = 1024; k <= 1024; k <<= 1) + { + BlockSize = k; + for (int i = 0; i < count; ++i) + { + // Scan tests +#if SCAN +#if DEBUG + printf("\n"); + printf("****************\n"); + printf("** SCAN TESTS **\n"); + printf("****************\n"); +#endif + // initialize b using thrust functions + + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, b, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + +#if CPU_TEST + zeroArray(SIZE, b); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(NPOT, b, true); + printCmpResult(NPOT, b, c); +#endif + zeroArray(SIZE, c); + printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + onesArray(SIZE, c); + printDesc("1s array for finding bugs"); + StreamCompaction::Naive::scan(SIZE, c, a); + printArray(SIZE, c, true); */ + + zeroArray(SIZE, c); + printDesc("naive scan, non-power-of-two"); + StreamCompaction::Naive::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); +#endif + +#if COMPACT +#if DEBUG + printf("\n"); + printf("*****************************\n"); + printf("** STREAM COMPACTION TESTS **\n"); + printf("*****************************\n"); +#endif + // Compaction tests + + //genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + //a[SIZE - 1] = 0; + //printArray(SIZE, a, true); + + int count, expectedCount, expectedNPOT; + + // initialize b using Thrust + +#if CPU_TEST + zeroArray(SIZE, b); + printDesc("cpu compact without scan, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedCount = count; + printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, b); + + zeroArray(SIZE, c); + printDesc("cpu compact without scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedNPOT = count; + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); +#endif + zeroArray(SIZE, c); + printDesc("cpu compact with scan"); + count = StreamCompaction::CPU::compactWithScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); +#endif + //system("pause"); // stop Win32 console from closing on exit +#if SORT +#if DEBUG + printf("\n"); + printf("*****************************\n"); + printf("** SORT TESTS **\n"); + printf("*****************************\n"); +#endif +#if CPU_TEST + //CPU sort + zeroArray(SIZE, c); + printDesc("CPU sort, power-of-two"); + StreamCompaction::CPU::sort(SIZE, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + + zeroArray(SIZE, c); + printDesc("CPU sort, non-power-of-two"); + StreamCompaction::CPU::sort(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); +#endif + // initialize b using Thrust + // thrust sort + zeroArray(SIZE, b); + printDesc("Thrust sort, power-of-two"); + StreamCompaction::Thrust::sort(SIZE, b, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + + // Radix sort + zeroArray(SIZE, c); + printDesc("Radix sort, power-of-two"); + StreamCompaction::Radix::sort(SIZE, c, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, b); + printDesc("Thrust sort, non-power-of-two"); + StreamCompaction::Thrust::sort(NPOT, b, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + + zeroArray(SIZE, c); + printDesc("Radix sort, non-power-of-two"); + StreamCompaction::Radix::sort(NPOT, c, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); +#endif + } + } + delete[] a; delete[] b; delete[] c; diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 025e94a..b2c825b 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -6,35 +6,46 @@ #include #include +#define DEBUG 1 +#define PRINT_ARRAY 1 + template int cmpArrays(int n, T *a, T *b) { +#if DEBUG for (int i = 0; i < n; i++) { if (a[i] != b[i]) { printf(" a[%d] = %d, b[%d] = %d\n", i, a[i], i, b[i]); return 1; } } +#endif return 0; } void printDesc(const char *desc) { +#if DEBUG printf("==== %s ====\n", desc); +#endif } template void printCmpResult(int n, T *a, T *b) { +#if DEBUG printf(" %s \n", cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); +#endif } template void printCmpLenResult(int n, int expN, T *a, T *b) { +#if DEBUG if (n != expN) { printf(" expected %d elements, got %d\n", expN, n); } printf(" %s \n", (n == -1 || n != expN) ? "FAIL COUNT" : cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); +#endif } void zeroArray(int n, int *a) { @@ -58,6 +69,7 @@ void genArray(int n, int *a, int maxval) { } void printArray(int n, int *a, bool abridged = false) { +#if DEBUG && PRINT_ARRAY printf(" [ "); for (int i = 0; i < n; i++) { if (abridged && i + 2 == 15 && n > 16) { @@ -67,10 +79,13 @@ void printArray(int n, int *a, bool abridged = false) { printf("%3d ", a[i]); } printf("]\n"); +#endif } template void printElapsedTime(T time, std::string note = "") { +#if DEBUG std::cout << " elapsed time: " << time << "ms " << note << std::endl; +#endif } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 567795b..eb71175 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radix.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radix.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..bb773f8 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -14,6 +14,12 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { exit(EXIT_FAILURE); } +int BlockSize = 128; +int SM_Count = 0; +int Warp_Size = 0; +int MaxThreadPerBlock = 1024; +int MaxBlocksPerSM = 0; +int MaxWave = 0; namespace StreamCompaction { namespace Common { @@ -23,7 +29,10 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n) return; + + bools[index] = (idata[index] != 0 ? 1 : 0); } /** @@ -31,9 +40,34 @@ namespace StreamCompaction { * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO + const int *idata, const int *indices) { + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n) return; + + if (idata[index] != 0) + { + odata[indices[index]] = idata[index]; + } } + void GetGPUInfo(bool print) + { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + SM_Count = prop.multiProcessorCount; + Warp_Size = prop.warpSize; + MaxThreadPerBlock = prop.maxThreadsPerBlock; + MaxBlocksPerSM = prop.maxBlocksPerMultiProcessor; + MaxWave = MaxBlocksPerSM * SM_Count; + + if (print) + { + printf("SM: %d\n", SM_Count); + printf("Warp Size: %d\n", Warp_Size); + printf("Max Threads/Block Size: %d\n", MaxThreadPerBlock); + printf("Max Block/SM: %d\n", MaxBlocksPerSM); + printf("Max Wave: %d\n", MaxWave); + } + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..65f03f1 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -13,6 +13,15 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define DYNAMIC_BLOCK_SIZE 0 + +extern int BlockSize; +extern int SM_Count; +extern int Warp_Size; +extern int MaxThreadPerBlock; +extern int MaxBlocksPerSM; +extern int MaxWave; + /** * Check for CUDA errors; print and exit if there was a problem. */ @@ -30,12 +39,28 @@ inline int ilog2ceil(int x) { return x == 1 ? 0 : ilog2(x - 1) + 1; } +inline int lowbit(int x) { return x & -x; } +inline int pow2floor(int x) +{ + while (x != lowbit(x)) x += lowbit(x); + return x >> 1; +} + +inline int GetBlockSize(int k) +{ +#if DYNAMIC_BLOCK_SIZE + return std::max(std::min(MaxThreadPerBlock, pow2floor(k / SM_Count)), Warp_Size); +#else + return BlockSize; +#endif +} + namespace StreamCompaction { namespace Common { __global__ void kernMapToBoolean(int n, int *bools, const int *idata); __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices); + const int *idata, const int *indices); /** * This class is used for timing the performance @@ -128,5 +153,7 @@ namespace StreamCompaction { float prev_elapsed_time_cpu_milliseconds = 0.f; float prev_elapsed_time_gpu_milliseconds = 0.f; }; + + void GetGPUInfo(bool print = true); } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..08b0c58 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,5 +1,7 @@ #include +#include #include "cpu.h" +#include #include "common.h" @@ -18,8 +20,13 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { + if (n <= 0) return; timer().startCpuTimer(); - // TODO + odata[0] = 0; + for (int i = 1; i < n; ++i) + { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -29,10 +36,17 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { + int index = 0; timer().startCpuTimer(); - // TODO + for (int i = 0; i < n; ++i) + { + if (idata[i] != 0) + { + odata[index++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return index; } /** @@ -41,10 +55,44 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + std::vector label(n); + std::vector pre_sum_exclusive(n); + + timer().startCpuTimer(); + // label each item with 0/1 + for (int i = 0; i < n; ++i) + { + label[i] = (idata[i] != 0 ? 1 : 0); + } + // exclusive scan + pre_sum_exclusive[0] = 0; + for (int i = 1; i < n; ++i) + { + pre_sum_exclusive[i] = pre_sum_exclusive[i - 1] + label[i - 1]; + } + // scatter + for (int i = 0; i < n; ++i) + { + if (idata[i] != 0) + { + odata[pre_sum_exclusive[i]] = idata[i]; + } + } + + timer().endCpuTimer(); + return pre_sum_exclusive[n - 1]; + } + /** + * CPU sort std::sort + * + * @returns the number of elements remaining after compaction. + */ + void sort(int n, int* odata, const int* idata) + { + std::memcpy(odata, idata, n * sizeof(int)); timer().startCpuTimer(); - // TODO + std::sort(odata, odata + n); timer().endCpuTimer(); - return -1; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..222b77a 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -11,5 +11,7 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); + + void sort(int n, int* odata, const int* idata); } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..23e606b 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,80 @@ namespace StreamCompaction { return timer; } + __global__ void kernalEfficientScan_UpSweep(int k, int step, int* data) + { + int index = threadIdx.x + (blockDim.x * blockIdx.x); + + if (index >= k) return; + data[(index + 1) * step - 1] = data[(index + 1) * step - 1] + data[(index + 1) * step - 1 - (step >> 1)]; + } + __global__ void kernalEfficientScan_DownSweep(int k, int step, int* data) + { + int index = threadIdx.x + (blockDim.x * blockIdx.x); + + if (index >= k) return; + int temp = data[(index + 1) * step - 1]; + data[(index + 1) * step - 1] = data[(index + 1) * step - 1] + data[(index + 1) * step - 1 - (step >> 1)]; + data[(index + 1) * step - 1 - (step >> 1)] = temp; + } + + void EfficientParallelScan(const int& pot_length, int* dev_data) + { + // Up-Sweep + int k = pot_length >> 1; + int step = 2; + while (k > 1) + { + const int block = GetBlockSize(k); + kernalEfficientScan_UpSweep << <(k + block - 1) / block, block >> > (k, step, dev_data); + checkCUDAError("Luanch kernalEfficientScan_UpSweep failed!"); + + k >>= 1; + step <<= 1; + } + //replace last number with 0 + cudaMemset(dev_data + pot_length - 1, 0, sizeof(int)); + + // Down-Sweep + k = 1; + step = pot_length; + while (k < pot_length) + { + const int block = GetBlockSize(k); + kernalEfficientScan_DownSweep << <(k + block - 1) / block, block >> > (k, step, dev_data); + checkCUDAError("Luanch kernalEfficientScan_DownSweep failed!"); + + k <<= 1; + step >>= 1; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int pot_length = pow(2, ilog2ceil(n));// power-of-two length; + + // Malloc necessary space on GPU + int* dev_data; + + cudaMalloc((void**)&dev_data, pot_length * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + + // copy data from CPU to GPU + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Memcpy idata(host) to dev_data(device) failed!"); + timer().startGpuTimer(); - // TODO + EfficientParallelScan(pot_length, dev_data); timer().endGpuTimer(); + + // copy data back to CPU + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy dev_data(device) to odata(host) failed!"); + + // free memory on GPU + cudaFree(dev_data); } /** @@ -31,10 +98,46 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int pot_length = pow(2, ilog2ceil(n));// power-of-two length; + + // Malloc necessary space on GPU + int* dev_data; + int* dev_label; + int* dev_result; + + cudaMalloc((void**)&dev_data, pot_length * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + + cudaMalloc((void**)&dev_label, pot_length * sizeof(int)); + checkCUDAError("cudaMalloc dev_label failed!"); + + cudaMalloc((void**)&dev_result, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_result failed!"); + + // copy data from CPU to GPU + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Memcpy idata(host) to dev_data(device) failed!"); + timer().startGpuTimer(); - // TODO + + Common::kernMapToBoolean << <(pot_length + BlockSize - 1) / BlockSize, BlockSize >> > (pot_length, dev_label, dev_data); + + EfficientParallelScan(pot_length, dev_label); + + Common::kernScatter << <(pot_length + BlockSize - 1) / BlockSize, BlockSize >> > (pot_length, dev_result, dev_data, dev_label); + checkCUDAError("Luanch kernalScatter failed!"); + timer().endGpuTimer(); - return -1; + + int num_remain; + cudaMemcpy(&num_remain, dev_label + pot_length - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_result, num_remain * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_data); + cudaFree(dev_result); + cudaFree(dev_label); + + return num_remain; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..5c1ae25 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -9,5 +9,7 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); + + void EfficientParallelScan(const int& pot_length, int* dev_data); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..9f350cc 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,54 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kernalNaiveScan(int n, int start, int* write_arr, const int* read_arr) + { + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n) return; + int id = index + start; + write_arr[id] = read_arr[index] + read_arr[id]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // Malloc necessary space on GPU + int* dev_read; + int* dev_write; + + cudaMalloc((void**)&dev_read, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_read failed!"); + cudaMalloc((void**)&dev_write, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_write failed!"); + + // copy data from CPU to GPU + cudaMemcpy(dev_read, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Memcpy idata(host) to dev_read(device) failed!"); + + // naive scan timer().startGpuTimer(); - // TODO + int d = 1; + while (d < n) + { + int k = n - d; + kernalNaiveScan<<<(k + BlockSize - 1) / BlockSize, BlockSize >> >(k, d, dev_write, dev_read); + checkCUDAError("Luanch kernalNaiveScan failed!"); + cudaMemcpy(dev_read + d, dev_write + d, k * sizeof(int), cudaMemcpyDeviceToDevice); + + d <<= 1; + } timer().endGpuTimer(); + + // copy data back to CPU + odata[0] = 0; + cudaMemcpy(odata + 1, dev_read, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy dev_write(device) to odata(host) failed!"); + + // free memory on GPU + cudaFree(dev_read); + cudaFree(dev_write); } } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..a242ee8 --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,123 @@ +#include +#include +#include +#include +#include +#include "common.h" +#include "radix.h" +#include "efficient.h" + +namespace StreamCompaction { + namespace Radix { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernalCheckStop(int n, const int* idata, int* stop) + { + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n - 1) return; + + if (idata[index] > idata[index + 1]) (*stop) = 1; + } + + __global__ void kernalRadixMapToBoolean(int n, int k, int* label, const int* idata, int* skip) { + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n) return; + + int num = idata[index]; + int result = 1 - ((num & (1 << k)) >> k); + if (k == 0 || result != ((num & (1 << (k - 1))) != 0 ? 0 : 1)) + { + *skip = 1; + } + label[index] = result; + } + __global__ void kernalRadixScattering(int n, int k, int start, int* odata, const int* idata, const int* label) + { + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n) return; + + bool result = ((idata[index] & (1 << k)) != 0 ? 1 : 0); + if (result) + { + odata[start + index - label[index]] = idata[index]; + } + else + { + odata[label[index]] = idata[index]; + } + } + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void sort(int n, int *odata, const int *idata) + { + int pot_length = pow(2, ilog2ceil(n));// power-of-two length; + + int* dev_read; + int* dev_write; + int* dev_label; + int* dev_number; + + cudaMalloc((void**)&dev_read, pot_length * sizeof(int)); + checkCUDAError("cudaMalloc dev_read failed!"); + cudaMalloc((void**)&dev_write, pot_length * sizeof(int)); + checkCUDAError("cudaMalloc dev_write failed!"); + cudaMalloc((void**)&dev_label, pot_length * sizeof(int)); + checkCUDAError("cudaMalloc dev_label failed!"); + cudaMalloc((void**)&dev_number, sizeof(int)); + checkCUDAError("cudaMalloc dev_number failed!"); + + cudaMemset(dev_read, (1 << 8) - 1, pot_length * sizeof(int)); + cudaMemcpy(dev_read, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Memcpy idata(host) to dev_read(device) failed!"); + timer().startGpuTimer(); + for (int i = 0; i < 32; ++i) + { + // check whether to stop + cudaMemset(dev_number, 0, sizeof(int)); + kernalCheckStop << < (pot_length + BlockSize - 1) / BlockSize, BlockSize >> > (pot_length, dev_read, dev_number); + int stop; + cudaMemcpy(&stop, dev_number, sizeof(int), cudaMemcpyDeviceToHost); + if (stop == 0) break; + + // label and check whether to skip this bit + cudaMemset(dev_number, 0, sizeof(int)); + kernalRadixMapToBoolean << < (pot_length + BlockSize - 1) / BlockSize, BlockSize >> > (pot_length, i, dev_label, dev_read, dev_number); + checkCUDAError("Luanch kernalRadixMapToBoolean failed!"); + + int skip; + cudaMemcpy(&skip, dev_number, sizeof(int), cudaMemcpyDeviceToHost); + if (skip == 0) continue; + + // read the last number of label_1 back + int last_num; + cudaMemcpy(&last_num, dev_label + pot_length - 1, sizeof(int), cudaMemcpyDeviceToHost); + + Efficient::EfficientParallelScan(pot_length, dev_label); + //thrust::device_ptr thrust_dev_label(dev_label); + //thrust::exclusive_scan(thrust_dev_label, thrust_dev_label + pot_length, dev_label); + + int start_index; + cudaMemcpy(&start_index, dev_label + pot_length - 1, sizeof(int), cudaMemcpyDeviceToHost); + start_index += last_num; + + kernalRadixScattering << < (pot_length + BlockSize - 1) / BlockSize, BlockSize >> > (pot_length, i, start_index, dev_write, dev_read, dev_label); + checkCUDAError("Luanch kernalRadixMapToBoolean failed!"); + + std::swap(dev_write, dev_read); + } + timer().endGpuTimer(); + cudaMemcpy(odata, dev_read, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_read); + cudaFree(dev_write); + cudaFree(dev_label); + cudaFree(dev_number); + } + } +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..aa489d9 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Radix { + StreamCompaction::Common::PerformanceTimer& timer(); + + void sort(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..32bebb9 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -5,6 +5,7 @@ #include #include "common.h" #include "thrust.h" +#include namespace StreamCompaction { namespace Thrust { @@ -18,11 +19,45 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_read; + int* dev_write; + + cudaMalloc((void**)&dev_read, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_read failed!"); + + cudaMalloc((void**)&dev_write, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_write failed!"); + + // copy data from CPU to GPU + cudaMemcpy(dev_read, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Memcpy idata(host) to dev_read(device) failed!"); + + thrust::device_ptr thrust_dev_read(dev_read); + thrust::device_ptr thrust_dev_write(dev_write); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(thrust_dev_read, thrust_dev_read + n, dev_write); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_write, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy dev_write(device) to odata(host) failed!"); + + cudaFree(dev_read); + cudaFree(dev_write); + } + + void sort(int n, int* odata, const int* idata) + { + thrust::device_vector thrust_dev_data(idata, idata + n); + + timer().startGpuTimer(); + thrust::sort(thrust_dev_data.begin(), thrust_dev_data.end()); + timer().endGpuTimer(); + + cudaMemcpy(odata, thrust_dev_data.data().get(), n * sizeof(int), cudaMemcpyDeviceToHost); } } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index fe98206..38bb58a 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -7,5 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + + void sort(int n, int* odata, const int* idata); } }