diff --git a/README.md b/README.md index 0e38ddb1..68c297b2 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,130 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Hongyi Ding + * [LinkedIn](https://www.linkedin.com/in/hongyi-ding/), [personal website](https://johnnyding.com/) +* Tested on: Windows 11, i7-12700 @ 2.10GHz 32GB, NVIDIA T1000 4GB (SEAS Virtual Lab) -### (TODO: Your README) +### Description -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +In this project, we implemented scan & stream compaction algorithms in different ways. We intend to compare the performance of different implementations, including cpu scan & compation, naive gpu scan, and scanning using the thrust library. +### Performance Analysis + +This table shows the time needed for scanning over the number of numbers using different ways. The time is recorded in ms. + +| N | CPU | Naive GPU | Efficient GPU | thrust | +| ---- | ------- | --------- | ------------- | -------- | +| 2^10 | 0.0016 | 1.68691 | 1.38016 | 0.981888 | +| 2^15 | 0.054 | 1.84899 | 1.37459 | 1.06662 | +| 2^20 | 1.7093 | 4.20902 | 2.68237 | 1.34141 | +| 2^25 | 52.4132 | 86.1861 | 27.3095 | 3.1232 | +| 2^26 | 105.806 | 171.778 | 52.3393 | 5.23088 | +| 2^27 | 206.849 | 353.931 | 102.907 | 8.9111 | +| 2^28 | 424.237 | 703.445 | 204.032 | 16.5718 | +| 2^29 | 816.657 | 3192.93 | 407.458 | 78.364 | +| 2^30 | 1662.63 | 14334 | 2523.06 | 404.013 | + +This table shows the time needed for compaction over the number of numbers using different ways. The time is recorded in ms. + +| N | CPU without scan | CPU with scan | Efficient GPU | +| ---- | ---------------- | ------------- | ------------- | +| 2^10 | 0.0025 | 0.0042 | 0.206528 | +| 2^15 | 0.067 | 0.1381 | 0.471424 | +| 2^20 | 2.2095 | 4.8973 | 1.34582 | +| 2^25 | 70.2928 | 156.767 | 33.5974 | +| 2^26 | 143.941 | 323.207 | 65.5928 | +| 2^27 | 282.771 | 652.236 | 132.566 | +| 2^28 | 558.38 | 1376.53 | 281.205 | +| 2^29 | 1003.12 | 2541.53 | 2543.02 | +| 2^30 | 2273.97 | 6898.14 | 11972.5 | + +### Answer to Questions + +* Roughly optimize the block sizes of each of your implementations for minimal + run time on your GPU. + + * This is the table of performance of efficient GPU scan over different block sizes + + | Block Size | 128 | 256 | 512 | 768 | 1024 | + | ----------- | ------- | ------- | ------ | ------- | ------- | + | Performance | 53.6786 | 52.3393 | 52.817 | 52.2205 | 52.7176 | + + The difference is minor for different block sizes. So we just take the common block size of 256. + +* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and + Thrust) to the serial CPU version of Scan. Plot a graph of the comparison + (with array size on the independent axis). + + * ![scan-performance](pics/scan-performance.png) + * ![compaction-performance](pics/compaction-performance.png) + +* Write a brief explanation of the phenomena you see here. + + * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is + it different for each implementation? + * When N is small (`<2^20`), for all 3 GPU implementations, the overhead of initializing memory and transferring data is high. As we can observe in the graph, when `N<2^20`, cpu version has a better performance. So in this situation, the bottleneck is initializing and Host-Device communication. + * When N becomes larger (`2^20` to `2^28`), the gpu implementations are much better. The parallelism is making the the algorithm perform better than the single-thread version on gpu. + * When N grows very large (`>2^28`), due to limited GPU memory (which is only 4GB on the test machine), the program has to use shared memory, then the memory I/O has a much larger latency. So in this case, all 3 gpu implementations begin to perform worse and worse, even worse than the cpu version. + +* Paste the output of the test program into a triple-backtick block in your + README. + + * this is the output when `N=1<<26` + + ``` + **************** + ** SCAN TESTS ** + **************** + [ 0 9 47 18 42 18 43 43 37 33 6 43 41 ... 15 0 ] + ==== cpu scan, power-of-two ==== + elapsed time: 105.806ms (std::chrono Measured) + [ 0 0 9 56 74 116 134 177 220 257 290 296 339 ... 1643626790 1643626805 ] + ==== cpu scan, non-power-of-two ==== + elapsed time: 106.817ms (std::chrono Measured) + [ 0 0 9 56 74 116 134 177 220 257 290 296 339 ... 1643626693 1643626732 ] + passed + ==== naive scan, power-of-two ==== + elapsed time: 171.778ms (CUDA Measured) + passed + ==== naive scan, non-power-of-two ==== + elapsed time: 160.932ms (CUDA Measured) + passed + ==== work-efficient scan, power-of-two ==== + elapsed time: 52.3393ms (CUDA Measured) + passed + ==== work-efficient scan, non-power-of-two ==== + elapsed time: 50.5345ms (CUDA Measured) + passed + ==== thrust scan, power-of-two ==== + elapsed time: 5.23088ms (CUDA Measured) + passed + ==== thrust scan, non-power-of-two ==== + elapsed time: 4.78698ms (CUDA Measured) + passed + + ***************************** + ** STREAM COMPACTION TESTS ** + ***************************** + [ 2 0 0 1 1 1 2 2 0 1 2 1 2 ... 0 0 ] + ==== cpu compact without scan, power-of-two ==== + elapsed time: 143.941ms (std::chrono Measured) + [ 2 1 1 1 2 2 1 2 1 2 1 1 1 ... 3 1 ] + passed + ==== cpu compact without scan, non-power-of-two ==== + elapsed time: 139.827ms (std::chrono Measured) + [ 2 1 1 1 2 2 1 2 1 2 1 1 1 ... 3 1 ] + passed + ==== cpu compact with scan ==== + elapsed time: 323.207ms (std::chrono Measured) + [ 2 1 1 1 2 2 1 2 1 2 1 1 1 ... 3 1 ] + passed + ==== work-efficient compact, power-of-two ==== + elapsed time: 65.5928ms (CUDA Measured) + passed + ==== work-efficient compact, non-power-of-two ==== + elapsed time: 65.4395ms (CUDA Measured) + passed + ``` + + \ No newline at end of file diff --git a/pics/compaction-echarts.js b/pics/compaction-echarts.js new file mode 100644 index 00000000..6b24a56e --- /dev/null +++ b/pics/compaction-echarts.js @@ -0,0 +1,25 @@ +const CPU_without_scan = [0.0025, 0.067, 2.2095, 70.2928, 143.941, 282.771, 558.38, 1003.12, 2273.97]; +const CPU_with_scan = [0.0042, 0.1381, 4.8973, 156.767, 323.207, 652.236, 1376.53, 2541.53, 6898.14]; +const Efficient_GPU = [0.206528, 0.471424, 1.34582, 33.5974, 65.5928, 132.566, 281.205, 2543.02, 11972.5]; + +const N = [10, 15, 20, 25, 26, 27, 28, 29, 30]; +const N_values = N.map(v => v); +const offset= 3; + +function logAndOffset(arr, offset) { + return arr.map(v => Math.log10(v) + offset); +} + +option = { + title: { + text: 'Compaction Performance' + }, + legend: { data: ['CPU_without_scan', 'CPU_with_scan', 'Efficient_GPU'] }, + xAxis: { type: 'value', name: 'log10(N)', min: 10, max: 30 }, + yAxis: { type: 'value', name: 'log10(Time)' }, + series: [ + { name: 'CPU_without_scan', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(CPU_without_scan, offset)[i]]) }, + { name: 'CPU_with_scan', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(CPU_with_scan, offset)[i]]) }, + { name: 'Efficient_GPU', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(Efficient_GPU, offset)[i]]) }, + ] +}; \ No newline at end of file diff --git a/pics/compaction-performance.png b/pics/compaction-performance.png new file mode 100644 index 00000000..2ffbe177 Binary files /dev/null and b/pics/compaction-performance.png differ diff --git a/pics/scan-echarts.js b/pics/scan-echarts.js new file mode 100644 index 00000000..3a3badac --- /dev/null +++ b/pics/scan-echarts.js @@ -0,0 +1,27 @@ +const CPU = [0.0016, 0.054, 1.7093, 52.4132, 105.806, 206.849, 424.237, 816.657, 1662.63]; +const NaiveGPU = [1.68691, 1.84899, 4.20902, 86.1861, 171.778, 353.931, 703.445, 3192.93, 14334]; +const EfficientGPU = [1.38016, 1.37459, 2.68237, 27.3095, 52.3393, 102.907, 204.032, 407.458, 2523.06]; +const Thrust = [0.981888, 1.06662, 1.34141, 3.1232, 5.23088, 8.9111, 16.5718, 78.364, 404.013]; + +const N = [10, 15, 20, 25, 26, 27, 28, 29, 30]; +const N_values = N.map(v => v); +const offset= 3; + +function logAndOffset(arr, offset) { + return arr.map(v => Math.log10(v) + offset); +} + +option = { + title: { + text: 'Scan Performance' + }, + legend: { data: ['CPU', 'Naive GPU', 'Efficient GPU', 'Thrust'] }, + xAxis: { type: 'value', name: 'log10(N)', min: 10, max: 30 }, + yAxis: { type: 'value', name: 'log10(Time)' }, + series: [ + { name: 'CPU', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(CPU, offset)[i]]) }, + { name: 'Naive GPU', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(NaiveGPU, offset)[i]]) }, + { name: 'Efficient GPU', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(EfficientGPU, offset)[i]]) }, + { name: 'Thrust', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(Thrust, offset)[i]]) }, + ] +}; \ No newline at end of file diff --git a/pics/scan-performance.png b/pics/scan-performance.png new file mode 100644 index 00000000..f4ed8dfb Binary files /dev/null and b/pics/scan-performance.png differ diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..1369212d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 25; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 19511caa..c1a8eab6 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -20,9 +20,11 @@ list(SORT sources) source_group(Headers FILES ${headers}) source_group(Sources FILES ${sources}) +find_package(CCCL REQUIRED) add_library(stream_compaction ${sources} ${headers}) +target_link_libraries(stream_compaction CCCL::Thrust) if(CMAKE_VERSION VERSION_LESS "3.23.0") - set_target_properties(stream_compaction} PROPERTIES CUDA_ARCHITECTURES OFF) + set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES OFF) elseif(CMAKE_VERSION VERSION_LESS "3.24.0") set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES all-major) else() diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..04c7529f 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,14 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (idata[index] != 0) { + bools[index] = 1; + } else { + bools[index] = 0; + } + } } /** @@ -32,7 +39,12 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..7cbfaa96 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,10 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +33,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int i = 0, j = 0; + while (i < n) { + if (idata[i] != 0) { + odata[j++] = idata[i]; + } + i++; + } timer().endCpuTimer(); - return -1; + return j; } /** @@ -41,10 +50,25 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int* b = new int[n], * sum = new int[n]; timer().startCpuTimer(); - // TODO + for (int i = 0; i < n; i++) { + b[i] = (idata[i] != 0) ? 1 : 0; + } + sum[0] = 0; + for (int i = 1; i < n; i++) { + sum[i] = sum[i - 1] + b[i - 1]; + } + int count = (n > 0) ? sum[n - 1] + b[n - 1] : 0; + for (int i = 0; i < n; i++) { + if (b[i] == 1) { + odata[sum[i]] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + delete[] b; + delete[] sum; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..be417acb 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,9 @@ #include #include "common.h" #include "efficient.h" +#include + +#define NUM_PER_BLOCK 1024 namespace StreamCompaction { namespace Efficient { @@ -12,13 +15,116 @@ namespace StreamCompaction { return timer; } + int nextPow2(int n) { + int ret = 1; + while (ret < n) ret <<= 1; + return ret; + } + + __global__ void scan(int n, int *idata) { + __shared__ int temp[NUM_PER_BLOCK]; + n = NUM_PER_BLOCK; + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) { + temp[index] = idata[index]; + } + + int offset = 1; + for (int d = n>>1; d > 0; d >>= 1) { + if (index < d) { + int ai = offset * (2 * index + 2) - 1; + int bi = offset * (2 * index + 1) - 1; + if (ai >= n || bi >= n) continue; + temp[ai] += temp[bi]; + } + offset <<= 1; + __syncthreads(); + } + + // clear the last element + if (index == 0) { + temp[n - 1] = 0; + } + __syncthreads(); + + offset = n >> 1; + for (int d = 1; d < n; d <<= 1) { + if (index < d) { + int ai = offset * (2 * index + 2) - 1; + int bi = offset * (2 * index + 1) - 1; + if (ai >= n || bi >= n) continue; + int t = temp[bi]; + temp[bi] = temp[ai]; + temp[ai] += t; + } + offset >>= 1; + __syncthreads(); + } + + if (index < n) { + idata[index] = temp[index]; + } + } + + __global__ void upsweep(int n, int* data, int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int offset = (n >> 1) / d; + if (index < d) { + int ai = offset * (2 * index + 2) - 1; + int bi = offset * (2 * index + 1) - 1; + data[ai] += data[bi]; + } + } + + __global__ void downsweep(int n, int* data, int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int offset = (n >> 1) / d; + if (index < d) { + int ai = offset * (2 * index + 2) - 1; + int bi = offset * (2 * index + 1) - 1; + int t = data[bi]; + data[bi] = data[ai]; + data[ai] += t; + } + } + + void scanOnDevice(int N, int* d_idata) { + int blockSize = 256; + int numBlocks = (N + blockSize - 1) / blockSize; + for (int d = N >> 1; d > 0; d >>= 1) { + upsweep << <(d + blockSize - 1) / blockSize, blockSize >> > (N, d_idata, d); + } + //cudaMemcpy(odata, d_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + //for (int i = 0; i < n; i++) { + // std::cout << odata[i] << " "; + //} + //std::cout << std::endl; + cudaMemset(d_idata + N - 1, 0, sizeof(int)); + for (int d = 1; d < N; d <<= 1) { + downsweep << <(d + blockSize - 1) / blockSize, blockSize >> > (N, d_idata, d); + } + //cudaMemcpy(odata, d_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + //for (int i = 0; i < n; i++) { + // std::cout << odata[i] << " "; + //} + //std::cout << std::endl; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *d_idata; + int N = nextPow2(n); + cudaMalloc((void**)&d_idata, N * sizeof(int)); + cudaMemset(d_idata, 0, N * sizeof(int)); + cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); - // TODO + scanOnDevice(N, d_idata); timer().endGpuTimer(); + cudaMemcpy(odata, d_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_idata); } /** @@ -31,10 +137,32 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int* d_bools; + int* d_indices; + int* d_idata; + int* d_odata; + int N = nextPow2(n); + cudaMalloc((void**)&d_bools, n * sizeof(int)); + cudaMalloc((void**)&d_indices, N * sizeof(int)); + cudaMalloc((void**)&d_idata, n * sizeof(int)); + cudaMalloc((void**)&d_odata, n * sizeof(int)); + cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); - // TODO + int blockSize = 256; + StreamCompaction::Common::kernMapToBoolean << <(n + blockSize - 1) / blockSize, blockSize >> > (n, d_bools, d_idata); + cudaMemcpy(d_indices, d_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + scanOnDevice(N, d_indices); + StreamCompaction::Common::kernScatter << < (n + blockSize - 1) / blockSize, blockSize >> > (n, d_odata, d_idata, d_bools, d_indices); timer().endGpuTimer(); - return -1; + int sum; + cudaMemcpy(&sum, d_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + if (idata[n - 1] != 0) sum++; + cudaMemcpy(odata, d_odata, sum * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_bools); + cudaFree(d_indices); + cudaFree(d_idata); + cudaFree(d_odata); + return sum; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..93ff151f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,39 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + __global__ void scan(int n, int *odata, const int *idata, int offset) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + if (index >= offset) { + odata[index] = idata[index - offset] + idata[index]; + } else { + odata[index] = idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* d_idata, * d_odata; + cudaMalloc((void**)&d_idata, n * sizeof(int)); + cudaMalloc((void**)&d_odata, n * sizeof(int)); + cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); - // TODO + int blockSize = 256; + int numBlocks = (n + blockSize - 1) / blockSize; + for (int offset = 1; offset < n; offset *= 2) { + scan << > > (n, d_odata, d_idata, offset); + std::swap(d_odata, d_idata); + } + cudaMemcpy(odata, d_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + for (int i = n - 1; i > 0; i--) { + odata[i] = odata[i - 1]; + } + odata[0] = 0; timer().endGpuTimer(); + cudaFree(d_idata); + cudaFree(d_odata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..d4fa9641 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,16 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector h_idata(idata, idata + n); + thrust::device_vector d_idata = h_idata; + thrust::device_vector d_odata(n); timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(d_idata.begin(), d_idata.end(), d_odata.begin()); timer().endGpuTimer(); + thrust::copy(d_odata.begin(), d_odata.end(), odata); } } }