diff --git a/README.md b/README.md index 0e38ddb1..1fcdafec 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,127 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Caroline Fernandes + * [LinkedIn](https://www.linkedin.com/in/caroline-fernandes-0-/), [personal website](https://0cfernandes00.wixsite.com/visualfx) +* Tested on: Windows 11, i9-14900HX @ 2.20GHz, Nvidia GeForce RTX 4070 -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Features & Sections +- [CPU Scan and Stream Compaction](https://github.com/0cfernandes00/Project2-Stream-Compaction/blob/main/README.md#cpu) +- [Naive GPU Scan](https://github.com/0cfernandes00/Project2-Stream-Compaction/blob/main/README.md#naive) +- [Work Efficient Scan and Compaction](https://github.com/0cfernandes00/Project2-Stream-Compaction/blob/main/README.md#work-efficient) +- [Thrust Scan](https://github.com/0cfernandes00/Project2-Stream-Compaction/blob/main/README.md#thrust) +- [Optimization Profiling](https://github.com/0cfernandes00/Project2-Stream-Compaction/blob/main/README.md#optimization-analysis) +- [Final Output](https://github.com/0cfernandes00/Project2-Stream-Compaction/blob/main/README.md#final-output-for-size--2--8) +![](img/diagram_two.png) + +![](img/diagram_one.png) + +The overarching goals for this project were to +1) Understand and implement Stream Compaction on the GPU +2) Practice parallelizing algorithms + +This implementation of stream compaction is for removing zeroes from an array of ints, but this algorithm will be useful for future uses in removing unhelpful rays from a path tracer. + +In this process, I first implemented these algorithms on the CPU including Scan(Prefix Sum) and built up to a naive, work efficient, and the thrust implementation of stream compaction. +I also optimized my work efficient scan bringing the time on a power-of-2 array from 0.23 ms to 0.08ms for an array size of 2 to the power of 8. + + +### CPU +This section implemented an Exclusive Prefix Sum (Scan), a Stream Compaction without scan, and finally built up to a Stream Compaction with scan. + +Stream Compaction with scan +1) Map the input into a bools array of 0s and 1s +2) Perform a prefix sum of the input array into the bool array +3) The resulting scan (from step 2) will tell which index to scatter the input array to + +The primary performance hit for this implementation was Memory I/O, multiple reads and writes were most impactful as well as multiple passes over the data. +Smaller array sizes made this implementation look fast since there was little computation overhead compared to their parallel counterparts. + +### Naive +Pseudocode from [GPU Gems 3 Chapter 39 (Section 39.2.1)](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda) + +The primary hit to performance for the naive implementation could be the multiple accesses to global memory, additionally this implementation required buffers to swap between in order to prevent race conditions since this algorithm was not intended to be done in-place. + + +### Work Efficient +The Work Efficient Compaction utilized a parallel reduction up-sweep kernel and a down-sweep kernel as part of the sum, provided again from the book. + +I tried a few different methods for optimization. I originally implemented it according to the book and lecture slides keeping the kernels seperate. I also tried combining the two operations into a single kernel to reduce the time cost to the memory copy I currently do before performing the downsweep. The performance change that ended up making a difference was to reduce the number of blocks that are launched based on the number of threads needed for the current iteration of the loop, as well as using the threads needed for launch as a verification check on larger array sizes. + +The NSight Compute report seemed to suggest that the bottleneck was largely in computation usage for this algorithm. The memory overhead and number of writes and reads seemed similar to earlier implementations. The computation and branching considerations for scaling block sizes on kernel launch seemed to have the greatest impact on performance. +![](img/workeff_scan_compute.png) + +### Thrust +This was simply a call to the Thrust library to compare against our other implementations. The compute report suggested that the kernel thrust uses has high register usage and low occupancy requiring more hardware resources for a single thread. +![](img/thrust_compute.png) + +Unlike the other implementations, thrust seems to be using the cudaStreamSynchronize which is slowing the runtime. On average, the other implementations seem to be allocating less memory than the thrust implementation. For thrust the average runtime for the cudaMalloc API call is 34.4 microseconds, and for the other implementations the average is 29.93 microseconds. + + +All implementations except for thrust +Screenshot 2025-09-15 225852 +Thrust only +thrust_events + +### Optimization Analysis +I was able to use Nsight Systems as well as Nsight Compute to get a better look at the kernels and api calls. One thing it called out was that the grid size was too small and unoptimal for performance. This message was not super suprising since I was running smaller arrays. +image + + +### Final Output (for Size = 2 ^ 8) +``` +**************** +** SCAN TESTS ** +**************** + [ 44 31 15 33 34 41 21 45 49 9 38 49 30 ... 19 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 11.6663ms (std::chrono Measured) + [ 0 44 75 90 123 157 198 219 264 313 322 360 409 ... 102732619 102732638 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 17.8068ms (std::chrono Measured) + [ 0 44 75 90 123 157 198 219 264 313 322 360 409 ... 102732514 102732560 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 2.49738ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 2.09478ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 1.50678ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 1.06701ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 9.5096ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 7.08781ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 1 3 2 2 3 2 2 3 0 1 3 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 31.6045ms (std::chrono Measured) + [ 2 3 1 3 2 2 3 2 2 3 1 3 1 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 42.2566ms (std::chrono Measured) + [ 2 3 1 3 2 2 3 2 2 3 1 3 1 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 82.0855ms (std::chrono Measured) + [ 2 3 1 3 2 2 3 2 2 3 1 3 1 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.819008ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.769312ms (CUDA Measured) + passed +``` diff --git a/img/diagram_one.png b/img/diagram_one.png new file mode 100644 index 00000000..fb7b11f3 Binary files /dev/null and b/img/diagram_one.png differ diff --git a/img/diagram_two.png b/img/diagram_two.png new file mode 100644 index 00000000..ad799f32 Binary files /dev/null and b/img/diagram_two.png differ diff --git a/img/thrust_compute.png b/img/thrust_compute.png new file mode 100644 index 00000000..ae763e80 Binary files /dev/null and b/img/thrust_compute.png differ diff --git a/img/workeff_scan_compute.png b/img/workeff_scan_compute.png new file mode 100644 index 00000000..5703d637 Binary files /dev/null and b/img/workeff_scan_compute.png differ diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..6a022469 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 22;// feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -46,13 +46,14 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); @@ -60,12 +61,14 @@ int main(int argc, char* argv[]) { StreamCompaction::Naive::scan(SIZE, c, a); printArray(SIZE, c, true); */ + zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); @@ -80,7 +83,9 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); @@ -94,7 +99,9 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - + + + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -147,6 +154,22 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + /* + zeroArray(SIZE, c); + printDesc("cpu radix sort, power-of-two"); + //StreamCompaction::CPU::cpuRadixSort(SIZE, c, a); + //printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + //printArray(SIZE, c, true); + printCmpLenResult(SIZE, SIZE, a, c); + + zeroArray(SIZE, c); + printDesc("radix sort, power-of-two"); + StreamCompaction::Naive::radixSort(SIZE, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpLenResult(SIZE, SIZE, a, c); + */ + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..4c4da90a 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,17 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + if (idata[index] != 0) { + bools[index] = 1; + } + else { + bools[index] = 0; + } + } /** @@ -33,6 +44,15 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + if (bools[index] == 1) { + int idx = indices[index]; + odata[idx] = idata[index]; + } + } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..ef38cd89 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -2,6 +2,8 @@ #include "cpu.h" #include "common.h" +#include + namespace StreamCompaction { namespace CPU { @@ -12,6 +14,53 @@ namespace StreamCompaction { return timer; } + /* + // counting sort implementation + void count_sort(int arr[], int n, int pos) + { + + // we declare a count array and initialize the array by + // 0 + int count[10] = { 0 }; + + // we count the frequency of each distinct digit at + // given place for every element in the original array + for (int i = 0; i < n; i++) { + count[(arr[i] / pos) % 10]++; + } + + // we perform prefix sum and update the count array + for (int i = 1; i < 10; i++) { + count[i] = count[i] + count[i - 1]; + } + + // we store our answer in the ans array + int ans[n]; + for (int i = n - 1; i >= 0; i--) { + ans[--count[(arr[i] / pos) % 10]] = arr[i]; + } + + // here we copy the contents of ans array to our + // original array + for (int i = 0; i < n; i++) { + arr[i] = ans[i]; + } + } + */ + + //void cpuRadixSort(int n, int* odata, const int* idata) { + /* + void cpuRadixSort(int n, int arr[]) + { + // max_element() is a c++ stl function to find the + // maximum element from an array + int k = *std::max_element(arr, arr + n); + + for (int pos = 1; (k / pos) > 0; pos *= 10) { + //count_sort(arr, n, pos); + } + }*/ + /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. @@ -20,6 +69,10 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for(int i = 1; i + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +14,100 @@ namespace StreamCompaction { return timer; } + + __global__ void kernDownSweepEfficientScan(int n, int d, int* odata) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + int stride = 1 << (d + 1); + + int threadsNeeded = n >> (d + 1); + if (k >= threadsNeeded) return; + + + int baseIdx = stride * k; + int leftIdx = baseIdx + (1 << d) - 1; + int rightIdx = baseIdx + stride - 1; + + + if (rightIdx < n) { + int t = odata[leftIdx]; + + odata[leftIdx] = odata[rightIdx]; + odata[rightIdx] += t; + } + + } + + + __global__ void kernUpSweepEfficientScan(int n, int d, int* odata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int stride = 1 << (d + 1); + + int threadsNeeded = n >> (d + 1); + if (index >= threadsNeeded) return; + + int baseIdx = stride * index; + int leftIdx = baseIdx + (1 << d) - 1; + int rightIdx = baseIdx + stride - 1; + + if (rightIdx < n) { + odata[rightIdx] += odata[leftIdx]; + } + + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + + int* d_data; + + int paddedN = 1 << ilog2ceil(n); + int logceilN = ilog2ceil(paddedN); + + cudaMalloc((void**)&d_data, paddedN * sizeof(int)); + cudaMemcpy(d_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + if (paddedN > n) { + cudaMemset(d_data + n, 0, (paddedN - n) * sizeof(int)); + } + // TODO + int blockSize = 256; + + timer().startGpuTimer(); + + + for (int d = 0; d < logceilN; ++d) { + int threadsNeeded = paddedN >> (d + 1); + if (threadsNeeded == 0) break; + + int numBlocksLevel = (threadsNeeded + blockSize - 1) / blockSize; + + kernUpSweepEfficientScan << > > (paddedN, d, d_data); + + } + + int setNMinusOne_ToZero = 0; + cudaMemcpy(d_data + paddedN - 1, &setNMinusOne_ToZero, sizeof(int), cudaMemcpyHostToDevice); + + + for (int d = logceilN - 1; d > -1; --d) { + int threadsNeeded = paddedN >> (d + 1); + if (threadsNeeded == 0) break; + + int numBlocksLevel = (threadsNeeded + blockSize - 1) / blockSize; + + kernDownSweepEfficientScan << > > (paddedN, d, d_data); + + } + timer().endGpuTimer(); + + cudaMemcpy(odata, d_data, n * sizeof(int), cudaMemcpyDeviceToHost); + + + cudaFree(d_data); } /** @@ -31,10 +120,71 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO - timer().endGpuTimer(); - return -1; + + int *count; + int *d_bools; + int *d_indices; + int *d_idata; + int* d_odata; + + + int paddedN = 1 << ilog2ceil(n); + + cudaMalloc((void**)&d_bools, paddedN * sizeof(int)); + cudaMalloc((void**)&d_indices, paddedN * sizeof(int)); + cudaMalloc((void**)&d_idata, paddedN * sizeof(int)); + cudaMalloc((void**)&d_odata, paddedN * sizeof(int)); + + cudaMemcpy(d_idata, idata, paddedN * sizeof(int), cudaMemcpyHostToDevice); + + // map to bools + int blockSize = 64; + int numBlocks = (paddedN + blockSize - 1) / blockSize; + + //timer().startGpuTimer(); + + StreamCompaction::Common::kernMapToBoolean << > > (paddedN, d_bools, d_idata); + cudaDeviceSynchronize(); + + int* tmp_bools = new int[paddedN]; + cudaMemcpy(tmp_bools, d_bools, paddedN * sizeof(int), cudaMemcpyDeviceToHost); + + + int* tmp_indices = new int[paddedN]; + // scan bools into indices + scan(paddedN, tmp_indices, tmp_bools); + cudaDeviceSynchronize(); + + + // copy indices to device for scatter kernel + cudaMemcpy(d_indices, tmp_indices, paddedN * sizeof(int), cudaMemcpyHostToDevice); + + + // scatter + StreamCompaction::Common::kernScatter << > > (paddedN, d_odata, d_idata, d_bools, d_indices); + cudaDeviceSynchronize(); + + + int returnVal = 0; + for (int i = 0; i < n; i++) { + tmp_bools[i] == 1 ? returnVal++ : returnVal += 0; + } + + //timer().endGpuTimer(); + + cudaMemcpy(odata, d_odata, paddedN * sizeof(int), cudaMemcpyDeviceToHost); + + delete[] tmp_bools; + delete[] tmp_indices; + cudaFree(d_bools); + cudaFree(d_indices); + cudaFree(d_idata); + cudaFree(d_odata); + + + + return returnVal; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..8835e652 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#include + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +13,93 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + + __global__ void kernNaiveScan(int n, int d, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index >= n) return; + int val = 1 << (d-1); + + if (index >= val ) { + odata[index] = idata[index] + idata[index - val]; + } + else { + odata[index] = idata[index]; + } + + } + + __global__ void exclusiveShift(int n, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index >= n || index < 0) return; + + if (index == 0) odata[0] = idata[0]; + + odata[index] = idata[index - 1]; + + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO + + int* d_A; + int* d_B; + + cudaMalloc((void**)&d_A, n * sizeof(int)); + cudaMalloc((void**)&d_B, n * sizeof(int)); + cudaMemcpy(d_A, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + + int blockSize = 256; + int numBlocks = (n + blockSize - 1) / blockSize; + + int* tmp_A; + int* tmp_B; + + timer().startGpuTimer(); + for(int d = 1; d <= ilog2ceil(n); d++) { + tmp_A = d % 2 == 1 ? d_A : d_B; + tmp_B = d % 2 == 1 ? d_B : d_A; + kernNaiveScan<<>>(n, d, tmp_B, tmp_A); + cudaDeviceSynchronize(); + + } timer().endGpuTimer(); + + cudaMemcpy(odata, tmp_B, n * sizeof(int), cudaMemcpyDeviceToHost); + + for (int i = n - 1; i > 0; i--) { + odata[i] = odata[i - 1]; + } + odata[0] = 0; + + + + cudaFree(d_A); + cudaFree(d_B); + } + + __global__ void kernRadixSort(int n, int* odata, const int* idata, int bit) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + int lsb = (idata[index] >> bit) & 1; + + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void radixSort(int n, int* odata, const int* idata) { + + // get the least significant bit + //int lsb = idata & 1; + + return; } } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb064..11e11bd0 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -7,5 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + + void radixSort(int n, int* odata, const int* idata); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..c145a73a 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,21 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + + thrust::device_vector dv(idata, idata + n); + thrust::exclusive_scan(dv.begin(), dv.end(), dv.begin()); + thrust::copy(dv.begin(), dv.end(), odata); + + timer().endGpuTimer(); + } } }