diff --git a/README.md b/README.md index 0e38ddb1..735e2ed2 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,88 @@ CUDA Stream Compaction ====================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +**University of Pennsylvania, CIS 5650: GPU Programming and Architecture, +Project 2 - CUDA Stream Compaction** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Pavel Peev + * [LinkedIn](https://www.linkedin.com/in/pavel-peev-5568561b9/), [personal website](www.cartaphil.com) +* Tested on: Windows 11, i7-1270, NVIDIA T1000 -### (TODO: Your README) +### Description -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Comparison of implementations of exclusive scans (and implementations of compact for the CPU and the work effient scan), comparing a basic CPU implementation, a naive CUDA implementation which adds an element stride spaces to the right over log2(n) kernal calls, the work efficient implmentation using an upsweep and downsweep stage to create a balanced binary tree, and thrust's own implementation of an exclusive scan. + +### Sample Output +Below is an example of the output from running the test with array size 2^28 +``` +**************** +** SCAN TESTS ** +**************** + [ 45 15 18 24 10 15 38 38 47 12 8 39 45 ... 11 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 421.69ms (std::chrono Measured) + [ 0 45 60 78 102 112 127 165 203 250 262 270 309 ... -2015394307 -2015394296 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 419.546ms (std::chrono Measured) + [ 0 45 60 78 102 112 127 165 203 250 262 270 309 ... -2015394410 -2015394369 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 520.167ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 519.373ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 392.432ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 392.508ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 16.1508ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 17.0412ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 2 1 3 0 2 1 0 0 0 3 3 2 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 570.409ms (std::chrono Measured) + [ 3 2 1 3 2 1 3 3 2 2 2 3 2 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 576.547ms (std::chrono Measured) + [ 3 2 1 3 2 1 3 3 2 2 2 3 2 ... 1 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 1481.78ms (std::chrono Measured) + [ 3 2 1 3 2 1 3 3 2 2 2 3 2 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 2605.5ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 2604.63ms (CUDA Measured) + passed +``` +### Performance + +Line Chart + + + +### Performance Analysis +Blocks of size 128 were used, as that seemed to perform the best across all the algorithms. + +The efficient implementation performs marginally better than it's naive and cpu counterparts.The naive implementation performed the worst, which makes sense do to it's nlog2(n) complexity and the high cost for invoking kernal calls, making it slower than a regular CPU implemtation (and potentially the use of the older NVIDIA T1000 GPU). The thrust implementation performs significantly better than the others, showing just how much the exclusive scan algorithm can be optimized. + + +Analyzing the kernal invocations within NSight Compute (shown below), it becomes clear that the efficient implementation can be improved upon. Both the upsweep and downsweep have low memory throughput. Using shared memory, we can load the memory coherently for each of the blocks, which should reduce the amount of memory loads needed with the current algorithm. + +image + +I also use a modulo operator within the kernal, which is a well known expensive operation which could be replaced. Also, it is very likely that the warps diverge, with some warps having only 1 thread doing any work. With some clever indexing, this could be optimized so that all the working threads fall into the same warps, allowing for the warps with non working threads to retire early. diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..ae898a69 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 28; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..7db26fed 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,12 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + //Note, exclusive scan + odata[0] = 0; + for (int i = 1; i < n; i++) + { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -31,8 +37,22 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + + //Points to the next spot for inserting into odata + int oDataIndex = 0; + + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[oDataIndex] = idata[i]; + oDataIndex++; + } + } + + timer().endCpuTimer(); - return -1; + return oDataIndex; } /** @@ -43,8 +63,46 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + + //Step 1: create bool array + int* boolArray = new int[n] {-1}; + for (int i = 0; i < n; i++) + { + if (idata[i] == 0) + { + boolArray[i] = 0; + } + else + { + boolArray[i] = 1; + } + } + //Step 2: Scan bool array + //Note: not using scan since it's being timed with the same timer. + int* scanArray = new int[n]; + scanArray[0] = 0; + for (int i = 1; i < n; i++) + { + scanArray[i] = scanArray[i - 1] + boolArray[i - 1]; + } + + //Step 3: Scatter + int count = 0; + for (int i = 0; i < n; i++) + { + if (boolArray[i] == 1) + { + count++; + odata[scanArray[i]] = idata[i]; + } + } + + + + delete[] boolArray; + delete[] scanArray; timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..720646fa 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -11,14 +11,142 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + + __global__ void kernUpSweep(int n, int* data, int d) + { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < n) + { + //Back when d was 0, 1, 2, 3... + /* + if (index % (int)powf(2, d + 1) == 0) + { + data[index + (int)powf(2, d + 1) - 1] += data[index + (int)powf(2, d) - 1]; + + } + */ + if (index % (d * 2) == 0) + { + data[index + d * 2 - 1] += data[index + d - 1]; + + } + + } + } + __global__ void kernDownSweep(int n, int* data, int d) + { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < n) + { + //Back when d was 0, 1, 2, 3... + /* + if (index % (int)powf(2, d + 1) == 0) + { + int t = data[index + (int)powf(2, d) - 1]; + data[index + (int)powf(2, d) - 1] = data[index + (int)powf(2, d + 1) - 1]; + data[index + (int)powf(2, d + 1) - 1] += t; + } + */ + if (index % (d * 2) == 0) + { + int t = data[index + d - 1]; + data[index + d - 1] = data[index + d * 2 - 1]; + data[index + d * 2 - 1] += t; + } + } + + } + + __global__ void kernMapToBoolean(int n, int* oData, const int* iData) + { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < n) + { + int val = iData[index]; + if (val != 0) + { + oData[index] = 1; + } + } + } + __global__ void kernScatter(int n, int* oData, const int* iData, const int* boolArray, const int* scannedArray) + { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < n) + { + if (boolArray[index]) + { + oData[scannedArray[index]] = iData[index]; + } + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + void scan(int n, int *odata, const int *idata, bool forCompact) { + // TODO - timer().endGpuTimer(); + + int* dev_arr; + + int log2Ceil = ilog2ceil(n); + int arraySize = 1 << log2Ceil; + + cudaMalloc((void**)&dev_arr, sizeof(int) * arraySize); + checkCUDAError("Bad Malloc"); + + //Make sure that the array is filled with 0s to start with + cudaMemset(dev_arr, 0, sizeof(int) * arraySize); + checkCUDAError("Bad memset"); + if (!forCompact) + { + cudaMemcpy(dev_arr, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + } + else + { + cudaMemcpy(dev_arr, idata, sizeof(int) * n, cudaMemcpyDeviceToDevice); + } + checkCUDAError("Bad copy of initial data"); + + int threadsPerBlock = 128; + dim3 totalBlocks((arraySize + threadsPerBlock - 1) / threadsPerBlock); + if (!forCompact) + { + timer().startGpuTimer(); + } + for (int d = 1; d < arraySize; d *= 2) + { + kernUpSweep << > > (arraySize, dev_arr, d); + //checkCUDAError("up sweep failure"); + } + cudaMemset(dev_arr + (arraySize - 1), 0, sizeof(int)); + //checkCUDAError("Bad zeroing of last element"); + + for (int d = arraySize / 2; d > 0; d /= 2) + { + kernDownSweep << > > (arraySize, dev_arr, d); + //checkCUDAError("down sweep failure"); + } + + cudaDeviceSynchronize(); + if (!forCompact) + { + timer().endGpuTimer(); + cudaMemcpy(odata, dev_arr, sizeof(int) * n, cudaMemcpyDeviceToHost); + + } + else + { + cudaMemcpy(odata, dev_arr, sizeof(int) * n, cudaMemcpyDeviceToDevice); + } + checkCUDAError("memcpy output failure"); + + + cudaFree(dev_arr); + cudaDeviceSynchronize(); + } /** @@ -31,10 +159,53 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + + + int* dev_in; + int* dev_out; + int* dev_boolArray; + int* dev_scannedArray; + + cudaMalloc((void**)&dev_in, sizeof(int) * n); + cudaMalloc((void**)&dev_boolArray, sizeof(int) * n); + cudaMalloc((void**)&dev_scannedArray, sizeof(int) * n); + cudaMalloc((void**)&dev_out, sizeof(int) * n); + checkCUDAError("Bad Malloc"); + + + cudaMemset(dev_boolArray, 0, sizeof(int) * n); + checkCUDAError("Bad Memset"); + cudaMemcpy(dev_in, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Bad Memcpy"); + + int threadsPerBlock = 128; + dim3 totalBlocks((n + threadsPerBlock - 1) / threadsPerBlock); + timer().startGpuTimer(); - // TODO + kernMapToBoolean << > > (n, dev_boolArray, dev_in); + + + scan(n, dev_scannedArray, dev_boolArray, true); + + + + int count; + int lastElement; + cudaMemcpy(&count, dev_scannedArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastElement, dev_boolArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + count += lastElement; + + kernScatter << > > (n, dev_out, dev_in, dev_boolArray, dev_scannedArray); timer().endGpuTimer(); - return -1; + cudaMemcpy(odata, dev_out, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_in); + cudaFree(dev_out); + cudaFree(dev_boolArray); + cudaFree(dev_scannedArray); + + + return count; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4fe..8b77b78e 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, bool forCompact = false); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..ec41974d 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,82 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + + + // TODO: __global__ + __global__ void kernNaiveScan(int n, int* odata, const int* idata, int stride) + { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < n) + { + + if (index >= (1 << stride)) + { + odata[index] = idata[index] + idata[index - (1 << stride)]; + } + else + { + odata[index] = idata[index]; + } + } + } + + __global__ void kernInclusiveToExclusive(int n, int* odata, const int* idata) + { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index == 0) + { + odata[0] = 0; + } + else if (index < n) + { + odata[index] = idata[index - 1]; + } + } + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO + //Starting Input + int *dev_arrA; + //Starting Output + int *dev_arrB; + + cudaMalloc((void**)&dev_arrA, sizeof(int) * n); + cudaMalloc((void**)&dev_arrB, sizeof(int) * n); + + cudaMemcpy(dev_arrA, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + + int threadsPerBlock = 128; + dim3 totalBlocks ((n + threadsPerBlock - 1) / threadsPerBlock); + + int log2Ceil = ilog2ceil(n); + + timer().startGpuTimer(); + for (int i = 0; i < log2Ceil; i++) + { + kernNaiveScan << > > (n, dev_arrB, dev_arrA, i); + std::swap(dev_arrA, dev_arrB); + } + + kernInclusiveToExclusive <<>> (n, dev_arrB, dev_arrA); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_arrB, sizeof(int) * n, cudaMemcpyDeviceToHost); + + + + cudaFree(dev_arrA); + cudaFree(dev_arrB); + + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..63eb32ae 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,21 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + //thrust::device_vector dev_in(idata, n); + thrust::device_vector dev_in(idata, idata + n); + thrust::device_vector dev_out(n); + timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin()); + timer().endGpuTimer(); + + thrust::copy(dev_out.begin(), dev_out.end(), odata); } } }