diff --git a/README.md b/README.md index 0e38ddb..b519861 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,37 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +Han Wang -### (TODO: Your README) +Tested on: Windows 11, 11th Gen Intel(R) Core(TM) i9-11900H @ 2.50GHz 22GB, GTX 3070 Laptop GPU -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Analysis +**Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU.** +**(You shouldn't compare unoptimized implementations to each other!) +Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis).** + +![Unlock FPS](img/graph1.png) + +Based on my observation in my text, there are three phenomenons: +1. The block size seems to not influence the output that much. +2. The naive gpu approach is slower than the efficient approach. +3. Though I didn't plot out the output of the CPU scan, the CPU operation seems to be actually faster than the GPU operation. + +The first phenomenon seems not really reasonable. I expect that with more block size, we can improve the run time speed in the parallel operation. But I think I need to keep the truth I just post the real output. The second phenomenon seems reasonable. The book explained the high-speed algorithm. The third phenomenon might be because the GPU approach might spend more time accessing the shared memory compared to the CPU's fast access. For the current data size, the benefit of the parallel operation cannot cover the loss of that. + + +**Don't mix up CpuTimer and GpuTimer. +To guess at what might be happening inside the Thrust implementation (e.g. allocation, memory copy), take a look at the Nsight timeline for its execution. Your analysis here doesn't have to be detailed, since you aren't even looking at the code for the implementation. +Write a brief explanation of the phenomena you see here.** + +The observation is that the thrust implementation is slower than the GPU and CPU approach. Based on my knowledge, I think it is more possible that the thrust implementation might spend large amounts of time on memory I/O operation. I trust that the algorithm of the thrust might be fast and reliable. + + + +**Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation? +Paste the output of the test program into a triple-backtick block in your README.** + +Because our time check excludes the init and end memory operation, based on my observation and my hypothesis, I think that the memory I/O computation might be there but didn't actually influence me. At least I didn't observe the performance bottlenecks. + +![Unlock FPS](img/output.png) diff --git a/img/graph1.png b/img/graph1.png new file mode 100644 index 0000000..04c758b Binary files /dev/null and b/img/graph1.png differ diff --git a/img/output.png b/img/output.png new file mode 100644 index 0000000..d60c87b Binary files /dev/null and b/img/output.png differ diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..410061c 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,20 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + if (idata[idx] > 0) { + bools[idx] = 1; + } + else{ + bools[idx] = 0; + } + + + + } /** @@ -33,6 +47,16 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + if (bools[idx] > 0) { + odata[indices[idx]] = idata[idx]; + } + + + } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..13cce18 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -18,9 +18,17 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); + // TODO - timer().endCpuTimer(); + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + + } + + + + } /** @@ -31,8 +39,17 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int output = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[output] = idata[i]; + output += 1; + } + } + + timer().endCpuTimer(); - return -1; + return output; } /** @@ -41,10 +58,43 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); + + // TODO + + int* checked = new int[n]; + int* preCheck = new int[n]; + int counter = 0; + + timer().startCpuTimer(); + + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + preCheck[i] = 1; + counter += 1; + } + else { + preCheck[i] = 0; + } + } + + scan(n, checked, preCheck); + + for (int i = 0; i < n; i++) { + if (preCheck[i]==1) { + odata[checked[i]] = idata[i]; + } + + } timer().endCpuTimer(); - return -1; + + delete[] checked; + delete[] preCheck; + + + return counter; + + } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..a306d2a 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,7 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -11,14 +12,96 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + __global__ void upSweep(int n, int base, int* idata) { + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + int k = idx * (1 << base + 1); + if (k >= n) { + return; + } + + idata[k + (1 << base + 1) - 1] += idata[k + (1 << base) - 1]; + } + // referemce to book page algorithm 4 + __global__ void downSweep(int n, int base, int* idata) { + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + int k = idx * (1 << base + 1); + if (k >= n) { + return; + } + + int t = idata[k + (1 << base) - 1]; + idata[k + (1 << base) - 1] = idata[k + (1 << base + 1) - 1]; + idata[k + (1 << base + 1) - 1] += t; + + } + + void processScan(int n, int ending, int* gpu_idata) { + + + for (int i = 0; i < ilog2ceil(n); i++) { + + dim3 fullBlocksPerGrid((ending / (1 << (i + 1)) + blockSize - 1) / blockSize); + upSweep <<>> (n, i, gpu_idata); + } + cudaMemset(&gpu_idata[ending - 1], 0, sizeof(int)); + checkCUDAError("error in loop 0"); + + for (int i = ilog2ceil(n) -1; i>=0; i--) { + + dim3 fullBlocksPerGrid((ending / (1 << (i + 1)) + blockSize - 1) / blockSize); + downSweep <<>> (n, i, gpu_idata); + + + } + checkCUDAError("error in loop 0111"); + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO + int* gpu_odataa; + int* gpu_idataa; + int ending = 1 << ilog2ceil(n); + + cudaMalloc((void**)&gpu_odataa, ending * sizeof(int)); + cudaMalloc((void**)&gpu_idataa, ending * sizeof(int)); + checkCUDAError("memory error 0101!!!!!"); + cudaMemset(gpu_odataa, 0, ending * sizeof(int)); + checkCUDAError("memory error 0102!!!!!"); + cudaMemset(gpu_idataa, 0, ending * sizeof(int)); + checkCUDAError("memory error 0103!!!!!"); + cudaMemcpy(gpu_idataa, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + checkCUDAError("memory error 01!!!!!"); + + + timer().startGpuTimer(); + processScan(n, ending, gpu_idataa); timer().endGpuTimer(); + + + checkCUDAError("error in loop final process!!!!!"); + int* temp = gpu_odataa; + gpu_odataa = gpu_idataa; + gpu_idataa = temp; + + cudaMemcpy(odata, gpu_odataa, sizeof(int) * n, cudaMemcpyDeviceToHost); + + checkCUDAError("memory error 02!!!!!"); + + cudaFree(gpu_odataa); + cudaFree(gpu_idataa); + + } /** @@ -31,10 +114,65 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + //timer().startGpuTimer(); + + + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + //dim3 numBlocks((n - 1 + blockSize - 1) / blockSize); + + int* gpu_odata; + int* gpu_idata; + + int ending = 1 << ilog2ceil(n); + int* gpu_bool; + int* gup_sum; + + + cudaMalloc((void**)&gpu_odata, n * sizeof(int)); + cudaMalloc((void**)&gpu_idata, n * sizeof(int)); + cudaMalloc((void**)&gpu_bool, n * sizeof(int)); + cudaMalloc((void**)&gup_sum, n * sizeof(int)); + + cudaMemcpy(gpu_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + checkCUDAError("memory error 01!!!!!"); + + Common::kernMapToBoolean << > > (n, gpu_bool, gpu_idata); + + scan(n, gup_sum, gpu_bool); + + checkCUDAError("memory error 02!!!!!"); + + Common::kernScatter << > > (n, gpu_odata, gpu_idata, gpu_bool, gup_sum); + + + + // TODO - timer().endGpuTimer(); - return -1; + //timer().endGpuTimer(); + + int counter = -1; + + + cudaMemcpy(odata, gpu_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaMemcpy(&counter, &gup_sum[n-1], sizeof(int), cudaMemcpyDeviceToHost); + if (idata[n - 1] != 0) { + counter += 1; + } + + + checkCUDAError("memory error 023!!!!!"); + cudaFree(gpu_odata); + cudaFree(gpu_idata); + + cudaFree(gpu_bool); + cudaFree(gup_sum); + + + + return counter; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..6c0b86e 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,9 @@ #include "common.h" #include "naive.h" + + +#define blockSize 8 namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -12,14 +15,98 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void scanOperation(int n, int base, int* odata, int* idata) { + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + + if(idx >= n){ + return; + + } + + int curr_pos = 1 << (base-1); + + if (idx < curr_pos) { + + odata[idx] = idata[idx]; + } + else { + odata[idx] = idata[idx - curr_pos] + idata[idx]; + + } + + + + + } + + __global__ void convertToExclusive(int n, int* odata, int* idata) { + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (idx >= n) { + return; + + } + if (idx == 0) { + + odata[idx] = 0; + return; + } + + odata[idx] = idata[idx - 1]; + + + + } + + + + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO + int* gpu_odata; + int* gpu_idata; + + cudaMalloc((void**)&gpu_odata, n * sizeof(int)); + cudaMalloc((void**)&gpu_idata, n * sizeof(int)); + cudaMemcpy(gpu_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + checkCUDAError("memory error!!!!!"); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + dim3 numBlocks((n -1 + blockSize - 1) / blockSize); + + timer().startGpuTimer(); + for (int i = 1; i <= ilog2ceil(n); i++) { + scanOperation <<>> (n, i, gpu_odata, gpu_idata); + + checkCUDAError("error in loop!!!!!"); + int* temp = gpu_odata; + gpu_odata = gpu_idata; + gpu_idata = temp; + + + } + + + convertToExclusive << > > (n, gpu_odata, gpu_idata); + timer().endGpuTimer(); + + cudaMemcpy(odata, gpu_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + checkCUDAError("memory error!!!!!"); + + cudaFree(gpu_odata); + cudaFree(gpu_idata); + + + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..9204f59 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,33 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + int* gpu_odata; + int* gpu_idata; + + cudaMalloc((void**)&gpu_odata, n * sizeof(int)); + cudaMalloc((void**)&gpu_idata, n * sizeof(int)); + cudaMemcpy(gpu_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + thrust::device_ptrthrust_dv_in(gpu_idata); + thrust::device_ptrthrust_dv_out(gpu_odata); + + timer().startGpuTimer(); + thrust::exclusive_scan(thrust_dv_in, thrust_dv_in+n, thrust_dv_out); + timer().endGpuTimer(); + + + cudaMemcpy(odata, gpu_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(gpu_odata); + cudaFree(gpu_idata); + + + } } }