diff --git a/README.md b/README.md index 0e38ddb..35f2f3c 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,102 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xiaoxiao Zou + * [LinkedIn](https://www.linkedin.com/in/xiaoxiao-zou-23482a1b9/) +* Tested on: Windows 11, AMD Ryzen 9 7940HS @ 4.00 GHz, RTX 4060 Laptop -### (TODO: Your README) +### Implementations: +I implemented basic CPU scan & compact, GPU naive scan, GPU work efficient scan & compact, Thrust scan. In addition to those, I also implemented GPU work efficient scan Upgrade, GPU work efficient scan Upgrade with Shared Memory. + +The four basic implementations just followed the instructions from slide. + +GPU Work efficient: Benchmark with all modulo operations and multiply operations converted to bitwise operations (give fair amount of speedup). + +GPU Work efficient scan Upgrade: I calculated actual number of blocks will be needed will be needed for each round of up sweep and down sweep in order to reduce number of blocks (total number of threads) need to be launched each time. This gives around up to 5x speedup. + +GPU Work efficient scan Upgrade with Shared Memory: I used shared memory to do block-wise scan for each block, then, I do scan on the increments. At last, I add increments back to block. Here, I made a design choice for the scan on increments, for this scan, I use GPU Work efficient scan Upgrade method instead of GPU Work efficient scane Upgrade with shared memory. By implementing GPU Work efficient scan Upgrade with shared memory on increments array will result in recursive looping on increments array. (I tried to do it just by appending new increments array to old one). However, I found that actually slow the performance somehow due to the need to addition from new increments array to old arrays. I found just using simple GPU Work efficient scan Upgrade is not that bad. This overall give up to 16x speedup. + +Blocksize limitation: by doing shared memory, my block size will be limited to block size 64, (starting at 128, I think there is some memory conflict inside each block, which resulting in error). For other methods, blocksize does not influence performance that much starting at blocksize 32. (if block size too small, will slow down performance project 1) + +### Performance Analysis +![](img/p1.png) +![](img/p2.png) +![](img/p3.png) + +The one thing I noticed first is my CPU is way stronger than I thought. Only when it reachs array=2^24, it starts to show up slowdown on performance. But right after 2^28, my CPU is no longer compatible of doing this arithematics. + +For general GPU side performance, it starts to showing slowing down when it reachs 2^20. For thrust, it starts to slow down on 2^28. (I personally think it will 2^28 is the bottleneck, since at 2^29, 50ms implies 20fps and this only counts the calculation for scan not including those memory operations). My Work efficient method is not effiecient at all, however, the upgrade one gives fairly good opitimization compared to naive one. The one with upgrade SM gives fairly good optimization compared to upgrade especially at 2^28. + +Some potential opitimization: by observing thrust, I found there is some insufficient threads usage for my SM method. In upgrade method, there is a way to just not lauching the threads in kernel. However, for SM one, although I am only launching blocksize/2 threads for each block, but when they are sweeping, most time there is only part of threads are working in the block. I dont know is there any more wise way to use those threads (probably just do mutiple additions at once, like two or three layers all together when downsweep). Another opitimization I would think of, swapping is not essentially needed if there is a wise way to just caculated the index to do the computation. + +#### Output for arraysize=2^26 +``` + +**************** +** SCAN TESTS ** +**************** + [ 2 10 43 45 10 38 5 10 13 25 24 17 9 ... 33 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 30.0245ms (std::chrono Measured) + [ 0 2 12 55 100 110 148 153 163 176 201 225 242 ... 1643506275 1643506308 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 41.5261ms (std::chrono Measured) + [ 0 2 12 55 100 110 148 153 163 176 201 225 242 ... 1643506220 1643506227 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 85.8092ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 82.2282ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 135.767ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 130.076ms (CUDA Measured) + passed +==== work-efficient scan upgrade, power-of-two ==== + elapsed time: 31.5261ms (CUDA Measured) + passed +==== work-efficient scan upgrade, non-power-of-two ==== + elapsed time: 31.4493ms (CUDA Measured) + passed +==== work-efficient scan upgrade with SM, power-of-two ==== + elapsed time: 11.6919ms (CUDA Measured) + passed +==== work-efficient scan upgrade with SM, non-power-of-two ==== + elapsed time: 12.0757ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 5.33914ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 5.62893ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 0 3 2 2 3 3 3 0 3 1 1 0 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 139.038ms (std::chrono Measured) + [ 3 3 2 2 3 3 3 3 1 1 2 1 2 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 129.736ms (std::chrono Measured) + [ 3 3 2 2 3 3 3 3 1 1 2 1 2 ... 3 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 318.162ms (std::chrono Measured) + [ 3 3 2 2 3 3 3 3 1 1 2 1 2 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 42.9237ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 43.177ms (CUDA Measured) + passed +Press any key to continue . . . +``` -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/p1.png b/img/p1.png new file mode 100644 index 0000000..1d879f0 Binary files /dev/null and b/img/p1.png differ diff --git a/img/p2.png b/img/p2.png new file mode 100644 index 0000000..83e39d4 Binary files /dev/null and b/img/p2.png differ diff --git a/img/p3.png b/img/p3.png new file mode 100644 index 0000000..233b6e9 Binary files /dev/null and b/img/p3.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..52e9cdc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 29; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -69,13 +69,41 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); + StreamCompaction::Efficient::oldscan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::oldscan(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan upgrade, power-of-two"); + StreamCompaction::Efficient::scanupgrade(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan upgrade, non-power-of-two"); + StreamCompaction::Efficient::scanupgrade(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan upgrade with SM, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan upgrade with SM, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..abf9a3d 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,16 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (idata[index] == 0) { + bools[index] = 0; + } + else { + bools[index] = 1; + } } /** @@ -33,6 +43,13 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..750bc83 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -13,6 +13,7 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 64 /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..dabdb04 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,10 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 0; i < n-1; i++) { + odata[i+1] = idata[i]+odata[i]; + } timer().endCpuTimer(); } @@ -31,8 +35,14 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int j = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[j++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return j; } /** @@ -43,8 +53,35 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + + int* temp=(int*)malloc(n * sizeof(int)); + int* temp2 = (int*)malloc(n * sizeof(int)); + + //mapping + for (int i = 0; i < n; i++) { + if (idata[i] == 0) { + temp[i] = 0; + } + else { + temp[i] = 1; + } + } + //scan + temp2[0] = 0; + for (int i = 0; i < n - 1; i++) { + temp2[i + 1] = temp[i] + temp2[i]; + } + //scatter + for (int i = 0; i < n; i++) { + if (temp[i]==1) { + odata[temp2[i]] = idata[i]; + } + } + int cnt = temp2[n - 1]; + free(temp); + free(temp2); timer().endCpuTimer(); - return -1; + return cnt; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..64a3fd6 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,7 @@ #include "common.h" #include "efficient.h" +bool upgrade = true; namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -11,14 +12,207 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + __global__ void kernEffUpSweep(int n, int division, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (((index+1) &(division-1)) == 0) { + int div = index-(int)(division >> 1); + idata[index] += idata[div]; + } + } + + __global__ void kernEffDownSweep(int n, int division, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (((index + 1) & (division - 1)) == 0) { + int div = index-(int)(division >> 1); + int temp = idata[index]; + idata[index] += idata[div]; + idata[div] = temp; + } + } + __global__ void kernEffUpSweepNew(int n, int division,int iter, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n/division) { + return; + } + //int current = (index + 1) * division - 1; + int current= (int)((index + 1) <>1)]; + } + + __global__ void kernEffDownSweepNew(int n, int division, int iter, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n / division) { + return; + } + //int current = (index + 1) * division - 1; + int current = (int)((index + 1) <> 1); + int temp = idata[current]; + idata[current] += idata[div]; + idata[div] = temp; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + __global__ void kernSMWorkEfficient(int blockIter,int n, int* increments, int* idata) { + __shared__ int offset; + offset = (blockIdx.x * blockSize); + __syncthreads(); + + int thisIdx = (int)(threadIdx.x << 1); + __shared__ int TMshared[blockSize]; + TMshared[thisIdx] = idata[thisIdx + offset]; + TMshared[thisIdx +1] = idata[thisIdx +1 + offset]; + __syncthreads(); + int division = 2; + for (int i = 1; i < blockIter; i++) { + if (threadIdx.x < (int)(1 << (blockIter - i))) { + int current = (int)((threadIdx.x + 1) << i) - 1; + TMshared[current] += TMshared[current - (int)(division >> 1)]; + } + division = division << 1; + __syncthreads(); + } + TMshared[blockSize - 1]=0; + //__syncthreads(); + for (int i = blockIter; i >= 1; i--) { + // 1<<(blockIter -i-1) + if (threadIdx.x < (int)(1 << (blockIter - i))) { + int current = (int)((threadIdx.x + 1) << i) - 1; + int temp = TMshared[current]; + TMshared[current] += TMshared[current - (int)(division >> 1)]; + TMshared[current - (int)(division >> 1)] = temp; + } + division = division >> 1; + __syncthreads(); + } + + idata[thisIdx + offset] = TMshared[thisIdx + 1]; + idata[thisIdx +1+ offset] = (thisIdx + 1== blockSize -1) ? TMshared[thisIdx + 1] + idata[thisIdx + 1 + offset] : TMshared[thisIdx + 2]; + __syncthreads(); + increments[blockIdx.x]= idata[blockSize - 1 + offset]; + + } + + __global__ void kernSMAddition(int n, int* increments, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (blockIdx.x >= n) { + return; + } + __shared__ int incre; + incre= increments[blockIdx.x]; + idata[index] += incre; + } + + void oldscan(int n, int* odata, const int* idata) { + // TODO + + + dim3 threadsPerBlock(blockSize); + int numOfblock = (n + blockSize - 1) / blockSize; + int* buffer1; + + int iter = ilog2ceil(n); + int newsize = 1 << iter; + cudaMalloc((void**)&buffer1, newsize * sizeof(int)); + cudaMemset(buffer1, 0, newsize * sizeof(int)); + cudaMemcpy(buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + for (int i = 2; i < newsize; i = i << 1) { + kernEffUpSweep << > > (newsize, i, buffer1); + } + cudaMemset(&buffer1[newsize - 1], 0, sizeof(int)); + for (int i = newsize; i >= 2; i = i >> 1) { + kernEffDownSweep << > > (newsize, i, buffer1); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, buffer1, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buffer1); + + + } + void scanupgrade(int n, int* odata, const int* idata) { + + dim3 threadsPerBlock(blockSize); + + int* buffer1; + + int iter = ilog2ceil(n); + int newsize = 1 << iter; + cudaMalloc((void**)&buffer1, newsize * sizeof(int)); + int numOfblock = (newsize + blockSize - 1) / blockSize; + cudaMemset(buffer1, 0, newsize * sizeof(int)); + cudaMemcpy(buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + int idx = 1; + timer().startGpuTimer(); + for (int i = 2; i < newsize; i = i << 1) { + numOfblock = (newsize / i + blockSize - 1) / blockSize; + kernEffUpSweepNew << > > (newsize, i, idx++, buffer1); + } + cudaMemset(&buffer1[newsize - 1], 0, sizeof(int)); + for (int i = newsize; i >= 2; i = i >> 1) { + numOfblock = (newsize / i + blockSize - 1) / blockSize; + kernEffDownSweepNew << > > (newsize, i, idx--, buffer1); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, buffer1, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buffer1); + //cudaFree(buffer2); + + } + + void scan(int n, int* odata, const int* idata) { + dim3 threadsPerBlock(blockSize/2); + dim3 threadsPerBlockl(blockSize); + + int* buffer1; + int* increments; + int newnumblock; + int idx = 1; + + int blockIter = ilog2ceil(blockSize); + int numOfblock = (n + blockSize - 1) / blockSize; + int newsize = numOfblock * blockSize; + cudaMalloc((void**)&buffer1, newsize * sizeof(int)); + cudaMalloc((void**)&increments, numOfblock * sizeof(int)); + + cudaMemset(buffer1, 0, newsize * sizeof(int)); + //cudaMemset(increments, 0, numOfblock * sizeof(int)); + cudaMemcpy(buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + numOfblock = (n + blockSize - 1) / blockSize; + + timer().startGpuTimer(); + kernSMWorkEfficient << > > (blockIter, newsize, increments, buffer1); + + for (int i = 2; i < numOfblock; i = i << 1) { + newnumblock = (numOfblock / i + blockSize - 1) / blockSize; + kernEffUpSweepNew << > > (numOfblock, i, idx++, increments); + } + cudaMemset(&increments[numOfblock - 1], 0, sizeof(int)); + for (int i = numOfblock; i >= 2; i = i >> 1) { + newnumblock = (newsize / i + blockSize - 1) / blockSize; + kernEffDownSweepNew << > > (numOfblock, i, idx--, increments); + } + kernSMAddition << > > (numOfblock, increments, buffer1); timer().endGpuTimer(); + + odata[0] = 0; + cudaMemcpy(&odata[1], buffer1, (n-1) * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&(odata[n]), increments, (2) * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buffer1); } /** @@ -31,10 +225,100 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO - timer().endGpuTimer(); - return -1; + if (upgrade) { + dim3 threadsPerBlock(blockSize); + + int* buffer1; + int* bools; + int* indices; + + int iter = ilog2ceil(n); + int newsize = 1 << iter; + cudaMalloc((void**)&buffer1, newsize * sizeof(int)); + cudaMalloc((void**)&bools, newsize * sizeof(int)); + cudaMalloc((void**)&indices, newsize * sizeof(int)); + cudaMemset(buffer1, 0, newsize * sizeof(int)); + cudaMemcpy(buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + int* outbuffer; + cudaMalloc((void**)&outbuffer, n * sizeof(int)); + int idx = 1; + int numOfblock = (newsize + blockSize - 1) / blockSize; + timer().startGpuTimer(); + Common::kernMapToBoolean << > > (newsize, bools, buffer1); + cudaMemcpy(indices, bools, newsize * sizeof(int), cudaMemcpyDeviceToDevice); + + for (int i = 2; i < newsize; i = i << 1) { + numOfblock = (newsize / i + blockSize - 1) / blockSize; + kernEffUpSweepNew << > > (newsize, i, idx++,indices); + } + cudaMemset(&indices[newsize - 1], 0, sizeof(int)); + for (int i = newsize; i >= 2; i = i >> 1) { + numOfblock = (newsize / i + blockSize - 1) / blockSize; + kernEffDownSweepNew << > > (newsize, i, idx--, indices); + } + numOfblock = (newsize + blockSize - 1) / blockSize; + //(int n, int *odata,const int* idata, const int* bools, const int* indices) + Common::kernScatter << > > (newsize, outbuffer, buffer1, bools, indices); + + timer().endGpuTimer(); + + int outputsize; + cudaMemcpy(&outputsize, &indices[newsize - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(odata, outbuffer, outputsize * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buffer1); + cudaFree(bools); + cudaFree(indices); + cudaFree(outbuffer); + + return outputsize; + } + else { + dim3 threadsPerBlock(blockSize); + int numOfblock = (n + blockSize - 1) / blockSize; + int* buffer1; + int* bools; + int* indices; + + int iter = ilog2ceil(n); + int newsize = 1<> > (newsize, bools, buffer1); + cudaMemcpy(indices, bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + + for (int i = 2; i > > (newsize, i, indices); + } + cudaMemset(&indices[newsize - 1], 0, sizeof(int)); + for (int i = newsize; i >= 2; i = i >> 1) { + kernEffDownSweep << > > (newsize, i, indices); + } + + //(int n, int *odata,const int* idata, const int* bools, const int* indices) + Common::kernScatter << > > (newsize, outbuffer,buffer1,bools,indices); + + timer().endGpuTimer(); + + int outputsize; + cudaMemcpy(&outputsize, &indices[newsize - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(odata, outbuffer, outputsize * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buffer1); + cudaFree(bools); + cudaFree(indices); + cudaFree(outbuffer); + + return outputsize; + + } } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..d5caf9b 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -7,6 +7,8 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + void oldscan(int n, int* odata, const int* idata); + void scanupgrade(int n, int* odata, const int* idata); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..830147f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,7 @@ #include "common.h" #include "naive.h" + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -12,14 +13,49 @@ namespace StreamCompaction { return timer; } // TODO: __global__ - + __global__ void kernNaiveScan(int n,int size, int* odata, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (index >= size) { + odata[index] = idata[index - size] + idata[index]; + } + else { + odata[index] = idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO + int numOfblock = (n + blockSize - 1) / blockSize; + dim3 threadsPerBlock(blockSize); + + int *buffer1; + int *buffer2; + int zerobuffer=0; + cudaMalloc((void**)&buffer1, n * sizeof(int)); + cudaMalloc((void**)&buffer2, n * sizeof(int)); + + cudaMemcpy(&(buffer1[1]), idata, (n-1) * sizeof(int),cudaMemcpyHostToDevice); + cudaMemcpy(buffer1, &zerobuffer, sizeof(int), cudaMemcpyHostToDevice); + int iter = ilog2ceil(n); + const int size = 1 << iter; + timer().startGpuTimer(); + for (int i = 1; i >> (n,i,buffer2,buffer1); + int *temp = buffer2; + buffer2 = buffer1; + buffer1 = temp; + } timer().endGpuTimer(); + cudaMemcpy(odata, buffer1, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buffer1); + cudaFree(buffer2); + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..ff0f1ed 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,19 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::device_vector dev_thrust_in(idata, idata + n); + thrust::device_vector dev_thrust_out(n); + timer().startGpuTimer(); + thrust::exclusive_scan(dev_thrust_in.begin(), dev_thrust_in.end(), dev_thrust_out.begin()); timer().endGpuTimer(); + int* dev_out = thrust::raw_pointer_cast(dev_thrust_out.data()); + cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + //cudaFree(dev_out); } } }