From b6fd5c3dfdd9215fb0f5ef7b5e62f0a151e4bedb Mon Sep 17 00:00:00 2001 From: DeltaCube23 Date: Tue, 2 Jun 2020 12:26:53 +0530 Subject: [PATCH] assignment 5 added --- 96447859_adithya/blelloch.cu | 122 ++++++++++++++++++++++++++++++ 96447859_adithya/hillis-steele.cu | 64 ++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 96447859_adithya/blelloch.cu create mode 100644 96447859_adithya/hillis-steele.cu diff --git a/96447859_adithya/blelloch.cu b/96447859_adithya/blelloch.cu new file mode 100644 index 0000000..4ff139f --- /dev/null +++ b/96447859_adithya/blelloch.cu @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +using namespace std; +#define ll long long int +const int Block_Size = 1024; + +// This GPU kernel does blockwise in-place scan +__global__ void Blelloch_Exclusive_Scan(ll *d_in, ll* d_out) +{ + __shared__ ll sh_array[Block_Size]; + int id = blockIdx.x * blockDim.x + threadIdx.x; + int tid = threadIdx.x; + int bid = blockIdx.x; + // Copying data from global to shared memory + sh_array[tid] = d_in[id]; + __syncthreads(); + /** Performing block-wise in-place Blelloch scan **/ + // First step of Blelloch scan : REDUCTION + for(int k=2; k <= Block_Size; k *= 2) + { + if((tid+1) % k == 0) + { + sh_array[tid] = max(sh_array[tid - (k/2)], sh_array[tid]); + } + __syncthreads(); + } + // At the end of reduction, the last element of each block conatins the sum of all elements in that block + // We store these block-wise sums in d_out + if(tid == (Block_Size - 1)) + { + d_out[bid] = sh_array[tid]; + sh_array[tid] = 0; + } + __syncthreads(); + + // Second step of Blelloch scan : DOWNSWEEP + // This is structurally the exact reverse of the reduction step + for(int k = Block_Size; k >= 2; k /= 2) + { + if((tid+1) % k == 0) + { + ll temp = sh_array[tid - (k/2)]; + sh_array[tid - (k/2)] = sh_array[tid]; + sh_array[tid] = max(temp, sh_array[tid]); + } + __syncthreads(); + } + // Copying the scan result back into global memory + d_in[id] = sh_array[tid]; + // d_in now contains blockwise scan result + __syncthreads(); +} + +// This GPU kernel adds the value d_out[id] to all values in the (id)th block of d_in +__global__ void Max(ll* d_in, ll* d_out) +{ + int id = blockIdx.x * blockDim.x + threadIdx.x; + int bid = blockIdx.x; + d_in[id] = max(d_out[bid], d_in[id]); + __syncthreads(); +} + +int main() +{ + ll *h_in, *h_scan; + int Size; + cout << "Enter size of the array.\n"; + cin >> Size; + //append extra element to cover up for exclusive scan + Size=Size+1; + int Reduced_Size = (int)ceil(1.0*Size/Block_Size); + int Array_Bytes = Size * sizeof(ll); + int Reduced_Array_Bytes = Reduced_Size * sizeof(ll); + h_in = (ll*)malloc(Array_Bytes); + h_scan = (ll*)malloc(Array_Bytes); + // Populating array with random numbers + srand(time(0)); + for(ll i=0; i>> (d_in, d_out); + + // After first kernel call, d_in has the blockwise scan results and d_out is an auxiliary array that has the blockwise max + // Second kernel call is done to scan the blockwise max array + // Then the ith value in the resultant scanned blockwise max array is checked with every value in the ith block + // This addition step is done in the Max() kernel + // This is required only if size of the array is greater than the block size + if(Size > Block_Size) + { + Blelloch_Exclusive_Scan <<< 1, Block_Size >>> (d_out, d_max); + Max <<< Reduced_Size, Block_Size >>> (d_in, d_out); + } + + // Copying the result back to the CPU + cudaMemcpy(h_scan, d_in, Array_Bytes, cudaMemcpyDeviceToHost); + cudaFree(d_in); + cudaFree(d_out); + cout << "Exclusive Scan Array : \n"; + for(ll i=1; i +#include +#include +#include +#include +#include +using namespace std; +#define ll long long int +__global__ void Inclusive_Scan(ll *d_in, ll* d_out, ll Size, ll i) +{ + ll id = blockIdx.x * blockDim.x + threadIdx.x; + ll step = 1 << i; + if(id < Size) + { + if(id >= step) + { + if(d_in[id]> Size; + ll Array_Bytes = Size * sizeof(ll); + h_in = (ll*)malloc(Array_Bytes); + h_out = (ll*)malloc(Array_Bytes); + + // Populating input array with random numbers + srand(time(0)); + for(ll i=0; i>> (d_in, d_out, Size, i); + cudaMemcpy(d_in, d_out, Array_Bytes, cudaMemcpyDeviceToDevice); + } + cudaMemcpy(h_out, d_out, Array_Bytes, cudaMemcpyDeviceToHost); + cudaFree(d_in); + cudaFree(d_out); + cout << "Inclusive Scan Array : \n"; + for(ll i=0; i