diff --git a/96224494_Sherwin/assignment4_a.cu b/96224494_Sherwin/assignment4_a.cu new file mode 100644 index 0000000..e44e19c --- /dev/null +++ b/96224494_Sherwin/assignment4_a.cu @@ -0,0 +1,153 @@ +#include +#include +using namespace std; +__global__ void Array_max(int* d_out, int* d_array, int Size) +{ + int id = blockIdx.x * blockDim.x + threadIdx.x; + int tid = threadIdx.x; + int bid = blockIdx.x; + __shared__ int sh_array[1024]; + // Shared memory that is exclusive for a block. + // An array of size 1024 declared for common access to all the threads in a block + // Each block has its own shared memory + + // Copy data from global to shared memory + if(id < Size) + sh_array[tid] = d_array[id]; + __syncthreads(); + + // Perform parallel reduction in shared memory + for(int s = 512; s>0; s = s/2) + { + __syncthreads(); + if(id>=Size || id+s>=Size) + continue; + if(tid0; s = s/2) + { + __syncthreads(); + if(id>=Size || id+s>=Size) + continue; + if(tid sh_array[tid + s]) + sh_array[tid]= sh_array[tid + s]; + } + // Each iteration reduces size of active array by half + } + __syncthreads(); + // Only thread 0 of each block writes back the result of that block into global memory + if(tid==0) + d_out[bid] = sh_array[tid]; +} +int Find_max_GPU(int h_array[], int Size) +{ + int* d_array, *d_out, *d_sum; + cudaMalloc((void**)&d_array, Size*sizeof(int)); + cudaMalloc((void**)&d_out, ceil(Size*1.0/1024)*sizeof(int)); + cudaMalloc((void**)&d_sum, sizeof(int)); + cudaMemcpy(d_array, h_array, sizeof(int) * Size, cudaMemcpyHostToDevice); + int h_sum; + Array_max <<>> (d_out, d_array, Size); + Array_max <<<1, 1024>>> (d_sum, d_out, ceil(Size*1.0/1024)); + cudaMemcpy(&h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_array); + cudaFree(d_out); + cudaFree(d_sum); + return h_sum; +} + +int Find_min_GPU(int h_array[], int Size) +{ + int* d_array, *d_out, *d_sum; + cudaMalloc((void**)&d_array, Size*sizeof(int)); + cudaMalloc((void**)&d_out, ceil(Size*1.0/1024)*sizeof(int)); + cudaMalloc((void**)&d_sum, sizeof(int)); + cudaMemcpy(d_array, h_array, sizeof(int) * Size, cudaMemcpyHostToDevice); + int h_sum; + Array_min <<>> (d_out, d_array, Size); + Array_min <<<1, 1024>>> (d_sum, d_out, ceil(Size*1.0/1024)); + cudaMemcpy(&h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_array); + cudaFree(d_out); + cudaFree(d_sum); + return h_sum; +} + + + +int Find_min_CPU(int h_array[], int Size) +{ + int naive_min = h_array[0] ; + for(int i=0; ih_array[i+1]) + naive_min=h_array[i+1]; + } + return naive_min; +} + + + +int Find_max_CPU(int h_array[], int Size) +{ + int naive_max = h_array[0]; + for(int i=0; i + +__global__ void mulArray(int* d_a,int* d_b, int* d_c,int size) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i 0; s = s/2) + { + __syncthreads(); + if(id>=Size || id+s>=Size) + continue; + if(tid>> (d_out, d_array, Size); + Array_Add <<<1, 1024>>> (d_sum, d_out, ceil(Size*1.0/1024)); + cudaMemcpy(&h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_array); + cudaFree(d_out); + cudaFree(d_sum); + return h_sum; +} + + + + + + +int main() +{ + int size; + printf("enter array size"); + scanf("%d",&size); + + + int h_a[size],h_b[size],h_c[size]; + int Array_Bytes = size* sizeof(int); + for(int i=0; i>>(d_a,d_b,d_c,size); + // Copy the resulting array from GPU (d_out) to the CPU (h_out) + cudaMemcpy(h_c, d_c, Array_Bytes, cudaMemcpyDeviceToHost); + int h_sum = Find_Sum_GPU(h_c, size); + printf("dot product sum is %d",h_sum); + cudaFree(d_a); + cudaFree(d_b); + cudaFree(d_c); +} diff --git a/96224494_Sherwin/asssignment_1.cu b/96224494_Sherwin/asssignment_1.cu deleted file mode 100644 index 799c8bc..0000000 --- a/96224494_Sherwin/asssignment_1.cu +++ /dev/null @@ -1,37 +0,0 @@ -#include -using namespace std; - -__global__ void AddArray(int* d_a,int* d_b, int* d_c,int Array_Size) -{ - int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < Array_Size) - d_c[id] = d_a[id] + d_b[id]; -} -int main() -{ - int Array_Size; - cout << "Enter the array size : "; - cin >> Array_Size; - int h_a[Array_Size],h_b[Array_Size], h_c[Array_Size]; - int Array_Bytes = Array_Size * sizeof(int); - for(int i=0; i>>(d_a,d_b,d_c,Array_Size); - // Copy the resulting array from GPU (d_out) to the CPU (h_out) - cudaMemcpy(h_c, d_c, Array_Bytes, cudaMemcpyDeviceToHost); - for(int i=0; i