diff --git a/Shreesh/A4/DotN/DotProd.cu b/Shreesh/A4/DotN/DotProd.cu new file mode 100644 index 0000000..0891d95 --- /dev/null +++ b/Shreesh/A4/DotN/DotProd.cu @@ -0,0 +1,144 @@ +#include +#include +using namespace std; + +//Device code +__global__ void dotProduct(int* A, int* B, int* C, int size){ + int abs_id = threadIdx.x + blockDim.x * blockIdx.x; + //term wise product in global memory + C[abs_id] = A[abs_id] * B[abs_id]; +} + +__global__ void dotProdSum(int* d_C, int* d_out, int size){ + + int t_id = threadIdx.x; int b_id = blockIdx.x; + int abs_id = threadIdx.x + blockDim.x * blockIdx.x; + __shared__ int sh_data[1024]; + __syncthreads(); + + //global -> shared memory + if(abs_id < size){ + sh_data[t_id] = d_C[abs_id]; + } + __syncthreads(); + + //reduce operation + for(unsigned int s = blockDim.x/2; s > 0; s = s/2){ + __syncthreads(); + if(abs_id >= size || abs_id + s >= size){ + continue; + } + __syncthreads(); + + if(t_id < s){ + sh_data[t_id] += sh_data[t_id + s]; + } + __syncthreads(); + } + + if(t_id == 0){ + //each shared memory (per block) -> global array + d_out[b_id] = sh_data[t_id]; + } + +} +//Host code +int cpuDot(int* h_A, int* h_B, int size); +int gpuDot(int* h_A, int* h_B, int size); +void populateRandom(int* h_in, int size, int seed); +void printArray(int* arr, int size); + +int main(int argc, char const *argv[]) +{ + int size; + cout << "Enter N: "; cin >> size; + int naive, parallel; bool ans = 0; + int h_A[size]; populateRandom(h_A, size, 0); + int h_B[size]; populateRandom(h_B, size, 5); + + if(size > 10){ + cout << "Size of arrays too large." << endl; + cout << "Do you still want me to display? (1/0):"; + cin >> ans; + } + if(ans==1){ + cout << "Array A: "; + printArray(h_A, size); + cout << endl; + cout << "Array B: "; + printArray(h_B, size); + } + naive = cpuDot(h_A, h_B, size); + cout << "\n\nNaive dot: " << naive << endl; + parallel = gpuDot(h_A, h_B, size); + cout << "Parallel dot: " << parallel << endl; + return 0; +} + +int cpuDot(int* h_A, int* h_B, int size){ + int naive = 0; + for (int i = 0; i < size; ++i) + { + naive += h_A[i]*h_B[i]; + } + return naive; +} + +int gpuDot(int* h_A, int* h_B, int size){ + int* d_A = NULL; + int* d_B = NULL; + int* d_C = NULL; + int* d_out = NULL; + int* d_sum = NULL; + int parallel = 0; + + int array_bytes = size * sizeof(int); + int reduced_size = (int)ceil(size*1.0/1024); + int reduced_bytes = reduced_size * sizeof(int); + + cudaMalloc((void**)&d_A, array_bytes); + cudaMalloc((void**)&d_B, array_bytes); + cudaMalloc((void**)&d_C, array_bytes); + cudaMalloc((void**)&d_out, reduced_bytes); + cudaMalloc((void**)&d_sum, sizeof(int)); + + cudaMemcpy(d_A, h_A, array_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, array_bytes, cudaMemcpyHostToDevice); + + int b = ceil(size * 1.0/1024); + int t = 1024; + + // int h_C[size]; + // cudaMemcpy(h_C, d_C, array_bytes, cudaMemcpyDeviceToHost); + // for (int i = 0; i < size; ++i) + // { + // cout << "h_C: " << h_C[i] << endl; + // } + + //kernel call - product then sum + dotProduct<<>>(d_A, d_B, d_C, size); + + dotProdSum<<>>(d_C, d_out, size); + dotProdSum<<<1, t>>>(d_out, d_sum, reduced_size); + + cudaMemcpy(¶llel, d_sum, sizeof(int), cudaMemcpyDeviceToHost); + // parallel = 1; + return parallel; +} + +void populateRandom(int* h_in, int size, int seed){ + srand(seed); + for (int i = 0; i < size; ++i) + { + int random = rand() % 10; + h_in[i] = random; + } +} + +void printArray(int* arr,int size){ + for (int i = 0; i < size; ++i) + { + cout << arr[i] << ", "; + } +} + diff --git a/Shreesh/A4/ReduceMinMax/MinMax.cu b/Shreesh/A4/ReduceMinMax/MinMax.cu new file mode 100644 index 0000000..4edb68e --- /dev/null +++ b/Shreesh/A4/ReduceMinMax/MinMax.cu @@ -0,0 +1,184 @@ +#include +#include +#include +using namespace std; + +int size = 1024*1024; //2^20 elements + +//Device code +__global__ void findMin(int* d_out, int* d_in, int size){ + int abs_id = threadIdx.x + blockDim.x * blockIdx.x; + int t_id = threadIdx.x; + int b_id = blockIdx.x; + + __shared__ int sdata[1024]; + __syncthreads(); + + //Copying data; global --> shared + if(abs_id < size){ + sdata[t_id] = d_in[abs_id]; + //there is one sdata array for every block + } + __syncthreads(); + + //parallel reduce in shared memory + for(unsigned int s = blockDim.x/2; s > 0; s = s/2){ + //make sure all local s are initialized + __syncthreads(); + if(abs_id >= size || abs_id+s >= size) + continue; + //make sure all unmapped threads are skipped + __syncthreads(); + + if(t_id < s){ + if(sdata[t_id] > sdata[t_id + s]){ + //if +s is smaller then replace + sdata[t_id] = sdata[t_id + s]; + } + } + + __syncthreads(); //All half comparisions are completed + } //each iteration reduces size of active array by half + + //Make sure all sdata[] have been reduced to size 1 + __syncthreads(); + + if(t_id==0){ + //d_out in global memory will be populated by first + //element of each of sdata array associated to each block + d_out[b_id] = sdata[t_id]; + } +} + +__global__ void findMax(int* d_out, int* d_in, int size){ + int abs_id = threadIdx.x + blockDim.x * blockIdx.x; + int t_id = threadIdx.x; + int b_id = blockIdx.x; + + __shared__ int sdata[1024]; + __syncthreads(); + + //Copying data; global --> shared + if(abs_id < size){ + sdata[t_id] = d_in[abs_id]; + //there is one sdata array for every block + } + __syncthreads(); + + //parallel reduce in shared memory + for(unsigned int s = blockDim.x/2; s > 0; s = s/2){ + //make sure all local s are initialized + __syncthreads(); + if(abs_id >= size || abs_id+s >= size) + continue; + //make sure all unmapped threads are skipped + __syncthreads(); + + if(t_id < s){ + if(sdata[t_id] < sdata[t_id + s]){ + //if +s is greater then replace + sdata[t_id] = sdata[t_id + s]; + } + } + + __syncthreads(); //All half comparisions are completed + } //each iteration reduces size of active array by half + + //Make sure all sdata[] have been reduced to size 1 + __syncthreads(); + + if(t_id==0){ + d_out[b_id] = sdata[t_id]; + } +} + +//Host code +void populateRandom(int* arr); +void printArray(int* arr); +void cpuMinMax(int* arr); +void gpuMinMax(int* h_in); +void compareResult(int* gpu, int* cpu); + +//Driver function +int main(int argc, char const *argv[]) +{ + int s = size; + int h_in[s]; + populateRandom(h_in); + cpuMinMax(h_in); + gpuMinMax(h_in); + return 0; +} + +void gpuMinMax(int* h_in){ + + int array_bytes = size * sizeof(int); + int reduced_size = (int)ceil(size*1.0/1024); + int reduced_bytes = reduced_size * sizeof(int); + int* d_in = NULL; //input array + int* d_out = NULL; //reduced array + int* d_min = NULL; //min + int* d_max = NULL; //max + int min, max; + + cudaMalloc((void**)&d_in, array_bytes); + cudaMalloc((void**)&d_out,reduced_bytes); + cudaMalloc((void**)&d_min, sizeof(int)); + cudaMalloc((void**)&d_max, sizeof(int)); + + cudaMemcpy(d_in, h_in, array_bytes, cudaMemcpyHostToDevice); + int b = ceil(size*1.0/1024); + //find min + findMin<<>>(d_out, d_in, size); + findMin<<<1, 1024, 1024*sizeof(int)>>>(d_min, d_out, ceil(size*1.0/1024)); + cudaMemcpy(&min, d_min, sizeof(int), cudaMemcpyDeviceToHost); + //find max + findMax<<>>(d_out, d_in, size); + findMax<<<1, 1024, 1024*sizeof(int)>>>(d_max, d_out, ceil(size*1.0/1024)); + cudaMemcpy(&max, d_max, sizeof(int), cudaMemcpyDeviceToHost); + //result + cout << "\nReducing using GPU" << endl; + cout << "Min: " << min << " | Max: " << max << endl; + + //free gpu memory + cudaFree(d_in); + cudaFree(d_out); + cudaFree(d_max); + cudaFree(d_min); + +} + +void populateRandom(int* h_in){ + unsigned int t = time(NULL); + srand(t); + for (int i = 0; i < size; ++i) + { + int random = rand(); + h_in[i] = random; + } +} + +void printArray(int* arr){ + for (int i = 0; i < size; ++i) + { + cout << arr[i] << ", "; + } +} + +void cpuMinMax(int* arr){ + int min, max; + min = arr[0]; + max = arr[0]; + for (int i = 0; i < size; ++i) + { + if(min > arr[i]) + min = arr[i]; + if(max < arr[i]) + max = arr[i]; + } + + cout << "\nReducing using CPU" << endl; + cout << "Min: " << min << " | Max: " << max << endl; + +} +