Skip to content
This repository was archived by the owner on Mar 14, 2023. It is now read-only.
Open

Fix #79

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions Shreesh/A4/DotN/DotProd.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#include<iostream>
#include <stdlib.h>
using namespace std;

//Device code
__global__ void dotProduct(int* A, int* B, int* C, int size){
int abs_id = threadIdx.x + blockDim.x * blockIdx.x;
//term wise product in global memory
C[abs_id] = A[abs_id] * B[abs_id];
}

__global__ void dotProdSum(int* d_C, int* d_out, int size){

int t_id = threadIdx.x; int b_id = blockIdx.x;
int abs_id = threadIdx.x + blockDim.x * blockIdx.x;
__shared__ int sh_data[1024];
__syncthreads();

//global -> shared memory
if(abs_id < size){
sh_data[t_id] = d_C[abs_id];
}
__syncthreads();

//reduce operation
for(unsigned int s = blockDim.x/2; s > 0; s = s/2){
__syncthreads();
if(abs_id >= size || abs_id + s >= size){
continue;
}
__syncthreads();

if(t_id < s){
sh_data[t_id] += sh_data[t_id + s];
}
__syncthreads();
}

if(t_id == 0){
//each shared memory (per block) -> global array
d_out[b_id] = sh_data[t_id];
}

}
//Host code
int cpuDot(int* h_A, int* h_B, int size);
int gpuDot(int* h_A, int* h_B, int size);
void populateRandom(int* h_in, int size, int seed);
void printArray(int* arr, int size);

int main(int argc, char const *argv[])
{
int size;
cout << "Enter N: "; cin >> size;
int naive, parallel; bool ans = 0;
int h_A[size]; populateRandom(h_A, size, 0);
int h_B[size]; populateRandom(h_B, size, 5);

if(size > 10){
cout << "Size of arrays too large." << endl;
cout << "Do you still want me to display? (1/0):";
cin >> ans;
}
if(ans==1){
cout << "Array A: ";
printArray(h_A, size);
cout << endl;
cout << "Array B: ";
printArray(h_B, size);
}
naive = cpuDot(h_A, h_B, size);
cout << "\n\nNaive dot: " << naive << endl;
parallel = gpuDot(h_A, h_B, size);
cout << "Parallel dot: " << parallel << endl;
return 0;
}

int cpuDot(int* h_A, int* h_B, int size){
int naive = 0;
for (int i = 0; i < size; ++i)
{
naive += h_A[i]*h_B[i];
}
return naive;
}

int gpuDot(int* h_A, int* h_B, int size){
int* d_A = NULL;
int* d_B = NULL;
int* d_C = NULL;
int* d_out = NULL;
int* d_sum = NULL;
int parallel = 0;

int array_bytes = size * sizeof(int);
int reduced_size = (int)ceil(size*1.0/1024);
int reduced_bytes = reduced_size * sizeof(int);

cudaMalloc((void**)&d_A, array_bytes);
cudaMalloc((void**)&d_B, array_bytes);
cudaMalloc((void**)&d_C, array_bytes);
cudaMalloc((void**)&d_out, reduced_bytes);
cudaMalloc((void**)&d_sum, sizeof(int));

cudaMemcpy(d_A, h_A, array_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, array_bytes, cudaMemcpyHostToDevice);

int b = ceil(size * 1.0/1024);
int t = 1024;

// int h_C[size];
// cudaMemcpy(h_C, d_C, array_bytes, cudaMemcpyDeviceToHost);
// for (int i = 0; i < size; ++i)
// {
// cout << "h_C: " << h_C[i] << endl;
// }

//kernel call - product then sum
dotProduct<<<b, t>>>(d_A, d_B, d_C, size);

dotProdSum<<<b, t>>>(d_C, d_out, size);
dotProdSum<<<1, t>>>(d_out, d_sum, reduced_size);

cudaMemcpy(&parallel, d_sum, sizeof(int), cudaMemcpyDeviceToHost);
// parallel = 1;
return parallel;
}

void populateRandom(int* h_in, int size, int seed){
srand(seed);
for (int i = 0; i < size; ++i)
{
int random = rand() % 10;
h_in[i] = random;
}
}

void printArray(int* arr,int size){
for (int i = 0; i < size; ++i)
{
cout << arr[i] << ", ";
}
}

184 changes: 184 additions & 0 deletions Shreesh/A4/ReduceMinMax/MinMax.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
#include <iostream>
#include <ctime>
#include <stdlib.h>
using namespace std;

int size = 1024*1024; //2^20 elements

//Device code
__global__ void findMin(int* d_out, int* d_in, int size){
int abs_id = threadIdx.x + blockDim.x * blockIdx.x;
int t_id = threadIdx.x;
int b_id = blockIdx.x;

__shared__ int sdata[1024];
__syncthreads();

//Copying data; global --> shared
if(abs_id < size){
sdata[t_id] = d_in[abs_id];
//there is one sdata array for every block
}
__syncthreads();

//parallel reduce in shared memory
for(unsigned int s = blockDim.x/2; s > 0; s = s/2){
//make sure all local s are initialized
__syncthreads();
if(abs_id >= size || abs_id+s >= size)
continue;
//make sure all unmapped threads are skipped
__syncthreads();

if(t_id < s){
if(sdata[t_id] > sdata[t_id + s]){
//if +s is smaller then replace
sdata[t_id] = sdata[t_id + s];
}
}

__syncthreads(); //All half comparisions are completed
} //each iteration reduces size of active array by half

//Make sure all sdata[] have been reduced to size 1
__syncthreads();

if(t_id==0){
//d_out in global memory will be populated by first
//element of each of sdata array associated to each block
d_out[b_id] = sdata[t_id];
}
}

__global__ void findMax(int* d_out, int* d_in, int size){
int abs_id = threadIdx.x + blockDim.x * blockIdx.x;
int t_id = threadIdx.x;
int b_id = blockIdx.x;

__shared__ int sdata[1024];
__syncthreads();

//Copying data; global --> shared
if(abs_id < size){
sdata[t_id] = d_in[abs_id];
//there is one sdata array for every block
}
__syncthreads();

//parallel reduce in shared memory
for(unsigned int s = blockDim.x/2; s > 0; s = s/2){
//make sure all local s are initialized
__syncthreads();
if(abs_id >= size || abs_id+s >= size)
continue;
//make sure all unmapped threads are skipped
__syncthreads();

if(t_id < s){
if(sdata[t_id] < sdata[t_id + s]){
//if +s is greater then replace
sdata[t_id] = sdata[t_id + s];
}
}

__syncthreads(); //All half comparisions are completed
} //each iteration reduces size of active array by half

//Make sure all sdata[] have been reduced to size 1
__syncthreads();

if(t_id==0){
d_out[b_id] = sdata[t_id];
}
}

//Host code
void populateRandom(int* arr);
void printArray(int* arr);
void cpuMinMax(int* arr);
void gpuMinMax(int* h_in);
void compareResult(int* gpu, int* cpu);

//Driver function
int main(int argc, char const *argv[])
{
int s = size;
int h_in[s];
populateRandom(h_in);
cpuMinMax(h_in);
gpuMinMax(h_in);
return 0;
}

void gpuMinMax(int* h_in){

int array_bytes = size * sizeof(int);
int reduced_size = (int)ceil(size*1.0/1024);
int reduced_bytes = reduced_size * sizeof(int);
int* d_in = NULL; //input array
int* d_out = NULL; //reduced array
int* d_min = NULL; //min
int* d_max = NULL; //max
int min, max;

cudaMalloc((void**)&d_in, array_bytes);
cudaMalloc((void**)&d_out,reduced_bytes);
cudaMalloc((void**)&d_min, sizeof(int));
cudaMalloc((void**)&d_max, sizeof(int));

cudaMemcpy(d_in, h_in, array_bytes, cudaMemcpyHostToDevice);
int b = ceil(size*1.0/1024);
//find min
findMin<<<b, 1024, 1024*sizeof(int)>>>(d_out, d_in, size);
findMin<<<1, 1024, 1024*sizeof(int)>>>(d_min, d_out, ceil(size*1.0/1024));
cudaMemcpy(&min, d_min, sizeof(int), cudaMemcpyDeviceToHost);
//find max
findMax<<<b, 1024, 1024*sizeof(int)>>>(d_out, d_in, size);
findMax<<<1, 1024, 1024*sizeof(int)>>>(d_max, d_out, ceil(size*1.0/1024));
cudaMemcpy(&max, d_max, sizeof(int), cudaMemcpyDeviceToHost);
//result
cout << "\nReducing using GPU" << endl;
cout << "Min: " << min << " | Max: " << max << endl;

//free gpu memory
cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_max);
cudaFree(d_min);

}

void populateRandom(int* h_in){
unsigned int t = time(NULL);
srand(t);
for (int i = 0; i < size; ++i)
{
int random = rand();
h_in[i] = random;
}
}

void printArray(int* arr){
for (int i = 0; i < size; ++i)
{
cout << arr[i] << ", ";
}
}

void cpuMinMax(int* arr){
int min, max;
min = arr[0];
max = arr[0];
for (int i = 0; i < size; ++i)
{
if(min > arr[i])
min = arr[i];
if(max < arr[i])
max = arr[i];
}

cout << "\nReducing using CPU" << endl;
cout << "Min: " << min << " | Max: " << max << endl;

}