Skip to content
This repository was archived by the owner on Mar 14, 2023. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 153 additions & 0 deletions 96224494_Sherwin/assignment4_a.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#include<iostream>
#include<stdio.h>
using namespace std;
__global__ void Array_max(int* d_out, int* d_array, int Size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
int bid = blockIdx.x;
__shared__ int sh_array[1024];
// Shared memory that is exclusive for a block.
// An array of size 1024 declared for common access to all the threads in a block
// Each block has its own shared memory

// Copy data from global to shared memory
if(id < Size)
sh_array[tid] = d_array[id];
__syncthreads();

// Perform parallel reduction in shared memory
for(int s = 512; s>0; s = s/2)
{
__syncthreads();
if(id>=Size || id+s>=Size)
continue;
if(tid<s)
{
if(sh_array[tid] < sh_array[tid + s])
sh_array[tid]= sh_array[tid + s];
}
// Each iteration reduces size of active array by half
}
__syncthreads();
// Only thread 0 of each block writes back the result of that block into global memory
if(tid==0)
d_out[bid] = sh_array[tid];
}
__global__ void Array_min(int* d_out, int* d_array, int Size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
int bid = blockIdx.x;
__shared__ int sh_array[1024];
// Shared memory that is exclusive for a block.
// An array of size 1024 declared for common access to all the threads in a block
// Each block has its own shared memory

// Copy data from global to shared memory
if(id < Size)
sh_array[tid] = d_array[id];
__syncthreads();

// Perform parallel reduction in shared memory
for(int s = 512; s>0; s = s/2)
{
__syncthreads();
if(id>=Size || id+s>=Size)
continue;
if(tid<s)
{
if(sh_array[tid] > sh_array[tid + s])
sh_array[tid]= sh_array[tid + s];
}
// Each iteration reduces size of active array by half
}
__syncthreads();
// Only thread 0 of each block writes back the result of that block into global memory
if(tid==0)
d_out[bid] = sh_array[tid];
}
int Find_max_GPU(int h_array[], int Size)
{
int* d_array, *d_out, *d_sum;
cudaMalloc((void**)&d_array, Size*sizeof(int));
cudaMalloc((void**)&d_out, ceil(Size*1.0/1024)*sizeof(int));
cudaMalloc((void**)&d_sum, sizeof(int));
cudaMemcpy(d_array, h_array, sizeof(int) * Size, cudaMemcpyHostToDevice);
int h_sum;
Array_max <<<ceil(Size*1.0/1024), 1024>>> (d_out, d_array, Size);
Array_max <<<1, 1024>>> (d_sum, d_out, ceil(Size*1.0/1024));
cudaMemcpy(&h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_array);
cudaFree(d_out);
cudaFree(d_sum);
return h_sum;
}

int Find_min_GPU(int h_array[], int Size)
{
int* d_array, *d_out, *d_sum;
cudaMalloc((void**)&d_array, Size*sizeof(int));
cudaMalloc((void**)&d_out, ceil(Size*1.0/1024)*sizeof(int));
cudaMalloc((void**)&d_sum, sizeof(int));
cudaMemcpy(d_array, h_array, sizeof(int) * Size, cudaMemcpyHostToDevice);
int h_sum;
Array_min <<<ceil(Size*1.0/1024), 1024>>> (d_out, d_array, Size);
Array_min <<<1, 1024>>> (d_sum, d_out, ceil(Size*1.0/1024));
cudaMemcpy(&h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_array);
cudaFree(d_out);
cudaFree(d_sum);
return h_sum;
}



int Find_min_CPU(int h_array[], int Size)
{
int naive_min = h_array[0] ;
for(int i=0; i<Size-1; i++)
{
if(h_array[i]>h_array[i+1])
naive_min=h_array[i+1];
}
return naive_min;
}



int Find_max_CPU(int h_array[], int Size)
{
int naive_max = h_array[0];
for(int i=0; i<Size-1; i++)
{
if(h_array[i]<h_array[i+1])
naive_max=h_array[i+1];
}
return naive_max;
}

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a mistake in your serial code to find the maximum and minimum.
Consider the array {4, 8, 6}. Then your Find_min_CPU algorithm returns 6 as the minimum, and not 4.

@SherwinBryan SherwinBryan May 22, 2020

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks ,I will do the changes




int main()
{
int Size;
printf("Enter the array size\n");
scanf("%d",&Size);
int h_array[Size];
for(int i=0; i<Size; i++)
h_array[i] =i+1;
int max = Find_max_GPU(h_array, Size);
int min = Find_min_GPU(h_array, Size);
int naive_min = Find_min_CPU(h_array, Size);
int naive_max = Find_max_CPU(h_array, Size);
printf("max no is %d\n",max);
printf("min no is %d\n",min);
if(max==naive_max&&min==naive_min)
printf("Result computed correctly\n");
else
printf("Result wrong!");


}
97 changes: 97 additions & 0 deletions 96224494_Sherwin/assignment4_b.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#include<stdio.h>

__global__ void mulArray(int* d_a,int* d_b, int* d_c,int size)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i <size)
d_c[i] = d_a[i] * d_b[i];
}



__global__ void Array_Add(int* d_out, int* d_array, int Size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
int bid = blockIdx.x;
__shared__ int sh_array[1024];
// Shared memory that is exclusive for a block.
// An array of size 1024 declared for common access to all the threads in a block
// Each block has its own shared memory

// Copy data from global to shared memory
if(id < Size)
sh_array[tid] = d_array[id];
__syncthreads();

// Perform parallel reduction in shared memory
for(int s = 512; s>0; s = s/2)
{
__syncthreads();
if(id>=Size || id+s>=Size)
continue;
if(tid<s)
sh_array[tid] += sh_array[tid + s];
// Each iteration reduces size of active array by half
}
__syncthreads();
// Only thread 0 of each block writes back the result of that block into global memory
if(tid==0)
d_out[bid] = sh_array[tid];
}
int Find_Sum_GPU(int h_array[], int Size)
{
int* d_array, *d_out, *d_sum;
cudaMalloc((void**)&d_array, Size*sizeof(int));
cudaMalloc((void**)&d_out, ceil(Size*1.0/1024)*sizeof(int));
cudaMalloc((void**)&d_sum, sizeof(int));
cudaMemcpy(d_array, h_array, sizeof(int) * Size, cudaMemcpyHostToDevice);
int h_sum;
Array_Add <<<ceil(Size*1.0/1024), 1024>>> (d_out, d_array, Size);
Array_Add <<<1, 1024>>> (d_sum, d_out, ceil(Size*1.0/1024));
cudaMemcpy(&h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_array);
cudaFree(d_out);
cudaFree(d_sum);
return h_sum;
}






int main()
{
int size;
printf("enter array size");
scanf("%d",&size);


int h_a[size],h_b[size],h_c[size];
int Array_Bytes = size* sizeof(int);
for(int i=0; i<size; i++)
{

h_a[i]= 2;
h_b[i]= 1;
}


printf("hello\n");
int *d_a,*d_b, *d_c;
cudaMalloc((void**)&d_b, Array_Bytes);
cudaMalloc((void**)&d_a, Array_Bytes);
cudaMalloc((void**)&d_c, Array_Bytes);
// Copy the array from CPU (h_in) to the GPU (d_in)
cudaMemcpy(d_b, h_b, Array_Bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_a, h_a, Array_Bytes, cudaMemcpyHostToDevice);
mulArray<<<size,1 >>>(d_a,d_b,d_c,size);
// Copy the resulting array from GPU (d_out) to the CPU (h_out)
cudaMemcpy(h_c, d_c, Array_Bytes, cudaMemcpyDeviceToHost);
int h_sum = Find_Sum_GPU(h_c, size);
printf("dot product sum is %d",h_sum);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
37 changes: 0 additions & 37 deletions 96224494_Sherwin/asssignment_1.cu

This file was deleted.