diff --git a/96224494_Sherwin/assignment4_a.cu b/96224494_Sherwin/assignment4_a.cu
new file mode 100644
index 0000000..e44e19c
--- /dev/null
+++ b/96224494_Sherwin/assignment4_a.cu
@@ -0,0 +1,153 @@
+#include<iostream>
+#include<stdio.h>
+using namespace std;
+__global__ void Array_max(int* d_out, int* d_array, int Size)
+{
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    int tid = threadIdx.x;
+    int bid = blockIdx.x;
+    __shared__ int sh_array[1024];
+    // Shared memory that is exclusive for a block.
+    // An array of size 1024 declared for common access to all the threads in a block
+    // Each block has its own shared memory
+   
+    // Copy data from global to shared memory
+    if(id < Size)
+        sh_array[tid] = d_array[id];
+    __syncthreads();
+   
+    // Perform parallel reduction in shared memory
+    for(int s = 512; s>0; s = s/2)
+    {
+        __syncthreads();
+        if(id>=Size || id+s>=Size)
+            continue;
+        if(tid<s)
+            {
+               if(sh_array[tid] < sh_array[tid + s])
+                sh_array[tid]= sh_array[tid + s];
+            }
+        // Each iteration reduces size of active array by half
+    }
+    __syncthreads();
+    // Only thread 0 of each block writes back the result of that block into global memory
+    if(tid==0)
+        d_out[bid] = sh_array[tid];  
+}
+__global__ void Array_min(int* d_out, int* d_array, int Size)
+{
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    int tid = threadIdx.x;
+    int bid = blockIdx.x;
+    __shared__ int sh_array[1024];
+    // Shared memory that is exclusive for a block.
+    // An array of size 1024 declared for common access to all the threads in a block
+    // Each block has its own shared memory
+   
+    // Copy data from global to shared memory
+    if(id < Size)
+        sh_array[tid] = d_array[id];
+    __syncthreads();
+   
+    // Perform parallel reduction in shared memory
+    for(int s = 512; s>0; s = s/2)
+    {
+        __syncthreads();
+        if(id>=Size || id+s>=Size)
+            continue;
+        if(tid<s)
+            {
+               if(sh_array[tid] > sh_array[tid + s])
+                sh_array[tid]= sh_array[tid + s];
+            }
+        // Each iteration reduces size of active array by half
+    }
+    __syncthreads();
+    // Only thread 0 of each block writes back the result of that block into global memory
+    if(tid==0)
+        d_out[bid] = sh_array[tid];  
+}
+int Find_max_GPU(int h_array[], int Size)
+{
+    int* d_array, *d_out, *d_sum;
+    cudaMalloc((void**)&d_array, Size*sizeof(int));
+    cudaMalloc((void**)&d_out, ceil(Size*1.0/1024)*sizeof(int));
+    cudaMalloc((void**)&d_sum, sizeof(int));
+    cudaMemcpy(d_array, h_array, sizeof(int) * Size, cudaMemcpyHostToDevice);
+    int h_sum;
+    Array_max <<<ceil(Size*1.0/1024), 1024>>> (d_out, d_array, Size);
+    Array_max <<<1, 1024>>> (d_sum, d_out, ceil(Size*1.0/1024));
+    cudaMemcpy(&h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);
+    cudaFree(d_array);
+    cudaFree(d_out);
+    cudaFree(d_sum);
+    return h_sum;
+}
+
+int Find_min_GPU(int h_array[], int Size)
+{
+    int* d_array, *d_out, *d_sum;
+    cudaMalloc((void**)&d_array, Size*sizeof(int));
+    cudaMalloc((void**)&d_out, ceil(Size*1.0/1024)*sizeof(int));
+    cudaMalloc((void**)&d_sum, sizeof(int));
+    cudaMemcpy(d_array, h_array, sizeof(int) * Size, cudaMemcpyHostToDevice);
+    int h_sum;
+    Array_min <<<ceil(Size*1.0/1024), 1024>>> (d_out, d_array, Size);
+    Array_min <<<1, 1024>>> (d_sum, d_out, ceil(Size*1.0/1024));
+    cudaMemcpy(&h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);
+    cudaFree(d_array);
+    cudaFree(d_out);
+    cudaFree(d_sum);
+    return h_sum;
+}
+
+
+
+int Find_min_CPU(int h_array[], int Size)
+{
+    int naive_min = h_array[0] ;
+    for(int i=0; i<Size-1; i++)
+         {
+            if(h_array[i]>h_array[i+1])
+            naive_min=h_array[i+1];
+         }
+    return naive_min;
+}
+
+
+
+int Find_max_CPU(int h_array[], int Size)
+{
+    int naive_max = h_array[0];
+    for(int i=0; i<Size-1; i++)
+         {
+            if(h_array[i]<h_array[i+1])
+            naive_max=h_array[i+1];
+         }
+    return naive_max;
+}
+
+
+
+
+int main()
+{
+    int Size;
+    printf("Enter the array size\n");
+    scanf("%d",&Size);
+    int h_array[Size];
+    for(int i=0; i<Size; i++)
+        h_array[i] =i+1;
+    int max = Find_max_GPU(h_array, Size);
+    int min = Find_min_GPU(h_array, Size);
+    int naive_min = Find_min_CPU(h_array, Size);
+    int naive_max = Find_max_CPU(h_array, Size);
+    printf("max no is %d\n",max);
+    printf("min no is %d\n",min);
+    if(max==naive_max&&min==naive_min)
+        printf("Result computed correctly\n");
+    else
+        printf("Result wrong!");
+   
+   
+}
diff --git a/96224494_Sherwin/assignment4_b.cu b/96224494_Sherwin/assignment4_b.cu
new file mode 100644
index 0000000..13e8866
--- /dev/null
+++ b/96224494_Sherwin/assignment4_b.cu
@@ -0,0 +1,97 @@
+#include<stdio.h>
+
+__global__ void mulArray(int* d_a,int* d_b, int* d_c,int size)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i <size)
+        d_c[i] = d_a[i] * d_b[i];
+}
+
+
+
+__global__ void Array_Add(int* d_out, int* d_array, int Size)
+{
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    int tid = threadIdx.x;
+    int bid = blockIdx.x;
+    __shared__ int sh_array[1024];
+    // Shared memory that is exclusive for a block.
+    // An array of size 1024 declared for common access to all the threads in a block
+    // Each block has its own shared memory
+   
+    // Copy data from global to shared memory
+    if(id < Size)
+        sh_array[tid] = d_array[id];
+    __syncthreads();
+   
+    // Perform parallel reduction in shared memory
+    for(int s = 512; s>0; s = s/2)
+    {
+        __syncthreads();
+        if(id>=Size || id+s>=Size)
+            continue;
+        if(tid<s)
+            sh_array[tid] += sh_array[tid + s];
+        // Each iteration reduces size of active array by half
+    }
+    __syncthreads();
+    // Only thread 0 of each block writes back the result of that block into global memory
+    if(tid==0)
+        d_out[bid] = sh_array[tid];  
+}
+int Find_Sum_GPU(int h_array[], int Size)
+{
+    int* d_array, *d_out, *d_sum;
+    cudaMalloc((void**)&d_array, Size*sizeof(int));
+    cudaMalloc((void**)&d_out, ceil(Size*1.0/1024)*sizeof(int));
+    cudaMalloc((void**)&d_sum, sizeof(int));
+    cudaMemcpy(d_array, h_array, sizeof(int) * Size, cudaMemcpyHostToDevice);
+    int h_sum;
+    Array_Add <<<ceil(Size*1.0/1024), 1024>>> (d_out, d_array, Size);
+    Array_Add <<<1, 1024>>> (d_sum, d_out, ceil(Size*1.0/1024));
+    cudaMemcpy(&h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);
+    cudaFree(d_array);
+    cudaFree(d_out);
+    cudaFree(d_sum);
+    return h_sum;
+}
+
+
+
+
+
+
+int main()
+{
+    int size;
+    printf("enter array size");
+    scanf("%d",&size);
+     
+ 
+    int h_a[size],h_b[size],h_c[size];
+    int Array_Bytes = size* sizeof(int);  
+    for(int i=0; i<size; i++)
+    {
+       
+            h_a[i]= 2;
+            h_b[i]= 1;
+    }
+     
+     
+    printf("hello\n");
+    int *d_a,*d_b, *d_c;
+    cudaMalloc((void**)&d_b, Array_Bytes);
+    cudaMalloc((void**)&d_a, Array_Bytes);
+    cudaMalloc((void**)&d_c, Array_Bytes);
+    // Copy the array from CPU (h_in) to the GPU (d_in)
+    cudaMemcpy(d_b, h_b, Array_Bytes, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_a, h_a, Array_Bytes, cudaMemcpyHostToDevice);
+    mulArray<<<size,1 >>>(d_a,d_b,d_c,size);
+    // Copy the resulting array from GPU (d_out) to the CPU (h_out)
+    cudaMemcpy(h_c, d_c, Array_Bytes, cudaMemcpyDeviceToHost);
+    int h_sum = Find_Sum_GPU(h_c, size);
+    printf("dot product sum is %d",h_sum);
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_c);
+}
diff --git a/96224494_Sherwin/asssignment_1.cu b/96224494_Sherwin/asssignment_1.cu
deleted file mode 100644
index 799c8bc..0000000
--- a/96224494_Sherwin/asssignment_1.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-#include<iostream>
-using namespace std;
-
-__global__ void AddArray(int* d_a,int* d_b, int* d_c,int Array_Size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    if(id < Array_Size)
-        d_c[id] = d_a[id] + d_b[id];
-}
-int main()
-{
-    int Array_Size;
-    cout << "Enter the array size : ";
-    cin >> Array_Size;
-    int h_a[Array_Size],h_b[Array_Size], h_c[Array_Size];
-    int Array_Bytes = Array_Size * sizeof(int);  
-    for(int i=0; i<Array_Size; i++)
-    {
-        h_a[i] = i;
-        h_b[i] = i;
-    }
-    int *d_a,*d_b, *d_c;
-    cudaMalloc((void**)&d_b, Array_Bytes);
-    cudaMalloc((void**)&d_a, Array_Bytes);
-    cudaMalloc((void**)&d_c, Array_Bytes);
-    // Copy the array from CPU (h_in) to the GPU (d_in)
-    cudaMemcpy(d_b, h_b, Array_Bytes, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_a, h_a, Array_Bytes, cudaMemcpyHostToDevice);
-    AddArray<<<ceil(1.0*Array_Size/1024), 1024>>>(d_a,d_b,d_c,Array_Size);
-    // Copy the resulting array from GPU (d_out) to the CPU (h_out)
-    cudaMemcpy(h_c, d_c, Array_Bytes, cudaMemcpyDeviceToHost);
-    for(int i=0; i<Array_Size; i++)
-        cout << h_c[i] << " ";
-    cudaFree(d_a);
-    cudaFree(d_b);
-    cudaFree(d_c);
-}