diff --git a/Assignment2 b/96752745_Amal/assignment2.cu
similarity index 91%
rename from Assignment2
rename to 96752745_Amal/assignment2.cu
index aa25e10..9ace3b7 100644
--- a/Assignment2
+++ b/96752745_Amal/assignment2.cu
@@ -1,57 +1,57 @@
-#include<iostream>
-using namespace std;
-
-__global__ void Add(float *d_a,float *d_b,float *d_c,int r,int c){
-    
-    int i =blockIdx.x*blockDim.x+threadIdx.x;
-    int j =blockIdx.y*blockDim.y+threadIdx.y;
-    int k = i+j*c;
-    //i is defined for horizontal traversal
-    if(i<c && j<r){
-        d_c[k]=d_a[k]+d_b[k];
-    }    
-}
-
-
-int main()
-{
-    int r,c,i,j;
-    cout<<"Enter the rows and columns\n";
-    cin>>r>>c;
-    float h_a[r][c],h_b[r][c],h_c[r][c];
-    for(i=0;i<r;i++)
-    {
-        for(j=0;j<c;j++)
-        {
-            h_a[i][j]=i+j+3;
-            h_b[i][j]=i*j;
-        }
-    }
-    float *d_a,*d_b,*d_c;
-    cudaMalloc((void**)&d_a, (r*c)*sizeof(float));
-    cudaMalloc((void**)&d_b, (r*c)*sizeof(float));
-    cudaMalloc((void**)&d_c, (r*c)*sizeof(float));
-
-    cudaMemcpy(d_a, h_a, r*c*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_b, h_b, r*c*sizeof(float), cudaMemcpyHostToDevice);
-
-    dim3 dimBlock(32, 32);
-    dim3 dimGrid((int)ceil(1.0*r/dimBlock.x),(int)ceil(1.0*c/dimBlock.y));
-    Add<<<dimGrid,dimBlock>>>(d_a,d_b,d_c,r,c);
-    cudaMemcpy(h_c, d_c, (r*c)*sizeof(float), cudaMemcpyDeviceToHost);
-
-    cout<<"Sum of the 2 matrices is:\n";
-    for(i=0;i<r;i++)
-    {
-        for(j=0;j<c;j++)
-        {
-            printf("%.2f ",h_c[i][j]);
-        }
-        cout<<"\n";
-    }
-
-    cudaFree(d_a);
-    cudaFree(d_b);
-    cudaFree(d_c);
-    return 0;
-}
+#include<iostream>
+using namespace std;
+
+__global__ void Add(float *d_a,float *d_b,float *d_c,int r,int c){
+    
+    int i =blockIdx.x*blockDim.x+threadIdx.x;
+    int j =blockIdx.y*blockDim.y+threadIdx.y;
+    int k = i+j*c;
+    //i is defined for horizontal traversal
+    if(i<c && j<r){
+        d_c[k]=d_a[k]+d_b[k];
+    }    
+}
+
+
+int main()
+{
+    int r,c,i,j;
+    cout<<"Enter the rows and columns\n";
+    cin>>r>>c;
+    float h_a[r][c],h_b[r][c],h_c[r][c];
+    for(i=0;i<r;i++)
+    {
+        for(j=0;j<c;j++)
+        {
+            h_a[i][j]=i+j+3;
+            h_b[i][j]=i*j;
+        }
+    }
+    float *d_a,*d_b,*d_c;
+    cudaMalloc((void**)&d_a, (r*c)*sizeof(float));
+    cudaMalloc((void**)&d_b, (r*c)*sizeof(float));
+    cudaMalloc((void**)&d_c, (r*c)*sizeof(float));
+
+    cudaMemcpy(d_a, h_a, r*c*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_b, h_b, r*c*sizeof(float), cudaMemcpyHostToDevice);
+
+    dim3 dimBlock(32, 32);
+    dim3 dimGrid((int)ceil(1.0*c/dimBlock.x),(int)ceil(1.0*r/dimBlock.y));
+    Add<<<dimGrid,dimBlock>>>(d_a,d_b,d_c,r,c);
+    cudaMemcpy(h_c, d_c, (r*c)*sizeof(float), cudaMemcpyDeviceToHost);
+
+    cout<<"Sum of the 2 matrices is:\n";
+    for(i=0;i<r;i++)
+    {
+        for(j=0;j<c;j++)
+        {
+            printf("%.2f ",h_c[i][j]);
+        }
+        cout<<"\n";
+    }
+
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_c);
+    return 0;
+}
\ No newline at end of file
diff --git a/96752745_Amal/assignment3.cu b/96752745_Amal/assignment3.cu
new file mode 100644
index 0000000..8a0795a
--- /dev/null
+++ b/96752745_Amal/assignment3.cu
@@ -0,0 +1,53 @@
+#include<iostream>
+using namespace std;
+
+__global__ void Transpose(int *d_a,int max){
+
+int i = blockIdx.x*blockDim.x+threadIdx.x;
+int j = blockIdx.y*blockDim.y+threadIdx.y;
+int id1 = i+max*j;
+int id2 = j+max*i;
+__syncthreads();
+
+if(i<max && j<max)
+{
+	int t = d_a[id1];
+	__syncthreads();
+	d_a[id1]=d_a[id2];
+	__syncthreads();
+	d_a[id2]=t;
+}
+	
+}
+
+int main()
+{
+	int r,c,i,j,max;
+	cout<<"Enter the number of rows and columns:\n";
+	cin>>r>>c;
+	max=r>c?r:c;
+	int h_a[max][max]={0};
+	for(i=0;i<r;i++)
+	{
+		for(j=0;j<c;j++)
+		h_a[i][j]=2*i+j;
+	}
+	int *d_a;
+	cudaMalloc((void**)&d_a, max*max*sizeof(int));
+
+	cudaMemcpy(d_a, h_a, max*max*sizeof(int), cudaMemcpyHostToDevice);
+	dim3 dimBlock(32, 32);
+    dim3 dimGrid((int)ceil(1.0*max/dimBlock.x), (int)ceil(1.0*max/dimBlock.y));
+	Transpose<<<dimGrid,dimBlock>>>(d_a,max);
+	cudaMemcpy(h_a, d_a, max*max*sizeof(int), cudaMemcpyDeviceToHost);
+	cout<<"The transpose matrix is:\n";
+	for(i=0;i<c;i++)
+	{
+		for(j=0;j<r;j++)
+		cout<<h_a[i][j]<<" ";
+		cout<<"\n";
+	}
+
+	cudaFree(d_a);
+	return 0;
+}
\ No newline at end of file