From cb9cd7aad3a20aa9b0e4b3c2d3cbb26ac3e2f760 Mon Sep 17 00:00:00 2001
From: Asif Manzoor Hasan <hasan_asif1995@yahoo.co.in>
Date: Wed, 1 Dec 2021 18:32:47 +0100
Subject: [PATCH 1/2] Optimization for channel

---
 .gitignore                 |   2 +
 Makefile                   |   6 +-
 src/boundary.h             |  22 ++
 src/boundary_condition_y.h | 107 ++++++--
 src/boundary_condition_z.h |   8 +-
 src/calc_stress.cu         | 120 ++++++++-
 src/cuda_derivs.h          |  66 ++---
 src/cuda_functions.h       |  23 +-
 src/cuda_main.cu           | 203 ++++++++++----
 src/cuda_main.h            |  13 +
 src/cuda_rhs.cu            | 526 +++++++++++++++++++++++--------------
 src/cuda_utils.cu          |  24 +-
 src/globals.h              |  40 +--
 src/main.h                 |   2 +-
 14 files changed, 808 insertions(+), 354 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7f02fbe..4fd798e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+*.ncu-rep
 *.bin
 *.txt
 *.o
@@ -6,3 +7,4 @@ logfile
 *.xmf
 *.mp4
 ns
+tmp
diff --git a/Makefile b/Makefile
index 6485a4b..f5a4d34 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ GEN = /usr
 
 #MPI = /usr/local/openmpi-4.1.1
 
-CUDA = /usr/local/cuda-11.2
+CUDA = /usr/local/cuda-11.4
 
 GPU_ARCHITECTURE = 70
 
@@ -38,6 +38,7 @@ INC = -I$(CUDA)/include -I$(GEN)/include $(GLB)
 #INC += -I$(MPI)/include
 LIB = -L$(CUDA)/lib64 -L$(GEN)/lib -lc -lstdc++ -lcuda -lcudart -lcudadevrt 
 #LIB += -L$(MPI)/lib
+#NVCC = $(CUDA)/bin/nvcc $(DBG) $(CPPFLAGS) -lineinfo -rdc=true # 
 NVCC = nvcc $(DBG) $(CPPFLAGS) -lineinfo -rdc=true # 
 MAXREG = # --maxrregcount=82 --ptxas-options=-v
 ifeq ($(DBG),)
@@ -104,3 +105,6 @@ $(OBJ)cuda_link.o: $(OBJ_CUDA)
 
 clean:
 		rm -rf $(TARGET) $(OBJ)*.o
+
+
+
diff --git a/src/boundary.h b/src/boundary.h
index 3fb8755..8c9e4dc 100644
--- a/src/boundary.h
+++ b/src/boundary.h
@@ -12,7 +12,9 @@ extern __device__ __forceinline__ void perBCx(myprec *s_f, int si);
 extern __device__ __forceinline__ void perBCy(myprec *s_f, int si);
 extern __device__ __forceinline__ void perBCz(myprec *s_f, int si);
 extern __device__ __forceinline__ void perBCzBot(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void perBCyBot(myprec *s_f, myprec *f, int si, Indices id);
 extern __device__ __forceinline__ void perBCzTop(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void perBCyTop(myprec *s_f, myprec *f, int si, Indices id);
 extern __device__ __forceinline__ void haloBCy(myprec *s_f, myprec *f, int si, Indices id);
 extern __device__ __forceinline__ void haloBCz(myprec *s_f, myprec *f, int si, Indices id);
 extern __device__ __forceinline__ void haloBCzBot(myprec *s_f, myprec *f, int si, Indices id);
@@ -33,7 +35,9 @@ extern __device__ __forceinline__ void botBCxExt(myprec *s_f, int si, myprec Bcb
 extern __device__ __forceinline__ void topBCzExt(myprec *s_f, int si);
 extern __device__ __forceinline__ void botBCzExt(myprec *s_f, int si);
 extern __device__ __forceinline__ void botBCzCpy(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void botBCyCpy(myprec *s_f, myprec *f, int si, Indices id);
 extern __device__ __forceinline__ void topBCzCpy(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void topBCyCpy(myprec *s_f, myprec *f, int si, Indices id);
 
 __device__ __forceinline__ __attribute__((always_inline)) void perBCx(myprec *s_f, int si) {
 	s_f[si-stencilSize]  = s_f[si+mx-stencilSize];
@@ -54,10 +58,20 @@ __device__ __forceinline__ __attribute__((always_inline)) void perBCzBot(myprec
 	s_f[si-stencilSize]  = f[idx(id.i,id.j,id.k+mz-stencilSize)];
 }
 
+__device__ __forceinline__ __attribute__((always_inline)) void perBCyBot(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si-stencilSize]  = f[idx(id.i,id.j+my-stencilSize,id.k)];
+}
+
+
 __device__ __forceinline__ __attribute__((always_inline)) void perBCzTop(myprec *s_f, myprec *f, int si, Indices id) {
 	s_f[si+mz/nDivZ]     = f[idx(id.i,id.j,si-stencilSize)];
 }
 
+__device__ __forceinline__ __attribute__((always_inline)) void perBCyTop(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si+my/nDivY]     = f[idx(id.i,si-stencilSize, id.k)];
+}
+
+
 __device__ __forceinline__ __attribute__((always_inline)) void haloBCy(myprec *s_f, myprec *f, int si, Indices id) {
 	s_f[si-stencilSize]  = f[mx*my*mz + id.j + id.i*stencilSize + id.k*mx*stencilSize];
 	s_f[si+my]           = f[mx*my*mz + stencilSize*mx*mz + id.j + id.i*stencilSize + id.k*mx*stencilSize];
@@ -183,9 +197,17 @@ __device__ __forceinline__ __attribute__((always_inline)) void botBCzCpy(myprec
 s_f[si-stencilSize] = f[idx(id.i,id.j,id.k-stencilSize)];
 }
 
+__device__ __forceinline__ __attribute__((always_inline)) void botBCyCpy(myprec *s_f, myprec *f, int si, Indices id) {
+s_f[si-stencilSize] = f[idx(id.i,id.j-stencilSize,id.k)];
+}
+
 __device__ __forceinline__ __attribute__((always_inline)) void topBCzCpy(myprec *s_f, myprec *f, int si, Indices id) {
 s_f[si+mz/nDivZ] = f[idx(id.i,id.j,id.k+mz/nDivZ)];
 }
 
+__device__ __forceinline__ __attribute__((always_inline)) void topBCyCpy(myprec *s_f, myprec *f, int si, Indices id) {
+s_f[si+my/nDivY] = f[idx(id.i,id.j+my/nDivY,id.k)];
+}
+
 
 #endif /* BOUNDARY_H_ */
diff --git a/src/boundary_condition_y.h b/src/boundary_condition_y.h
index 419b18c..cade0f3 100644
--- a/src/boundary_condition_y.h
+++ b/src/boundary_condition_y.h
@@ -8,52 +8,125 @@ extern __device__ __forceinline__ void BCyNumber1(myprec *s_u, myprec *s_v, mypr
 												  myprec *s_m, myprec *s_dil,
 												  myprec *u, myprec *v, myprec *w,
 												  myprec *m, myprec *dil,
-												  Indices id, int si, int n);
+												  Indices id, int si, int n, int jNum);
 
-extern __device__ __forceinline__ void BCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int m);
+extern __device__ __forceinline__ void BCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int m, int jNum);
 
 extern __device__ __forceinline__ void BCyNumber3(myprec *s_f, myprec *s_g,
 												  myprec *f,   myprec *g,
-												  Indices id, int si, int n);
+												  Indices id, int si, int n, int jNum);
 
 __device__ __forceinline__ __attribute__((always_inline)) void BCyNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
 																		  myprec *s_m, myprec *s_dil,
 																		  myprec *u, myprec *v, myprec *w,
 																		  myprec *m, myprec *dil,
-																		  Indices id, int si, int n) {
-	if (id.j < stencilSize) {
-		if(multiGPU) {
+																		  Indices id, int si, int n, int jNum) {
+	if (id.tiy < stencilSize) {
+	    if(nDivY==1) {
+		     if(multiGPU) {
 			haloBCy(s_u,u,si,id); haloBCy(s_v,v,si,id); haloBCy(s_w,w,si,id);
 			haloBCy(s_m,m,si,id); haloBCy(s_dil,dil,si,id);
-		} else {
+		     } else {
 			perBCy(s_u,si);	perBCy(s_v,si); perBCy(s_w,si);
 			perBCy(s_m,si); perBCy(s_dil,si);
 		}
+	   } 
+           else {
+			if(jNum==0) {
+				topBCyCpy(s_u,u,si,id); topBCyCpy(s_v,v,si,id); topBCyCpy(s_w,w,si,id);
+				topBCyCpy(s_m,m,si,id); topBCyCpy(s_dil,dil,si,id);
+				
+						perBCyBot(s_u,u,si,id);
+						perBCyBot(s_v,v,si,id);
+						perBCyBot(s_w,w,si,id);
+						perBCyBot(s_m,m,si,id);
+						perBCyBot(s_dil,dil,si,id);
+				
+				
+			} else if (jNum==nDivY-1) {
+				botBCyCpy(s_u,u,si,id); botBCyCpy(s_v,v,si,id); botBCyCpy(s_w,w,si,id);
+				botBCyCpy(s_m,m,si,id); botBCyCpy(s_dil,dil,si,id);
+				
+						perBCyTop(s_u,u,si,id);
+						perBCyTop(s_v,v,si,id);
+						perBCyTop(s_w,w,si,id);
+						perBCyTop(s_m,m,si,id);
+						perBCyTop(s_dil,dil,si,id);
+				
+			} else {
+				topBCyCpy(s_u,u,si,id); topBCyCpy(s_v,v,si,id); topBCyCpy(s_w,w,si,id);
+				topBCyCpy(s_m,m,si,id); topBCyCpy(s_dil,dil,si,id);
+				botBCyCpy(s_u,u,si,id); botBCyCpy(s_v,v,si,id); botBCyCpy(s_w,w,si,id);
+				botBCyCpy(s_m,m,si,id); botBCyCpy(s_dil,dil,si,id);
+			}
+		}
 	}
+
+
+
+
 	__syncthreads();
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void BCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int n) {
-	if (id.j < stencilSize) {
-		if(multiGPU) {
+__device__ __forceinline__ __attribute__((always_inline)) void BCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int n, int jNum) {
+	if (id.tiy < stencilSize) {
+	    if(nDivY==1) {
+		     if(multiGPU) {
 			haloBCy(s_p,p,si,id);
-		} else {
+		     } else {
 			perBCy(s_p,si);
 		}
+	   } 
+           else {
+			if(jNum==0) {
+				topBCyCpy(s_p,p,si,id); 
+			        perBCyBot(s_p,p,si,id);
+				
+				
+			} else if (jNum==nDivY-1) {
+				botBCyCpy(s_p,p,si,id); 
+						perBCyTop(s_p,p,si,id);
+				
+			} else {
+				topBCyCpy(s_p,p,si,id); botBCyCpy(s_p,p,si,id);
+			}
+		}
 	}
+
+
+
+
 	__syncthreads();
+
 }
 
 __device__ __forceinline__ __attribute__((always_inline)) void BCyNumber3(myprec *s_f, myprec *s_g,
 																		  myprec *f,   myprec *g,
-																		  Indices id, int si, int n) {
-	if (id.j < stencilSize) {
-		if(multiGPU) {
-			haloBCy(s_f,f,si,id); haloBCy(s_g,g,si,id);
-		} else {
-			perBCy(s_f,si); perBCy(s_g,si);
+																		  Indices id, int si, int n, int jNum) {
+	if (id.tiy < stencilSize) {
+	    if(nDivY==1) {
+		     if(multiGPU) {
+			haloBCy(s_f,f,si,id); haloBCy(s_g,g,si,id); 
+		     } else {
+			perBCy(s_f,si);	perBCy(s_g,si); 
+		}
+	   } 
+           else {
+			if(jNum==0) {
+				topBCyCpy(s_f,f,si,id); topBCyCpy(s_g,g,si,id); 						perBCyBot(s_f,f,si,id);
+						perBCyBot(s_g,g,si,id);
+				
+				
+			} else if (jNum==nDivY-1) {
+				botBCyCpy(s_f,f,si,id); botBCyCpy(s_g,g,si,id); 						perBCyTop(s_f,f,si,id);
+						perBCyTop(s_g,g,si,id);
+				
+			} else {
+				topBCyCpy(s_f,f,si,id); topBCyCpy(s_g,g,si,id); 				botBCyCpy(s_f,f,si,id); botBCyCpy(s_g,g,si,id);
+			}
 		}
 	}
+
 	__syncthreads();
 }
 
diff --git a/src/boundary_condition_z.h b/src/boundary_condition_z.h
index 75cff94..11f0769 100644
--- a/src/boundary_condition_z.h
+++ b/src/boundary_condition_z.h
@@ -83,7 +83,7 @@ __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber1(myprec
 																		  myprec *m, myprec *dil,
 																		  Indices id, int si, int n, int kNum) {
 
-	if (id.tix < stencilSize) {
+	if (id.tiy < stencilSize) {
 		if(nDivZ==1) {
 			if(multiGPU) {
 				haloBCz(s_u,u,si,id); haloBCz(s_v,v,si,id); haloBCz(s_w,w,si,id);
@@ -164,7 +164,7 @@ __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber1(myprec
 }
 
 __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber2(myprec *s_p, myprec *p, Indices id, int si, int n, int kNum) {
-	if (id.tix < stencilSize) {
+	if (id.tiy < stencilSize) {
 		if(nDivZ==1) {
 			if(multiGPU) {
 				haloBCz(s_p,p,si,id);
@@ -212,7 +212,7 @@ __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber2(myprec
 __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber3(myprec *s_l, myprec *s_t,
 																		  myprec *l,   myprec *t,
 																		  Indices id, int si, int n, int kNum) {
-	if (id.tix < stencilSize) {
+	if (id.tiy < stencilSize) {
 		if(nDivZ==1) {
 			if(multiGPU) {
 				haloBCz(s_l,l,si,id); haloBCz(s_t,t,si,id);
@@ -273,7 +273,7 @@ __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber3(myprec
 __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber4(myprec *s_r, myprec *s_h,
 																		  myprec *r,   myprec *h,
 																		  Indices id, int si, int n, int kNum) {
-	if (id.tix < stencilSize) {
+	if (id.tiy < stencilSize) {
 		if(nDivZ==1) {
 			if(multiGPU) {
 				haloBCz(s_h,h,si,id); haloBCz(s_r,r,si,id);
diff --git a/src/calc_stress.cu b/src/calc_stress.cu
index 9cf9a49..5fcff95 100644
--- a/src/calc_stress.cu
+++ b/src/calc_stress.cu
@@ -6,6 +6,8 @@
 #include "cuda_globals.h"
 #include "cuda_math.h"
 #include "boundary_condition_x.h"
+#include "boundary_condition_y.h"
+#include "boundary_condition_z.h"
 #include "comm.h"
 #include "sponge.h"
 
@@ -14,7 +16,7 @@ __global__ void deviceAdvanceTime(myprec *dt) {
 }
 
 __global__ void deviceCalcPress(myprec *a, myprec *b, myprec *c) {
-	*a = 0.99*(*b) - 0.5*(*a/(*c)-1);
+	*a = 0.99*(*b) - 0.5*( *a/(*c) - 1 ); //*a/(*c)-1 //a here is rw_bulk // changed from rw_bulk/r_bulk to rw_bulk
 }
 
 __global__ void derVelX(myprec *u, myprec *v, myprec *w, myprec *dudx, myprec *dvdx, myprec *dwdx) {
@@ -46,20 +48,105 @@ __global__ void derVelX(myprec *u, myprec *v, myprec *w, myprec *dudx, myprec *d
 
 __global__ void derVelY(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *dvdy, myprec *dwdy) {
 
-	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
-
-	derDevV1yL(dudy,u,id);
+       Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+       int jNum = blockIdx.z;
+
+       id.mkidYFlx(jNum);
+
+       int tid = id.tix + id.bdx*id.tiy;
+
+       int si = id.tiy + stencilSize;       // local i for shared memory access + halo offset
+       int sj = id.tix;                   // local j for shared memory access
+       int si1 = tid%id.bdy +  stencilSize;       // local i for shared memory access + halo offset
+       int sj1 = tid/id.bdy;
+
+       Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y);
+       id1.mkidYFlx(jNum);
+
+       myprec wrk1=0;
+       myprec wrk2=0;
+       //myprec wrk3=0;
+
+      __shared__ myprec s_u[mx/nDivX][my/nDivY+stencilSize*2];
+      __shared__ myprec s_v[mx/nDivX][my/nDivY+stencilSize*2];
+      //__shared__ myprec s_w[mx/nDivX][my/nDivY+stencilSize*2];
+      s_u[sj][si] = u[id.g];
+      s_v[sj][si] = v[id.g];
+      //s_w[sj][si] = w[id.g];
+      __syncthreads();
+      BCyNumber2(s_u[sj],u,id,si,my,jNum);
+      BCyNumber2(s_v[sj],v,id,si,my,jNum);
+      //BCyNumber2(s_w[sj],w,id,si,my,jNum);
+      __syncthreads();
+// COMP Tile sj1 and si1
+      derDevSharedV1y(&wrk1,s_u[sj1],si1);
+      derDevSharedV1y(&wrk2,s_v[sj1],si1);
+      //derDevSharedV1y(&wrk3,s_w[sj1],si1);
+      __syncthreads();      
+      s_u[sj1][si1] = wrk1;
+      s_v[sj1][si1] = wrk2;
+     // s_w[sj1][si1] = wrk3;
+      __syncthreads();       
+//MEM tile sj si      
+      dudy[id.g] = s_u[sj][si]; 
+      dvdy[id.g] = s_v[sj][si];
+      //dwdy[id.g] = s_w[sj][si];
+      __syncthreads();
+
+/*	derDevV1yL(dudy,u,id);
 	derDevV1yL(dvdy,v,id);
-	derDevV1yL(dwdy,w,id);
+	//derDevV1yL(dwdy,w,id);*/
 }
 
-__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz, int kNum) {
-
-	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
-
-	derDevV1zL(dudz,u,uInit,id,kNum);
+__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz) {
+
+       Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+       int kNum = blockIdx.z;
+
+       id.mkidZFlx(kNum);
+
+       int tid = id.tix + id.bdx*id.tiy;
+
+       int si = id.tiy + stencilSize;       // local i for shared memory access + halo offset
+       int sj = id.tix;                   // local j for shared memory access
+       int si1 = tid%id.bdy +  stencilSize;       // local i for shared memory access + halo offset
+       int sj1 = tid/id.bdy;
+
+       Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y);
+       id1.mkidZFlx(kNum);
+
+       myprec wrk1=0;
+       myprec wrk2=0;
+       myprec wrk3=0;
+
+      __shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2];
+      __shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2];
+      __shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2];
+      s_u[sj][si] = u[id.g];
+      s_v[sj][si] = v[id.g];
+      s_w[sj][si] = w[id.g];
+      __syncthreads();
+      BCzNumber2(s_u[sj],u,id,si,mz,kNum);
+      BCzNumber2(s_v[sj],v,id,si,mz,kNum);
+      BCzNumber2(s_w[sj],w,id,si,mz,kNum);
+      __syncthreads();
+// COMP Tile sj1 and si1
+      derDevSharedV1z(&wrk1,s_u[sj1],si1);
+      derDevSharedV1z(&wrk2,s_v[sj1],si1);
+      derDevSharedV1z(&wrk3,s_w[sj1],si1);
+      __syncthreads();      
+      s_u[sj1][si1] = wrk1;
+      s_v[sj1][si1] = wrk2;
+      s_w[sj1][si1] = wrk3;
+      __syncthreads();       
+//MEM tile sj si      
+      dudz[id.g] = s_u[sj][si]; 
+      dvdz[id.g] = s_v[sj][si];
+      dwdz[id.g] = s_w[sj][si];
+      __syncthreads();
+	/*derDevV1zL(dudz,u,uInit,id,kNum);
 	derDevV1zL(dvdz,v,vInit,id,kNum);
-	derDevV1zL(dwdz,w,wInit,id,kNum);
+	derDevV1zL(dwdz,w,wInit,id,kNum);*/
 }
 
 __global__ void derVelYBC(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *dvdy, myprec *dwdy, int direction) {
@@ -112,7 +199,7 @@ void calcPressureGrad(myprec *dpdz, myprec *r, myprec *w, Communicator rk) {
 	deviceMul<<<gr0,bl0>>>(workA,r,w);
 	hostVolumeIntegral(dpdz,workA,rk);
 
-	deviceCalcPress<<<1,1>>>(dpdz,dpdz_prev,rbulk);
+	deviceCalcPress<<<1,1>>>(dpdz,dpdz_prev,rbulk); //I changed here from rbulk to workA. // i CHANGED TO 1
 
 	checkCuda( cudaFree(workA) );
 	checkCuda( cudaFree(dpdz_prev) );
@@ -159,15 +246,17 @@ __global__ void deviceCalcDt(myprec *wrkArray, myprec *r, myprec *u, myprec *v,
 
 }
 
-void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, Communicator rk) {
+void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, myprec *dtC, myprec *dpdz, int file, int istep, Communicator rk) {
 
 	cudaSetDevice(rk.nodeRank);
 
 	myprec *workA, *rbulk;
 	myprec *hostWork = (myprec*)malloc(sizeof(myprec));
+
 	dim3 gr0  = dim3(my / sPencils, mz, 1);
 	dim3 bl0 = dim3(mx, sPencils, 1);
 
+
 	checkCuda( cudaMalloc((void**)&workA,mx*my*mz*sizeof(myprec)) );
 	checkCuda( cudaMalloc((void**)&rbulk    ,     sizeof(myprec)) );
 
@@ -176,6 +265,8 @@ void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, mypre
 		checkCuda( cudaMemcpy(hostWork, rbulk, sizeof(myprec), cudaMemcpyDeviceToHost) );
 		allReduceSum(hostWork,1);
 		checkCuda( cudaMemcpy(rbulk, hostWork, sizeof(myprec), cudaMemcpyHostToDevice) );
+		if(rk.rank==0) printf("step number %d with %le %le %3.10le\n",nsteps*(file-1) + istep ,*dtC,*dpdz, *hostWork);
+
 
 		deviceMul<<<gr0,bl0>>>(workA,r,w);
 		hostVolumeIntegral(par1,workA,rk);
@@ -198,4 +289,7 @@ void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, mypre
 	free(hostWork);
 	checkCuda( cudaFree(workA) );
 	checkCuda( cudaFree(rbulk) );
+
+
+
 }
diff --git a/src/cuda_derivs.h b/src/cuda_derivs.h
index fe8670f..ad8040f 100644
--- a/src/cuda_derivs.h
+++ b/src/cuda_derivs.h
@@ -35,12 +35,10 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxQuadSharedx(m
 	flxm = 0.0;
 	__syncthreads();
 
-	for (int lt=1; lt<stencilSize+1; lt++)
-		for (int mt=0; mt<lt; mt++) {
-			flxp -= dcoeffF[stencilSize-lt]*(s_f[si-mt]+s_f[si-mt+lt])*(s_g[si-mt]+s_g[si-mt+lt]);
-			flxm -= dcoeffF[stencilSize-lt]*(s_f[si-mt-1]+s_f[si-mt+lt-1])*(s_g[si-mt-1]+s_g[si-mt+lt-1]);
-		}
-
+	for (int lt=1; lt<stencilSize+1; lt++) {
+                        flxp -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si+lt])*(s_g[si]+s_g[si+lt]);
+			flxm -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si-lt])*(s_g[si]+s_g[si-lt]);
+        }
 	*df = 0.5*d_dx*(flxm - flxp);
 
 #if nonUniformX
@@ -59,12 +57,11 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxCubeSharedx(m
 	flxm = 0.0;
 	__syncthreads();
 
-	for (int lt=1; lt<stencilSize+1; lt++)
-		for (int mt=0; mt<lt; mt++) {
-			flxp -= dcoeffF[stencilSize-lt]*(s_f[si-mt]+s_f[si-mt+lt])*(s_g[si-mt]+s_g[si-mt+lt])*(s_h[si-mt]+s_h[si-mt+lt]);
-			flxm -= dcoeffF[stencilSize-lt]*(s_f[si-mt-1]+s_f[si-mt+lt-1])*(s_g[si-mt-1]+s_g[si-mt+lt-1])*(s_h[si-mt-1]+s_h[si-mt+lt-1]);
-		}
+	for (int lt=1; lt<stencilSize+1; lt++) {
 
+			flxp -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si+lt])*(s_g[si]+s_g[si+lt])*(s_h[si]+s_h[si+lt]);
+			flxm -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si-lt])*(s_g[si]+s_g[si-lt])*(s_h[si]+s_h[si-lt]);
+        }
 	*df = 0.25*d_dx*(flxm - flxp);
 
 #if nonUniformX
@@ -83,12 +80,10 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxQuadSharedy(m
 	flxm = 0.0;
 	__syncthreads();
 
-	for (int lt=1; lt<stencilSize+1; lt++)
-		for (int mt=0; mt<lt; mt++) {
-			flxp -= dcoeffF[stencilSize-lt]*(s_f[si-mt]+s_f[si-mt+lt])*(s_g[si-mt]+s_g[si-mt+lt]);
-			flxm -= dcoeffF[stencilSize-lt]*(s_f[si-mt-1]+s_f[si-mt+lt-1])*(s_g[si-mt-1]+s_g[si-mt+lt-1]);
-		}
-
+	for (int lt=1; lt<stencilSize+1; lt++) {
+                        flxp -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si+lt])*(s_g[si]+s_g[si+lt]);
+			flxm -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si-lt])*(s_g[si]+s_g[si-lt]);
+        }
 	*df = 0.5*d_dy*(flxm - flxp);
 
 	__syncthreads();
@@ -103,12 +98,11 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxCubeSharedy(m
 	flxm = 0.0;
 	__syncthreads();
 
-	for (int lt=1; lt<stencilSize+1; lt++)
-		for (int mt=0; mt<lt; mt++) {
-			flxp -= dcoeffF[stencilSize-lt]*(s_f[si-mt]+s_f[si-mt+lt])*(s_g[si-mt]+s_g[si-mt+lt])*(s_h[si-mt]+s_h[si-mt+lt]);
-			flxm -= dcoeffF[stencilSize-lt]*(s_f[si-mt-1]+s_f[si-mt+lt-1])*(s_g[si-mt-1]+s_g[si-mt+lt-1])*(s_h[si-mt-1]+s_h[si-mt+lt-1]);
-		}
+	for (int lt=1; lt<stencilSize+1; lt++) {
 
+			flxp -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si+lt])*(s_g[si]+s_g[si+lt])*(s_h[si]+s_h[si+lt]);
+			flxm -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si-lt])*(s_g[si]+s_g[si-lt])*(s_h[si]+s_h[si-lt]);
+        }
 	*df = 0.25*d_dy*(flxm - flxp);
 
 	__syncthreads();
@@ -123,12 +117,15 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxQuadSharedz(m
 	flxm = 0.0;
 	__syncthreads();
 
-	for (int lt=1; lt<stencilSize+1; lt++)
-		for (int mt=0; mt<lt; mt++) {
-			flxp -= dcoeffF[stencilSize-lt]*(s_f[si-mt]+s_f[si-mt+lt])*(s_g[si-mt]+s_g[si-mt+lt]);
-			flxm -= dcoeffF[stencilSize-lt]*(s_f[si-mt-1]+s_f[si-mt+lt-1])*(s_g[si-mt-1]+s_g[si-mt+lt-1]);
-		}
+	for (int lt=1; lt<stencilSize+1; lt++) {
+		//for (int mt=0; mt<lt; mt++) {
+			//flxp -= dcoeffF[stencilSize-lt]*(s_f[si-mt]+s_f[si-mt+lt])*(s_g[si-mt]+s_g[si-mt+lt]);
+			//flxm -= dcoeffF[stencilSize-lt]*(s_f[si-mt-1]+s_f[si-mt+lt-1])*(s_g[si-mt-1]+s_g[si-mt+lt-1]);
+                        flxp -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si+lt])*(s_g[si]+s_g[si+lt]);
+			flxm -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si-lt])*(s_g[si]+s_g[si-lt]);
 
+		//}
+                 }
 	*df = 0.5*d_dz*(flxm - flxp);
 
 	__syncthreads();
@@ -143,12 +140,17 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxCubeSharedz(m
 	flxm = 0.0;
 	__syncthreads();
 
-	for (int lt=1; lt<stencilSize+1; lt++)
-		for (int mt=0; mt<lt; mt++) {
-			flxp -= dcoeffF[stencilSize-lt]*(s_f[si-mt]+s_f[si-mt+lt])*(s_g[si-mt]+s_g[si-mt+lt])*(s_h[si-mt]+s_h[si-mt+lt]);
-			flxm -= dcoeffF[stencilSize-lt]*(s_f[si-mt-1]+s_f[si-mt+lt-1])*(s_g[si-mt-1]+s_g[si-mt+lt-1])*(s_h[si-mt-1]+s_h[si-mt+lt-1]);
-		}
+	for (int lt=1; lt<stencilSize+1; lt++) {
+//		for (int mt=0; mt<lt; mt++) {
+//			flxp -= dcoeffF[stencilSize-lt]*(s_f[si-mt]+s_f[si-mt+lt])*(s_g[si-mt]+s_g[si-mt+lt])*(s_h[si-mt]+s_h[si-mt+lt]);
+//			flxm -= dcoeffF[stencilSize-lt]*(s_f[si-mt-1]+s_f[si-mt+lt-1])*(s_g[si-mt-1]+s_g[si-mt+lt-1])*(s_h[si-mt-1]+s_h[si-mt+lt-1]);
+
+			flxp -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si+lt])*(s_g[si]+s_g[si+lt])*(s_h[si]+s_h[si+lt]);
+			flxm -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si-lt])*(s_g[si]+s_g[si-lt])*(s_h[si]+s_h[si-lt]);
+
 
+//		}
+                }  
 	*df = 0.25*d_dz*(flxm - flxp);
 
 	__syncthreads();
diff --git a/src/cuda_functions.h b/src/cuda_functions.h
index 589e497..a961cae 100644
--- a/src/cuda_functions.h
+++ b/src/cuda_functions.h
@@ -39,18 +39,21 @@ class Indices {
        g = i + j*mx + k*mx*my;
     }
 
-    __device__ __host__ void mkidYFlx() {
-        i  = bix*bdy + tiy;
-        j  = tix;
+    __device__ __host__ void mkidYFlx(int jNum) {
+        //CHANGED
+        i  = bix*bdx + tix;
+        j  = tiy + jNum*my/nDivY;
         k  = biy;
         g = i + j*mx + k*mx*my;
      }
 
     __device__ __host__ void mkidZFlx(int kNum) {
-        i  = bix*bdy + tiy;
+  //changed
+        i  = bix*bdx + tix;
         j  = biy;
-        k  = tix + kNum*mz/nDivZ;
+        k  = tiy + kNum*mz/nDivZ;
         g = i + j*mx + k*mx*my;
+
      }
 
     __device__ __host__ void mkidYBC(int dir) {
@@ -76,7 +79,7 @@ class Indices {
 
 void calcTimeStep(myprec *dt, myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, myprec *mu, Communicator rk);
 void calcPressureGrad(myprec *dpdx, myprec *r, myprec *w, Communicator rk);
-void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, Communicator rk);
+void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, myprec *dtC , myprec *dpdz ,int file, int istep , Communicator rk);
 
 #include "boundary_condition_z.h"
 #include "cuda_derivs.h"
@@ -86,21 +89,21 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
 		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
 		myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz,
-		myprec *dil, myprec *dpdz, int iNum);
+		myprec *dvdy, myprec *dwdz, myprec *dil, myprec *dpdz, int iNum);
 __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, myprec *eY,
 		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
 		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
 		myprec *dvdx, myprec *dudy, myprec *dvdy, myprec *dwdy, myprec *dvdz,
-		myprec *dil, myprec *dpdz, int jNum);
+		myprec *dil, myprec *dpdz);
 __global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, myprec *eZ,
 		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
 		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
 		myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz,
-		myprec *dil, myprec *dpdz, int kNum);
+		myprec *dil, myprec *dpdz);
 
 __global__ void derVelX(myprec *u, myprec *v, myprec *w, myprec *dudx, myprec *dvdx, myprec *dwdx);
 __global__ void derVelY(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *dvdy, myprec *dwdy);
-__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz, int kNum);
+__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz);
 __global__ void derVelYBC(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *dvdy, myprec *dwdy, int direction);
 __global__ void derVelZBC(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz, int direction);
 __global__ void calcDil(myprec *dil, myprec *dudx, myprec *dvdy, myprec *dwdz);
diff --git a/src/cuda_main.cu b/src/cuda_main.cu
index 5f413c1..4f04436 100644
--- a/src/cuda_main.cu
+++ b/src/cuda_main.cu
@@ -7,6 +7,7 @@
 #include "sponge.h"
 
 
+
 __device__ __constant__ myprec alpha[] = {0.    , -17./60., -5./12.};
 __device__ __constant__ myprec beta[]  = {8./15.,   5./12.,  3./4. };
 
@@ -14,10 +15,10 @@ cudaStream_t s[8+nDivZ];
 
 inline void calcRHS(myprec *rhsr, myprec *rhsu, myprec *rhsv, myprec *rhsw, myprec *rhse, Communicator rk) {
 	calcState<<<grid0,block0,0,s[0]>>>(d_r,d_u,d_v,d_w,d_e,d_h,d_t,d_p,d_m,d_l,0); //here 0 means interior points
-	derVelX<<<d_grid[0],d_block[0],0,s[1]>>>(d_u,d_v,d_w,gij[0],gij[1],gij[2]);
+	//derVelX<<<d_grid[0],d_block[0],0,s[1]>>>(d_u,d_v,d_w,gij[0],gij[1],gij[2]);
 	derVelY<<<d_grid[3],d_block[3],0,s[2]>>>(d_u,d_v,d_w,gij[3],gij[4],gij[5]);
-	for (int kNum=0; kNum<nDivZ; kNum++)
-		derVelZ<<<d_grid[4],d_block[4],0,s[8+kNum]>>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],kNum);
+	//for (int kNum=0; kNum<1; kNum++) //CHANGED!!!!!
+	derVelZ<<<d_grid[4],d_block[4],0,s[8]>>>(d_u,d_v,d_w,gij[6],gij[7],gij[8]);
 	if(multiGPU) {
 		updateHaloFive(d_r,d_u,d_v,d_w,d_e,rk); cudaDeviceSynchronize();
 		calcState<<<gridHalo,blockHalo,0,s[4]>>>(d_r,d_u,d_v,d_w,d_e,d_h,d_t,d_p,d_m,d_l,1); //here 1 means halo points
@@ -27,33 +28,34 @@ inline void calcRHS(myprec *rhsr, myprec *rhsu, myprec *rhsv, myprec *rhsw, mypr
 		derVelZBC<<<gridHaloZ,blockHaloZ,0,s[3]>>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],1);	//here 1 means upper boundary (mz-index)
 	}
 	cudaDeviceSynchronize();
-	calcDil<<<grid0,block0>>>(d_dil,gij[0],gij[4],gij[8]);
+	//calcDil<<<grid0,block0>>>(d_dil,gij[0],gij[4],gij[8]);
 	cudaDeviceSynchronize();
 	if(multiGPU) deviceBlocker<<<grid0,block0,0,s[0]>>>();   //in order to hide the halo update with deviceRHSX (on stream s[0])
-	deviceRHSX<<<d_grid[0],d_block[0],0,s[0]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[0],gij[1],gij[2],gij[3],gij[6],d_dil,dpdz,0);
+	deviceRHSX<<<d_grid[0],d_block[0],0,s[0]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[0],gij[1],gij[2],gij[3],gij[6],gij[4],gij[8],d_dil,dpdz,0);
 	if(multiGPU) updateHalo(d_dil,rk);
 	cudaDeviceSynchronize();
-	deviceRHSY<<<d_grid[1],d_block[1]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[1],gij[3],gij[4],gij[5],gij[7],d_dil,dpdz,0);
-	for (int kNum=0; kNum<nDivZ; kNum++)
-		deviceRHSZ<<<d_grid[2],d_block[2],0,s[kNum]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[2],gij[5],gij[6],gij[7],gij[8],d_dil,dpdz,kNum);
+	deviceRHSY<<<d_grid[1],d_block[1]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[1],gij[3],gij[4],gij[5],gij[7],d_dil,dpdz);
+        cudaDeviceSynchronize();	
+        deviceRHSZ<<<d_grid[2],d_block[2],0,s[8]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[2],gij[5],gij[6],gij[7],gij[8],d_dil,dpdz);
 	cudaDeviceSynchronize();
 	if(boundaryLayer) addSponge<<<d_grid[0],d_block[0]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_e);
 	cudaDeviceSynchronize();
 }
 
-void runSimulation(myprec *par1, myprec *par2, myprec *time, Communicator rk) {
-
-	cudaSetDevice(rk.nodeRank);
-
-	myprec h_dt,h_dpdz;
-
-	for (int istep = 0; istep < nsteps; istep++) {
-		if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk);
-		if(istep>0)  deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC);
-		if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC);
-		deviceAdvanceTime<<<1,1>>>(dtC);
-		if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk);
-
+void runSimulation(int file , myprec *par1, myprec *par2, myprec *time, Communicator rk) {
+
+          cudaSetDevice(rk.nodeRank);
+          myprec h_dt,h_dpdz;
+  
+          for (int istep = 0; istep < nsteps; istep++) {
+  
+          	if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk);
+          	if(istep>0)  deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC);
+          	if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC);
+          	deviceAdvanceTime<<<1,1>>>(dtC);
+          	if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,&h_dt,&h_dpdz,file, istep, rk);
+		
+		
 		deviceMul<<<grid0,block0,0,s[0]>>>(d_uO,d_r,d_u);
 		deviceMul<<<grid0,block0,0,s[1]>>>(d_vO,d_r,d_v);
 		deviceMul<<<grid0,block0,0,s[2]>>>(d_wO,d_r,d_w);
@@ -64,12 +66,15 @@ void runSimulation(myprec *par1, myprec *par2, myprec *time, Communicator rk) {
 
 		//runge kutta step 1
 		calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk);
-		eulerSum<<<grid0,block0>>>(d_r,d_rO,d_rhsr1,dtC);
-		eulerSum<<<grid0,block0>>>(d_e,d_eO,d_rhse1,dtC);
-		cudaDeviceSynchronize();
-		eulerSumR<<<grid0,block0,0,s[0]>>>(d_u,d_uO,d_rhsu1,d_r,dtC);
-		eulerSumR<<<grid0,block0,0,s[1]>>>(d_v,d_vO,d_rhsv1,d_r,dtC);
-		eulerSumR<<<grid0,block0,0,s[2]>>>(d_w,d_wO,d_rhsw1,d_r,dtC);
+		//eulerSum<<<grid0,block0>>>(d_r,d_rO,d_rhsr1,dtC);
+		//eulerSum<<<grid0,block0>>>(d_e,d_eO,d_rhse1,dtC);
+
+                eulerSumAll<<<grid0,block0>>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, 
+                            d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,dtC);
+		//cudaDeviceSynchronize();
+		//eulerSumR<<<grid0,block0,0,s[0]>>>(d_u,d_uO,d_rhsu1,d_r,dtC);
+		//eulerSumR<<<grid0,block0,0,s[1]>>>(d_v,d_vO,d_rhsv1,d_r,dtC);
+		//eulerSumR<<<grid0,block0,0,s[2]>>>(d_w,d_wO,d_rhsw1,d_r,dtC);
 		cudaDeviceSynchronize();
 
 		if(multiGPU) {  //To initiate slowly the routines so that we have time to initiate the memory transfer
@@ -80,12 +85,14 @@ void runSimulation(myprec *par1, myprec *par2, myprec *time, Communicator rk) {
 
 		//runge kutta step 2
 		calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk);
-		eulerSum3<<<grid0,block0>>>(d_r,d_rO,d_rhsr1,d_rhsr2,dtC);
-		eulerSum3<<<grid0,block0>>>(d_e,d_eO,d_rhse1,d_rhse2,dtC);
-		cudaDeviceSynchronize();
-		eulerSum3R<<<grid0,block0,0,s[0]>>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_r,dtC);
-		eulerSum3R<<<grid0,block0,0,s[1]>>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_r,dtC);
-		eulerSum3R<<<grid0,block0,0,s[2]>>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_r,dtC);
+		//eulerSum3<<<grid0,block0>>>(d_r,d_rO,d_rhsr1,d_rhsr2,dtC);
+		//eulerSum3<<<grid0,block0>>>(d_e,d_eO,d_rhse1,d_rhse2,dtC);
+		eulerSumAll2<<<grid0,block0>>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, 
+                            d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2, dtC);
+                //cudaDeviceSynchronize();
+		//eulerSum3R<<<grid0,block0,0,s[0]>>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_r,dtC);
+		//eulerSum3R<<<grid0,block0,0,s[1]>>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_r,dtC);
+		//eulerSum3R<<<grid0,block0,0,s[2]>>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_r,dtC);
 		cudaDeviceSynchronize();
 
 		if(multiGPU) {  //To initiate slowly the routines so that we have time to initiate the memory transfer
@@ -96,12 +103,16 @@ void runSimulation(myprec *par1, myprec *par2, myprec *time, Communicator rk) {
 
 		//runge kutta step 3
 		calcRHS(d_rhsr3,d_rhsu3,d_rhsv3,d_rhsw3,d_rhse3,rk);
-		rk3final<<<grid0,block0>>>(d_r,d_rO,d_rhsr1,d_rhsr2,d_rhsr3,dtC);
-		rk3final<<<grid0,block0>>>(d_e,d_eO,d_rhse1,d_rhse2,d_rhse3,dtC);
-		cudaDeviceSynchronize();
-		rk3finalR<<<grid0,block0,0,s[0]>>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_rhsu3,d_r,dtC);
-		rk3finalR<<<grid0,block0,0,s[1]>>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_rhsv3,d_r,dtC);
-		rk3finalR<<<grid0,block0,0,s[2]>>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_rhsw3,d_r,dtC);
+//		rk3final<<<grid0,block0>>>(d_r,d_rO,d_rhsr1,d_rhsr2,d_rhsr3,dtC);
+//		rk3final<<<grid0,block0>>>(d_e,d_eO,d_rhse1,d_rhse2,d_rhse3,dtC);
+		eulerSumAll3<<<grid0,block0>>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, 
+                            d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2,
+                            d_rhsr3, d_rhsu3, d_rhsv3, d_rhsw3, d_rhse3,dtC);
+
+//		cudaDeviceSynchronize();
+//		rk3finalR<<<grid0,block0,0,s[0]>>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_rhsu3,d_r,dtC);
+//		rk3finalR<<<grid0,block0,0,s[1]>>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_rhsv3,d_r,dtC);
+//		rk3finalR<<<grid0,block0,0,s[2]>>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_rhsw3,d_r,dtC);
 		cudaDeviceSynchronize();
 	}
 }
@@ -113,12 +124,11 @@ void runSimulationLowStorage(myprec *par1, myprec *par2, myprec *time, Communica
 	myprec h_dt,h_dpdz;
 
 	for (int istep = 0; istep < nsteps; istep++) {
-		if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk);
+/*		if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk);
 		if(istep>0)  deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC);
 		if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC);
 		deviceAdvanceTime<<<1,1>>>(dtC);
-		if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk);
-
+		if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk);*/
 
 		//Starting the Runge-Kutta Steps
 
@@ -185,6 +195,55 @@ void runSimulationLowStorage(myprec *par1, myprec *par2, myprec *time, Communica
 	}
 }
 
+__global__ void eulerSumAll(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, 
+                            myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1
+                            ,myprec *dt) {
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+        id.mkidX();
+        myprec tmp1 = 0;
+	tmp1 = (r0[id.g] + rhsr1[id.g]*(*dt)/2.0);
+        r[id.g] = tmp1;
+	u[id.g] = (u0[id.g] + rhsu1[id.g]*(*dt)/2.0)/tmp1;
+	v[id.g] = (v0[id.g] + rhsv1[id.g]*(*dt)/2.0)/tmp1;
+	w[id.g] = (w0[id.g] + rhsw1[id.g]*(*dt)/2.0)/tmp1;
+        e[id.g] = (e0[id.g] + rhse1[id.g]*(*dt)/2.0);
+
+}
+
+
+__global__ void eulerSumAll2(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, 
+                            myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1,
+                            myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, myprec *dt) {
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+        id.mkidX();
+        myprec tmp1 = 0;
+	tmp1 =    ( r0[id.g] + (2*rhsr2[id.g] - rhsr1[id.g])*(*dt) );
+        r[id.g] = tmp1;
+	u[id.g] = ( u0[id.g] + (2*rhsu2[id.g] - rhsu1[id.g])*(*dt) )/tmp1;
+	v[id.g] = ( v0[id.g] + (2*rhsv2[id.g] - rhsv1[id.g])*(*dt) )/tmp1;
+	w[id.g] = ( w0[id.g] + (2*rhsw2[id.g] - rhsw1[id.g])*(*dt) )/tmp1;
+        e[id.g] = ( e0[id.g] + (2*rhse2[id.g] - rhse1[id.g])*(*dt) );
+
+}
+
+
+__global__ void eulerSumAll3(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, 
+                            myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1,
+                            myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2,
+                            myprec *rhsr3, myprec *rhsu3, myprec *rhsv3, myprec *rhsw3, myprec *rhse3, myprec *dt) {
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+        id.mkidX();
+        myprec tmp1 = 0;
+	tmp1    =    r0[id.g] + (*dt)*( rhsr1[id.g] + 4*rhsr2[id.g] + rhsr3[id.g])/6.;
+        r[id.g] = tmp1;
+	u[id.g] = (  u0[id.g] + (*dt)*( rhsu1[id.g] + 4*rhsu2[id.g] + rhsu3[id.g])/6.)/tmp1;
+	v[id.g] = (  v0[id.g] + (*dt)*( rhsv1[id.g] + 4*rhsv2[id.g] + rhsv3[id.g])/6.)/tmp1;
+	w[id.g] = (  w0[id.g] + (*dt)*( rhsw1[id.g] + 4*rhsw2[id.g] + rhsw3[id.g])/6.)/tmp1;
+        e[id.g] = (  e0[id.g] + (*dt)*( rhse1[id.g] + 4*rhse2[id.g] + rhse3[id.g])/6.);
+
+}
+
+
 __global__ void eulerSum(myprec *a, myprec *b, myprec *c, myprec *dt) {
 	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
 	a[id.g] = (b[id.g] + c[id.g]*(*dt)/2.0);
@@ -217,7 +276,7 @@ __global__ void rk3finalR(myprec *a1, myprec *a2, myprec *b, myprec *c, myprec *
 
 __global__ void calcState(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret, myprec *ht, myprec *tem, myprec *pre, myprec *mu, myprec *lam, int bc) {
 
-	int threadsPerBlock  = blockDim.x * blockDim.y;
+/*	int threadsPerBlock  = blockDim.x * blockDim.y;
 	int threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y;
 	int blockNumInGrid   = blockIdx.x  + gridDim.x  * blockIdx.y;
 
@@ -237,6 +296,34 @@ __global__ void calcState(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel,
     myprec suth = pow(tem[gl],viscexp);
     mu[gl]      = suth/Re;
     lam[gl]     = suth/Re/Pr/Ec;
+    __syncthreads();*/
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id.mkidX();
+    int gl = id.g;
+    if(bc==1) gl += mx*my*mz;
+
+    myprec cvInv = (gam - 1.0)/Rgas;
+    myprec r = rho[gl];
+    myprec u = uvel[gl];
+    myprec v = vvel[gl];
+    myprec w = wvel[gl];
+    myprec etot = ret[gl];
+
+    myprec invrho = 1.0/r;
+    myprec en = etot*invrho - 0.5*(u*u + v*v + w*w);
+
+    myprec  t   = cvInv*en;
+    myprec  p   = r*Rgas*t; //tmp1 is temperature // tmp2 is pressure    
+    tem[gl]     = t;
+    pre[gl]     = p;
+
+    ht[gl]      = (etot + p)*invrho;
+
+    myprec suth = pow(t,viscexp);
+    mu[gl]      = suth/Re;
+    lam[gl]     = suth/Re/Pr/Ec;
+
     __syncthreads();
 
 }
@@ -255,13 +342,13 @@ void calcTimeStepPressGrad(int istep, myprec *dtC, myprec *dpdz, myprec *h_dt, m
 	mpiBarrier();
 	cudaMemcpy(dtC , h_dt  , sizeof(myprec), cudaMemcpyHostToDevice);
 	if(forcing) {
-		calcPressureGrad(dpdz,d_r,d_w,rk);
+		calcPressureGrad(dpdz,d_r,d_w,rk); //Changed here dpdz_new = dpdz_old - 0.5*(rw_bulk - 1 )
 		cudaMemcpy(h_dpdz, dpdz, sizeof(myprec), cudaMemcpyDeviceToHost);
 		allReduceArray(h_dpdz,1);
 		mpiBarrier();
 		cudaMemcpy(dpdz, h_dpdz, sizeof(myprec), cudaMemcpyHostToDevice);
 	}
-	if(rk.rank==0) printf("step number %d with %le %le\n",istep,*h_dt,*h_dpdz);
+	//if(rk.rank==0) printf("step number %d with %le %le\n",nsteps*(file) + istep ,*h_dt,*h_dpdz);
 }
 
 void solverWrapper(Communicator rk) {
@@ -283,20 +370,36 @@ void solverWrapper(Communicator rk) {
     //check the memory usage of the GPU
     checkGpuMem(rk);
 
+
+	/*derVelX<<<d_grid[0],d_block[0],0,s[1]>>>(d_u,d_v,d_w,gij[0],gij[1],gij[2]);
+	derVelY<<<d_grid[3],d_block[3],0,s[2]>>>(d_u,d_v,d_w,gij[3],gij[4],gij[5]);
+	derVelZ<<<d_grid[4],d_block[4],0,s[8]>>>(d_u,d_v,d_w,gij[6],gij[7],gij[8]);
+	calcDil<<<grid0,block0>>>(d_dil,gij[0],gij[4],gij[8]);*/ // In Y and Z derivatives are turned off. so need to fix that or create a new kernel.
+
+
+
     if(restartFile<0) {
     	start=0;
     } else {
     	start=restartFile;
     }
+    for(int file = start+1; file<nfiles+start+1; file++) {  
+        
+        if(rk.rank==0) fp = fopen("solution.txt","w+");
 
-    if(rk.rank==0) fp = fopen("solution.txt","w+");
-    for(int file = start+1; file<nfiles+start+1; file++) {
+        /////Time----------------------------------------------------------------------------------
+	float tm;
+	cudaEvent_t start1, stop; checkCuda( cudaEventCreate(&start1) ); checkCuda( cudaEventCreate(&stop) ); checkCuda( cudaEventRecord(start1, 0) );
 
-    	if(lowStorage) {
+	if(lowStorage) {
         	runSimulationLowStorage(dpar1,dpar2,dtime,rk);  //running the simulation on the GPU
     	} else {
-    		runSimulation(dpar1,dpar2,dtime,rk);  //running the simulation on the GPU
-    	}
+    		runSimulation(file, dpar1,dpar2,dtime,rk);  //running the simulation on the GPU
+          	}
+       
+	checkCuda( cudaEventRecord(stop, 0) );	checkCuda( cudaEventSynchronize(stop) );	checkCuda( cudaEventElapsedTime(&tm, start1, stop) );
+	printf("Time for runSimulation  %3.4f s \n", tm/nsteps/1000);
+        // EndTime---------------------------------------------------------------------------------
 
     	copyField(1,rk);					  //copying back partial results to CPU
 
diff --git a/src/cuda_main.h b/src/cuda_main.h
index 7978223..5091630 100644
--- a/src/cuda_main.h
+++ b/src/cuda_main.h
@@ -63,6 +63,19 @@ void calcTimeStepPressGrad(int istep, myprec *dtC, myprec *dpdz, myprec *h_dt, m
 __device__ myprec time_on_GPU;
 __device__ Communicator rkGPU;
 
+__global__ void eulerSumAll(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, 
+                            myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1,
+                            myprec *dt);
+
+__global__ void eulerSumAll2(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, 
+                            myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1,
+                            myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, myprec *dt);
+
+__global__ void eulerSumAll3(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, 
+                            myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1,
+                            myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2,
+                            myprec *rhsr3, myprec *rhsu3, myprec *rhsv3, myprec *rhsw3, myprec *rhse3, myprec *dt);
+
 __global__ void eulerSum(myprec *a, myprec *b, myprec *c, myprec *dt);
 __global__ void eulerSumR(myprec *a, myprec *b, myprec *c, myprec *r, myprec *dt);
 
diff --git a/src/cuda_rhs.cu b/src/cuda_rhs.cu
index 0e1a015..225f99b 100644
--- a/src/cuda_rhs.cu
+++ b/src/cuda_rhs.cu
@@ -10,12 +10,12 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
 		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
 		myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz,
-		myprec *dil, myprec *dpdz, int iNum) {
+		myprec *dvdy, myprec *dwdz, myprec *dil, myprec *dpdz, int iNum) {
 
 	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
 	id.mkidX();
 
-	int si = id.i + stencilSize;       // local i for shared memory access + halo offset
+	int si = id.tix + stencilSize;       // local i for shared memory access + halo offset
 	int sj = id.tiy;                   // local j for shared memory access
 
 	myprec rXtmp=0;
@@ -26,14 +26,16 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 
 	myprec wrk1=0;
 	myprec wrk2=0;
+	myprec wrk3=0;
 
-	__shared__ myprec s_u[sPencils][mx+stencilSize*2];
-	__shared__ myprec s_v[sPencils][mx+stencilSize*2];
-	__shared__ myprec s_w[sPencils][mx+stencilSize*2];
-	__shared__ myprec s_t[sPencils][mx+stencilSize*2];
-	__shared__ myprec s_p[sPencils][mx+stencilSize*2];
-	__shared__ myprec s_prop1[sPencils][mx+stencilSize*2];
-	__shared__ myprec s_prop2[sPencils][mx+stencilSize*2];
+        
+	__shared__ myprec s_u[sPencils/2][mx+stencilSize*2];
+	__shared__ myprec s_v[sPencils/2][mx+stencilSize*2];
+	__shared__ myprec s_w[sPencils/2][mx+stencilSize*2];
+	__shared__ myprec s_t[sPencils/2][mx+stencilSize*2];
+	__shared__ myprec s_p[sPencils/2][mx+stencilSize*2];
+	__shared__ myprec s_prop1[sPencils/2][mx+stencilSize*2];
+	__shared__ myprec s_prop2[sPencils/2][mx+stencilSize*2];
 
 	s_u[sj][si] = u[id.g];
 	s_v[sj][si] = v[id.g];
@@ -49,27 +51,37 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 	BCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx);
 	__syncthreads();
 
+	derDevSharedV1x(&wrk1,s_u[sj],si);
+	derDevSharedV1x(&wrk2,s_v[sj],si);
+	derDevSharedV1x(&wrk3,s_w[sj],si);
+
+        dudx[id.g] = wrk1;
+        dvdx[id.g] = wrk2;
+        dwdx[id.g] = wrk3;
+
+        rXtmp  = wrk1 + dvdy[id.g] + dwdz[id.g]; // rXtmp is for rho but here it just acts acts as a temporary variable.
+        dil[id.g] = rXtmp; 
 	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
-	uXtmp = ( 2 * dudx[id.g] - 2./3.*dil[id.g] );
-	vXtmp = (     dvdx[id.g] + dudy[id.g]  );
-	wXtmp = (     dwdx[id.g] + dudz[id.g]  );
+	uXtmp = ( 2 * wrk1 - 2./3.*rXtmp ); //rXtmp stores dilatation
+	vXtmp = (     wrk2 + dudy[id.g]  );
+	wXtmp = (     wrk3 + dudz[id.g]  );
 
 	//adding the viscous dissipation part duidx*mu*six
-	eXtmp = s_prop1[sj][si]*(uXtmp*dudx[id.g] + vXtmp*dvdx[id.g] + wXtmp*dwdx[id.g]);
+	eXtmp = s_prop1[sj][si]*(uXtmp*wrk1 + vXtmp*wrk2 + wXtmp*wrk3);
 
 	//Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse);
 	derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx
-    uXtmp *= wrk2;
+        uXtmp *= wrk2;
 	vXtmp *= wrk2;
 	wXtmp *= wrk2;
 
 	// viscous fluxes derivative mu*d^2ui dx^2
 	derDevSharedV2x(&wrk1,s_u[sj],si);
 	uXtmp = uXtmp + wrk1*s_prop1[sj][si];
-	derDevSharedV2x(&wrk1,s_v[sj],si);
-	vXtmp = vXtmp + wrk1*s_prop1[sj][si];
-	derDevSharedV2x(&wrk1,s_w[sj],si);
-	wXtmp = wXtmp + wrk1*s_prop1[sj][si];
+	derDevSharedV2x(&wrk2,s_v[sj],si);
+	vXtmp = vXtmp + wrk2*s_prop1[sj][si];
+	derDevSharedV2x(&wrk3,s_w[sj],si);
+	wXtmp = wXtmp + wrk3*s_prop1[sj][si];
 
 	//adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six)
 	eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp;
@@ -78,11 +90,11 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 	derDevSharedV2x(&wrk1,s_t[sj],si);
 	eXtmp = eXtmp + wrk1*s_prop2[sj][si];
 	derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx
-	derDevSharedV1x(&wrk1,s_t[sj],si); //wrk1 = d (t) dx
-	eXtmp = eXtmp + wrk1*wrk2;
+	derDevSharedV1x(&wrk3,s_t[sj],si); //wrk1 = d (t) dx
+	eXtmp = eXtmp + wrk2*wrk3;
 
 	// pressure and dilation derivatives
-	s_prop2[sj][si] = dil[id.g];
+	s_prop2[sj][si] = rXtmp; // rXtmp stores dilatation temporarily
 	__syncthreads();
 
 	// these are the BCs for the dilatation
@@ -104,17 +116,17 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 	BCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx);
 	__syncthreads();
 
-	fluxQuadSharedx(&wrk1,s_prop1[sj],s_u[sj],si);
-	rXtmp = wrk1;
+	fluxQuadSharedx(&wrk3,s_prop1[sj],s_u[sj],si);
+	rXtmp = wrk3;
 	__syncthreads();
 	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si);
 	uXtmp = uXtmp + wrk1;
 	__syncthreads();
-	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_v[sj],si);
-	vXtmp = vXtmp + wrk1;
+	fluxCubeSharedx(&wrk2,s_prop1[sj],s_u[sj],s_v[sj],si);
+	vXtmp = vXtmp + wrk2;
 	__syncthreads();
-	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_w[sj],si);
-	wXtmp = wXtmp + wrk1;
+	fluxCubeSharedx(&wrk3,s_prop1[sj],s_u[sj],s_w[sj],si);
+	wXtmp = wXtmp + wrk3;
 	__syncthreads();
 	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si);
 	eXtmp = eXtmp + wrk1;
@@ -131,13 +143,21 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
 		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
 		myprec *dvdx, myprec *dudy, myprec *dvdy, myprec *dwdy, myprec *dvdz,
-		myprec *dil, myprec *dpdz, int jNum) {
+		myprec *dil, myprec *dpdz) {
 
 	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
-	id.mkidYFlx();
+	int jNum = blockIdx.z;
+	id.mkidYFlx(jNum);
 
-	int si = id.j + stencilSize;       // local i for shared memory access + halo offset
-	int sj = id.tiy;                   // local j for shared memory access
+        int tid = id.tix + id.bdx*id.tiy;
+
+	int si  = id.tiy + stencilSize;       // local i for shared memory access + halo offset
+	int sj  = id.tix;                   // local j for shared memory access
+        int si1 = tid%id.bdy +  stencilSize;       // local i for shared memory access + halo offset
+        int sj1 = tid/id.bdy;
+
+       // Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y);
+       // id1.mkidYFlx(jNum);
 
 	myprec rYtmp=0;
 	myprec uYtmp=0;
@@ -147,13 +167,16 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 
 	myprec wrk1=0;
 	myprec wrk2=0;
+	myprec wrk3=0;
 
-	__shared__ myprec s_u[sPencils][my+stencilSize*2];
-	__shared__ myprec s_v[sPencils][my+stencilSize*2];
-	__shared__ myprec s_w[sPencils][my+stencilSize*2];
-	__shared__ myprec s_prop[sPencils][my+stencilSize*2];
-	__shared__ myprec s_dil[sPencils][my+stencilSize*2];
+	__shared__ myprec s_u[mx/nDivX][my/nDivY+stencilSize*2];
+	__shared__ myprec s_v[mx/nDivX][my/nDivY+stencilSize*2];
+	__shared__ myprec s_w[mx/nDivX][my/nDivY+stencilSize*2];
+	__shared__ myprec s_prop[mx/nDivX][my/nDivY+stencilSize*2];
+	__shared__ myprec s_dil[mx/nDivX][my/nDivY+stencilSize*2];
+        __shared__ myprec s_tmp[mx/nDivX][my/nDivY+stencilSize*2];
 
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
 	s_u[sj][si] = u[id.g];
 	s_v[sj][si] = v[id.g];
 	s_w[sj][si] = w[id.g];
@@ -163,87 +186,137 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 
 	// fill in periodic images in shared memory array
 	// these are boundary conditions for u,v,w,mu and dilatation
-	BCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my);
+
+	BCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my,jNum);
 	__syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	derDevSharedV1y(&wrk1,s_u[sj1],si1); 
+	derDevSharedV1y(&wrk2,s_v[sj1],si1); 
+	derDevSharedV1y(&wrk3,s_w[sj1],si1); 
 
 	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
-	uYtmp = (    dudy[id.g] + dvdx[id.g]        );
-	vYtmp = (2 * dvdy[id.g] - 2./3.*s_dil[sj][si] );
-	wYtmp = (    dwdy[id.g] + dvdz[id.g]        );
+
+
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+        s_tmp[sj][si] = dvdx[id.g];
+        __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+        uYtmp = (    wrk1 + s_tmp[sj1][si1]        );
+	vYtmp = (2 * wrk2 - 2./3.*s_dil[sj1][si1] );
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+        __syncthreads();
+        s_tmp[sj][si] = dvdz[id.g];
+        __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP 
+	wYtmp = (    wrk3 + s_tmp[sj1][si1]        );
+
+        /*__syncthreads();
+        s_tmp[sj1][si1] = wrk2;
+        __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+        dvdy[id.g] = s_tmp[sj][si];*/
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP 
+        __syncthreads();
+        s_tmp[sj1][si1] = wrk3;
+        __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM 
+        dwdy[id.g] = s_tmp[sj][si];
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP 
+      
+
 
 	//adding the viscous dissipation part duidy*mu*siy
-	eYtmp = s_prop[sj][si]*(uYtmp*dudy[id.g] + vYtmp*dvdy[id.g] + wYtmp*dvdz[id.g]);
+	eYtmp = s_prop[sj1][si1]*(uYtmp*wrk1 + vYtmp*wrk2 + wYtmp*wrk3);
 
 	//Adding here the terms d (mu) dy * syj;
-	derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dy
-    uYtmp *= wrk2;
+	derDevSharedV1y(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dy
+        uYtmp *= wrk2;
 	vYtmp *= wrk2;
 	wYtmp *= wrk2;
 
 	// viscous fluxes derivative
-	derDevSharedV2y(&wrk1,s_u[sj],si);
-	uYtmp = uYtmp + wrk1*s_prop[sj][si];
-	derDevSharedV2y(&wrk1,s_v[sj],si);
-	vYtmp = vYtmp + wrk1*s_prop[sj][si];
-	derDevSharedV2y(&wrk1,s_w[sj],si);
-	wYtmp = wYtmp + wrk1*s_prop[sj][si];
+	derDevSharedV2y(&wrk1,s_u[sj1],si1);
+	uYtmp = uYtmp + wrk1*s_prop[sj1][si1];
+	derDevSharedV2y(&wrk1,s_v[sj1],si1);
+	vYtmp = vYtmp + wrk1*s_prop[sj1][si1];
+	derDevSharedV2y(&wrk1,s_w[sj1],si1);
+	wYtmp = wYtmp + wrk1*s_prop[sj1][si1];
 
 	//adding the viscous dissipation part ui*(mu * d2duidy2 + dmudy * siy)
-	eYtmp = eYtmp + s_u[sj][si]*uYtmp + s_v[sj][si]*vYtmp + s_w[sj][si]*wYtmp;
+	eYtmp = eYtmp + s_u[sj1][si1]*uYtmp + s_v[sj1][si1]*vYtmp + s_w[sj1][si1]*wYtmp;
 
 	//dilation derivatives
-	derDevSharedV1y(&wrk2,s_dil[sj],si);
-	vYtmp = vYtmp + s_prop[sj][si]*wrk2/3.0;
-	eYtmp = eYtmp + s_prop[sj][si]*wrk2/3.0*s_v[sj][si];
+	derDevSharedV1y(&wrk2,s_dil[sj1],si1);
+	vYtmp = vYtmp + s_prop[sj1][si1]*wrk2/3.0;
+	eYtmp = eYtmp + s_prop[sj1][si1]*wrk2/3.0*s_v[sj1][si1];
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
 
 	// pressure derivatives
+        __syncthreads();
 	s_dil[sj][si] = p[id.g];
 	__syncthreads();
 
 	// Boundary condition for pressure
-	BCyNumber2(s_dil[sj],p,id,si,my);
+	BCyNumber2(s_dil[sj],p,id,si,my, jNum);
 	__syncthreads();
 
-	derDevShared1y(&wrk1,s_dil[sj],si);
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	derDevShared1y(&wrk1,s_dil[sj1],si1);
 	vYtmp = vYtmp - wrk1;
 
+
 	// fourier terms
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+        __syncthreads();   
 	s_prop[sj][si] = lam[id.g];
 	s_dil[sj][si]  = t[id.g];
 	__syncthreads();
 
 	// Boundary condition for temperature and thermal conductivity
-	BCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my);
+	BCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my,jNum);
 	__syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
 
-	derDevSharedV2y(&wrk1,s_dil[sj],si);
-	eYtmp = eYtmp + wrk1*s_prop[sj][si];
-	derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dy
-	derDevSharedV1y(&wrk1,s_dil[sj] ,si); //wrk1 = d (t) dy
+	derDevSharedV2y(&wrk1,s_dil[sj1],si1);
+	eYtmp = eYtmp + wrk1*s_prop[sj1][si1];
+	derDevSharedV1y(&wrk2,s_prop[sj1],si1); //wrk2 = d (lam) dy
+	derDevSharedV1y(&wrk1,s_dil[sj1] ,si1); //wrk1 = d (t) dy
 	eYtmp = eYtmp + wrk1*wrk2;
 
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
 	//Adding here the terms - d (ru phi) dy;
+	__syncthreads();
 	s_prop[sj][si] = r[id.g];
 	s_dil[sj][si]  = h[id.g];
 	__syncthreads();
 	// Boundary condition for density and enthalpy
-	BCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my);
+	BCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my,jNum);
 	__syncthreads();
-	fluxQuadSharedy(&wrk1,s_prop[sj],s_v[sj],si);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	fluxQuadSharedy(&wrk1,s_prop[sj1],s_v[sj1],si1);
 	rYtmp = wrk1;
 	__syncthreads();
-	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_u[sj],si);
+	fluxCubeSharedy(&wrk1,s_prop[sj1],s_v[sj1],s_u[sj1],si1);
 	uYtmp = uYtmp + wrk1;
 	__syncthreads();
-	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_v[sj],si);
+	fluxCubeSharedy(&wrk1,s_prop[sj1],s_v[sj1],s_v[sj1],si1);
 	vYtmp = vYtmp + wrk1;
 	__syncthreads();
-	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_w[sj],si);
+	fluxCubeSharedy(&wrk1,s_prop[sj1],s_v[sj1],s_w[sj1],si1);
 	wYtmp = wYtmp + wrk1;
 	__syncthreads();
-	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_dil[sj],si);
+	fluxCubeSharedy(&wrk1,s_prop[sj1],s_v[sj1],s_dil[sj1],si1);
 	eYtmp = eYtmp + wrk1;
 	__syncthreads();
+//USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE.      
+        s_prop[sj1][si1] = rYtmp ; s_u[sj1][si1] = uYtmp ; s_v[sj1][si1] = vYtmp ; s_w[sj1][si1] = wYtmp ; s_dil[sj1][si1] = eYtmp ;
+        __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
 
 #if useStreams
 	rY[id.g] = rYtmp;
@@ -252,146 +325,209 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 	wY[id.g] = wYtmp;
 	eY[id.g] = eYtmp;
 #else
-	rY[id.g] += rYtmp;
-	uY[id.g] += uYtmp;
-	vY[id.g] += vYtmp;
-	wY[id.g] += wYtmp;
-	eY[id.g] += eYtmp;
+	rY[id.g] += s_prop[sj][si];
+	uY[id.g] += s_u[sj][si];
+	vY[id.g] += s_v[sj][si];
+	wY[id.g] += s_w[sj][si];
+	eY[id.g] += s_dil[sj][si];
 #endif
 	__syncthreads();
 }
 
 __global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, myprec *eZ,
-		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
-		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
-		myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz,
-		myprec *dil, myprec *dpdz, int kNum) {
-
-	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
-	id.mkidZFlx(kNum);
-
-	int si = id.tix + stencilSize;       // local i for shared memory access + halo offset
-	int sj = id.tiy;                   // local j for shared memory access
-
-	myprec rZtmp=0;
-	myprec uZtmp=0;
-	myprec vZtmp=0;
-	myprec wZtmp=0;
-	myprec eZtmp=0;
-
-	myprec wrk1=0;
-	myprec wrk2=0;
-
-	__shared__ myprec s_u[sPencils][mz/nDivZ+stencilSize*2];
-	__shared__ myprec s_v[sPencils][mz/nDivZ+stencilSize*2];
-	__shared__ myprec s_w[sPencils][mz/nDivZ+stencilSize*2];
-	__shared__ myprec s_prop[sPencils][mz/nDivZ+stencilSize*2];
-	__shared__ myprec s_dil[sPencils][mz/nDivZ+stencilSize*2];
-
-	s_u[sj][si] = u[id.g];
-	s_v[sj][si] = v[id.g];
-	s_w[sj][si] = w[id.g];
-	s_prop[sj][si] = mu[id.g];
-	s_dil[sj][si] = dil[id.g];
-	__syncthreads();
-
-	// fill in periodic images in shared memory array
-	// these are boundary conditions for u,v,w,mu and dilatation
-	BCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,kNum);
-	__syncthreads();
-
-	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
-	uZtmp = (    dudz[id.g] + dwdx[id.g]        );
-	vZtmp = (    dvdz[id.g] + dwdy[id.g]        );
-	wZtmp = (2 * dwdz[id.g] - 2./3.*s_dil[sj][si] );
-
-	//adding the viscous dissipation part duidz*mu*siz
-	eZtmp = s_prop[sj][si]*(uZtmp*dudz[id.g] + vZtmp*dvdz[id.g] + wZtmp*dwdz[id.g]);
-
-	//Adding here the terms d (mu) dz * szj;
-	derDevSharedV1z(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dz
+        myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
+        myprec *t,  myprec *p,  myprec *mu, myprec *lam,
+        myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz,
+        myprec *dil, myprec *dpdz) {
+
+    Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+    int kNum = blockIdx.z;
+    id.mkidZFlx(kNum);
+       
+    int tid = id.tix + id.bdx*id.tiy;
+
+    int si = id.tiy + stencilSize;       // local i for shared memory access + halo offset
+    int sj = id.tix;                   // local j for shared memory access
+    int si1 = tid%id.bdy +  stencilSize;       // local i for shared memory access + halo offset
+    int sj1 = tid/id.bdy;
+
+    //Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y);
+    //id1.mkidZFlx(kNum);
+
+    myprec rZtmp=0;
+    myprec uZtmp=0;
+    myprec vZtmp=0;
+    myprec wZtmp=0;
+    myprec eZtmp=0;
+
+    myprec wrk1=0;
+    myprec wrk2=0;
+    myprec wrk3=0;
+
+        //myprec tmpreg1[stencilSize*2+1];
+        //myprec tmpreg2[stencilSize*2+1];
+        //myprec tmpreg3[stencilSize*2+1];
+
+    __shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2];
+    __shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2];
+    __shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2];
+    __shared__ myprec s_prop[mx/nDivX][mz/nDivZ+stencilSize*2];
+    __shared__ myprec s_dil[mx/nDivX][mz/nDivZ+stencilSize*2];
+    __shared__ myprec s_tmp[mx/nDivX][mz/nDivZ+stencilSize*2];
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+    s_u[sj][si] = u[id.g];
+    s_v[sj][si] = v[id.g];
+    s_w[sj][si] = w[id.g];
+    s_prop[sj][si] = mu[id.g];
+    s_dil[sj][si] = dil[id.g];
+    __syncthreads();
+
+    // fill in periodic images in shared memory array
+    // these are boundary conditions for u,v,w,mu and dilatation
+
+   
+        BCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,kNum);
+    __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+////BY COMPUTATION THREAD ARRANGEMENT
+    derDevSharedV1z(&wrk1,s_u[sj1],si1);
+    derDevSharedV1z(&wrk2,s_v[sj1],si1);
+    derDevSharedV1z(&wrk3,s_w[sj1],si1);
+    //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
+
+// WRK1 CORRESPONDS TO THE NEW THREAD ARRANGEMENT.
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+        s_tmp[sj][si] = dwdx[id.g];
+        __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+    uZtmp = (    wrk1 + s_tmp[sj1][si1]        );
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+        __syncthreads();
+        s_tmp[sj][si] = dwdy[id.g];
+        __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+    vZtmp = (    wrk2 + s_tmp[sj1][si1]        );
+    wZtmp = (2 * wrk3 - 2./3.*s_dil[sj1][si1] );
+
+        /*__syncthreads();
+        s_tmp[sj1][si1] = wrk3;
+        __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM 
+        dwdz[id.g] = s_tmp[sj][si];*/
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+
+
+
+    //adding the viscous dissipation part duidz*mu*siz
+    eZtmp = s_prop[sj1][si1]*(uZtmp*wrk1 + vZtmp*wrk2 + wZtmp*wrk3);
+
+    //Adding here the terms d (mu) dz * szj;
+    derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dz
     uZtmp *= wrk2;
-	vZtmp *= wrk2;
-	wZtmp *= wrk2;
-
-	//viscous fluxes derivative
-	derDevSharedV2z(&wrk1,s_u[sj],si);
-	uZtmp = uZtmp + wrk1*s_prop[sj][si];
-	derDevSharedV2z(&wrk1,s_v[sj],si);
-	vZtmp = vZtmp + wrk1*s_prop[sj][si];
-	derDevSharedV2z(&wrk1,s_w[sj],si);
-	wZtmp = wZtmp + wrk1*s_prop[sj][si];
-
-	//adding the viscous dissipation part ui*(mu * d2duidz2 + dmudz * siz)
-	eZtmp = eZtmp + s_u[sj][si]*uZtmp + s_v[sj][si]*vZtmp + s_w[sj][si]*wZtmp;
-
-	//dilation derivatives
-	derDevSharedV1z(&wrk2,s_dil[sj],si);
-	wZtmp = wZtmp + s_prop[sj][si]*wrk2/3.0;
-	eZtmp = eZtmp + s_prop[sj][si]*wrk2/3.0*s_w[sj][si];
-
-	// pressure derivatives
-	s_dil[sj][si] = p[id.g];
-	__syncthreads();
-	BCzNumber2(s_dil[sj],p,id,si,mz,kNum);
-
-	__syncthreads();
-	derDevShared1z(&wrk1,s_dil[sj],si);
-	wZtmp = wZtmp - wrk1;
-
-	// fourier terms
-	s_prop[sj][si] = lam[id.g];
-	s_dil[sj][si]  = t[id.g];
-	__syncthreads();
+    vZtmp *= wrk2;
+    wZtmp *= wrk2;
+
+    //viscous fluxes derivative
+    derDevSharedV2z(&wrk1,s_u[sj1],si1);
+    uZtmp = uZtmp + wrk1*s_prop[sj1][si1];
+    derDevSharedV2z(&wrk1,s_v[sj1],si1);
+    vZtmp = vZtmp + wrk1*s_prop[sj1][si1];
+    derDevSharedV2z(&wrk1,s_w[sj1],si1);
+    wZtmp = wZtmp + wrk1*s_prop[sj1][si1];
+
+    //adding the viscous dissipation part ui*(mu * d2duidz2 + dmudz * siz)
+    eZtmp = eZtmp + s_u[sj1][si1]*uZtmp + s_v[sj1][si1]*vZtmp + s_w[sj1][si1]*wZtmp;
+
+    //dilation derivatives
+    derDevSharedV1z(&wrk2,s_dil[sj1],si1);
+    wZtmp = wZtmp + s_prop[sj1][si1]*wrk2/3.0;
+    eZtmp = eZtmp + s_prop[sj1][si1]*wrk2/3.0*s_w[sj1][si1];
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+//SHARED MEM LOADING OPERATION. - ID
+
+    // pressure derivatives
+    __syncthreads();
+    s_dil[sj][si] = p[id.g];
+    __syncthreads();
+    BCzNumber2(s_dil[sj],p,id,si,mz,kNum);
+    __syncthreads();
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+    derDevShared1z(&wrk1,s_dil[sj1],si1);
+    wZtmp = wZtmp - wrk1;
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+    // fourier terms
+    __syncthreads();   
+        s_prop[sj][si] = lam[id.g];
+    s_dil[sj][si]  = t[id.g];
+    __syncthreads();
+    // Boundary conditions for thermal conductivity and temperature
+    BCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,kNum);
+    __syncthreads();
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+
+    derDevSharedV2z(&wrk1,s_dil[sj1],si1);
+    eZtmp = eZtmp + wrk1*s_prop[sj1][si1];
+    derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (lam) dz
+    derDevSharedV1z(&wrk1,s_dil[sj1],si1); //wrk1 = d (t) dz
+    eZtmp = eZtmp + wrk1*wrk2;
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+    //Adding here the terms - d (ru phi) dz;
+    __syncthreads();
+    s_prop[sj][si] = r[id.g];
+    s_dil[sj][si]  = h[id.g];
+    __syncthreads();
+    // Boundary conditions for denisty and enthalpy
+    BCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,kNum);
+    __syncthreads();
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+    fluxQuadSharedz(&wrk1,s_prop[sj1],s_w[sj1],si1);
+    rZtmp = wrk1;
+    __syncthreads();
+    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_u[sj1],si1);
+    uZtmp = uZtmp + wrk1;
+    __syncthreads();
+    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_v[sj1],si1);
+    vZtmp = vZtmp + wrk1;
+    __syncthreads();
+    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_w[sj1],si1);
+    wZtmp = wZtmp + wrk1;
+    __syncthreads();
+    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_dil[sj1],si1);
+    eZtmp = eZtmp + wrk1;
+    __syncthreads();
+//USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE.      
+    s_prop[sj1][si1] = rZtmp ; s_u[sj1][si1] = uZtmp ; s_v[sj1][si1] = vZtmp ; s_tmp[sj1][si1] = wZtmp ; s_dil[sj1][si1] = eZtmp ;
+    __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+        
+    rZ[id.g] += s_prop[sj][si];
+    uZ[id.g] += s_u[sj][si];
+    vZ[id.g] += s_v[sj][si];
+    wZ[id.g] += s_tmp[sj][si] + *dpdz;
+    eZ[id.g] += s_dil[sj][si] + *dpdz*s_w[sj][si];
+    //dil[id.g] = dudx[id.g] + dvdy[id.g] + dwdz[id.g];
+
+//#if useStreams
+//	rZ[id.g] = rZtmp;
+//	uZ[id.g] = uZtmp;
+//	vZ[id.g] = vZtmp;
+//	wZ[id.g] = wZtmp + *dpdz;
+//	eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ;
+//#else
 
-	// Boundary conditions for thermal conductivity and temperature
-	BCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,kNum);
-	__syncthreads();
 
-	derDevSharedV2z(&wrk1,s_dil[sj],si);
-	eZtmp = eZtmp + wrk1*s_prop[sj][si];
-	derDevSharedV1z(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dz
-	derDevSharedV1z(&wrk1,s_dil[sj],si); //wrk1 = d (t) dz
-	eZtmp = eZtmp + wrk1*wrk2;
 
-	//Adding here the terms - d (ru phi) dz;
-	s_prop[sj][si] = r[id.g];
-	s_dil[sj][si]  = h[id.g];
-	__syncthreads();
 
-	// Boundary conditions for denisty and enthalpy
-	BCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,kNum);
-	__syncthreads();
 
-	fluxQuadSharedz(&wrk1,s_prop[sj],s_w[sj],si);
-	rZtmp = wrk1;
-	__syncthreads();
-	fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_u[sj],si);
-	uZtmp = uZtmp + wrk1;
-	__syncthreads();
-	fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_v[sj],si);
-	vZtmp = vZtmp + wrk1;
-	__syncthreads();
-	fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_w[sj],si);
-	wZtmp = wZtmp + wrk1;
-	__syncthreads();
-	fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_dil[sj],si);
-	eZtmp = eZtmp + wrk1;
-	__syncthreads();
-
-#if useStreams
-	rZ[id.g] = rZtmp;
-	uZ[id.g] = uZtmp;
-	vZ[id.g] = vZtmp;
-	wZ[id.g] = wZtmp + *dpdz;
-	eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ;
-#else
-	rZ[id.g] += rZtmp;
-	uZ[id.g] += uZtmp;
-	vZ[id.g] += vZtmp;
-	wZ[id.g] += wZtmp + *dpdz;
-	eZ[id.g] += eZtmp + *dpdz*s_w[sj][si] ;
-#endif
+//#endif
 	__syncthreads();
 }
 
diff --git a/src/cuda_utils.cu b/src/cuda_utils.cu
index d612cd5..d8756d6 100644
--- a/src/cuda_utils.cu
+++ b/src/cuda_utils.cu
@@ -183,22 +183,22 @@ void copyThreadGridsToDevice(Communicator rk) {
 	sanityCheckThreadGrids(rk);
 
 	// X-grid
-	d_grid[0]  = dim3(my / sPencils, mz, 1);
-	d_block[0] = dim3(mx, sPencils, 1);
+	d_grid[0]  = dim3(2*my / sPencils, mz, 1);
+	d_block[0] = dim3(mx, sPencils/2, 1);
 
 	// Y-grid (1) for RHS calculation and (3) for velocity derivative
-	d_grid[3]  = dim3(mx / lPencils, mz, 1);
-	d_block[3] = dim3(lPencils, (my * sPencils) / lPencils, 1);
+	d_grid[3]  = dim3(nDivX, mz, nDivY);
+	d_block[3] = dim3(mx/nDivX, my/nDivY , 1);
 
-	d_grid[1]  = dim3(mx / sPencils, mz, 1);
-	d_block[1] = dim3(my , sPencils, 1);
+	d_grid[1]  = dim3(nDivX, mz, nDivY);
+	d_block[1] = dim3(mx/nDivX, my/nDivY , 1);
 
 	// Y-grid (2) for RHS calculation and (4) for velocity derivative
-	d_grid[4]  = dim3(mx / lPencils, my, 1);
-	d_block[4] = dim3(lPencils, (mz / nDivZ * sPencils) / lPencils, 1);
+	d_grid[4]  = dim3(nDivX, my, nDivZ);
+	d_block[4] = dim3(mx/nDivX, (mz / nDivZ) , 1);
 
-	d_grid[2]  = dim3(mx / sPencils, my, 1);
-	d_block[2] = dim3(mz / nDivZ , sPencils, 1); //if not using shared change!!
+	d_grid[2]  = dim3(nDivX, my, nDivZ);
+	d_block[2] = dim3(mx/nDivX, mz / nDivZ, 1); //if not using shared change!!
 
 	grid0 = d_grid[0];
 	block0= d_block[0];
@@ -231,7 +231,7 @@ void copyThreadGridsToDevice(Communicator rk) {
 }
 
 void sanityCheckThreadGrids(Communicator rk) {
-	if(mx%sPencils!=0) {
+/*	if(mx%sPencils!=0) {
 		if(rk.rank==0) {
 			printf("Error! -> mx mod sPencils!=0\n");
 		}
@@ -293,7 +293,7 @@ void sanityCheckThreadGrids(Communicator rk) {
 		}
 		mpiBarrier();
 		exit(1);
-	}
+	}*/
 }
 
 void copyField(int direction, Communicator rk) {
diff --git a/src/globals.h b/src/globals.h
index b374a59..f50b4da 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -18,36 +18,36 @@
 #define stencilSize 3  // the order is double the stencilSize (advective fluxes stencil)
 #define stencilVisc 2  // the order is double the stencilVisc (viscous fluxes stencil)
 
-#define Lx       (20.0)
-#define Ly       (7.0)
-#define Lz       (500.0)
-#define mx_tot   240
-#define my_tot   64
-#define mz_tot   2048
-#define nsteps   1001
-#define nfiles   100
+#define Lx       (2.0)
+#define Ly       (3.0)
+#define Lz       (10.0)
+#define mx_tot   176
+#define my_tot   216 // 246
+#define mz_tot   432
+#define nsteps   2000
+#define nfiles   1
 #define CFL      0.75f
-#define restartFile  -1
+#define restartFile -1
 
-#define lowStorage    (true)
-#define boundaryLayer (true)
-#define perturbed	  (true)
-#define forcing       (false)
+#define lowStorage    (false)
+#define boundaryLayer (false)
+#define perturbed     (false)
+#define forcing       (true)
 #define periodicX     (false)
 #define nonUniformX   (true)
 
 #define checkCFLcondition 100
 #define checkBulk 100
 
-#define Re       1500.0
-#define Pr       0.75
-#define Ma       0.35
-#define viscexp  1.5
+#define Re       7500.0
+#define Pr       0.70
+#define Ma       0.7
+#define viscexp  0.75
 #define gam      1.4
 #define Ec       ((gam - 1.f)*Ma*Ma)
 #define Rgas     (1.f/(gam*Ma*Ma))
 
-const double stretch  = 5.0;
+const double stretch  = 3.0;
 const myprec TwallTop = 1.0;
 const myprec TwallBot = 1.0;
 
@@ -55,7 +55,9 @@ const myprec TwallBot = 1.0;
 #define my (my_tot/pRow)
 #define mz (mz_tot/pCol)
 
-#define nDivZ 	(8) 	// never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations)
+#define nDivX 	(11) 	// never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations)
+#define nDivY 	(9) 	// never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations)
+#define nDivZ 	(18) 	// never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations)
 
 #define idx(i,j,k) \
 		({ ( k )*mx*my +( j )*mx + ( i ); })
diff --git a/src/main.h b/src/main.h
index 5b6b35a..bbc6a06 100644
--- a/src/main.h
+++ b/src/main.h
@@ -32,7 +32,7 @@ void setGPUParameters(Communicator rk);
 void setDevice(int device);
 void copyField(int direction, Communicator rk);
 void checkGpuMem(Communicator rk);
-void runSimulation(myprec *kin, myprec *enst, myprec *time, Communicator rk);
+void runSimulation(int file, myprec *kin, myprec *enst, myprec *time, Communicator rk);
 void initSolver(Communicator rk);
 void clearSolver(Communicator rk);
 void calculateSponge(Communicator rk);

From eccac7bc44a59dc0826e15968e8fabeacfd92b27 Mon Sep 17 00:00:00 2001
From: Asif Manzoor Hasan <hasan_asif1995@yahoo.co.in>
Date: Wed, 2 Feb 2022 17:22:12 +0100
Subject: [PATCH 2/2] Latest Version

---
 src/.cproject                       |  138 ++++
 src/.project                        |   27 +
 src/.settings/language.settings.xml |   44 ++
 src/backinit.cpp                    |  354 +++++++++
 src/boundary.h                      |  271 +++++--
 src/boundary_condition_x.h          |  314 +++++---
 src/boundary_condition_y.h          |  153 ++--
 src/boundary_condition_z.h          |  488 +++++--------
 src/calc_stress.cu                  |  154 +++-
 src/comm.cpp                        |  321 ++++++--
 src/comm.h                          |    7 +
 src/cuda_derivs.cu                  |    1 -
 src/cuda_derivs.h                   |   82 ++-
 src/cuda_functions.h                |   70 +-
 src/cuda_globals.h                  |   16 +-
 src/cuda_main.cu                    |  978 ++++++++++++++++++-------
 src/cuda_main.h                     |   29 +
 src/cuda_math.cu                    |   36 +-
 src/cuda_math.h                     |    3 +
 src/cuda_rhs.cu                     | 1046 ++++++++++++++++++++-------
 src/cuda_rhs_backup_0511.cu         |  554 ++++++++++++++
 src/cuda_utils.cu                   |   81 ++-
 src/globals.h                       |   86 ++-
 src/init.cpp                        |  225 ++++--
 src/main.cpp                        |    5 +-
 src/main.h                          |    2 +-
 src/nrbcX.h                         |  235 ++++++
 src/nrbcZ.h                         |  163 +++++
 src/temporary.cu                    |  608 ++++++++++++++++
 29 files changed, 5203 insertions(+), 1288 deletions(-)
 create mode 100644 src/.cproject
 create mode 100644 src/.project
 create mode 100644 src/.settings/language.settings.xml
 create mode 100644 src/backinit.cpp
 create mode 100644 src/cuda_rhs_backup_0511.cu
 create mode 100644 src/nrbcX.h
 create mode 100644 src/nrbcZ.h
 create mode 100644 src/temporary.cu

diff --git a/src/.cproject b/src/.cproject
new file mode 100644
index 0000000..c6790cb
--- /dev/null
+++ b/src/.cproject
@@ -0,0 +1,138 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+    	
+    <storageModule moduleId="org.eclipse.cdt.core.settings">
+        		
+        <cconfiguration id="com.nvidia.cuda.toolchain.linux.configuration.debug.1075183989">
+            			
+            <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.toolchain.linux.configuration.debug.1075183989" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+                				
+                <externalSettings/>
+                				
+                <extensions>
+                    					
+                    <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+                    					
+                    <extension id="com.nvidia.cuda.toolchain.nvccErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                    					
+                    <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                    					
+                    <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+                    					
+                    <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                    				
+                </extensions>
+                			
+            </storageModule>
+            			
+            <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+                				
+                <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug" description="" id="com.nvidia.cuda.toolchain.linux.configuration.debug.1075183989" name="Debug" parent="com.nvidia.cuda.toolchain.linux.configuration.debug">
+                    					
+                    <folderInfo id="com.nvidia.cuda.toolchain.linux.configuration.debug.1075183989." name="/" resourcePath="">
+                        						
+                        <toolChain id="com.nvidia.cuda.toolchain.linux.debugToolchain.216174296" name="CUDA Linux toolchain" superClass="com.nvidia.cuda.toolchain.linux.debugToolchain">
+                            							
+                            <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="com.nvidia.cuda.toolchain.targetLinuxPlatform.159644105" isAbstract="false" osList="linux" superClass="com.nvidia.cuda.toolchain.targetLinuxPlatform"/>
+                            							
+                            <builder buildPath="${workspace_loc:/GPUcode}/Debug" id="com.nvidia.cuda.toolchain.builder.1934154701" managedBuildOn="true" name="CUDA Toolkit Builder.Debug" superClass="com.nvidia.cuda.toolchain.builder"/>
+                            							
+                            <tool id="nvcc.linker.561830360" name="NVCC linker" superClass="nvcc.linker"/>
+                            							
+                            <tool id="nvcc.archiver.1176671561" name="NVCC archiver" superClass="nvcc.archiver"/>
+                            							
+                            <tool id="nvcc.compiler.1155618356" name="NVCC Compiler" superClass="nvcc.compiler">
+                                								
+                                <option defaultValue="true" id="com.nvidia.cuda.toolchain.compiler.device-debug.277500635" superClass="com.nvidia.cuda.toolchain.compiler.device-debug" valueType="boolean"/>
+                                								
+                                <option defaultValue="true" id="com.nvidia.cuda.toolchain.compiler.debug.316999213" superClass="com.nvidia.cuda.toolchain.compiler.debug" valueType="boolean"/>
+                                								
+                                <option id="nvcc.compiler.pic.124300475" superClass="nvcc.compiler.pic"/>
+                                							
+                            </tool>
+                            						
+                        </toolChain>
+                        					
+                    </folderInfo>
+                    				
+                </configuration>
+                			
+            </storageModule>
+            			
+            <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+            		
+        </cconfiguration>
+        		
+        <cconfiguration id="com.nvidia.cuda.toolchain.linux.configuration.release.1901966558">
+            			
+            <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.toolchain.linux.configuration.release.1901966558" moduleId="org.eclipse.cdt.core.settings" name="Release">
+                				
+                <externalSettings/>
+                				
+                <extensions>
+                    					
+                    <extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+                    					
+                    <extension id="com.nvidia.cuda.toolchain.nvccErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                    					
+                    <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                    					
+                    <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+                    					
+                    <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                    				
+                </extensions>
+                			
+            </storageModule>
+            			
+            <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+                				
+                <configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release" description="" id="com.nvidia.cuda.toolchain.linux.configuration.release.1901966558" name="Release" parent="com.nvidia.cuda.toolchain.linux.configuration.release">
+                    					
+                    <folderInfo id="com.nvidia.cuda.toolchain.linux.configuration.release.1901966558." name="/" resourcePath="">
+                        						
+                        <toolChain id="com.nvidia.cuda.toolchain.linux.releaseToolchain.266062711" name="CUDA Linux toolchain" superClass="com.nvidia.cuda.toolchain.linux.releaseToolchain">
+                            							
+                            <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="com.nvidia.cuda.toolchain.targetLinuxPlatform.1945747779" isAbstract="false" osList="linux" superClass="com.nvidia.cuda.toolchain.targetLinuxPlatform"/>
+                            							
+                            <builder buildPath="${workspace_loc:/GPUcode}/Release" id="com.nvidia.cuda.toolchain.builder.892787080" managedBuildOn="true" name="CUDA Toolkit Builder.Release" superClass="com.nvidia.cuda.toolchain.builder"/>
+                            							
+                            <tool id="nvcc.linker.1634129534" name="NVCC linker" superClass="nvcc.linker"/>
+                            							
+                            <tool id="nvcc.archiver.1918216400" name="NVCC archiver" superClass="nvcc.archiver"/>
+                            							
+                            <tool id="nvcc.compiler.1769340277" name="NVCC Compiler" superClass="nvcc.compiler">
+                                								
+                                <option defaultValue="com.nvidia.cuda.toolchain.compiler.optimization.level3" id="com.nvidia.cuda.toolchain.compiler.optimization.867747039" superClass="com.nvidia.cuda.toolchain.compiler.optimization" valueType="enumerated"/>
+                                								
+                                <option id="nvcc.compiler.pic.89348618" superClass="nvcc.compiler.pic"/>
+                                							
+                            </tool>
+                            						
+                        </toolChain>
+                        					
+                    </folderInfo>
+                    				
+                </configuration>
+                			
+            </storageModule>
+            		
+        </cconfiguration>
+        	
+    </storageModule>
+    	
+    <storageModule moduleId="scannerConfiguration">
+        		
+        <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+        	
+    </storageModule>
+    	
+    <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+        		
+        <project id="GPUcode.com.nvidia.cuda.toolchain.linux.build.exe.1772380411" name="Executable" projectType="com.nvidia.cuda.toolchain.linux.build.exe"/>
+        	
+    </storageModule>
+    	
+    <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+    
+</cproject>
diff --git a/src/.project b/src/.project
new file mode 100644
index 0000000..65be2a9
--- /dev/null
+++ b/src/.project
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>MPI_test foler</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+	</natures>
+</projectDescription>
diff --git a/src/.settings/language.settings.xml b/src/.settings/language.settings.xml
new file mode 100644
index 0000000..4053a67
--- /dev/null
+++ b/src/.settings/language.settings.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<project>
+    	
+    <configuration id="com.nvidia.cuda.toolchain.linux.configuration.debug.1075183989" name="Debug">
+        		
+        <extension point="org.eclipse.cdt.core.LanguageSettingsProvider">
+            			
+            <provider copy-of="extension" id="org.eclipse.cdt.ui.UserLanguageSettingsProvider"/>
+            			
+            <provider-reference id="org.eclipse.cdt.core.ReferencedProjectsLanguageSettingsProvider" ref="shared-provider"/>
+            			
+            <provider-reference id="org.eclipse.cdt.managedbuilder.core.MBSLanguageSettingsProvider" ref="shared-provider"/>
+            			
+            <provider class="com.nvidia.cuda.toolchain.internal.NVCCBuiltinSpecsDetector" console="false" env-hash="-16186165351885" id="com.nvidia.cuda.toolchain.CUDACProvider" keep-relative-paths="false" name="Nvcc Builtins provider" parameter="${COMMAND} ${FLAGS} -E -Xcompiler -P -Xcompiler -v -Xcompiler -dD &quot;${INPUTS}&quot;" prefer-non-shared="true">
+                				
+                <language-scope id="com.nvidia.cuda.ui.CUDALanguage"/>
+                			
+            </provider>
+            		
+        </extension>
+        	
+    </configuration>
+    	
+    <configuration id="com.nvidia.cuda.toolchain.linux.configuration.release.1901966558" name="Release">
+        		
+        <extension point="org.eclipse.cdt.core.LanguageSettingsProvider">
+            			
+            <provider copy-of="extension" id="org.eclipse.cdt.ui.UserLanguageSettingsProvider"/>
+            			
+            <provider-reference id="org.eclipse.cdt.core.ReferencedProjectsLanguageSettingsProvider" ref="shared-provider"/>
+            			
+            <provider-reference id="org.eclipse.cdt.managedbuilder.core.MBSLanguageSettingsProvider" ref="shared-provider"/>
+            			
+            <provider class="com.nvidia.cuda.toolchain.internal.NVCCBuiltinSpecsDetector" console="false" env-hash="-16186165351885" id="com.nvidia.cuda.toolchain.CUDACProvider" keep-relative-paths="false" name="Nvcc Builtins provider" parameter="${COMMAND} ${FLAGS} -E -Xcompiler -P -Xcompiler -v -Xcompiler -dD &quot;${INPUTS}&quot;" prefer-non-shared="true">
+                				
+                <language-scope id="com.nvidia.cuda.ui.CUDALanguage"/>
+                			
+            </provider>
+            		
+        </extension>
+        	
+    </configuration>
+    
+</project>
diff --git a/src/backinit.cpp b/src/backinit.cpp
new file mode 100644
index 0000000..da1fdca
--- /dev/null
+++ b/src/backinit.cpp
@@ -0,0 +1,354 @@
+#include <iostream>
+#include <vector>
+#include <math.h>
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include "comm.h"
+#include "globals.h"
+#include "main.h"
+
+void derivGrid(double *d2f, double *df, double *f, double dx);
+
+void initField(int timestep, Communicator rk) {
+	readFileMPI('r',timestep,r,rk);
+	readFileMPI('u',timestep,u,rk);
+	readFileMPI('v',timestep,v,rk);
+	readFileMPI('w',timestep,w,rk);
+	readFileMPI('e',timestep,e,rk);
+    if(rk.rank==0) {
+    	printf("finished initializing field\n");
+    }
+}
+
+void writeField(int timestep, Communicator rk) {
+	saveFileMPI('r',timestep,r,rk);
+	saveFileMPI('u',timestep,u,rk);
+	saveFileMPI('v',timestep,v,rk);
+	saveFileMPI('w',timestep,w,rk);
+	saveFileMPI('e',timestep,e,rk);
+}
+
+void initGrid(Communicator rk) {
+
+	//constant grid in y and z and stretched in x
+
+	if (boundaryLayer) {
+        myprec tmp = 1/( (mxwr-1.0) - 1.0);
+		for (int i=0; i<mxwr; i++){
+			fact = (i)*tmp;
+		    x[i] = Lxwr*( sinh(stretch * fact)/sinh(stretch));
+		}
+		myprec dx1 = x[mxwr-1]-x[mxwr-2] ;
+		myprec Lxgp = Lx     - Lxwr;
+		int    mxgp = mx_tot - mxwr ;
+        myprec rgp  = 1.01;
+        myprec rgpold = 0;
+        while (abs(rgp-rgpold) < 1e-3) {
+        	rgpold = rgp;
+        	rgp = 1.0 - (1.0 - rgp)/dx1*Lxgp;
+        	rgp = pow(rgp, 1.0/mxgp);
+        }
+        for (int i = mxwr; i < mx_tot + stencilSize ; i++) {
+        	x[i] = x[i-1] + dx1*pow(rgp, i-mxwr);
+        }
+//what is the grid dimension in the halo points? Simone did not have in the channel grid I think because of the fact that Grid dimensions in the halo were never needed
+        // eqyations were never solved on the haldo points. The fact that we put BC as u[-1] = - u [0]; u[-2] = - u[1] and also for temperature automatically imples
+        //that the distance between the halo points is mirrored. x[-1] is as far ffrom the wall as x[0].   -2  -1 | 0 1 2 ....
+        //But now we will also need a halo grid as we will be solving for the cells (nrbc)
+
+
+	}
+
+
+	dx = Lx*(1.0)/(mx_tot);
+
+	double xn[mx_tot+1];
+	int denom = mx_tot;
+	myprec denom2 = 2.0;
+	if(boundaryLayer) {
+		denom *= 2;
+		denom2/= 2;
+	}
+
+	for (int i=0; i<mx_tot+1; i++)
+		xn[i] = tanh(stretch*((i*1.0)/denom-0.5))/tanh(stretch*0.5)	;
+
+	for (int i=0; i<mx_tot; i++)
+		x[i] = Lx * (1.0 + (xn[i] + xn[i+1])/2.0)/denom2;
+
+	derivGrid(xpp,xp,x,dx);
+
+	for (int i=0; i<mx_tot; i++)
+		xp[i] = 1.0/xp[i];
+
+#if !nonUniformX
+	for(int i=0;i<mx_tot;i++) {
+			x[i]=Lx*(0.5+i*1.0)/(mx_tot);  }
+
+	dx = x[1] - x[0];
+
+#endif
+
+	for(int j=0;j<my_tot;j++) {
+		y[j]=Ly*(0.5+j*1.0)/(my_tot);  }
+
+	for(int k=0;k<mz_tot;k++) {
+		z[k]=Lz*(0.5+k*1.0)/(mz_tot);  }
+
+	if(rk.rank==0) {
+
+		FILE *fp = fopen("Grid.txt","w+");
+		for (int i=0; i<mx; i++)
+			fprintf(fp,"%d %lf %lf %lf\n",i,x[i],xp[i],xpp[i]);
+		fclose(fp);
+
+		FILE *fb = fopen("fields/x.bin","wb");
+		fwrite(x, mx_tot , sizeof(double) , fb );
+		fclose(fb);
+
+		fb = fopen("fields/y.bin","wb");
+		fwrite(y , my_tot , sizeof(double) , fb );
+		fclose(fb);
+
+		fb = fopen("fields/z.bin","wb");
+		fwrite(z , mz_tot , sizeof(double) , fb );
+		fclose(fb);
+	}
+
+	mpiBarrier();
+}
+
+void initChannel(Communicator rk) {
+
+	double U0;
+	double T0 = 1.0;
+	double P0 = T0*Rgas;
+	double R0 = 1.0;
+
+	U0 = pow(gam,0.5)*Ma;
+
+	for (int i=0; i<mx; i++) {
+		for (int j=0; j<my; j++) {
+			for (int k=0; k<mz; k++) {
+				double rr1 = rand()*1.0/(RAND_MAX*1.0) - 0.5;
+				double rr2 = rand()*1.0/(RAND_MAX*1.0) - 0.5;
+				double rr3 = rand()*1.0/(RAND_MAX*1.0) - 0.5;
+				double ufluc  =  0.02*rr1;
+				double vfluc  =  0.02*rr2;
+				double wfluc  =  0.02*rr3;
+
+				double wmean  = 1.5*U0*R0*x[i]*(1.0-x[i]/Lx);
+
+				ufluc = ufluc + 0.05*sin(0.5*M_PI*x[i])*cos(2*M_PI*y[j+rk.jstart]);
+				vfluc = vfluc + 0.05*sin(0.5*M_PI*x[i])*sin(2*M_PI*y[j+rk.jstart]);
+				u[idx(i,j,k)] = ufluc;
+				v[idx(i,j,k)] = vfluc;
+				w[idx(i,j,k)] = wmean + wfluc;
+
+				r[idx(i,j,k)] = R0;
+				e[idx(i,j,k)] = P0/(gam-1.0) + 0.5 * r[idx(i,j,k)] * (pow(u[idx(i,j,k)],2) + pow(v[idx(i,j,k)],2) + pow(w[idx(i,j,k)],2));
+			} } }
+}
+
+void initCHIT(Communicator rk) {
+
+	double V0 = 1.0;
+	double T0 = 1.0;
+	double P0 = T0*Rgas;
+	double R0 = 1.0;
+
+	for (int i=0; i<mx; i++) {
+		double fx = 2*M_PI*x[i]/Lx;
+		for (int j=0; j<my; j++) {
+			double fy = 2*M_PI*y[j+rk.jstart]/Ly;
+			for (int k=0; k<mz; k++) {
+				double fz = 2*M_PI*z[k+rk.kstart]/Lz;
+				u[idx(i,j,k)] =  V0*sin(fx/1.0)*cos(fy/1.0)*cos(fz/1.0);
+				v[idx(i,j,k)] = -V0*cos(fx/1.0)*sin(fy/1.0)*cos(fz/1.0);
+				w[idx(i,j,k)] =  0.0;
+
+				double press = P0 + 1.0/16.0*R0*V0*V0 * (cos(2.0*fx/1.0) + cos(2.0*fy/1.0)) * (cos(2.0*fz/1.0) + 2.0);
+
+				r[idx(i,j,k)] = press/Rgas/T0;
+				e[idx(i,j,k)] = press/(gam-1.0) + 0.5 * r[idx(i,j,k)] * (pow(u[idx(i,j,k)],2) + pow(v[idx(i,j,k)],2) + pow(w[idx(i,j,k)],2));
+			} } }
+}
+
+void calcAvgChan(Communicator rk) {
+	double um[mx],vm[mx],wm[mx],rm[mx],em[mx];
+	double uf[mx],vf[mx],wf[mx],rf[mx],ef[mx];
+
+	for (int i=0; i<mx; i++) {
+		rm[i] = 0.0;
+		um[i] = 0.0;
+		vm[i] = 0.0;
+		wm[i] = 0.0;
+		em[i] = 0.0;
+		rf[i] = 0.0;
+		uf[i] = 0.0;
+		vf[i] = 0.0;
+		wf[i] = 0.0;
+		ef[i] = 0.0;
+		for (int k=0; k<mz; k++)
+			for (int j=0; j<my; j++) {
+				rm[i] += r[idx(i,j,k)]/my/mz;
+				um[i] += r[idx(i,j,k)]*u[idx(i,j,k)]/my/mz;
+				vm[i] += r[idx(i,j,k)]*v[idx(i,j,k)]/my/mz;
+				wm[i] += r[idx(i,j,k)]*w[idx(i,j,k)]/my/mz;
+				em[i] += e[idx(i,j,k)]/my/mz;
+			}
+	}
+
+	allReduceArrayDouble(rm,mx);
+	allReduceArrayDouble(um,mx);
+	allReduceArrayDouble(vm,mx);
+	allReduceArrayDouble(wm,mx);
+	allReduceArrayDouble(em,mx);
+
+	for (int i=0; i<mx; i++) {
+		um[i] /= rm[i];
+		vm[i] /= rm[i];
+		wm[i] /= rm[i];
+		for (int k=0; k<mz; k++)
+			for (int j=0; j<my; j++) {
+				rf[i] += (r[idx(i,j,k)]-rm[i])*(r[idx(i,j,k)]-rm[i])/my/mz;
+				uf[i] += (u[idx(i,j,k)]-um[i])*(u[idx(i,j,k)]-um[i])/my/mz;
+				vf[i] += (v[idx(i,j,k)]-vm[i])*(v[idx(i,j,k)]-vm[i])/my/mz;
+				wf[i] += (w[idx(i,j,k)]-wm[i])*(w[idx(i,j,k)]-wm[i])/my/mz;
+				ef[i] += (e[idx(i,j,k)]-em[i])*(e[idx(i,j,k)]-em[i])/my/mz;
+			}
+	}
+
+	allReduceArrayDouble(rf,mx);
+	allReduceArrayDouble(uf,mx);
+	allReduceArrayDouble(vf,mx);
+	allReduceArrayDouble(wf,mx);
+	allReduceArrayDouble(ef,mx);
+
+	if(rk.rank==0) {
+		FILE *fp = fopen("prof.txt","w+");
+		for (int i=0; i<mx; i++)
+			fprintf(fp,"%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n",x[i],rm[i],um[i],vm[i],wm[i],em[i],rf[i],uf[i],vf[i],wf[i],ef[i]);
+		fclose(fp);
+	}
+	mpiBarrier();
+}
+
+void printRes(Communicator rk) {
+
+	double ut;
+
+	double Ret = 0.0;
+	double muw,temw;
+
+	for (int k=0; k<mz; k++)
+		for (int j=0; j<my; j++) {
+			    temw   = 1.0;
+			    double suth = pow(temw,viscexp);
+			    muw      = suth/Re;
+
+			    double ub[stencilSize*2+1];
+			    for (int i=stencilSize; i<stencilSize*2+1; i++)
+			    	ub[i] = w[i-stencilSize + j*mx + k*my*mx];
+			    for (int i=0; i<stencilSize; i++)
+			    	ub[i] = w[stencilSize-i-1 + j*mx + k*my*mx];
+
+			    double dudx = 0;
+			    for (int i=0; i<stencilSize; i++)
+			    	dudx += coeffF[i]*(ub[i]-ub[stencilSize*2-i])/dx;
+
+			    dudx *= xp[0];
+
+			    ut  = sqrt(muw*abs(dudx)/r[idx(0,j,k)]);
+			    Ret += ut*r[idx(0,j,k)]/muw;
+		}
+
+	Ret = Ret/my/mz;
+
+	allReduceArrayDouble(&Ret,1);
+
+	if(rk.rank==0) {
+		printf("\n");
+		printf("The average friction Reynolds number is: \t %lf\n",Ret);
+		printf("Resolutions are: \n");
+		printf("wall-normal wall: \t %lf\n",(x[0]+x[1])/2*Ret);
+		printf("wall-normal center: \t %lf\n",(x[mx/2+1]-x[mx/2-1])/2*Ret);
+		printf("span-wise: \t %lf\n",(y[1]-y[0])*Ret);
+		printf("stream-wise: \t %lf\n",(z[1]-z[0])*Ret);
+		printf("\n");
+
+		printf("the initial dt and dpdz are : %lf   and    %lf\n",dt,h_dpdz);
+		printf("\n");
+	}
+}
+
+void derivGrid(double *d2f, double *df, double *f, double dx) {
+
+	double fbound[mx_tot+stencilSize*2];
+	for(int i=stencilSize; i<mx_tot+stencilSize; i++)
+		fbound[i] = f[i-stencilSize];
+
+	for(int i=0; i<stencilSize; i++) {
+		fbound[i]   = - fbound[2*stencilSize-i-1];
+		fbound[mx_tot+stencilSize+i] = 2*Lx - fbound[mx_tot+stencilSize-i-1];
+	}
+
+	for (int i = 0; i < mx_tot; i++){
+		df[i]  = 0.0;
+		d2f[i] = coeffS[stencilSize]*fbound[i+stencilSize]/dx/dx;
+		for (int it = 0; it < stencilSize; it++){
+			df[i]  += coeffF[it]*(fbound[i+it]-fbound[i+stencilSize*2-it])/dx;
+			d2f[i] += coeffS[it]*(fbound[i+it]+fbound[i+stencilSize*2-it])/dx/dx;
+		}
+	}
+}
+
+void calcdt(Communicator rk) {
+	double dx;
+	double dy = y[1] - y[0];
+	double dz = z[1] - z[0];
+	double dx2;
+	double dy2 = dy*dy;
+	double dz2 = dz*dz;
+	double dtConvInv = 0.0;
+	double dtViscInv = 0.0;
+	for (int gt = 0; gt<mx*my*mz; gt++) {
+		double ien = e[gt]/r[gt] - 0.5*(u[gt]*u[gt] + v[gt]*v[gt] + w[gt]*w[gt]);
+		double sos = pow(gam*(gam-1)*ien,0.5);
+
+		int i = gt%my;
+
+	    if(i==0) {
+	    	dx = (x[i+1] + x[i])/2.0;
+	    } else if (i==mx-1) {
+	    	dx = Lx - (x[i] + x[i-1])/2.0;
+	    } else {
+	    	dx = (x[i+1] - x[i-1])/2.0;
+	    }
+
+	    dx2 = dx*dx;
+
+	    double dtc1, dtv1;
+
+	    dtc1      =  MAX( (abs(u[gt]) + sos)/dx, MAX( (abs(v[gt]) + sos)/dy, (abs(w[gt]) + sos)/dz) );
+	    dtv1      =  MAX( 1.0/Re/dx2, MAX( 1.0/Re/dy2, 1.0/Re/dz2) );
+		dtConvInv =  MAX( dtConvInv, dtc1 );
+		dtViscInv =  MAX( dtViscInv, dtv1 );
+	}
+	dt = CFL/MAX(dtConvInv, dtViscInv);
+
+	allReduceArrayDouble(&dt,1);
+	mpiBarrier();
+}
+
+void restartWrapper(Communicator rk) {
+    if(restartFile<0) {
+    	if(forcing) {
+    		initChannel(rk);
+    	} else {
+    		if(!boundaryLayer) initCHIT(rk);
+    	}
+    } else {
+    	initField(restartFile,rk); }
+}
diff --git a/src/boundary.h b/src/boundary.h
index 8c9e4dc..8555f17 100644
--- a/src/boundary.h
+++ b/src/boundary.h
@@ -8,7 +8,27 @@
 #ifndef BOUNDARY_H_
 #define BOUNDARY_H_
 
-extern __device__ __forceinline__ void perBCx(myprec *s_f, int si);
+extern __device__ __forceinline__ void TopperBCx(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void BotperBCx(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void TopwallBCxExt(myprec *s_f, int si, const myprec Bctop, const myprec Bcbot);
+extern __device__ __forceinline__ void BotwallBCxExt(myprec *s_f, int si, const myprec Bctop, const myprec Bcbot);
+extern __device__ __forceinline__ void TopwallBCxMir(myprec *s_f, int si);
+extern __device__ __forceinline__ void BotwallBCxMir(myprec *s_f, int si);
+extern __device__ __forceinline__ void TopwallBCxVel(myprec *s_f, int si);
+extern __device__ __forceinline__ void BotwallBCxVel(myprec *s_f, int si);
+extern __device__ __forceinline__ void TopwallBCxExt_cen(myprec *s_f, int si, const myprec Bctop, const myprec Bcbot);
+extern __device__ __forceinline__ void BotwallBCxExt_cen(myprec *s_f, int si, const myprec Bctop, const myprec Bcbot);
+extern __device__ __forceinline__ void TopwallBCxMir_cen(myprec *s_f, int si);
+extern __device__ __forceinline__ void BotwallBCxMir_cen(myprec *s_f, int si);
+extern __device__ __forceinline__ void TopwallBCxVel_cen(myprec *s_f, int si);
+extern __device__ __forceinline__ void BotwallBCxVel_cen(myprec *s_f, int si);
+extern __device__ __forceinline__ void ToprhBoundPT(myprec *r, myprec *h, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si);
+extern __device__ __forceinline__ void BotrhBoundPT(myprec *r, myprec *h, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si);
+extern __device__ __forceinline__ void TopmlBoundPT(myprec *m, myprec *l, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si);
+extern __device__ __forceinline__ void BotmlBoundPT(myprec *m, myprec *l, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si);
+extern __device__ __forceinline__ void BCxExtrapolate(myprec *s_f, int si);
+
+
 extern __device__ __forceinline__ void perBCy(myprec *s_f, int si);
 extern __device__ __forceinline__ void perBCz(myprec *s_f, int si);
 extern __device__ __forceinline__ void perBCzBot(myprec *s_f, myprec *f, int si, Indices id);
@@ -19,13 +39,8 @@ extern __device__ __forceinline__ void haloBCy(myprec *s_f, myprec *f, int si, I
 extern __device__ __forceinline__ void haloBCz(myprec *s_f, myprec *f, int si, Indices id);
 extern __device__ __forceinline__ void haloBCzBot(myprec *s_f, myprec *f, int si, Indices id);
 extern __device__ __forceinline__ void haloBCzTop(myprec *s_f, myprec *f, int si, Indices id);
-extern __device__ __forceinline__ void wallBCxExt(myprec *s_f, int si, const myprec Bctop, const myprec Bcbot);
-extern __device__ __forceinline__ void wallBCxMir(myprec *s_f, int si);
-extern __device__ __forceinline__ void wallBCxVel(myprec *s_f, int si);
 extern __device__ __forceinline__ void wallBCxDil(myprec *s_f, myprec *s_u, myprec *s_v, myprec *s_w, int si);
 extern __device__ __forceinline__ void stateBoundPT(myprec *r, myprec *t, myprec *u, myprec *v, myprec *w, myprec *h, myprec *p, myprec *m, myprec *l);
-extern __device__ __forceinline__ void rhBoundPT(myprec *r, myprec *h, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si);
-extern __device__ __forceinline__ void mlBoundPT(myprec *m, myprec *l, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si);
 extern __device__ __forceinline__ void botBCxMir(myprec *s_f, int si);
 extern __device__ __forceinline__ void topBCxExt(myprec *s_f, int si);
 extern __device__ __forceinline__ void topBCxVal(myprec *s_f, int si, myprec value);
@@ -34,49 +49,202 @@ extern __device__ __forceinline__ void botBCzVal(myprec *s_f, int si, myprec val
 extern __device__ __forceinline__ void botBCxExt(myprec *s_f, int si, myprec Bcbot);
 extern __device__ __forceinline__ void topBCzExt(myprec *s_f, int si);
 extern __device__ __forceinline__ void botBCzExt(myprec *s_f, int si);
-extern __device__ __forceinline__ void botBCzCpy(myprec *s_f, myprec *f, int si, Indices id);
-extern __device__ __forceinline__ void botBCyCpy(myprec *s_f, myprec *f, int si, Indices id);
-extern __device__ __forceinline__ void topBCzCpy(myprec *s_f, myprec *f, int si, Indices id);
-extern __device__ __forceinline__ void topBCyCpy(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void BotBCzCpy(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void BotBCyCpy(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void TopBCzCpy(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void TopBCyCpy(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void TopBCxCpy(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void BotBCxCpy(myprec *s_f, myprec *f, int si, Indices id);
+
+extern __device__ __forceinline__ void InflowBCzBot(myprec *s_f, myprec *recy_f, int si, Indices id);
+extern __device__ __forceinline__ void BotBCzMir(myprec *s_f, int si);
+
+extern __device__ __forceinline__ void haloBCyBot(myprec *s_f, myprec *f, int si, Indices id);
+extern __device__ __forceinline__ void haloBCyTop(myprec *s_f, myprec *f, int si, Indices id);
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopperBCx(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si+mx/nX]           = f[idx(si-stencilSize,id.j, id.k)];
+}
+__device__ __forceinline__ __attribute__((always_inline)) void BotperBCx(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si-stencilSize]     = f[idx(id.i+mx-stencilSize,id.j,id.k)];
+}
 
-__device__ __forceinline__ __attribute__((always_inline)) void perBCx(myprec *s_f, int si) {
-	s_f[si-stencilSize]  = s_f[si+mx-stencilSize];
-	s_f[si+mx]           = s_f[si];
+__device__ __forceinline__ __attribute__((always_inline)) void TopwallBCxVel(myprec *s_f, int si) {
+	s_f[si+mx/nX]           = - s_f[mx/nX+2*stencilSize-si-1];
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void perBCy(myprec *s_f, int si) {
-	s_f[si-stencilSize]  = s_f[si+my-stencilSize];
-	s_f[si+my]           = s_f[si];
+__device__ __forceinline__ __attribute__((always_inline)) void BotwallBCxVel(myprec *s_f, int si) {
+	s_f[si-stencilSize]  = - s_f[3*stencilSize-si-1];
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void perBCz(myprec *s_f, int si) {
-	s_f[si-stencilSize]  = s_f[si+mz-stencilSize];
-	s_f[si+mz]           = s_f[si];
+__device__ __forceinline__ __attribute__((always_inline)) void TopwallBCxMir(myprec *s_f, int si) {
+	s_f[si+mx/nX]           = s_f[mx/nX+2*stencilSize-si-1];
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void perBCzBot(myprec *s_f, myprec *f, int si, Indices id) {
-	s_f[si-stencilSize]  = f[idx(id.i,id.j,id.k+mz-stencilSize)];
+__device__ __forceinline__ __attribute__((always_inline)) void BotwallBCxMir(myprec *s_f, int si) {
+	s_f[si-stencilSize]  = s_f[3*stencilSize-si-1];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopwallBCxExt(myprec *s_f, int si, myprec Bctop, myprec Bcbot) {
+	s_f[si+mx/nX]           = 2.0*Bctop - s_f[mx/nX+2*stencilSize-si-1];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotwallBCxExt(myprec *s_f, int si, myprec Bctop, myprec Bcbot) {
+	s_f[si-stencilSize]  = 2.0*Bcbot - s_f[3*stencilSize-si-1];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopwallBCxVel_cen(myprec *s_f, int si) {
+	s_f[si+mx/nX]           = - s_f[mx/nX+2*stencilSize-si-2];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotwallBCxVel_cen(myprec *s_f, int si) {
+	s_f[si-stencilSize]  = - s_f[3*stencilSize-si];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopwallBCxMir_cen(myprec *s_f, int si) {
+	s_f[si+mx/nX]           = s_f[mx/nX+2*stencilSize-si-2];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotwallBCxMir_cen(myprec *s_f, int si) {
+	s_f[si-stencilSize]  = s_f[3*stencilSize-si];
 }
 
+__device__ __forceinline__ __attribute__((always_inline)) void TopwallBCxExt_cen(myprec *s_f, int si, myprec Bctop, myprec Bcbot) {
+	s_f[si+mx/nX]           = 2.0*Bctop - s_f[mx/nX+2*stencilSize-si-2];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotwallBCxExt_cen(myprec *s_f, int si, myprec Bctop, myprec Bcbot) {
+	s_f[si-stencilSize]  = 2.0*Bcbot - s_f[3*stencilSize-si];
+}
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void ToprhBoundPT(myprec *r, myprec *h, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si)
+{
+
+
+    int idx = si+mx/nX;
+    h[idx]  = t[idx]*Rgas*gam/(gam - 1.0)
+        		    		  + 0.5*(u[idx]*u[idx]+v[idx]*v[idx]+w[idx]*w[idx]);
+    r[idx]  = p[idx]/(Rgas*t[idx]);
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotrhBoundPT(myprec *r, myprec *h, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si)
+{
+	int idx = si-stencilSize;
+
+    h[idx]  = t[idx]*Rgas*gam/(gam - 1.0)
+    		    		  + 0.5*(u[idx]*u[idx]+v[idx]*v[idx]+w[idx]*w[idx]);
+    r[idx]  = p[idx]/(Rgas*t[idx]);
+
+}
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopmlBoundPT(myprec *m, myprec *l, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si)
+{
+    int idx = si+mx/nX;
+
+    myprec suth = pow(t[idx],viscexp);
+    m[idx]   = suth/Re;
+    l[idx]   = suth/Re/Pr/Ec;
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotmlBoundPT(myprec *m, myprec *l, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si)
+{
+	int idx = si-stencilSize;
+
+    myprec suth = pow(t[idx],viscexp);
+    m[idx]   = suth/Re;
+    l[idx]   = suth/Re/Pr/Ec;
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BCxExtrapolate(myprec *s_f, int si)
+{
+	s_f[si+mx/nX]  = s_f[mx/nX-1 + stencilSize];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCxCpy(myprec *s_f, myprec *f, int si, Indices id) {
+		s_f[si+mx/nX] = f[idx(id.i+mx/nX,id.j,id.k)];
+
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCxCpy(myprec *s_f, myprec *f, int si, Indices id) {
+		s_f[si-stencilSize] = f[idx(id.i-stencilSize,id.j,id.k)];
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__device__ __forceinline__ __attribute__((always_inline)) void perBCyTop(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si+my/nDivY]     = f[idx(id.i,si-stencilSize, id.k)];
+}
 __device__ __forceinline__ __attribute__((always_inline)) void perBCyBot(myprec *s_f, myprec *f, int si, Indices id) {
 	s_f[si-stencilSize]  = f[idx(id.i,id.j+my-stencilSize,id.k)];
 }
 
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCyCpy(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si+my/nDivY] = f[idx(id.i,id.j+my/nDivY,id.k)];
+}
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCyCpy(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si-stencilSize] = f[idx(id.i,id.j-stencilSize,id.k)];
+}
+__device__ __forceinline__ __attribute__((always_inline)) void haloBCyTop(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si+my/nDivY]     = f[mx*my*mz + stencilSize*mx*mz + (si-stencilSize) + id.i*stencilSize + id.k*mx*stencilSize];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void haloBCyBot(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si-stencilSize]  = f[mx*my*mz + id.j + id.i*stencilSize + id.k*mx*stencilSize];
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 __device__ __forceinline__ __attribute__((always_inline)) void perBCzTop(myprec *s_f, myprec *f, int si, Indices id) {
 	s_f[si+mz/nDivZ]     = f[idx(id.i,id.j,si-stencilSize)];
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void perBCyTop(myprec *s_f, myprec *f, int si, Indices id) {
-	s_f[si+my/nDivY]     = f[idx(id.i,si-stencilSize, id.k)];
+__device__ __forceinline__ __attribute__((always_inline)) void perBCzBot(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si-stencilSize]  = f[idx(id.i,id.j,id.k+mz-stencilSize)];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCzCpy(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si+mz/nDivZ] = f[idx(id.i,id.j,id.k+mz/nDivZ)];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCzCpy(myprec *s_f, myprec *f, int si, Indices id) {
+	s_f[si-stencilSize] = f[idx(id.i,id.j,id.k-stencilSize)];
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BCzExtrapolate(myprec *s_f, int si){
+	s_f[si+mz/nDivZ]  = s_f[mz/nDivZ-1 + stencilSize];
 }
 
+__device__ __forceinline__ __attribute__((always_inline)) void InflowBCzBot(myprec *s_f, myprec *recy_f, int si, Indices id){
+
+	s_f[si-stencilSize] = recy_f[idx(id.i,id.j,si-stencilSize)];
+
+}
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCzMir(myprec *s_f, int si) {
+	s_f[si-stencilSize]  = s_f[3*stencilSize-si];
+}
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
 
 __device__ __forceinline__ __attribute__((always_inline)) void haloBCy(myprec *s_f, myprec *f, int si, Indices id) {
 	s_f[si-stencilSize]  = f[mx*my*mz + id.j + id.i*stencilSize + id.k*mx*stencilSize];
 	s_f[si+my]           = f[mx*my*mz + stencilSize*mx*mz + id.j + id.i*stencilSize + id.k*mx*stencilSize];
 }
 
+
+
+
+
+
 __device__ __forceinline__ __attribute__((always_inline)) void haloBCz(myprec *s_f, myprec *f, int si, Indices id) {
 	s_f[si-stencilSize]  = f[mx*my*mz + 2*stencilSize*mx*mz + id.k + id.i*stencilSize + id.j*mx*stencilSize];
 	s_f[si+mz]           = f[mx*my*mz + 2*stencilSize*mx*mz + stencilSize*mx*my + id.k + id.i*stencilSize + id.j*mx*stencilSize];
@@ -90,25 +258,11 @@ __device__ __forceinline__ __attribute__((always_inline)) void haloBCzTop(myprec
 	s_f[si+mz/nDivZ]     = f[mx*my*mz + 2*stencilSize*mx*mz + stencilSize*mx*my + (si-stencilSize) + id.i*stencilSize + id.j*mx*stencilSize];
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void wallBCxExt(myprec *s_f, int si, myprec Bctop, myprec Bcbot) {
-	s_f[si-stencilSize]  = 2.0*Bcbot - s_f[3*stencilSize-si-1];
-	s_f[si+mx]           = 2.0*Bctop - s_f[mx+2*stencilSize-si-1];
-}
-
 __device__ __forceinline__ __attribute__((always_inline)) void wallBCxDil(myprec *s_f, myprec *s_u, myprec *s_v, myprec *s_w, int si) {
 	s_f[si-stencilSize]  = s_u[si-stencilSize] +  s_v[si-stencilSize] +  s_w[si-stencilSize];
 	s_f[si+mx]           = s_u[si+mx]          +  s_v[si+mx]          +  s_w[si+mx];
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void wallBCxVel(myprec *s_f, int si) {
-	s_f[si-stencilSize]  = - s_f[3*stencilSize-si-1];
-	s_f[si+mx]           = - s_f[mx+2*stencilSize-si-1];
-}
-
-__device__ __forceinline__ __attribute__((always_inline)) void wallBCxMir(myprec *s_f, int si) {
-	s_f[si-stencilSize]  = s_f[3*stencilSize-si-1];
-	s_f[si+mx]           = s_f[mx+2*stencilSize-si-1];
-}
 
 __device__ __forceinline__ __attribute__((always_inline)) void stateBoundPT(myprec *r, myprec *t, myprec *u, myprec *v, myprec *w, myprec *h, myprec *p, myprec *m, myprec *l, int si)
 {
@@ -132,34 +286,6 @@ __device__ __forceinline__ __attribute__((always_inline)) void stateBoundPT(mypr
     l[idx]   = suth/Re/Pr/Ec;
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void rhBoundPT(myprec *r, myprec *h, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si)
-{
-	int idx = si-stencilSize;
-
-    h[idx]  = t[idx]*Rgas*gam/(gam - 1.0)
-    		    		  + 0.5*(u[idx]*u[idx]+v[idx]*v[idx]+w[idx]*w[idx]);
-    r[idx]  = p[idx]/(Rgas*t[idx]);
-
-    idx = si+mx;
-    h[idx]  = t[idx]*Rgas*gam/(gam - 1.0)
-        		    		  + 0.5*(u[idx]*u[idx]+v[idx]*v[idx]+w[idx]*w[idx]);
-    r[idx]  = p[idx]/(Rgas*t[idx]);
-}
-
-__device__ __forceinline__ __attribute__((always_inline)) void mlBoundPT(myprec *m, myprec *l, myprec *p, myprec *t, myprec *u, myprec *v, myprec *w, int si)
-{
-	int idx = si-stencilSize;
-
-    myprec suth = pow(t[idx],viscexp);
-    m[idx]   = suth/Re;
-    l[idx]   = suth/Re/Pr/Ec;
-
-    idx = si+mx;
-
-    suth = pow(t[idx],viscexp);
-    m[idx]   = suth/Re;
-    l[idx]   = suth/Re/Pr/Ec;
-}
 
 __device__ __forceinline__ __attribute__((always_inline)) void topBCxExt(myprec *s_f, int si) {
 	s_f[si+mx]           = 2.0*s_f[mx+stencilSize-1] - s_f[mx+2*stencilSize-si-2];
@@ -193,21 +319,12 @@ __device__ __forceinline__ __attribute__((always_inline)) void botBCxMir(myprec
 	s_f[si-stencilSize]  = s_f[3*stencilSize-si-1];
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void botBCzCpy(myprec *s_f, myprec *f, int si, Indices id) {
-s_f[si-stencilSize] = f[idx(id.i,id.j,id.k-stencilSize)];
-}
 
-__device__ __forceinline__ __attribute__((always_inline)) void botBCyCpy(myprec *s_f, myprec *f, int si, Indices id) {
-s_f[si-stencilSize] = f[idx(id.i,id.j-stencilSize,id.k)];
-}
 
-__device__ __forceinline__ __attribute__((always_inline)) void topBCzCpy(myprec *s_f, myprec *f, int si, Indices id) {
-s_f[si+mz/nDivZ] = f[idx(id.i,id.j,id.k+mz/nDivZ)];
-}
 
-__device__ __forceinline__ __attribute__((always_inline)) void topBCyCpy(myprec *s_f, myprec *f, int si, Indices id) {
-s_f[si+my/nDivY] = f[idx(id.i,id.j+my/nDivY,id.k)];
-}
+
+
+
 
 
 #endif /* BOUNDARY_H_ */
diff --git a/src/boundary_condition_x.h b/src/boundary_condition_x.h
index 941eb06..e0c7f41 100644
--- a/src/boundary_condition_x.h
+++ b/src/boundary_condition_x.h
@@ -5,113 +5,233 @@
 #include "sponge.h"
 #include "perturbation.h"
 
-extern __device__ __forceinline__ void BCxderVel(myprec *s_u, myprec *s_v, myprec *s_w,
-												 Indices id, int si, int m);
-
-extern __device__ __forceinline__ void BCxNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
-												  myprec *s_p, myprec *s_t,
-												  myprec *s_m, myprec *s_l,
-												  Indices id, int si, int m);
-
-extern __device__ __forceinline__ void BCxNumber2(myprec *s_dil, Indices id, int si, int m);
-
-extern __device__ __forceinline__ void BCxNumber3(myprec *s_u, myprec *s_v, myprec *s_w,
-												  myprec *s_p, myprec *s_t,
-												  myprec *s_r, myprec *s_h,
-												  Indices id, int si, int m);
-
-__device__ __forceinline__ __attribute__((always_inline)) void BCxderVel(myprec *s_u, myprec *s_v, myprec *s_w,
-																		 Indices id, int si, int m) {
-	if(id.i<stencilSize) {
-		if(periodicX) {
-			perBCx(s_u,si);perBCx(s_v,si);perBCx(s_w,si);
-		} else {
-			if(boundaryLayer) {
-				topBCxExt(s_u,si);
-				topBCxExt(s_v,si);
-				topBCxExt(s_w,si);
-				botBCxExt(s_u,si,0.0);
-				botBCxExt(s_v,si,0.0);
-				botBCxExt(s_w,si,0.0);
-				if(perturbed) PerturbUvel(s_u,id,si);
-			} else {
-				wallBCxVel(s_u,si);
-				wallBCxVel(s_v,si);
-				wallBCxVel(s_w,si);
-			}
-		}
+extern __device__ __forceinline__ void TopBCxderVel(myprec *s_u, myprec *s_v, myprec *s_w,
+		myprec *u, myprec *v, myprec *w,
+		Indices id, int si, int m, int bctype);
+extern __device__ __forceinline__ void BotBCxderVel(myprec *s_u, myprec *s_v, myprec *s_w,
+		myprec *u, myprec *v, myprec *w,
+		Indices id, int si, int m, int bctype);
+
+extern __device__ __forceinline__ void TopBCxNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
+		myprec *s_p, myprec *s_t,
+		myprec *s_m, myprec *s_l, myprec *u, myprec *v, myprec *w,
+		myprec *p, myprec *t, myprec *mu, myprec *lam,
+		Indices id, int si, int m, int bctype);
+extern __device__ __forceinline__ void BotBCxNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
+		myprec *s_p, myprec *s_t,
+		myprec *s_m, myprec *s_l, myprec *u, myprec *v, myprec *w,
+		myprec *p, myprec *t, myprec *mu, myprec *lam,
+		Indices id, int si, int m, int bctype);
+
+extern __device__ __forceinline__ void TopBCxNumber2(myprec *s_dil, myprec *dil, Indices id, int si, int m, int bctype);
+extern __device__ __forceinline__ void BotBCxNumber2(myprec *s_dil, myprec *dil, Indices id, int si, int m, int bctype);
+
+
+extern __device__ __forceinline__ void TopBCxNumber3(myprec *s_u, myprec *s_v, myprec *s_w,
+		myprec *s_p, myprec *s_t,
+		myprec *s_r, myprec *s_h, myprec *r, myprec *h,
+		Indices id, int si, int m, int bctype);
+
+extern __device__ __forceinline__ void BotBCxNumber3(myprec *s_u, myprec *s_v, myprec *s_w,
+		myprec *s_p, myprec *s_t,
+		myprec *s_r, myprec *s_h, myprec *r, myprec *h,
+		Indices id, int si, int m, int bctype);
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCxderVel(myprec *s_u, myprec *s_v, myprec *s_w, myprec *u, myprec *v, myprec *w,
+		Indices id, int si, int m, int bctype) {
+
+	if (bctype == 1) {
+		TopperBCx(s_u,u,si,id);
+		TopperBCx(s_v,v,si,id);
+		TopperBCx(s_w,w,si,id);
+
+	} else if (bctype == 2) {
+		TopwallBCxVel(s_u,si);
+		TopwallBCxVel(s_v,si);
+		TopwallBCxVel(s_w,si);
+	} else if (bctype == 3) {
+		TopwallBCxVel_cen(s_u,si);
+		TopwallBCxVel_cen(s_v,si);
+		TopwallBCxVel_cen(s_w,si);
+	} else if (bctype == 4) {
+		BCxExtrapolate(s_u,si);
+		BCxExtrapolate(s_v,si);
+		BCxExtrapolate(s_w,si);
+	}  // add more else ifs here
+}
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCxderVel(myprec *s_u, myprec *s_v, myprec *s_w, myprec *u, myprec *v, myprec *w,
+		Indices id, int si, int m, int bctype) {
+
+	if (bctype == 1) {
+		BotperBCx(s_u,u,si,id);
+		BotperBCx(s_v,v,si,id);
+		BotperBCx(s_w,w,si,id);
+
+	} else if (bctype == 2) {
+		BotwallBCxVel(s_u,si);
+		BotwallBCxVel(s_v,si);
+		BotwallBCxVel(s_w,si);
+
+	} else if (bctype == 3) {
+		BotwallBCxVel_cen(s_u,si);
+		BotwallBCxVel_cen(s_v,si);
+		BotwallBCxVel_cen(s_w,si);
+
+	} else if (bctype == 4) {
+       // do nothing for bottom
+	}
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCxNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
+		myprec *s_p, myprec *s_t,
+		myprec *s_m, myprec *s_l, myprec *u, myprec *v, myprec *w,
+		myprec *p, myprec *t, myprec *mu, myprec *lam,
+		Indices id, int si, int m, int bctype) {
+	if (bctype == 1) {
+		TopperBCx(s_p,p,si,id);
+		TopperBCx(s_u,u,si,id);
+		TopperBCx(s_v,v,si,id);
+		TopperBCx(s_w,w,si,id);
+		TopperBCx(s_t,t,si,id);
+		TopperBCx(s_m,mu,si,id);
+		TopperBCx(s_l,lam,si,id);
+
+	} else if (bctype == 2) {
+		TopwallBCxMir(s_p,si);
+		TopwallBCxVel(s_u,si);
+		TopwallBCxVel(s_v,si);
+		TopwallBCxVel(s_w,si);
+		TopwallBCxExt(s_t,si,TwallTop,TwallBot);
+		TopmlBoundPT(s_m, s_l, s_p, s_t, s_u, s_v, s_w, si);
+
+	} else if (bctype == 3) {
+
+		TopwallBCxMir_cen(s_p,si);
+		TopwallBCxVel_cen(s_u,si);
+		TopwallBCxVel_cen(s_v,si);
+		TopwallBCxVel_cen(s_w,si);
+		TopwallBCxExt_cen(s_t,si,TwallTop,TwallBot);
+		TopmlBoundPT(s_m, s_l, s_p, s_t, s_u, s_v, s_w, si);
+
+	} else if (bctype == 4) {
+
+		BCxExtrapolate(s_p,si);
+		BCxExtrapolate(s_u,si);
+		BCxExtrapolate(s_v,si);
+		BCxExtrapolate(s_w,si);
+		BCxExtrapolate(s_t,si);
+		TopmlBoundPT(s_m, s_l, s_p, s_t, s_u, s_v, s_w, si);
+	} // add more else ifs here
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCxNumber1(myprec *s_u, myprec *s_v, myprec *s_w,\
+		myprec *s_p, myprec *s_t,
+		myprec *s_m, myprec *s_l, myprec *u, myprec *v, myprec *w,
+		myprec *p, myprec *t, myprec *mu, myprec *lam,
+		Indices id, int si, int m, int bctype) {
+	if (bctype == 1) {
+		BotperBCx(s_p,p,si,id);
+		BotperBCx(s_u,u,si,id);
+		BotperBCx(s_v,v,si,id);
+		BotperBCx(s_w,w,si,id);
+		BotperBCx(s_t,t,si,id);
+		BotperBCx(s_m,mu,si,id);
+		BotperBCx(s_l,lam,si,id);
+
+	} else if (bctype == 2) {
+		BotwallBCxMir(s_p,si);
+		BotwallBCxVel(s_u,si);
+		BotwallBCxVel(s_v,si);
+		BotwallBCxVel(s_w,si);
+		BotwallBCxExt(s_t,si,TwallTop,TwallBot);
+		BotmlBoundPT(s_m, s_l, s_p, s_t, s_u, s_v, s_w, si);
+
+	} else if (bctype == 3) {
+
+		BotwallBCxMir_cen(s_p,si);
+		BotwallBCxVel_cen(s_u,si);
+		BotwallBCxVel_cen(s_v,si);
+		BotwallBCxVel_cen(s_w,si);
+		BotwallBCxExt_cen(s_t,si,TwallTop,TwallBot);
+		BotmlBoundPT(s_m, s_l, s_p, s_t, s_u, s_v, s_w, si);
+
+	} else if (bctype == 4) {
+      // Do nothing as extrapolation will not be used at the bottom boundary
 	}
-	__syncthreads();
+
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void BCxNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
-		  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  myprec *s_p, myprec *s_t,
-		  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  myprec *s_m, myprec *s_l,
-		  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  Indices id, int si, int m) {
-	if (id.i < stencilSize) {
-		if(periodicX) {
-			perBCx(s_u,si);
-			perBCx(s_v,si);
-			perBCx(s_w,si);
-			perBCx(s_t,si);
-			perBCx(s_p,si);
-			perBCx(s_m,si);
-			perBCx(s_l,si);
-		}else {
-			if(boundaryLayer) {
-				topBCxExt(s_u,si);
-				topBCxExt(s_v,si);
-				topBCxExt(s_w,si);
-				topBCxExt(s_p,si);
-				topBCxExt(s_t,si);
-				botBCxMir(s_p,si);
-				botBCxMir(s_t,si);
-				botBCxExt(s_u,si,0.0);
-				botBCxExt(s_v,si,0.0);
-				botBCxExt(s_w,si,0.0);
-				if(perturbed) PerturbUvel(s_u,id,si);
-				mlBoundPT(s_m, s_l, s_p, s_t, s_u, s_v, s_w, si);
-			} else {
-				wallBCxMir(s_p,si);
-				wallBCxVel(s_u,si);
-				wallBCxVel(s_v,si);
-				wallBCxVel(s_w,si);
-				wallBCxExt(s_t,si,TwallTop,TwallBot);
-				mlBoundPT(s_m, s_l, s_p, s_t, s_u, s_v, s_w, si);
-			}
-		}
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCxNumber2(myprec *s_dil, myprec *dil, Indices id, int si, int m, int bctype) {
+
+	if (bctype == 1) {
+		TopperBCx(s_dil,dil,si,id);
+
+	} else if (bctype == 2) {
+		TopwallBCxMir(s_dil,si);
+
+	} else if (bctype == 3) {
+		TopwallBCxMir_cen(s_dil,si);
+	} else if (bctype == 4) {
+		BCxExtrapolate(s_dil,si);
 	}
-	__syncthreads();
+
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void BCxNumber2(myprec *s_dil, Indices id, int si, int m) {
-	if (id.i < stencilSize) {
-		if(periodicX) {
-			perBCx(s_dil,si);
-		}else {
-			if(boundaryLayer) {
-				topBCxExt(s_dil,si);
-				botBCxMir(s_dil,si);
-			} else {
-				wallBCxMir(s_dil,si);
-			}
-		}
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCxNumber2(myprec *s_dil, myprec *dil, Indices id, int si, int m, int bctype) {
+
+	if (bctype == 1) {
+		BotperBCx(s_dil,dil,si,id);
+
+	} else if (bctype == 2) {
+		BotwallBCxMir(s_dil,si);
+
+	} else if (bctype == 3) {
+		BotwallBCxMir_cen(s_dil,si);
+	} else if (bctype == 4) {
+        // no extrapolation for bottom
+	}
+
+}
+
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCxNumber3(myprec *s_u, myprec *s_v, myprec *s_w,
+		myprec *s_p, myprec *s_t,
+		myprec *s_r, myprec *s_h, myprec *r, myprec *h,
+		Indices id, int si, int m, int bctype) {
+	if (bctype == 1) {
+		TopperBCx(s_r,r,si,id); TopperBCx(s_h,h,si,id);
+	} else if (bctype == 2) {
+		ToprhBoundPT(s_r, s_h, s_p, s_t, s_u, s_v, s_w, si);
+
+	} else if (bctype == 3) {
+		ToprhBoundPT(s_r, s_h, s_p, s_t, s_u, s_v, s_w, si);
+
+	} else if (bctype == 4) {
+		ToprhBoundPT(s_r, s_h, s_p, s_t, s_u, s_v, s_w, si);
 	}
-	__syncthreads();
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void BCxNumber3(myprec *s_u, myprec *s_v, myprec *s_w,
-																		  myprec *s_p, myprec *s_t,
-																		  myprec *s_r, myprec *s_h,
-																		  Indices id, int si, int m) {
-	if (id.i < stencilSize) {
-		if(periodicX) {
-			perBCx(s_r,si); perBCx(s_h,si);
-		} else {
-			rhBoundPT(s_r, s_h, s_p, s_t, s_u, s_v, s_w, si);
-		}
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCxNumber3(myprec *s_u, myprec *s_v, myprec *s_w,
+		myprec *s_p, myprec *s_t,
+		myprec *s_r, myprec *s_h, myprec *r, myprec *h,
+		Indices id, int si, int m, int bctype) {
+	if (bctype == 1) {
+		BotperBCx(s_r,r,si,id); BotperBCx(s_h,h,si,id);
+	} else if (bctype == 2) {
+		BotrhBoundPT(s_r, s_h, s_p, s_t, s_u, s_v, s_w, si);
+
+	} else if (bctype == 3) {
+		BotrhBoundPT(s_r, s_h, s_p, s_t, s_u, s_v, s_w, si);
+
+	}  else if (bctype == 4) {
+		//no extrapolation at bottom BC
 	}
-	__syncthreads();
 }
 
+
 #endif /* BOUNDARY_CONDITION_H_ */
diff --git a/src/boundary_condition_y.h b/src/boundary_condition_y.h
index cade0f3..98c0d1e 100644
--- a/src/boundary_condition_y.h
+++ b/src/boundary_condition_y.h
@@ -4,130 +4,85 @@
 #include "boundary.h"
 #include "sponge.h"
 
-extern __device__ __forceinline__ void BCyNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
+extern __device__ __forceinline__ void TopBCyNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
 												  myprec *s_m, myprec *s_dil,
 												  myprec *u, myprec *v, myprec *w,
 												  myprec *m, myprec *dil,
-												  Indices id, int si, int n, int jNum);
+												  Indices id, int si, int n);
+extern __device__ __forceinline__ void BotBCyNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
+												  myprec *s_m, myprec *s_dil,
+												  myprec *u, myprec *v, myprec *w,
+												  myprec *m, myprec *dil,
+												  Indices id, int si, int n);
 
-extern __device__ __forceinline__ void BCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int m, int jNum);
+extern __device__ __forceinline__ void TopBCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int m);
+extern __device__ __forceinline__ void BotBCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int m);
 
-extern __device__ __forceinline__ void BCyNumber3(myprec *s_f, myprec *s_g,
+
+extern __device__ __forceinline__ void TopBCyNumber3(myprec *s_f, myprec *s_g,
+												  myprec *f,   myprec *g,
+												  Indices id, int si, int n);
+extern __device__ __forceinline__ void BotBCyNumber3(myprec *s_f, myprec *s_g,
 												  myprec *f,   myprec *g,
-												  Indices id, int si, int n, int jNum);
+												  Indices id, int si, int n);
+
 
-__device__ __forceinline__ __attribute__((always_inline)) void BCyNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCyNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
 																		  myprec *s_m, myprec *s_dil,
 																		  myprec *u, myprec *v, myprec *w,
 																		  myprec *m, myprec *dil,
-																		  Indices id, int si, int n, int jNum) {
-	if (id.tiy < stencilSize) {
-	    if(nDivY==1) {
-		     if(multiGPU) {
-			haloBCy(s_u,u,si,id); haloBCy(s_v,v,si,id); haloBCy(s_w,w,si,id);
-			haloBCy(s_m,m,si,id); haloBCy(s_dil,dil,si,id);
-		     } else {
-			perBCy(s_u,si);	perBCy(s_v,si); perBCy(s_w,si);
-			perBCy(s_m,si); perBCy(s_dil,si);
-		}
-	   } 
-           else {
-			if(jNum==0) {
-				topBCyCpy(s_u,u,si,id); topBCyCpy(s_v,v,si,id); topBCyCpy(s_w,w,si,id);
-				topBCyCpy(s_m,m,si,id); topBCyCpy(s_dil,dil,si,id);
-				
-						perBCyBot(s_u,u,si,id);
-						perBCyBot(s_v,v,si,id);
-						perBCyBot(s_w,w,si,id);
-						perBCyBot(s_m,m,si,id);
-						perBCyBot(s_dil,dil,si,id);
-				
-				
-			} else if (jNum==nDivY-1) {
-				botBCyCpy(s_u,u,si,id); botBCyCpy(s_v,v,si,id); botBCyCpy(s_w,w,si,id);
-				botBCyCpy(s_m,m,si,id); botBCyCpy(s_dil,dil,si,id);
-				
-						perBCyTop(s_u,u,si,id);
-						perBCyTop(s_v,v,si,id);
-						perBCyTop(s_w,w,si,id);
-						perBCyTop(s_m,m,si,id);
-						perBCyTop(s_dil,dil,si,id);
-				
-			} else {
-				topBCyCpy(s_u,u,si,id); topBCyCpy(s_v,v,si,id); topBCyCpy(s_w,w,si,id);
-				topBCyCpy(s_m,m,si,id); topBCyCpy(s_dil,dil,si,id);
-				botBCyCpy(s_u,u,si,id); botBCyCpy(s_v,v,si,id); botBCyCpy(s_w,w,si,id);
-				botBCyCpy(s_m,m,si,id); botBCyCpy(s_dil,dil,si,id);
-			}
-		}
-	}
+																		  Indices id, int si, int n) {
+	perBCyTop(s_u,u,si,id);
+	perBCyTop(s_v,v,si,id);
+	perBCyTop(s_w,w,si,id);
+	perBCyTop(s_m,m,si,id);
+	perBCyTop(s_dil,dil,si,id);
+}
 
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCyNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
+																		  myprec *s_m, myprec *s_dil,
+																		  myprec *u, myprec *v, myprec *w,
+																		  myprec *m, myprec *dil,
+																		  Indices id, int si, int n) {
+	perBCyBot(s_u,u,si,id);
+	perBCyBot(s_v,v,si,id);
+	perBCyBot(s_w,w,si,id);
+	perBCyBot(s_m,m,si,id);
+	perBCyBot(s_dil,dil,si,id);
+}
 
 
 
-	__syncthreads();
-}
 
-__device__ __forceinline__ __attribute__((always_inline)) void BCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int n, int jNum) {
-	if (id.tiy < stencilSize) {
-	    if(nDivY==1) {
-		     if(multiGPU) {
-			haloBCy(s_p,p,si,id);
-		     } else {
-			perBCy(s_p,si);
-		}
-	   } 
-           else {
-			if(jNum==0) {
-				topBCyCpy(s_p,p,si,id); 
-			        perBCyBot(s_p,p,si,id);
-				
-				
-			} else if (jNum==nDivY-1) {
-				botBCyCpy(s_p,p,si,id); 
-						perBCyTop(s_p,p,si,id);
-				
-			} else {
-				topBCyCpy(s_p,p,si,id); botBCyCpy(s_p,p,si,id);
-			}
-		}
-	}
 
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int n) {
+
+	perBCyTop(s_p,p,si,id);
 
+}
 
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int n) {
 
-	__syncthreads();
+	perBCyBot(s_p,p,si,id);
 
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void BCyNumber3(myprec *s_f, myprec *s_g,
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCyNumber3(myprec *s_f, myprec *s_g,
 																		  myprec *f,   myprec *g,
-																		  Indices id, int si, int n, int jNum) {
-	if (id.tiy < stencilSize) {
-	    if(nDivY==1) {
-		     if(multiGPU) {
-			haloBCy(s_f,f,si,id); haloBCy(s_g,g,si,id); 
-		     } else {
-			perBCy(s_f,si);	perBCy(s_g,si); 
-		}
-	   } 
-           else {
-			if(jNum==0) {
-				topBCyCpy(s_f,f,si,id); topBCyCpy(s_g,g,si,id); 						perBCyBot(s_f,f,si,id);
-						perBCyBot(s_g,g,si,id);
-				
-				
-			} else if (jNum==nDivY-1) {
-				botBCyCpy(s_f,f,si,id); botBCyCpy(s_g,g,si,id); 						perBCyTop(s_f,f,si,id);
-						perBCyTop(s_g,g,si,id);
-				
-			} else {
-				topBCyCpy(s_f,f,si,id); topBCyCpy(s_g,g,si,id); 				botBCyCpy(s_f,f,si,id); botBCyCpy(s_g,g,si,id);
-			}
-		}
+																		  Indices id, int si, int n) {
+	perBCyTop(s_f,f,si,id);
+	perBCyTop(s_g,g,si,id);
 	}
 
-	__syncthreads();
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCyNumber3(myprec *s_f, myprec *s_g,
+																		  myprec *f,   myprec *g,
+																		  Indices id, int si, int n) {
+	perBCyBot(s_f,f,si,id);
+	perBCyBot(s_g,g,si,id);
+
 }
 
+
 #endif /* BOUNDARY_CONDITION_H_ */
diff --git a/src/boundary_condition_z.h b/src/boundary_condition_z.h
index 11f0769..04c91d2 100644
--- a/src/boundary_condition_z.h
+++ b/src/boundary_condition_z.h
@@ -4,331 +4,223 @@
 #include "boundary.h"
 #include "sponge.h"
 
-extern __device__ __forceinline__ void BCzderVel(myprec s_f[mz+stencilSize*2][lPencils], myprec *f, myprec *fref, Indices id, int si, int i, int j, int kNum);
+extern __device__ __forceinline__ void TopBCzderVel(myprec *s_u, myprec *s_v, myprec *s_w, myprec *u, myprec *v, myprec *w,
+		                                            Indices id, int si, int m, int bctype);
+extern __device__ __forceinline__ void BotBCzderVel(myprec *s_u, myprec *s_v, myprec *s_w, myprec *u, myprec *v, myprec *w,
+		                                            Indices id, int si, int m, int bctype, recycle rec);
 
-extern __device__ __forceinline__ void BCzNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
+
+extern __device__ __forceinline__ void TopBCzNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
 												  myprec *s_m, myprec *s_dil,
 												  myprec *u, myprec *v, myprec *w,
 												  myprec *m, myprec *dil,
-												  Indices id, int si, int n, int kNum);
+												  Indices id, int si, int n, int bctype);
+extern __device__ __forceinline__ void BotBCzNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
+												  myprec *s_m, myprec *s_dil,
+												  myprec *u, myprec *v, myprec *w,
+												  myprec *m, myprec *dil,
+												  Indices id, int si, int n, int bctype, recycle rec);
+
 
-extern __device__ __forceinline__ void BCzNumber2(myprec *s_p, myprec *p, Indices id, int si, int n, int kNum);
+extern __device__ __forceinline__ void TopBCzNumber2(myprec *s_p, myprec *p, Indices id, int si, int n, int bctype);
+extern __device__ __forceinline__ void BotBCzNumber2(myprec *s_p, myprec *p, Indices id, int si, int n, int bctype, recycle rec);
 
-extern __device__ __forceinline__ void BCzNumber3(myprec *s_l, myprec *s_t,
+
+
+extern __device__ __forceinline__ void TopBCzNumber3(myprec *s_l, myprec *s_t,
+												  myprec *l,   myprec *t,
+												  Indices id, int si, int n, int bctype);
+extern __device__ __forceinline__ void BotBCzNumber3(myprec *s_l, myprec *s_t,
 												  myprec *l,   myprec *t,
-												  Indices id, int si, int n, int kNum);
+												  Indices id, int si, int n, int bctype, recycle rec);
 
-extern __device__ __forceinline__ void BCzNumber4(myprec *s_r, myprec *s_h,
+
+extern __device__ __forceinline__ void TopBCzNumber4(myprec *s_r, myprec *s_h,
+												  myprec *r,   myprec *h,
+												  Indices id, int si, int n, int bctype);
+extern __device__ __forceinline__ void BotBCzNumber4(myprec *s_r, myprec *s_h,
 												  myprec *r,   myprec *h,
-												  Indices id, int si, int n, int kNum);
-
-__device__ __forceinline__ __attribute__((always_inline)) void BCzderVel(myprec s_f[mz/nDivZ+stencilSize*2][lPencils], myprec *f, myprec *fref, Indices id, int si, int i, int j, int kNum) {
-	int sk = id.tiy + stencilVisc;
-	if (sk < stencilVisc*2) {
-	if(nDivZ == 1) {
-		if(multiGPU) {
-			int k = sk - stencilVisc;
-			s_f[sk-stencilVisc][si]  = f[mx*my*mz + 2*stencilSize*mx*mz + k + i*stencilSize + j*mx*stencilSize];
-			s_f[sk+mz][si]           = f[mx*my*mz + 2*stencilSize*mx*mz + stencilSize*mx*my + k + i*stencilSize + j*mx*stencilSize];
-		} else {
-			if(boundaryLayer) {
-				//extrapolation
-				s_f[sk+mz][si]           = 2.0*s_f[mz+stencilVisc-1][si] - s_f[mz+2*stencilVisc-sk-2][si];
-				//extrapolation on reference solution
-				//				s_f[sk-stencilVisc][si]  = 2.0*fref[i]              - s_f[3*stencilVisc-sk-1][si];
-				//extrapolation
-				s_f[sk-stencilVisc][si]  = 2.0*s_f[stencilVisc][si] - s_f[3*stencilVisc-sk][si];
-			} else {
-				//periodic boundary condition
-				s_f[sk-stencilVisc][si]  = s_f[sk+mz-stencilVisc][si];
-				s_f[sk+mz][si]           = s_f[sk][si];
-			}
-		}
-	} else {
-			int k = sk - stencilVisc + kNum*mz/nDivZ;
-			if(kNum==0) {
-				s_f[sk+mz/nDivZ][si] = f[idx(i,j,k+mz/nDivZ)];
-				if(multiGPU) {
-					s_f[sk-stencilVisc][si]  = f[mx*my*mz + 2*stencilSize*mx*mz + k + i*stencilSize + j*mx*stencilSize];
-				} else {
-					if(boundaryLayer) {
-						s_f[sk-stencilVisc][si]  = 2.0*s_f[stencilVisc][si] - s_f[3*stencilVisc-sk][si];
-					} else {
-						s_f[sk-stencilVisc][si]  = f[idx(i,j,k+mz-stencilVisc)];
-					}
-				}
-			} else if (kNum==nDivZ-1) {
-				s_f[sk-stencilVisc][si]  = f[idx(i,j,k-stencilVisc)];
-				if(multiGPU) {
-					s_f[sk+mz/nDivZ][si]       = f[mx*my*mz + 2*stencilSize*mx*mz + stencilSize*mx*my + k + i*stencilSize + j*mx*stencilSize];
-				} else {
-					if(boundaryLayer) {
-						s_f[sk+mz/nDivZ][si]   = 2.0*s_f[mz/nDivZ+stencilVisc-1][si] - s_f[mz/nDivZ+2*stencilVisc-sk-2][si];
-					} else {
-						s_f[sk+mz/nDivZ][si]   = f[idx(i,j,sk-stencilVisc)];
-					}
-				}
-			} else {
-				s_f[sk-stencilVisc][si]  = f[idx(i,j,k-stencilVisc)];
-				s_f[sk+mz/nDivZ][si]     = f[idx(i,j,k+mz/nDivZ)];
-			}
-		}
+												  Indices id, int si, int n, int bctype, recycle rec);
+
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCzderVel(myprec *s_u, myprec *s_v, myprec *s_w, myprec *u, myprec *v, myprec *w,
+		Indices id, int si, int m, int bctype) {
+
+	if (bctype == 1) {
+		perBCzTop(s_u,u,si,id);
+		perBCzTop(s_v,v,si,id);
+		perBCzTop(s_w,w,si,id);
+
+	}else if (bctype == 4) {
+		BCzExtrapolate(s_u,si);
+		BCzExtrapolate(s_v,si);
+		BCzExtrapolate(s_w,si);
+	}  // add more else ifs here
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCzderVel(myprec *s_u, myprec *s_v, myprec *s_w, myprec *u, myprec *v, myprec *w,
+		Indices id, int si, int m, int bctype, recycle rec) {
+
+	if (bctype == 1) {
+		perBCzBot(s_u,u,si,id);
+		perBCzBot(s_v,v,si,id);
+		perBCzBot(s_w,w,si,id);
+
+	} else if (bctype == 5) {
+	
+		InflowBCzBot(s_u,rec.RRu,si,id);
+		InflowBCzBot(s_v,rec.RRv,si,id);
+		InflowBCzBot(s_w,rec.RRw,si,id);
+	
+	}  // add more else ifs here
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCzNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
+																		  myprec *s_m, myprec *s_dil,
+																		  myprec *u, myprec *v, myprec *w,
+																		  myprec *m, myprec *dil,
+																		  Indices id, int si, int n, int bctype) {
+	if (bctype == 1) {
+
+		perBCzTop(s_u,u,si,id);
+		perBCzTop(s_v,v,si,id);
+		perBCzTop(s_w,w,si,id);
+		perBCzTop(s_m,m,si,id);
+		perBCzTop(s_dil,dil,si,id);
+
+	} else if (bctype == 4) {
+
+		BCzExtrapolate(s_u,si);
+		BCzExtrapolate(s_v,si);
+		BCzExtrapolate(s_w,si);
+		BCzExtrapolate(s_m,si);
+		BCzExtrapolate(s_dil,si);
+
 	}
-	__syncthreads();
+
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void BCzNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCzNumber1(myprec *s_u, myprec *s_v, myprec *s_w,
 																		  myprec *s_m, myprec *s_dil,
 																		  myprec *u, myprec *v, myprec *w,
 																		  myprec *m, myprec *dil,
-																		  Indices id, int si, int n, int kNum) {
-
-	if (id.tiy < stencilSize) {
-		if(nDivZ==1) {
-			if(multiGPU) {
-				haloBCz(s_u,u,si,id); haloBCz(s_v,v,si,id); haloBCz(s_w,w,si,id);
-				haloBCz(s_m,m,si,id); haloBCz(s_dil,dil,si,id);
-			} else {
-				if(boundaryLayer) {
-					topBCzExt(s_u,si);
-					topBCzExt(s_v,si);
-					topBCzExt(s_w,si);
-					topBCzExt(s_m,si);
-					topBCzExt(s_dil,si);
-					//				botBCzInit(s_u,uInit,si,id.i);
-					//				botBCzInit(s_v,vInit,si,id.i);
-					//				botBCzInit(s_w,wInit,si,id.i);
-					//				botBCzInit(s_m,mInit,si,id.i);
-					botBCzExt(s_u,si);
-					botBCzExt(s_v,si);
-					botBCzExt(s_w,si);
-					botBCzExt(s_m,si);
-					botBCzExt(s_dil,si);
-				} else {
-					perBCz(s_u,si);	perBCz(s_v,si); perBCz(s_w,si);
-					perBCz(s_m,si); perBCz(s_dil,si);
-				}
-			}
-		} else {
-			if(kNum==0) {
-				topBCzCpy(s_u,u,si,id); topBCzCpy(s_v,v,si,id); topBCzCpy(s_w,w,si,id);
-				topBCzCpy(s_m,m,si,id); topBCzCpy(s_dil,dil,si,id);
-				if(multiGPU) {
-					haloBCzBot(s_u,u,si,id); haloBCzBot(s_v,v,si,id); haloBCzBot(s_w,w,si,id);
-					haloBCzBot(s_m,m,si,id); haloBCzBot(s_dil,dil,si,id);
-				} else {
-					if(boundaryLayer) {
-						botBCzExt(s_u,si);
-						botBCzExt(s_v,si);
-						botBCzExt(s_w,si);
-						botBCzExt(s_m,si);
-						botBCzExt(s_dil,si);
-					} else {
-						perBCzBot(s_u,u,si,id);
-						perBCzBot(s_v,v,si,id);
-						perBCzBot(s_w,w,si,id);
-						perBCzBot(s_m,m,si,id);
-						perBCzBot(s_dil,dil,si,id);
-					}
-				}
-			} else if (kNum==nDivZ-1) {
-				botBCzCpy(s_u,u,si,id); botBCzCpy(s_v,v,si,id); botBCzCpy(s_w,w,si,id);
-				botBCzCpy(s_m,m,si,id); botBCzCpy(s_dil,dil,si,id);
-				if(multiGPU) {
-					haloBCzTop(s_u,u,si,id); haloBCzTop(s_v,v,si,id); haloBCzTop(s_w,w,si,id);
-					haloBCzTop(s_m,m,si,id); haloBCzTop(s_dil,dil,si,id);
-				} else {
-					if(boundaryLayer) {
-						topBCzExt(s_u,si);
-						topBCzExt(s_v,si);
-						topBCzExt(s_w,si);
-						topBCzExt(s_m,si);
-						topBCzExt(s_dil,si);
-					} else {
-						perBCzTop(s_u,u,si,id);
-						perBCzTop(s_v,v,si,id);
-						perBCzTop(s_w,w,si,id);
-						perBCzTop(s_m,m,si,id);
-						perBCzTop(s_dil,dil,si,id);
-					}
-				}
-			} else {
-				topBCzCpy(s_u,u,si,id); topBCzCpy(s_v,v,si,id); topBCzCpy(s_w,w,si,id);
-				topBCzCpy(s_m,m,si,id); topBCzCpy(s_dil,dil,si,id);
-				botBCzCpy(s_u,u,si,id); botBCzCpy(s_v,v,si,id); botBCzCpy(s_w,w,si,id);
-				botBCzCpy(s_m,m,si,id); botBCzCpy(s_dil,dil,si,id);
-			}
-		}
+																		  Indices id, int si, int n, int bctype, recycle rec) {
+	if (bctype == 1) {
+
+		perBCzBot(s_u,u,si,id);
+		perBCzBot(s_v,v,si,id);
+		perBCzBot(s_w,w,si,id);
+		perBCzBot(s_m,m,si,id);
+		perBCzBot(s_dil,dil,si,id);
+
+	} else if (bctype == 5) {
+
+		InflowBCzBot(s_u,rec.RRu,si,id);
+		InflowBCzBot(s_v,rec.RRv,si,id);
+		InflowBCzBot(s_w,rec.RRw,si,id);
+		InflowBCzBot(s_m,rec.RRm,si,id);
+		BotBCzMir(s_dil,si);
+	}
+
+}
+
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCzNumber2(myprec *s_p, myprec *p, Indices id, int si, int n, int bctype) {
+
+	if (bctype == 1) {
+
+		perBCzTop(s_p,p,si,id);
+
+	} else if (bctype == 4) {
+
+		BCzExtrapolate(s_p,si);
+
 	}
-	__syncthreads();
+
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void BCzNumber2(myprec *s_p, myprec *p, Indices id, int si, int n, int kNum) {
-	if (id.tiy < stencilSize) {
-		if(nDivZ==1) {
-			if(multiGPU) {
-				haloBCz(s_p,p,si,id);
-			} else {
-				if(boundaryLayer) {
-					topBCzExt(s_p,si);
-					botBCzExt(s_p,si);
-					//				botBCzInit(s_p,pInit,si,id.i);
-				} else {
-					perBCz(s_p,si);
-				}
-			}
-		} else {
-			if(kNum==0) {
-				topBCzCpy(s_p,p,si,id);
-				if(multiGPU) {
-					haloBCzBot(s_p,p,si,id);
-				} else {
-					if (boundaryLayer) {
-						botBCzExt(s_p,si);
-					} else {
-						perBCzBot(s_p,p,si,id);
-					}
-				}
-			} else if (kNum==nDivZ-1) {
-				botBCzCpy(s_p,p,si,id);
-				if(multiGPU) {
-					haloBCzTop(s_p,p,si,id);
-				} else {
-					if (boundaryLayer) {
-						topBCzExt(s_p,si);
-					} else {
-						perBCzTop(s_p,p,si,id);
-					}
-				}
-			} else {
-				topBCzCpy(s_p,p,si,id);
-				botBCzCpy(s_p,p,si,id);
-			}
-		}
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCzNumber2(myprec *s_p, myprec *p, Indices id, int si, int n, int bctype, recycle rec) {
+
+	if (bctype == 1) {
+
+		perBCzBot(s_p,p,si,id);
+
+	} else if (bctype == 5) {
+
+		InflowBCzBot(s_p,rec.RRp,si,id);
 	}
-	__syncthreads();
+
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void BCzNumber3(myprec *s_l, myprec *s_t,
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCzNumber3(myprec *s_l, myprec *s_t,
 																		  myprec *l,   myprec *t,
-																		  Indices id, int si, int n, int kNum) {
-	if (id.tiy < stencilSize) {
-		if(nDivZ==1) {
-			if(multiGPU) {
-				haloBCz(s_l,l,si,id); haloBCz(s_t,t,si,id);
-			} else {
-				if(boundaryLayer) {
-					topBCzExt(s_l,si);
-					topBCzExt(s_t,si);
-					botBCzExt(s_l,si);
-					botBCzExt(s_t,si);
-					//				botBCzInit(s_l,lInit,si,id.i);
-					//				botBCzInit(s_t,tInit,si,id.i);
-				} else {
-					perBCz(s_l,si); perBCz(s_t,si);
-				}
-			}
-		} else {
-			if(kNum==0) {
-				topBCzCpy(s_l,l,si,id);
-				topBCzCpy(s_t,t,si,id);
-				if(multiGPU) {
-					haloBCzBot(s_l,l,si,id);
-					haloBCzBot(s_t,t,si,id);
-				} else {
-					if (boundaryLayer) {
-						botBCzExt(s_l,si);
-						botBCzExt(s_t,si);
-					} else {
-						perBCzBot(s_l,l,si,id);
-						perBCzBot(s_t,t,si,id);
-					}
-				}
-			} else if (kNum==nDivZ-1) {
-				botBCzCpy(s_l,l,si,id);
-				botBCzCpy(s_t,t,si,id);
-				if(multiGPU) {
-					haloBCzTop(s_l,l,si,id);
-					haloBCzTop(s_t,t,si,id);
-				} else {
-					if (boundaryLayer) {
-						topBCzExt(s_l,si);
-						topBCzExt(s_t,si);
-					} else {
-						perBCzTop(s_l,l,si,id);
-						perBCzTop(s_t,t,si,id);
-					}
-				}
-			} else {
-				topBCzCpy(s_l,l,si,id);
-				topBCzCpy(s_t,t,si,id);
-				botBCzCpy(s_l,l,si,id);
-				botBCzCpy(s_t,t,si,id);
-			}
-		}
+																		  Indices id, int si, int n, int bctype) {
+	if (bctype == 1) {
+
+		perBCzTop(s_l,l,si,id);
+		perBCzTop(s_t,t,si,id);
+
+	} else if (bctype == 4) {
+
+		BCzExtrapolate(s_l,si);
+		BCzExtrapolate(s_t,si);
+
 	}
-	__syncthreads();
+
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void BCzNumber4(myprec *s_r, myprec *s_h,
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCzNumber3(myprec *s_l, myprec *s_t,
+																		  myprec *l,   myprec *t,
+																		  Indices id, int si, int n, int bctype, recycle rec) {
+	if (bctype == 1) {
+
+		perBCzBot(s_l,l,si,id);
+		perBCzBot(s_t,t,si,id);
+
+	} else if (bctype == 5) {
+
+		InflowBCzBot(s_l,rec.RRl,si,id);
+		InflowBCzBot(s_t,rec.RRt,si,id);
+
+	}
+
+}
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void TopBCzNumber4(myprec *s_r, myprec *s_h,
 																		  myprec *r,   myprec *h,
-																		  Indices id, int si, int n, int kNum) {
-	if (id.tiy < stencilSize) {
-		if(nDivZ==1) {
-			if(multiGPU) {
-				haloBCz(s_h,h,si,id); haloBCz(s_r,r,si,id);
-			} else {
-				if(boundaryLayer) {
-					topBCzExt(s_r,si);
-					topBCzExt(s_h,si);
-					//				botBCzInit(s_r,rInit,si,id.i);
-					//				botBCzInit(s_h,hInit,si,id.i);
-					botBCzExt(s_r,si);
-					botBCzExt(s_h,si);
-				} else {
-					perBCz(s_r,si); perBCz(s_h,si);
-				}
-			}
-		} else {
-			if(kNum==0) {
-				topBCzCpy(s_r,r,si,id);
-				topBCzCpy(s_h,h,si,id);
-				if(multiGPU) {
-					haloBCzBot(s_r,r,si,id);
-					haloBCzBot(s_h,h,si,id);
-				} else {
-					if (boundaryLayer) {
-						botBCzExt(s_r,si);
-						botBCzExt(s_h,si);
-					} else {
-						perBCzBot(s_r,r,si,id);
-						perBCzBot(s_h,h,si,id);
-					}
-				}
-			} else if (kNum==nDivZ-1) {
-				botBCzCpy(s_r,r,si,id);
-				botBCzCpy(s_h,h,si,id);
-				if(multiGPU) {
-					haloBCzTop(s_r,r,si,id);
-					haloBCzTop(s_h,h,si,id);
-				} else {
-					if (boundaryLayer) {
-						topBCzExt(s_r,si);
-						topBCzExt(s_h,si);
-					} else {
-						perBCzTop(s_r,r,si,id);
-						perBCzTop(s_h,h,si,id);
-					}
-				}
-			} else {
-				topBCzCpy(s_r,r,si,id);
-				topBCzCpy(s_h,h,si,id);
-				botBCzCpy(s_r,r,si,id);
-				botBCzCpy(s_h,h,si,id);
-			}
-		}
+																		  Indices id, int si, int n, int bctype) {
+	if (bctype == 1) {
+
+		perBCzTop(s_r,r,si,id);
+		perBCzTop(s_h,h,si,id);
+
+	} else if (bctype == 4) {
+
+		BCzExtrapolate(s_r,si);
+		BCzExtrapolate(s_h,si);
+
 	}
-	__syncthreads();
+
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void BotBCzNumber4(myprec *s_r, myprec *s_h,
+																		  myprec *r,   myprec *h,
+																		  Indices id, int si, int n, int bctype, recycle rec) {
+	if (bctype == 1) {
+
+		perBCzBot(s_r,r,si,id);
+		perBCzBot(s_h,h,si,id);
+
+	} else if (bctype == 5) {
+
+		InflowBCzBot(s_r,rec.RRr,si,id);
+		InflowBCzBot(s_h,rec.RRh,si,id);
+
+	}
+
 }
 
 
diff --git a/src/calc_stress.cu b/src/calc_stress.cu
index 5fcff95..9409ad4 100644
--- a/src/calc_stress.cu
+++ b/src/calc_stress.cu
@@ -19,32 +19,6 @@ __global__ void deviceCalcPress(myprec *a, myprec *b, myprec *c) {
 	*a = 0.99*(*b) - 0.5*( *a/(*c) - 1 ); //*a/(*c)-1 //a here is rw_bulk // changed from rw_bulk/r_bulk to rw_bulk
 }
 
-__global__ void derVelX(myprec *u, myprec *v, myprec *w, myprec *dudx, myprec *dvdx, myprec *dwdx) {
-
-	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
-	id.mkidX();
-
-	__shared__ myprec s_u[sPencils][mx+stencilSize*2];
-	__shared__ myprec s_v[sPencils][mx+stencilSize*2];
-	__shared__ myprec s_w[sPencils][mx+stencilSize*2];
-
-	int si = id.i + stencilSize;
-	int sj = id.tiy;
-
-	s_u[sj][si] = u[id.g];
-	s_v[sj][si] = v[id.g];
-	s_w[sj][si] = w[id.g];
-	__syncthreads();
-
-	BCxderVel(s_u[sj],s_v[sj],s_w[sj],id,si,mx);
-	__syncthreads();
-
-	myprec wrk1;
-	derDevSharedV1x(&wrk1,s_u[sj],si); dudx[id.g] = wrk1;
-	derDevSharedV1x(&wrk1,s_v[sj],si); dvdx[id.g] = wrk1;
-	derDevSharedV1x(&wrk1,s_w[sj],si); dwdx[id.g] = wrk1;
-
-}
 
 __global__ void derVelY(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *dvdy, myprec *dwdy) {
 
@@ -65,18 +39,49 @@ __global__ void derVelY(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *d
 
        myprec wrk1=0;
        myprec wrk2=0;
-       //myprec wrk3=0;
 
       __shared__ myprec s_u[mx/nDivX][my/nDivY+stencilSize*2];
       __shared__ myprec s_v[mx/nDivX][my/nDivY+stencilSize*2];
-      //__shared__ myprec s_w[mx/nDivX][my/nDivY+stencilSize*2];
       s_u[sj][si] = u[id.g];
       s_v[sj][si] = v[id.g];
-      //s_w[sj][si] = w[id.g];
       __syncthreads();
-      BCyNumber2(s_u[sj],u,id,si,my,jNum);
-      BCyNumber2(s_v[sj],v,id,si,my,jNum);
-      //BCyNumber2(s_w[sj],w,id,si,my,jNum);
+
+
+	if (id.tiy < stencilSize){
+		if (nDivY ==1){
+			if(pRow>1) {
+				haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id);
+
+				haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id);
+			} else{
+  			TopBCyNumber2(s_u[sj],u,id,si,my); TopBCyNumber2(s_v[sj],v,id,si,my); 
+  			BotBCyNumber2(s_u[sj],u,id,si,my); BotBCyNumber2(s_v[sj],v,id,si,my); 
+			}
+		} else {
+			if (jNum ==0){
+				TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); 
+
+				if(pRow>1) {
+
+					haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id);
+				} else{
+  			BotBCyNumber2(s_u[sj],u,id,si,my); BotBCyNumber2(s_v[sj],v,id,si,my); 
+				}
+			} else if (jNum == nDivY-1){
+				BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); 
+				if(pRow>1) {
+					haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id); 
+
+				} else {
+  			TopBCyNumber2(s_u[sj],u,id,si,my); TopBCyNumber2(s_v[sj],v,id,si,my); 
+				}
+			} else {
+				TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); 
+
+				BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); 
+			}
+		}
+	}
       __syncthreads();
 // COMP Tile sj1 and si1
       derDevSharedV1y(&wrk1,s_u[sj1],si1);
@@ -98,7 +103,7 @@ __global__ void derVelY(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *d
 	//derDevV1yL(dwdy,w,id);*/
 }
 
-__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz) {
+__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz, recycle rec, Communicator rk) {
 
        Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
        int kNum = blockIdx.z;
@@ -126,10 +131,78 @@ __global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *d
       s_v[sj][si] = v[id.g];
       s_w[sj][si] = w[id.g];
       __syncthreads();
-      BCzNumber2(s_u[sj],u,id,si,mz,kNum);
-      BCzNumber2(s_v[sj],v,id,si,mz,kNum);
-      BCzNumber2(s_w[sj],w,id,si,mz,kNum);
+
+
+
+	if (id.tiy < stencilSize){
+		if (nDivZ ==1){
+			if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot..
+				if (inletbc == 1 || outletbc ==1 ){ // if periodic
+					haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id);
+
+					haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id);
+				} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+					if (rk.km == pCol-2){ // last block
+						haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id);
+
+    		  TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc);
+					} else if (rk.kp == 1) {// first block
+						haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id);
+
+    		  BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec);
+					} else { // all internal blocks
+						haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id);
+
+						haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id);
+					}
+				}
+
+			} else{
+    		  TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc);
+    		  BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec);
+			}
+		} else {
+			if (kNum ==0){
+				TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id);
+				if(pCol > 1) {
+					if (inletbc == 1 || outletbc ==1 ){ // if periodic
+						haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id);
+					} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+						if (rk.kp == 1){ // first block
+    		  BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec);
+						} else { // all other blocks
+							haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id);
+						}
+					}
+
+				} else{
+    		  BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec);
+				}
+			} else if (kNum == nDivZ-1){
+				BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id);
+				if(pCol > 1) {
+					if (inletbc == 1 || outletbc ==1 ){ // if periodic
+						haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id);
+					} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+						if (rk.km == pCol -2 ){ // last block
+    		  TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc);
+						} else { // all other blocks
+							haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id);
+						}
+					}
+				} else{
+    		  TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc);
+				}
+			} else {
+				TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id);
+
+				BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id);
+			}
+		}
+	}
+
       __syncthreads();
+
 // COMP Tile sj1 and si1
       derDevSharedV1z(&wrk1,s_u[sj1],si1);
       derDevSharedV1z(&wrk2,s_v[sj1],si1);
@@ -232,14 +305,16 @@ __global__ void deviceCalcDt(myprec *wrkArray, myprec *r, myprec *u, myprec *v,
     myprec dtViscInv = 0.0;
 
     myprec ien = e[id.g]/r[id.g] - 0.5*(u[id.g]*u[id.g] + v[id.g]*v[id.g] + w[id.g]*w[id.g]);
-    myprec sos = pow(gam*(gam-1)*ien,0.5);
+    myprec sos = pow(abs(gam*(gam-1)*ien),0.5);
 
     myprec dx,d2x;
     dx = d_dxv[id.i];
     d2x = dx*dx;
 
+    myprec nu = mu[id.g]/r[id.g];
+
     dtConvInv =  MAX( (abs(u[id.g]) + sos)/dx, MAX( (abs(v[id.g]) + sos)*d_dy, (abs(w[id.g]) + sos)*d_dz) );
-    dtViscInv =  MAX( mu[id.g]/d2x, MAX( mu[id.g]*d_d2y, mu[id.g]*d_d2z) );
+    dtViscInv =  MAX( nu/d2x, MAX( nu*d_d2y, nu*d_d2z) );
 
     wrkArray[id.g] = CFL/MAX(dtConvInv, dtViscInv);
     __syncthreads();
@@ -265,8 +340,6 @@ void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, mypre
 		checkCuda( cudaMemcpy(hostWork, rbulk, sizeof(myprec), cudaMemcpyDeviceToHost) );
 		allReduceSum(hostWork,1);
 		checkCuda( cudaMemcpy(rbulk, hostWork, sizeof(myprec), cudaMemcpyHostToDevice) );
-		if(rk.rank==0) printf("step number %d with %le %le %3.10le\n",nsteps*(file-1) + istep ,*dtC,*dpdz, *hostWork);
-
 
 		deviceMul<<<gr0,bl0>>>(workA,r,w);
 		hostVolumeIntegral(par1,workA,rk);
@@ -290,6 +363,7 @@ void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, mypre
 	checkCuda( cudaFree(workA) );
 	checkCuda( cudaFree(rbulk) );
 
+	if(rk.rank==0) printf("step number %d with %le %le\n",nsteps*(file-1) + istep ,*dtC,*dpdz);
 
 
 }
diff --git a/src/comm.cpp b/src/comm.cpp
index 0d29ee8..a53037b 100644
--- a/src/comm.cpp
+++ b/src/comm.cpp
@@ -19,6 +19,8 @@ myprec *rcvYp,*rcvYm,*rcvZp,*rcvZm;
 myprec *senYp5,*senYm5,*senZp5,*senZm5;
 myprec *rcvYp5,*rcvYm5,*rcvZp5,*rcvZm5;
 
+MPI_Comm comm_col;
+
 void updateHaloTest(myprec *var, Communicator rk) {
 
 	int ierr;
@@ -133,6 +135,30 @@ void updateHaloFive(myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, Commu
 	fillBoundariesFive(rcvYm5,rcvYp5,rcvZm5,rcvZp5,r,u,v,w,e,1,rk);
 }
 
+void TransferToInlet(myprec *recy_r, myprec *recy_u,myprec *recy_v, myprec *recy_w, myprec *recy_t, Communicator rk) {
+
+    int ierr;
+    MPI_Status status;
+
+    if ( (rk.kstart <= recstn) && ( rk.kend >= recstn) ) { //The processor row consisting of recycling station co-od = [y=0:pRow-1, z=Recyc]
+        ierr = MPI_Send(recy_r, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 0, MPI_COMM_WORLD); // A processor with [5, Recyc] will send data to [5, 0]
+        ierr = MPI_Send(recy_u, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 1, MPI_COMM_WORLD);
+        ierr = MPI_Send(recy_v, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 2, MPI_COMM_WORLD);
+        ierr = MPI_Send(recy_w, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 3, MPI_COMM_WORLD);
+        ierr = MPI_Send(recy_t, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 4, MPI_COMM_WORLD);
+    }
+
+    if (rk.kstart == 0) { //co-od = [y=0:pRow-1, z=0]
+        ierr = MPI_Recv(recy_r, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 0, MPI_COMM_WORLD, &status);// A processor with [8, 0] will recieve data from [8, Recyc]
+        ierr = MPI_Recv(recy_u, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 1, MPI_COMM_WORLD, &status);
+        ierr = MPI_Recv(recy_v, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 2, MPI_COMM_WORLD, &status);
+        ierr = MPI_Recv(recy_w, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 3, MPI_COMM_WORLD, &status);
+        ierr = MPI_Recv(recy_t, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 4, MPI_COMM_WORLD, &status);
+    }
+
+    ierr = MPI_Barrier(MPI_COMM_WORLD);
+}
+
 long int nameToHash(char *name, int length) {
 	long int hash = 0;
 	for (int i=0; i<length; i++) {
@@ -143,63 +169,95 @@ long int nameToHash(char *name, int length) {
 
 void splitComm(Communicator *rk, int myRank) {
 
-	rk->myRank(myRank);
-
-	const int dimens[]  = {pRow,pCol};
-	const int periods[] = {1,1};
-	int coord[2],cYPos[2],cYNeg[2],cZPos[2],cZNeg[2];
-	int ierr;
-
-	MPI_Comm comm_cart;
-
-	ierr = MPI_Cart_create(MPI_COMM_WORLD, 2, dimens, periods, 0, &comm_cart);
-	ierr = MPI_Cart_coords(comm_cart, rk->rank, 2, coord);
-
-	cZPos[0] = coord[0];
-	cZNeg[0] = coord[0];
-	cYPos[1] = coord[1];
-	cYNeg[1] = coord[1];
-
-	cZPos[1] = coord[1]+1;
-	if(cZPos[1]>pCol-1)
-		cZPos[1] = 0;
-
-	cZNeg[1] = coord[1]-1;
-	if(cZNeg[1]<0)
-		cZNeg[1] = pCol-1;
-
-	cYPos[0] = coord[0]+1;
-	if(cYPos[0]>pRow-1)
-		cYPos[0] = 0;
-
-	cYNeg[0] = coord[0]-1;
-	if(cYNeg[0]<0)
-		cYNeg[0] = pRow-1;
+    rk->myRank(myRank);
+
+    const int dimens[]  = {pRow,pCol};
+    const int periods[] = {1,1};
+    int coord[2],cYPos[2],cYNeg[2],cZPos[2],cZNeg[2], cZSenRR[2], cZRecRR[2], color;
+    int ierr;
+
+    MPI_Comm comm_cart;
+
+    ierr = MPI_Cart_create(MPI_COMM_WORLD, 2, dimens, periods, 0, &comm_cart);
+    ierr = MPI_Cart_coords(comm_cart, rk->rank, 2, coord);
+
+    cZPos[0] = coord[0];
+    cZNeg[0] = coord[0];
+    cYPos[1] = coord[1];
+    cYNeg[1] = coord[1];
+
+    cZPos[1] = coord[1]+1;
+    if(cZPos[1]>pCol-1)
+        cZPos[1] = 0;
+
+    cZNeg[1] = coord[1]-1;
+    if(cZNeg[1]<0)
+        cZNeg[1] = pCol-1;
+
+    cYPos[0] = coord[0]+1;
+    if(cYPos[0]>pRow-1)
+        cYPos[0] = 0;
+
+    cYNeg[0] = coord[0]-1;
+    if(cYNeg[0]<0)
+        cYNeg[0] = pRow-1;
+
+    ierr = MPI_Cart_rank(comm_cart, cZPos, &rk->kp);
+    ierr = MPI_Cart_rank(comm_cart, cZNeg, &rk->km);
+    ierr = MPI_Cart_rank(comm_cart, cYPos, &rk->jp);
+    ierr = MPI_Cart_rank(comm_cart, cYNeg, &rk->jm);
+
+    rk->jstart = coord[0]*my;
+    rk->jend   = rk->jstart + my;
+    rk->kstart = coord[1]*mz;
+    rk->kend   = rk->kstart + mz;
+
+    int kr1,kr2;
+    int RecyZRank = 0;
+    for(int i = 0; i<pCol; i++) {
+        kr1 = i*mz;
+        kr2 = (i+1)*mz;
+        if ((kr1<=recstn) && (kr2 >= recstn)) {
+            RecyZRank = i;
+        }
+
+    }
+
+    cZSenRR[0] = coord[0];
+    cZSenRR[1] = RecyZRank;
+    cZRecRR[0] = coord[0];
+    cZRecRR[1] = 0;
+
+    ierr = MPI_Cart_rank(comm_cart, cZSenRR, &rk->coodsend);
+    ierr = MPI_Cart_rank(comm_cart, cZRecRR, &rk->coodrecv);
+    
+
+    long int hash;
+    MPI_Comm comm_node;
+    int procnamesize;
+    char procname[MPI_MAX_PROCESSOR_NAME];
+
+    ierr = MPI_Get_processor_name(procname, &procnamesize);
+
+    hash = (rk->rank)/GPUperNode ; // nameToHash(procname, procnamesize);
+
+    ierr = MPI_Comm_split(MPI_COMM_WORLD, hash, rk->rank, &comm_node);
+    ierr = MPI_Comm_rank(comm_node, &rk->nodeRank);
+    
+   // MPI_Comm comm_col; // All rows in a particular column. Used to communicate across Y direction. [0:pRow-1, z]
+    color = coord[1]; // All processors in a particular column will be bundled in the communicator called comm_col; 
+    ierr = MPI_Comm_split(MPI_COMM_WORLD, color, rk->rank, &comm_col); // refer https://mpitutorial.com/tutorials/introduction-to-groups-and-communicators/
+    int rankcol;
+    ierr = MPI_Comm_rank(comm_col, &rankcol);
+    
+    ierr = MPI_Barrier(MPI_COMM_WORLD);
+    printf("rank number %d, gpu number %d\n",rk->rank,rk->nodeRank);
+    ierr = MPI_Barrier(MPI_COMM_WORLD);
+    printf("SendingRank %d, Recieving Rank %d\n",rk->coodsend,rk->coodrecv);
+    ierr = MPI_Barrier(MPI_COMM_WORLD);
+    printf("Neighbours %d\n",rk -> km);
+    ierr = MPI_Barrier(MPI_COMM_WORLD);
 
-	ierr = MPI_Cart_rank(comm_cart, cZPos, &rk->kp);
-	ierr = MPI_Cart_rank(comm_cart, cZNeg, &rk->km);
-	ierr = MPI_Cart_rank(comm_cart, cYPos, &rk->jp);
-	ierr = MPI_Cart_rank(comm_cart, cYNeg, &rk->jm);
-
-	rk->jstart = coord[0]*my;
-	rk->jend   = rk->jstart + my;
-	rk->kstart = coord[1]*mz;
-	rk->kend   = rk->kstart + mz;
-
-	long int hash;
-	MPI_Comm comm_node;
-	int procnamesize;
-	char procname[MPI_MAX_PROCESSOR_NAME];
-
-	ierr = MPI_Get_processor_name(procname, &procnamesize);
-
-	hash = 1; // nameToHash(procname, procnamesize);
-
-	ierr = MPI_Comm_split(MPI_COMM_WORLD, hash, rk->rank, &comm_node);
-	ierr = MPI_Comm_rank(comm_node , &rk->nodeRank);
-	ierr = MPI_Barrier(MPI_COMM_WORLD);
-	printf("rank number %d, gpu number %d\n",rk->rank,rk->nodeRank);
-	ierr = MPI_Barrier(MPI_COMM_WORLD);
 }
 
 void saveFileMPI(char filename, int timestep,  double *var, Communicator rk) {
@@ -277,6 +335,144 @@ void readFileMPI(char filename, int timestep,  double *var, Communicator rk) {
 				var[i + (j-rk.jstart)*mx + (k-rk.kstart)*mx*my] = var_tot[i + j * mx_tot + k * mx_tot * my_tot];
 	delete [] var_tot;
 }
+void readFileMPIInit(char filename, int timestep,  double *r,  double *u,  double *v,  double *w,  double *ret, Communicator rk){
+	double *var_tot = new double[mx_tot*my_tot*mz_tot*5];
+	int ierr;
+	int lSize = mx_tot*my_tot*mz_tot*5;
+
+	if(rk.rank==0) {
+		char str[80];
+		size_t result;
+		sprintf(str, "fields/%c.%07d.bin",filename,timestep);
+		FILE *fb = fopen(str,"rb");
+		result = fread(var_tot , sizeof(myprec) , lSize , fb );
+		fclose(fb);
+	}
+	ierr = MPI_Barrier(MPI_COMM_WORLD);
+
+	ierr = MPI_Bcast(var_tot, lSize, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+
+	ierr = MPI_Barrier(MPI_COMM_WORLD);
+
+	for (int m=0; m<5; m++){
+	for (int k=rk.kstart; k<rk.kend; k++)
+		for (int j=rk.jstart; j<rk.jend; j++)
+			for (int i=0; i<mx; i++)
+				if (m==0){
+				r[i + (j-rk.jstart)*mx + (k-rk.kstart)*mx*my] = var_tot[i + j * mx_tot + k * mx_tot * my_tot + m*mx_tot*my_tot*mz_tot];
+	                        } else if(m==1) {
+
+				w[i + (j-rk.jstart)*mx + (k-rk.kstart)*mx*my] = var_tot[i + j * mx_tot + k * mx_tot * my_tot + m*mx_tot*my_tot*mz_tot];
+	                        } else if(m==2) {
+
+				u[i + (j-rk.jstart)*mx + (k-rk.kstart)*mx*my] = var_tot[i + j * mx_tot + k * mx_tot * my_tot + m*mx_tot*my_tot*mz_tot];
+	                        } else if(m==3) {
+
+				v[i + (j-rk.jstart)*mx + (k-rk.kstart)*mx*my] = var_tot[i + j * mx_tot + k * mx_tot * my_tot + m*mx_tot*my_tot*mz_tot];
+	                        } else if(m==4) {
+
+				ret[i + (j-rk.jstart)*mx + (k-rk.kstart)*mx*my] = var_tot[i + j * mx_tot + k * mx_tot * my_tot + m*mx_tot*my_tot*mz_tot];
+				}	
+	}
+	delete [] var_tot;
+}
+
+
+void readFileInitBL(char filename,  double *var, Communicator rk) {
+	double *var_tot = new double[mx_tot*(mz_tot+stencilSize)];
+	int ierr;
+	int lSize = mx_tot*(mz_tot+stencilSize);
+
+	if(rk.rank==0) {
+		char str[80];
+		size_t result;
+		sprintf(str, "InitBL/%c.bin",filename);
+		FILE *fb = fopen(str,"rb");
+		result = fread(var_tot , sizeof(myprec) , lSize , fb);
+		fclose(fb);
+	}
+	ierr = MPI_Barrier(MPI_COMM_WORLD);
+
+	ierr = MPI_Bcast(var_tot, lSize, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+
+	ierr = MPI_Barrier(MPI_COMM_WORLD);
+	for (int k=0; k<mz; k++)
+		for (int j=0; j<my; j++)
+			for (int i=0; i<mx; i++)
+				var[i + (j)*mx + (k)*mx*my] = var_tot[i + (k+rk.kstart+stencilSize)*mx]; // stencil size is added because the first 3 entries (for stencil=3) belong to (z_inlet -1)
+// (z_inlet -2) and (z_inlet -3)
+	delete [] var_tot;
+}
+void readFileInitBL1D(char filename,  double *var, Communicator rk) {
+	double *var_tot = new double[mz_tot+stencilSize];
+	int ierr;
+	int lSize = mz_tot+stencilSize;
+
+	if(rk.rank==0) {
+		char str[80];
+		size_t result;
+		sprintf(str, "InitBL/%c.bin",filename);
+		FILE *fb = fopen(str,"rb");
+		result = fread(var_tot , sizeof(myprec) , lSize , fb);
+		fclose(fb);
+	}
+	ierr = MPI_Barrier(MPI_COMM_WORLD);
+
+	ierr = MPI_Bcast(var_tot, lSize, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+
+	ierr = MPI_Barrier(MPI_COMM_WORLD);
+	for (int k=0; k<mz; k++)
+		var[k] = var_tot[(k+rk.kstart+stencilSize)];
+	delete [] var_tot;
+}
+void readFileInitBL_inRR(char filename,  double *var, Communicator rk) {
+	double *var_tot = new double[mx_tot*(mz_tot+stencilSize)];
+	int ierr;
+	int lSize = mx_tot*(mz_tot+stencilSize);
+
+	if(rk.rank==0) {
+		char str[80];
+		size_t result;
+		sprintf(str, "InitBL/%c.bin",filename);
+		FILE *fb = fopen(str,"rb");
+		result = fread(var_tot , sizeof(myprec) , lSize , fb);
+		fclose(fb);
+	}
+	ierr = MPI_Barrier(MPI_COMM_WORLD);
+
+	ierr = MPI_Bcast(var_tot, lSize, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+
+	ierr = MPI_Barrier(MPI_COMM_WORLD);
+	for (int k=0; k<stencilSize; k++) // only 0-stencilsize-1 is important for the ghost cells update. REMEMBER The ghost cells mean the cells next to the physical inflow and not the inlet of the MPI processors. However, this operation is performed by all the processes which is not required. Ideally this operation should only be perfomred by only those processes that uinclude the recycling station as they are the ones who will actually use the ghost cells data to compute data that needs to be transferred to processes at inlet (u = um_inRR + u').  
+			for (int i=0; i<mx; i++)
+				var[i + (k)*mx] = var_tot[i + (k)*mx];
+	delete [] var_tot;
+}
+void readFileInitBL1D_inRR(char filename,  double *var, Communicator rk) {
+	double *var_tot = new double[mz_tot+stencilSize];
+	int ierr;
+	int lSize = mz_tot+stencilSize;
+
+	if(rk.rank==0) {
+		char str[80];
+		size_t result;
+		sprintf(str, "InitBL/%c.bin",filename);
+		FILE *fb = fopen(str,"rb");
+		result = fread(var_tot , sizeof(myprec) , lSize , fb);
+		fclose(fb);
+	}
+	ierr = MPI_Barrier(MPI_COMM_WORLD);
+
+	ierr = MPI_Bcast(var_tot, lSize, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+
+	ierr = MPI_Barrier(MPI_COMM_WORLD);
+	for (int k=0; k<stencilSize; k++)
+		var[k] = var_tot[(k)];
+	delete [] var_tot;
+}
+
+
+
 
 void reduceArray(int rcvCore, myprec *sendArr, int sizeArr, Communicator rk) {
 	int ierr;
@@ -334,6 +530,19 @@ void allReduceSum(myprec *sendArr, int sizeArr) {
 	ierr = MPI_Barrier(MPI_COMM_WORLD);
 }
 
+void allReduceSumYavg(myprec *sendArr, int sizeArr) {
+	int ierr;
+	myprec tmpArr[sizeArr];
+
+	ierr = MPI_Allreduce(sendArr, tmpArr, sizeArr, MPI_myprec, MPI_SUM, comm_col);
+
+	for(int i=0; i<sizeArr; i++)
+		sendArr[i] = tmpArr[i];
+	ierr = MPI_Barrier(comm_col);
+}
+
+
+
 void broadCastArray(int bcstCore, double *sendArr, int sizeArr, Communicator rk) {
 	int ierr;
 
diff --git a/src/comm.h b/src/comm.h
index 716b4ed..fd9956c 100644
--- a/src/comm.h
+++ b/src/comm.h
@@ -21,7 +21,14 @@ void allReduceArray(myprec *sendArr, int sizeArr);
 void allReduceArrayDouble(double *sendArr, int sizeArr);
 void allReduceToMin(myprec *sendArr, int sizeArr);
 void allReduceSum(myprec *sendArr, int sizeArr);
+void allReduceSumYavg(myprec *sendArr, int sizeArr);
 void broadCastArray(int bcstCore, double *sendArr, int sizeArr, Communicator rk);
 void mpiBarrier();
+void TransferToInlet(myprec *recy_r, myprec *recy_u,myprec *recy_v, myprec *recy_w, myprec *recy_t, Communicator rk) ;
+void readFileInitBL(char filename, double *var, Communicator rk);
+void readFileInitBL1D(char filename, double *var, Communicator rk);
+void readFileInitBL_inRR(char filename, double *var, Communicator rk);
+void readFileInitBL1D_inRR(char filename, double *var, Communicator rk);
+void readFileMPIInit(char filename, int timestep,  double *r,  double *u,  double *v,  double *w,  double *ret, Communicator rk);
 
 #endif /* COMM_H_ */
diff --git a/src/cuda_derivs.cu b/src/cuda_derivs.cu
index 3cfcc37..e3e8560 100644
--- a/src/cuda_derivs.cu
+++ b/src/cuda_derivs.cu
@@ -75,7 +75,6 @@ __device__ void derDev1y(myprec *df, myprec *f, Indices id)
 	__syncthreads();
 
 }
-
 __device__ void derDev1z(myprec *df, myprec *f, Indices id)
 {
 
diff --git a/src/cuda_derivs.h b/src/cuda_derivs.h
index ad8040f..63284ea 100644
--- a/src/cuda_derivs.h
+++ b/src/cuda_derivs.h
@@ -26,6 +26,53 @@ extern __device__ __forceinline__ void fluxQuadSharedy(myprec *df, myprec *s_f,
 extern __device__ __forceinline__ void fluxCubeSharedy(myprec *df, myprec *s_f, myprec *s_g, myprec *s_h, int si);
 extern __device__ __forceinline__ void fluxQuadSharedz(myprec *df, myprec *s_f, myprec *s_g, int si);
 extern __device__ __forceinline__ void fluxCubeSharedz(myprec *df, myprec *s_f, myprec *s_g, myprec *s_h, int si);
+extern __device__ __forceinline__ void derShared1x_BD(myprec *df , myprec *s_f, int si);
+extern __device__ __forceinline__ void derShared1x_FD(myprec *df , myprec *s_f, int si);
+extern __device__ __forceinline__ void derShared1z_BD(myprec *df , myprec *s_f, int si);
+extern __device__ __forceinline__ void derShared1z_FD(myprec *df , myprec *s_f, int si);
+
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void derShared1x_BD(myprec *df, myprec *s_f, int si)
+{
+	*df = 1.5*s_f[si]-2*s_f[si-1] + 0.5*s_f[si-2];
+
+	*df = *df*d_dx;
+
+#if nonUniformX
+	*df = *df*d_xp[si-stencilSize];
+#endif
+
+}
+__device__ __forceinline__ __attribute__((always_inline)) void derShared1x_FD(myprec *df, myprec *s_f, int si)
+{
+
+	*df = -1.5*s_f[si] + 2*s_f[si+1] - 0.5*s_f[si+2];
+
+	*df = *df*d_dx;
+
+#if nonUniformX
+	*df = *df*d_xp[si-stencilSize];
+#endif
+
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void derShared1z_BD(myprec *df, myprec *s_f, int si)
+{
+	*df = 1.5*s_f[si]-2*s_f[si-1] + 0.5*s_f[si-2];
+
+	*df = *df*d_dz;
+}
+__device__ __forceinline__ __attribute__((always_inline)) void derShared1z_FD(myprec *df, myprec *s_f, int si)
+{
+
+	*df = -1.5*s_f[si] + 2*s_f[si+1] - 0.5*s_f[si+2];
+
+	*df = *df*d_dz;
+
+}
+
+
 
 __device__ __forceinline__ __attribute__((always_inline)) void fluxQuadSharedx(myprec *df, myprec *s_f, myprec *s_g, int si)
 {
@@ -33,19 +80,17 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxQuadSharedx(m
 
 	flxp = 0.0;
 	flxm = 0.0;
-	__syncthreads();
-
 	for (int lt=1; lt<stencilSize+1; lt++) {
-                        flxp -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si+lt])*(s_g[si]+s_g[si+lt]);
-			flxm -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si-lt])*(s_g[si]+s_g[si-lt]);
-        }
+		flxp -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si+lt])*(s_g[si]+s_g[si+lt]);
+		flxm -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si-lt])*(s_g[si]+s_g[si-lt]);
+	}
+
 	*df = 0.5*d_dx*(flxm - flxp);
 
 #if nonUniformX
-	*df = (*df)*d_xp[si-stencilSize];
+*df = (*df)*d_xp[si-stencilSize];
 #endif
 
-	__syncthreads();
 }
 
 __device__ __forceinline__ __attribute__((always_inline)) void fluxCubeSharedx(myprec *df, myprec *s_f, myprec *s_g, myprec *s_h, int si)
@@ -55,7 +100,6 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxCubeSharedx(m
 
 	flxp = 0.0;
 	flxm = 0.0;
-	__syncthreads();
 
 	for (int lt=1; lt<stencilSize+1; lt++) {
 
@@ -68,7 +112,6 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxCubeSharedx(m
 	*df = (*df)*d_xp[si-stencilSize];
 #endif
 
-	__syncthreads();
 }
 
 __device__ __forceinline__ __attribute__((always_inline)) void fluxQuadSharedy(myprec *df, myprec *s_f, myprec *s_g, int si)
@@ -115,20 +158,18 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxQuadSharedz(m
 
 	flxp = 0.0;
 	flxm = 0.0;
-	__syncthreads();
 
 	for (int lt=1; lt<stencilSize+1; lt++) {
 		//for (int mt=0; mt<lt; mt++) {
 			//flxp -= dcoeffF[stencilSize-lt]*(s_f[si-mt]+s_f[si-mt+lt])*(s_g[si-mt]+s_g[si-mt+lt]);
 			//flxm -= dcoeffF[stencilSize-lt]*(s_f[si-mt-1]+s_f[si-mt+lt-1])*(s_g[si-mt-1]+s_g[si-mt+lt-1]);
-                        flxp -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si+lt])*(s_g[si]+s_g[si+lt]);
-			flxm -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si-lt])*(s_g[si]+s_g[si-lt]);
+		flxp -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si+lt])*(s_g[si]+s_g[si+lt]);
+		flxm -= dcoeffF[stencilSize-lt]*(s_f[si]+s_f[si-lt])*(s_g[si]+s_g[si-lt]);
 
 		//}
                  }
 	*df = 0.5*d_dz*(flxm - flxp);
 
-	__syncthreads();
 }
 
 __device__ __forceinline__ __attribute__((always_inline)) void fluxCubeSharedz(myprec *df, myprec *s_f, myprec *s_g, myprec *s_h, int si)
@@ -138,7 +179,6 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxCubeSharedz(m
 
 	flxp = 0.0;
 	flxm = 0.0;
-	__syncthreads();
 
 	for (int lt=1; lt<stencilSize+1; lt++) {
 //		for (int mt=0; mt<lt; mt++) {
@@ -153,7 +193,6 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxCubeSharedz(m
                 }  
 	*df = 0.25*d_dz*(flxm - flxp);
 
-	__syncthreads();
 }
 
 __device__ __forceinline__ __attribute__((always_inline)) void derDevShared1x(myprec *df, myprec *s_f, int si)
@@ -169,7 +208,6 @@ __device__ __forceinline__ __attribute__((always_inline)) void derDevShared1x(my
 	*df = *df*d_xp[si-stencilSize];
 #endif
 
-	__syncthreads();
 }
 
 __device__ __forceinline__ __attribute__((always_inline)) void derDevShared2x(myprec *d2f, myprec *s_f, int si)
@@ -203,7 +241,6 @@ __device__ __forceinline__ __attribute__((always_inline)) void derDevSharedV1x(m
 	*df = *df*d_xp[si-stencilSize];
 #endif
 
-	__syncthreads();
 }
 
 __device__ __forceinline__ __attribute__((always_inline)) void derDevSharedV2x(myprec *d2f, myprec *s_f, int si)
@@ -221,7 +258,6 @@ __device__ __forceinline__ __attribute__((always_inline)) void derDevSharedV2x(m
 	}
 #endif
 
-	__syncthreads();
 
 }
 
@@ -275,8 +311,6 @@ __device__ __forceinline__ __attribute__((always_inline)) void derDevShared1z(my
 	for (int it=0; it<stencilSize; it++)  {
 		*df += dcoeffF[it]*(s_f[si+it-stencilSize]-s_f[si+stencilSize-it])*d_dz;
 	}
-
-	__syncthreads();
 }
 
 __device__ __forceinline__ __attribute__((always_inline)) void derDevShared2z(myprec *d2f, myprec *s_f, int si)
@@ -297,8 +331,6 @@ __device__ __forceinline__ __attribute__((always_inline)) void derDevSharedV1z(m
 	for (int it=0; it<stencilVisc; it++)  {
 		*df += dcoeffVF[it]*(s_f[si+it-stencilVisc]-s_f[si+stencilVisc-it])*d_dz;
 	}
-
-	__syncthreads();
 }
 
 __device__ __forceinline__ __attribute__((always_inline)) void derDevSharedV2z(myprec *d2f, myprec *s_f, int si)
@@ -309,8 +341,6 @@ __device__ __forceinline__ __attribute__((always_inline)) void derDevSharedV2z(m
 		*d2f += dcoeffVS[it]*(s_f[si+it-stencilVisc]+s_f[si+stencilVisc-it])*d_d2z;
 	}
 
-	__syncthreads();
-
 }
 
 __device__ __forceinline__ __attribute__((always_inline)) void derDevV1yL(myprec *df, myprec *f, Indices id)
@@ -355,7 +385,7 @@ __device__ __forceinline__ __attribute__((always_inline)) void derDevV1yL(myprec
 
 }
 
-__device__ __forceinline__ __attribute__((always_inline)) void derDevV1zL(myprec *df, myprec *f, myprec *fref, Indices id, int kNum)
+/*__device__ __forceinline__ __attribute__((always_inline)) void derDevV1zL(myprec *df, myprec *f, myprec *fref, Indices id, int kNum)
 {
   __shared__ myprec s_f[mz/nDivZ+stencilVisc*2][lPencils];
 
@@ -385,7 +415,7 @@ __device__ __forceinline__ __attribute__((always_inline)) void derDevV1zL(myprec
   }
   __syncthreads();
 
-}
+}*/
 
 __device__ __forceinline__ __attribute__((always_inline)) void derDev1yBC(myprec *df, myprec *f, Indices id, int direction)
 {
diff --git a/src/cuda_functions.h b/src/cuda_functions.h
index a961cae..4a91296 100644
--- a/src/cuda_functions.h
+++ b/src/cuda_functions.h
@@ -39,6 +39,13 @@ class Indices {
        g = i + j*mx + k*mx*my;
     }
 
+    __device__ __host__ void mkidXFlx(int iNum) {
+       i  = tix + iNum*mx/nX;
+       j  = bix*bdy + tiy;
+       k  = biy;
+       g = i + j*mx + k*mx*my;
+    }
+
     __device__ __host__ void mkidYFlx(int jNum) {
         //CHANGED
         i  = bix*bdx + tix;
@@ -74,9 +81,58 @@ class Indices {
         } else {
         	k = tix; }
         g = i + j*mx + k*mx*my;
-     }
+    }
+    __device__ __host__ void mkidBCwallTop(){
+    	i  = mx_tot-1;
+    	j  = tix;
+    	k  = bix;
+    	g  = i + j*mx + k*mx*my;
+    }
+    __device__ __host__ void mkidBCwallBot(){
+    	i  = 0;
+    	j  = tix;
+    	k  = bix;
+    	g  = i + j*mx + k*mx*my;
+    }
+    __device__ __host__ void mkidBCRecycAvg(int krec){
+    	i  = bix;
+    	j  = tix;
+    	k  = krec;
+    	g  = i + j*mx + k*mx*my;
+    }
+    __device__ __host__ void mkidBCRR(int krec){
+    	i  = tix;
+    	j  = bix;
+    	k  = krec;
+    	g  = i + j*mx + k*mx*my;
+    }
+    __device__ __host__ void mkidSpanShift(int krec, int shift){
+    	i  = bix;
+    	j  = (tix + shift)%my;
+    	k  = krec;
+    	g  = i + j*mx + k*mx*my;
+    }
+
+
 };
 
+class recycle{
+  public:
+    myprec *RRr,*RRu,*RRv,*RRw,*RRe,*RRh,*RRp,*RRt,*RRm,*RRl;
+
+    __device__ __host__ recycle(myprec *r,myprec *u,myprec *v,myprec *w,myprec *e,myprec *h,myprec *t,myprec *p,myprec *m,myprec *l ) {   
+       RRr = r;
+       RRu = u;
+       RRv = v;
+       RRw = w;
+       RRe = e;
+       RRh = h;
+       RRp = p;
+       RRt = t;
+       RRm = m;
+       RRl = l;
+    }
+};
 void calcTimeStep(myprec *dt, myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, myprec *mu, Communicator rk);
 void calcPressureGrad(myprec *dpdx, myprec *r, myprec *w, Communicator rk);
 void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, myprec *dtC , myprec *dpdz ,int file, int istep , Communicator rk);
@@ -89,7 +145,7 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
 		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
 		myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz,
-		myprec *dvdy, myprec *dwdz, myprec *dil, myprec *dpdz, int iNum);
+		myprec *dvdy, myprec *dwdz, myprec *dil, myprec *dpdz);
 __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, myprec *eY,
 		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
 		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
@@ -99,11 +155,11 @@ __global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, mypre
 		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
 		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
 		myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz,
-		myprec *dil, myprec *dpdz);
+		myprec *dil, myprec *dpdz, Communicator rk, recycle rec);
 
 __global__ void derVelX(myprec *u, myprec *v, myprec *w, myprec *dudx, myprec *dvdx, myprec *dwdx);
 __global__ void derVelY(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *dvdy, myprec *dwdy);
-__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz);
+__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz, recycle rec, Communicator rk);
 __global__ void derVelYBC(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *dvdy, myprec *dwdy, int direction);
 __global__ void derVelZBC(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz, int direction);
 __global__ void calcDil(myprec *dil, myprec *dudx, myprec *dvdy, myprec *dwdz);
@@ -111,6 +167,12 @@ __global__ void deviceCalcDt(myprec *wrkArray, myprec *r, myprec *u, myprec *v,
 __global__ void calcState(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret, myprec *ht, myprec *tem, myprec *pre, myprec *mu, myprec *lam, int bc);
 __global__ void deviceAdvanceTime(myprec *dt);
 
+__global__ void BCwallCenteredTop(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret);
+__global__ void BCwallCenteredBot(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret);
+
+__global__ void calcStateRR(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret, myprec *ht, myprec *tem, myprec *pre, myprec *mu, myprec *lam);
+
+
 //derivatives
 __device__ void derDev1x(myprec *df , myprec *f, Indices id);
 __device__ void derDev1y(myprec *df , myprec *f, Indices id);
diff --git a/src/cuda_globals.h b/src/cuda_globals.h
index 3fb6fe8..084c000 100644
--- a/src/cuda_globals.h
+++ b/src/cuda_globals.h
@@ -41,7 +41,7 @@ __device__ cudaError_t checkCudaDev(cudaError_t result)
 //   				    calculate the derivative at multiple points
 // lPencils is used only for derVelY and derVelZ
 
-#if mx==1 || my==1 
+#if mx==1 || my==1
 const int sPencils = 1;
 #else
 #if mx+2*stencilSize > 279 ||  my+2*stencilSize>600 || mz/nDivZ+2*stencilSize>600
@@ -64,11 +64,13 @@ const int lPencils = 32;
 #endif
 #endif
 
+
 extern __constant__ myprec dcoeffF[stencilSize];
 extern __constant__ myprec dcoeffS[stencilSize+1];
 extern __constant__ myprec dcoeffVF[stencilVisc];
 extern __constant__ myprec dcoeffVS[stencilVisc+1];
 
+
 #if mx<=546 //limit on the GPU constant memory usage (655356 bytes)
 extern __constant__ myprec dcoeffSx[mx*(2*stencilSize+1)];
 extern __constant__ myprec dcoeffVSx[mx*(2*stencilVisc+1)];
@@ -82,8 +84,8 @@ extern __device__ myprec time_on_GPU;
 extern __device__ Communicator rkGPU;
 
 
-extern dim3 d_block[5], grid0,  gridBC,  gridHalo,  gridHaloY,  gridHaloZ;
-extern dim3 d_grid[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ;
+extern dim3 d_grid[5], grid0,  gridBC,  gridHalo,  gridHaloY,  gridHaloZ, d_gridx, gridBCw;
+extern dim3 d_block[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ, d_blockx, blockBCw;
 
 extern cudaStream_t s[8+nDivZ];
 
@@ -138,4 +140,12 @@ extern myprec *rcvYp,*rcvYm,*rcvZp,*rcvZm;
 extern myprec *senYp5,*senYm5,*senZp5,*senZm5;
 extern myprec *rcvYp5,*rcvYm5,*rcvZp5,*rcvZm5;
 
+extern myprec *rm, *um, *wm, *tm;
+extern myprec *rm_in, *um_in, *wm_in, *tm_in;
+extern myprec *a_inpl, *b_inpl;
+extern int    *idxm, *idxp;
+extern myprec *delta_rec, *delta_in ;
+extern myprec *recy_r, *recy_u, *recy_v, *recy_w, *recy_e, *recy_h, *recy_p, *recy_t, *recy_m, *recy_l;
+
+
 #endif
diff --git a/src/cuda_main.cu b/src/cuda_main.cu
index 4f04436..f031113 100644
--- a/src/cuda_main.cu
+++ b/src/cuda_main.cu
@@ -6,56 +6,182 @@
 #include "comm.h"
 #include "sponge.h"
 
-
-
-__device__ __constant__ myprec alpha[] = {0.    , -17./60., -5./12.};
+__device__ __constant__ myprec alpha[] = {0., -17./60., -5./12.};
 __device__ __constant__ myprec beta[]  = {8./15.,   5./12.,  3./4. };
 
 cudaStream_t s[8+nDivZ];
 
-inline void calcRHS(myprec *rhsr, myprec *rhsu, myprec *rhsv, myprec *rhsw, myprec *rhse, Communicator rk) {
+inline void calcRHS(myprec *rhsr, myprec *rhsu, myprec *rhsv, myprec *rhsw, myprec *rhse, Communicator rk, recycle rec) {
+	if (topbc == 3) {
+		BCwallCenteredTop<<<gridBCw,blockBCw,0,s[0]>>>(d_r,d_u,d_v,d_w,d_e); // The values from RK3 step for the boundary nodes will be not accurate as we already have BCspecified for them and we do not solvecharacteristic equations for them (except for rho).
+		// So u[0] may not be 0 after RK3. Thus, we ensure the boundary values are accurate using this step. rho will be accurate at
+		// boundary as we solve the NS eqn for rho accurately at the wall (reflecting BC).
+	}
+	if (bottombc == 3) {
+		BCwallCenteredBot<<<gridBCw,blockBCw,0,s[0]>>>(d_r,d_u,d_v,d_w,d_e);
+
+	}
+	if(multiGPU) deviceBlocker<<<grid0,block0,0,s[0]>>>();   //in order to hide the halo update with deviceRHSX (on stream s[0])
 	calcState<<<grid0,block0,0,s[0]>>>(d_r,d_u,d_v,d_w,d_e,d_h,d_t,d_p,d_m,d_l,0); //here 0 means interior points
-	//derVelX<<<d_grid[0],d_block[0],0,s[1]>>>(d_u,d_v,d_w,gij[0],gij[1],gij[2]);
-	derVelY<<<d_grid[3],d_block[3],0,s[2]>>>(d_u,d_v,d_w,gij[3],gij[4],gij[5]);
-	//for (int kNum=0; kNum<1; kNum++) //CHANGED!!!!!
-	derVelZ<<<d_grid[4],d_block[4],0,s[8]>>>(d_u,d_v,d_w,gij[6],gij[7],gij[8]);
+	//    if(multiGPU) deviceBlocker<<<grid0,block0,0,s[0]>>>();   //in order to hide the halo update with deviceRHSX (on stream s[0])
+	deviceRHSX<<<d_gridx,d_blockx,0,s[0]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[0],gij[1],gij[2],gij[3],gij[6],gij[4],gij[8],d_dil,dpdz);
+
 	if(multiGPU) {
-		updateHaloFive(d_r,d_u,d_v,d_w,d_e,rk); cudaDeviceSynchronize();
+		updateHaloFive(d_r,d_u,d_v,d_w,d_e,rk);
+		cudaDeviceSynchronize();
 		calcState<<<gridHalo,blockHalo,0,s[4]>>>(d_r,d_u,d_v,d_w,d_e,d_h,d_t,d_p,d_m,d_l,1); //here 1 means halo points
-		derVelYBC<<<gridHaloY,blockHaloY,0,s[0]>>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],0);  //here 0 means lower boundary (0-index)
-		derVelZBC<<<gridHaloZ,blockHaloZ,0,s[1]>>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],0);	//here 0 means lower boundary (0-index)
-		derVelYBC<<<gridHaloY,blockHaloY,0,s[2]>>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],1);	//here 1 means upper boundary (my-index)
-		derVelZBC<<<gridHaloZ,blockHaloZ,0,s[3]>>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],1);	//here 1 means upper boundary (mz-index)
+		//        derVelYBC<<<gridHaloY,blockHaloY,0,s[0]>>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],0);  //here 0 means lower boundary (0-index)
+		//        derVelZBC<<<gridHaloZ,blockHaloZ,0,s[1]>>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],0);	//here 0 means lower boundary (0-index)
+		//        derVelYBC<<<gridHaloY,blockHaloY,0,s[2]>>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],1);	//here 1 means upper boundary (my-index)
+		//        derVelZBC<<<gridHaloZ,blockHaloZ,0,s[3]>>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],1);	//here 1 means upper boundary (mz-index)
 	}
-	cudaDeviceSynchronize();
-	//calcDil<<<grid0,block0>>>(d_dil,gij[0],gij[4],gij[8]);
-	cudaDeviceSynchronize();
-	if(multiGPU) deviceBlocker<<<grid0,block0,0,s[0]>>>();   //in order to hide the halo update with deviceRHSX (on stream s[0])
-	deviceRHSX<<<d_grid[0],d_block[0],0,s[0]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[0],gij[1],gij[2],gij[3],gij[6],gij[4],gij[8],d_dil,dpdz,0);
+
 	if(multiGPU) updateHalo(d_dil,rk);
 	cudaDeviceSynchronize();
 	deviceRHSY<<<d_grid[1],d_block[1]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[1],gij[3],gij[4],gij[5],gij[7],d_dil,dpdz);
-        cudaDeviceSynchronize();	
-        deviceRHSZ<<<d_grid[2],d_block[2],0,s[8]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[2],gij[5],gij[6],gij[7],gij[8],d_dil,dpdz);
 	cudaDeviceSynchronize();
-	if(boundaryLayer) addSponge<<<d_grid[0],d_block[0]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_e);
+	deviceRHSZ<<<d_grid[2],d_block[2],0,s[8]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[2],gij[5],gij[6],gij[7],gij[8],d_dil,dpdz, rk,rec);
+	cudaDeviceSynchronize();
+	//if(boundaryLayer) addSponge<<<d_grid[0],d_block[0]>>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_e);
 	cudaDeviceSynchronize();
 }
 
-void runSimulation(int file , myprec *par1, myprec *par2, myprec *time, Communicator rk) {
-
-          cudaSetDevice(rk.nodeRank);
-          myprec h_dt,h_dpdz;
-  
-          for (int istep = 0; istep < nsteps; istep++) {
-  
-          	if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk);
-          	if(istep>0)  deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC);
-          	if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC);
-          	deviceAdvanceTime<<<1,1>>>(dtC);
-          	if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,&h_dt,&h_dpdz,file, istep, rk);
-		
-		
+void runSimulation(int file, myprec *par1, myprec *par2, myprec *time, Communicator rk) {
+
+	cudaSetDevice(rk.nodeRank);
+	myprec h_dt,h_dpdz;
+
+	for (int istep = 0; istep < nsteps; istep++) {
+		if (topbc == 3) {
+			BCwallCenteredTop<<<gridBCw,blockBCw,0,s[0]>>>(d_r,d_u,d_v,d_w,d_e); // The values from RK3 step for the boundary nodes will be not accurate as we already have BCspecified for them and we do not solvecharacteristic equations for them (except for rho).
+			// So u[0] may not be 0 after RK3. Thus, we ensure the boundary values are accurate using this step. rho will be accurate at
+			// boundary as we solve the NS eqn for rho accurately at the wall (reflecting BC).
+		}
+		if (bottombc == 3) {
+			BCwallCenteredBot<<<gridBCw,blockBCw,0,s[0]>>>(d_r,d_u,d_v,d_w,d_e);
+
+		}
+		cudaDeviceSynchronize();
+
+		if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk);
+		if(istep>0)  deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1],dtC);
+		if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC);
+		deviceAdvanceTime<<<1,1>>>(dtC);
+		if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,&h_dt,&h_dpdz,file, istep, rk);
+
+		int notanum=0;
+                if (h_dt != h_dt) notanum = 1;
+                if (notanum>0){
+                    if(rk.rank==0) printf("Encountered a NAN in dt\n");
+                    exit(1);
+                }
+
+		//Recycling-Rescaling
+		if (inletbc == 5) {
+			if (( rk.kstart <= recstn) && ( rk.kend >= recstn) ) { // this section will be executed only by the processors that includes recycling stn.
+				if (abs(recstn - rk.kstart) < stencilSize-1) {
+					printf("The Recycling station is too close to the processor left boundary.");
+					exit(1);
+				}
+				if(istep%checkMeanRec==0) {
+					calcMeanRec(d_r, d_u, d_w, d_t, rm, um, wm, tm, a_inpl, b_inpl, idxm, idxp, delta_rec, delta_in,rk);
+				}
+				cudaDeviceSynchronize();
+				Recycle_Rescale(d_r, d_u, d_v, d_w, d_t, rm, um, wm, tm, rm_in, um_in, wm_in, tm_in, recy_r,
+						recy_u, recy_v, recy_w, recy_t, a_inpl, b_inpl, idxm, idxp, delta_rec, delta_in,rk);
+				cudaDeviceSynchronize();
+			}
+			if (pCol > 1) {
+				TransferToInlet(recy_r, recy_u, recy_v, recy_w, recy_t, rk);
+				cudaDeviceSynchronize();
+			}
+
+			if (pRow==1 && spanshift) {
+				Spanwiseshift(recy_r, recy_u, recy_v, recy_w, recy_t, rk);
+				cudaDeviceSynchronize();
+			}
+
+
+
+
+			dim3 gridcalcstateRR;
+			gridcalcstateRR = dim3(my, stencilSize, 1);
+			calcStateRR<<<gridcalcstateRR, mx_tot>>>(recy_r, recy_u, recy_v, recy_w, recy_e, recy_h, recy_t, recy_p, recy_m, recy_l);
+		}
+		cudaDeviceSynchronize();
+
+			if ((istep%100==0)){
+				/*myprec *h_wminl = (myprec*)malloc(mx*sizeof(myprec));
+				myprec *wminl;
+				checkCuda( cudaMalloc( (void**)&wminl, mx_tot*sizeof(myprec) ) );
+				int gridYAvg = mx_tot;
+				int blockYAvg = my;
+				YAvg<<<gridYAvg,blockYAvg,blockYAvg*2*sizeof(myprec),s[0]>>>(recy_w, wminl, stencilSize-1, 0);
+			        cudaDeviceSynchronize();	
+				checkCuda( cudaMemcpy(h_wminl, wminl, mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) );
+				allReduceSumYavg(h_wminl,mx_tot);*/
+
+				myprec *h_delre = (myprec*)malloc(stencilSize*sizeof(myprec));
+				checkCuda( cudaMemcpy(h_delre, delta_rec, stencilSize*sizeof(myprec), cudaMemcpyDeviceToHost) );
+			
+				myprec *h_wm = (myprec*)malloc(stencilSize*mx*sizeof(myprec));
+				checkCuda( cudaMemcpy(h_wm, wm, mx*stencilSize*sizeof(myprec), cudaMemcpyDeviceToHost) );
+        
+				myprec deltastar=0;
+				int kn = stencilSize-1;
+                                myprec Uinv = 1/h_wm[mx-1 + kn*mx];
+                                for (int i=0; i<mx; i++){
+                                        myprec dx = Lx/(mx-1)/xp[i]/h_delre[kn] ;
+                                        if (i==0 || i== mx-1){
+                                               dx = 0.5*dx;
+                                        }
+                                        deltastar += (1-h_wm[i+ kn*mx]*Uinv)*dx;
+                                }
+                                if(rk.rank==5){
+                                        FILE *fdel = fopen("delstar.txt","a+");
+                                        fprintf(fdel,"%lf\n",deltastar);
+                                        fclose(fdel);
+                                }
+
+                                free(h_wm); 
+			        free(h_delre);
+			}
+                mpiBarrier();
+
+		if(rk.rank==5 && istep==nsteps-1){
+			myprec *recyt = (myprec*)malloc(stencilSize*my*mx_tot*sizeof(myprec));
+			myprec *tt = (myprec*)malloc(mz*my*mx_tot*sizeof(myprec));
+			myprec *din = (myprec*)malloc(stencilSize*sizeof(myprec));
+			myprec *dre = (myprec*)malloc(stencilSize*sizeof(myprec));
+			myprec *tmin = (myprec*)malloc(stencilSize*mx*sizeof(myprec));
+			myprec *tmre = (myprec*)malloc(stencilSize*mx*sizeof(myprec));
+
+
+
+			checkCuda( cudaMemcpy(recyt, recy_w, stencilSize*my*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) );
+			checkCuda( cudaMemcpy(tt, d_w, mz*my*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) );
+			checkCuda( cudaMemcpy(din, delta_in, stencilSize*sizeof(myprec), cudaMemcpyDeviceToHost) );
+			checkCuda( cudaMemcpy(dre, delta_rec, stencilSize*sizeof(myprec), cudaMemcpyDeviceToHost) );
+			checkCuda( cudaMemcpy(tmre, wm, stencilSize*mx*sizeof(myprec), cudaMemcpyDeviceToHost) );
+			checkCuda( cudaMemcpy(tmin, wm_in, stencilSize*mx*sizeof(myprec), cudaMemcpyDeviceToHost) );
+
+
+			FILE *fp = fopen("Rec1.txt","w+");
+			int j = my/2;
+			int kt = recstn-rk.kstart;
+			for (int i=0; i<mx; i++)
+				fprintf(fp,"%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n",x[i],x[i]/din[2],x[i]/dre[2],recyt[i+j*mx+2*mx*my],tmin[i+2*mx], tt[i+j*mx+kt*mx*my], tmre[i+2*mx]);
+			fclose(fp);
+
+			free(recyt); free(tt); free(din); free(dre); free(tmin); free(tmre);
+
+		}
+
+
+
+		recycle rec(recy_r, recy_u, recy_v, recy_w, recy_e, recy_h, recy_t, recy_p, recy_m, recy_l);
+		cudaDeviceSynchronize();
+
 		deviceMul<<<grid0,block0,0,s[0]>>>(d_uO,d_r,d_u);
 		deviceMul<<<grid0,block0,0,s[1]>>>(d_vO,d_r,d_v);
 		deviceMul<<<grid0,block0,0,s[2]>>>(d_wO,d_r,d_w);
@@ -65,181 +191,186 @@ void runSimulation(int file , myprec *par1, myprec *par2, myprec *time, Communic
 		//Starting the Runge-Kutta Steps
 
 		//runge kutta step 1
-		calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk);
+		calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk,rec);
+
 		//eulerSum<<<grid0,block0>>>(d_r,d_rO,d_rhsr1,dtC);
 		//eulerSum<<<grid0,block0>>>(d_e,d_eO,d_rhse1,dtC);
 
-                eulerSumAll<<<grid0,block0>>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, 
-                            d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,dtC);
+		eulerSumAll<<<grid0,block0>>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO,
+				d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,dtC);
 		//cudaDeviceSynchronize();
 		//eulerSumR<<<grid0,block0,0,s[0]>>>(d_u,d_uO,d_rhsu1,d_r,dtC);
 		//eulerSumR<<<grid0,block0,0,s[1]>>>(d_v,d_vO,d_rhsv1,d_r,dtC);
 		//eulerSumR<<<grid0,block0,0,s[2]>>>(d_w,d_wO,d_rhsw1,d_r,dtC);
 		cudaDeviceSynchronize();
 
-		if(multiGPU) {  //To initiate slowly the routines so that we have time to initiate the memory transfer
-			deviceCpy<<<grid0,block0,0,s[0]>>>(d_r,d_r);
-			deviceCpy<<<grid0,block0,0,s[1]>>>(d_u,d_u);
-			deviceCpy<<<grid0,block0,0,s[2]>>>(d_v,d_v);
-			deviceCpy<<<grid0,block0,0,s[3]>>>(d_w,d_w); }
+		/*      if(multiGPU) {  //To initiate slowly the routines so that we have time to initiate the memory transfer
+            deviceCpy<<<grid0,block0,0,s[0]>>>(d_r,d_r);
+            deviceCpy<<<grid0,block0,0,s[1]>>>(d_u,d_u);
+            deviceCpy<<<grid0,block0,0,s[2]>>>(d_v,d_v);
+            deviceCpy<<<grid0,block0,0,s[3]>>>(d_w,d_w);
+        }*/
 
 		//runge kutta step 2
-		calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk);
+		calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk,rec);
 		//eulerSum3<<<grid0,block0>>>(d_r,d_rO,d_rhsr1,d_rhsr2,dtC);
 		//eulerSum3<<<grid0,block0>>>(d_e,d_eO,d_rhse1,d_rhse2,dtC);
-		eulerSumAll2<<<grid0,block0>>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, 
-                            d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2, dtC);
-                //cudaDeviceSynchronize();
+		eulerSumAll2<<<grid0,block0>>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO,
+				d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2, dtC);
+		//cudaDeviceSynchronize();
 		//eulerSum3R<<<grid0,block0,0,s[0]>>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_r,dtC);
 		//eulerSum3R<<<grid0,block0,0,s[1]>>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_r,dtC);
 		//eulerSum3R<<<grid0,block0,0,s[2]>>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_r,dtC);
 		cudaDeviceSynchronize();
 
-		if(multiGPU) {  //To initiate slowly the routines so that we have time to initiate the memory transfer
-			deviceCpy<<<grid0,block0,0,s[0]>>>(d_r,d_r);
-			deviceCpy<<<grid0,block0,0,s[1]>>>(d_u,d_u);
-			deviceCpy<<<grid0,block0,0,s[2]>>>(d_v,d_v);
-			deviceCpy<<<grid0,block0,0,s[3]>>>(d_w,d_w); }
+		/* if(multiGPU) {  //To initiate slowly the routines so that we have time to initiate the memory transfer
+            deviceCpy<<<grid0,block0,0,s[0]>>>(d_r,d_r);
+            deviceCpy<<<grid0,block0,0,s[1]>>>(d_u,d_u);
+            deviceCpy<<<grid0,block0,0,s[2]>>>(d_v,d_v);
+            deviceCpy<<<grid0,block0,0,s[3]>>>(d_w,d_w);
+        }*/
 
 		//runge kutta step 3
-		calcRHS(d_rhsr3,d_rhsu3,d_rhsv3,d_rhsw3,d_rhse3,rk);
-//		rk3final<<<grid0,block0>>>(d_r,d_rO,d_rhsr1,d_rhsr2,d_rhsr3,dtC);
-//		rk3final<<<grid0,block0>>>(d_e,d_eO,d_rhse1,d_rhse2,d_rhse3,dtC);
-		eulerSumAll3<<<grid0,block0>>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, 
-                            d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2,
-                            d_rhsr3, d_rhsu3, d_rhsv3, d_rhsw3, d_rhse3,dtC);
-
-//		cudaDeviceSynchronize();
-//		rk3finalR<<<grid0,block0,0,s[0]>>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_rhsu3,d_r,dtC);
-//		rk3finalR<<<grid0,block0,0,s[1]>>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_rhsv3,d_r,dtC);
-//		rk3finalR<<<grid0,block0,0,s[2]>>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_rhsw3,d_r,dtC);
+		calcRHS(d_rhsr3,d_rhsu3,d_rhsv3,d_rhsw3,d_rhse3,rk,rec);
+		//		rk3final<<<grid0,block0>>>(d_r,d_rO,d_rhsr1,d_rhsr2,d_rhsr3,dtC);
+		//		rk3final<<<grid0,block0>>>(d_e,d_eO,d_rhse1,d_rhse2,d_rhse3,dtC);
+		eulerSumAll3<<<grid0,block0>>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO,
+				d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2,
+				d_rhsr3, d_rhsu3, d_rhsv3, d_rhsw3, d_rhse3,dtC);
+
+		//		cudaDeviceSynchronize();
+		//		rk3finalR<<<grid0,block0,0,s[0]>>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_rhsu3,d_r,dtC);
+		//		rk3finalR<<<grid0,block0,0,s[1]>>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_rhsv3,d_r,dtC);
+		//		rk3finalR<<<grid0,block0,0,s[2]>>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_rhsw3,d_r,dtC);
 		cudaDeviceSynchronize();
 	}
 }
 
 void runSimulationLowStorage(myprec *par1, myprec *par2, myprec *time, Communicator rk) {
 
-	cudaSetDevice(rk.nodeRank);
-
-	myprec h_dt,h_dpdz;
-
-	for (int istep = 0; istep < nsteps; istep++) {
-/*		if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk);
-		if(istep>0)  deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC);
-		if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC);
-		deviceAdvanceTime<<<1,1>>>(dtC);
-		if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk);*/
-
-		//Starting the Runge-Kutta Steps
-
-		//runge kutta step 1
-		calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk);
-		deviceMul<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
-		deviceMul<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
-		deviceMul<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
-		sumLowStorageRK3<<<grid0,block0,0,s[0]>>>(d_r, d_rhsr1, d_rhsr1, dtC, 0);
-		sumLowStorageRK3<<<grid0,block0,0,s[1]>>>(d_u, d_rhsu1, d_rhsu1, dtC, 0);
-		sumLowStorageRK3<<<grid0,block0,0,s[2]>>>(d_v, d_rhsv1, d_rhsv1, dtC, 0);
-		sumLowStorageRK3<<<grid0,block0,0,s[3]>>>(d_w, d_rhsw1, d_rhsw1, dtC, 0);
-		sumLowStorageRK3<<<grid0,block0,0,s[4]>>>(d_e, d_rhse1, d_rhse1, dtC, 0);
-		cudaStreamSynchronize(s[0]);
-		deviceDiv<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
-		deviceDiv<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
-		deviceDiv<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
-		cudaDeviceSynchronize();
-
-		if(multiGPU) {  //To initiate slowly the routines so that we have time to initiate the memory transfer
-			deviceCpy<<<grid0,block0,0,s[0]>>>(d_r,d_r);
-			deviceCpy<<<grid0,block0,0,s[1]>>>(d_u,d_u);
-			deviceCpy<<<grid0,block0,0,s[2]>>>(d_v,d_v);
-			deviceCpy<<<grid0,block0,0,s[3]>>>(d_w,d_w); }
-
-
-		//runge kutta step 2
-		calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk);
-		deviceMul<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
-		deviceMul<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
-		deviceMul<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
-		sumLowStorageRK3<<<grid0,block0,0,s[0]>>>(d_r, d_rhsr1, d_rhsr2, dtC, 1);
-		sumLowStorageRK3<<<grid0,block0,0,s[1]>>>(d_u, d_rhsu1, d_rhsu2, dtC, 1);
-		sumLowStorageRK3<<<grid0,block0,0,s[2]>>>(d_v, d_rhsv1, d_rhsv2, dtC, 1);
-		sumLowStorageRK3<<<grid0,block0,0,s[3]>>>(d_w, d_rhsw1, d_rhsw2, dtC, 1);
-		sumLowStorageRK3<<<grid0,block0,0,s[4]>>>(d_e, d_rhse1, d_rhse2, dtC, 1);
-		cudaStreamSynchronize(s[0]);
-		deviceDiv<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
-		deviceDiv<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
-		deviceDiv<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
-		cudaDeviceSynchronize();
-
-		if(multiGPU) {  //To initiate slowly the routines so that we have time to initiate the memory transfer
-			deviceCpy<<<grid0,block0,0,s[0]>>>(d_r,d_r);
-			deviceCpy<<<grid0,block0,0,s[1]>>>(d_u,d_u);
-			deviceCpy<<<grid0,block0,0,s[2]>>>(d_v,d_v);
-			deviceCpy<<<grid0,block0,0,s[3]>>>(d_w,d_w); }
-
-		//runge kutta step 3
-		calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk);
-		deviceMul<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
-		deviceMul<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
-		deviceMul<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
-		sumLowStorageRK3<<<grid0,block0,0,s[0]>>>(d_r, d_rhsr2, d_rhsr1, dtC, 2);
-		sumLowStorageRK3<<<grid0,block0,0,s[1]>>>(d_u, d_rhsu2, d_rhsu1, dtC, 2);
-		sumLowStorageRK3<<<grid0,block0,0,s[2]>>>(d_v, d_rhsv2, d_rhsv1, dtC, 2);
-		sumLowStorageRK3<<<grid0,block0,0,s[3]>>>(d_w, d_rhsw2, d_rhsw1, dtC, 2);
-		sumLowStorageRK3<<<grid0,block0,0,s[4]>>>(d_e, d_rhse2, d_rhse1, dtC, 2);
-		cudaStreamSynchronize(s[0]);
-		deviceDiv<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
-		deviceDiv<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
-		deviceDiv<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
-		cudaDeviceSynchronize();
-	}
+	/*    cudaSetDevice(rk.nodeRank);
+
+    myprec h_dt,h_dpdz;
+
+    for (int istep = 0; istep < nsteps; istep++) {
+        		if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk);
+        		if(istep>0)  deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC);
+        		if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC);
+        		deviceAdvanceTime<<<1,1>>>(dtC);
+        		if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk);
+
+        //Starting the Runge-Kutta Steps
+
+        //runge kutta step 1
+        calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk);
+        deviceMul<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
+        deviceMul<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
+        deviceMul<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
+        sumLowStorageRK3<<<grid0,block0,0,s[0]>>>(d_r, d_rhsr1, d_rhsr1, dtC, 0);
+        sumLowStorageRK3<<<grid0,block0,0,s[1]>>>(d_u, d_rhsu1, d_rhsu1, dtC, 0);
+        sumLowStorageRK3<<<grid0,block0,0,s[2]>>>(d_v, d_rhsv1, d_rhsv1, dtC, 0);
+        sumLowStorageRK3<<<grid0,block0,0,s[3]>>>(d_w, d_rhsw1, d_rhsw1, dtC, 0);
+        sumLowStorageRK3<<<grid0,block0,0,s[4]>>>(d_e, d_rhse1, d_rhse1, dtC, 0);
+        cudaStreamSynchronize(s[0]);
+        deviceDiv<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
+        deviceDiv<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
+        deviceDiv<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
+        cudaDeviceSynchronize();
+
+        if(multiGPU) {  //To initiate slowly the routines so that we have time to initiate the memory transfer
+            deviceCpy<<<grid0,block0,0,s[0]>>>(d_r,d_r);
+            deviceCpy<<<grid0,block0,0,s[1]>>>(d_u,d_u);
+            deviceCpy<<<grid0,block0,0,s[2]>>>(d_v,d_v);
+            deviceCpy<<<grid0,block0,0,s[3]>>>(d_w,d_w);
+        }
+
+
+        //runge kutta step 2
+        calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk);
+        deviceMul<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
+        deviceMul<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
+        deviceMul<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
+        sumLowStorageRK3<<<grid0,block0,0,s[0]>>>(d_r, d_rhsr1, d_rhsr2, dtC, 1);
+        sumLowStorageRK3<<<grid0,block0,0,s[1]>>>(d_u, d_rhsu1, d_rhsu2, dtC, 1);
+        sumLowStorageRK3<<<grid0,block0,0,s[2]>>>(d_v, d_rhsv1, d_rhsv2, dtC, 1);
+        sumLowStorageRK3<<<grid0,block0,0,s[3]>>>(d_w, d_rhsw1, d_rhsw2, dtC, 1);
+        sumLowStorageRK3<<<grid0,block0,0,s[4]>>>(d_e, d_rhse1, d_rhse2, dtC, 1);
+        cudaStreamSynchronize(s[0]);
+        deviceDiv<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
+        deviceDiv<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
+        deviceDiv<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
+        cudaDeviceSynchronize();
+
+        if(multiGPU) {  //To initiate slowly the routines so that we have time to initiate the memory transfer
+            deviceCpy<<<grid0,block0,0,s[0]>>>(d_r,d_r);
+            deviceCpy<<<grid0,block0,0,s[1]>>>(d_u,d_u);
+            deviceCpy<<<grid0,block0,0,s[2]>>>(d_v,d_v);
+            deviceCpy<<<grid0,block0,0,s[3]>>>(d_w,d_w);
+        }
+
+        //runge kutta step 3
+        calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk);
+        deviceMul<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
+        deviceMul<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
+        deviceMul<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
+        sumLowStorageRK3<<<grid0,block0,0,s[0]>>>(d_r, d_rhsr2, d_rhsr1, dtC, 2);
+        sumLowStorageRK3<<<grid0,block0,0,s[1]>>>(d_u, d_rhsu2, d_rhsu1, dtC, 2);
+        sumLowStorageRK3<<<grid0,block0,0,s[2]>>>(d_v, d_rhsv2, d_rhsv1, dtC, 2);
+        sumLowStorageRK3<<<grid0,block0,0,s[3]>>>(d_w, d_rhsw2, d_rhsw1, dtC, 2);
+        sumLowStorageRK3<<<grid0,block0,0,s[4]>>>(d_e, d_rhse2, d_rhse1, dtC, 2);
+        cudaStreamSynchronize(s[0]);
+        deviceDiv<<<grid0,block0,0,s[1]>>>(d_u,d_u,d_r);
+        deviceDiv<<<grid0,block0,0,s[2]>>>(d_v,d_v,d_r);
+        deviceDiv<<<grid0,block0,0,s[3]>>>(d_w,d_w,d_r);
+        cudaDeviceSynchronize();
+    }*/
 }
 
-__global__ void eulerSumAll(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, 
-                            myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1
-                            ,myprec *dt) {
+__global__ void eulerSumAll(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w,
+		myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1
+		,myprec *dt) {
 	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
-        id.mkidX();
-        myprec tmp1 = 0;
+	id.mkidX();
+	myprec tmp1 = 0;
 	tmp1 = (r0[id.g] + rhsr1[id.g]*(*dt)/2.0);
-        r[id.g] = tmp1;
+	r[id.g] = tmp1;
 	u[id.g] = (u0[id.g] + rhsu1[id.g]*(*dt)/2.0)/tmp1;
 	v[id.g] = (v0[id.g] + rhsv1[id.g]*(*dt)/2.0)/tmp1;
 	w[id.g] = (w0[id.g] + rhsw1[id.g]*(*dt)/2.0)/tmp1;
-        e[id.g] = (e0[id.g] + rhse1[id.g]*(*dt)/2.0);
+	e[id.g] = (e0[id.g] + rhse1[id.g]*(*dt)/2.0);
 
 }
 
 
-__global__ void eulerSumAll2(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, 
-                            myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1,
-                            myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, myprec *dt) {
+__global__ void eulerSumAll2(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w,
+		myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1,
+		myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, myprec *dt) {
 	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
-        id.mkidX();
-        myprec tmp1 = 0;
+	id.mkidX();
+	myprec tmp1 = 0;
 	tmp1 =    ( r0[id.g] + (2*rhsr2[id.g] - rhsr1[id.g])*(*dt) );
-        r[id.g] = tmp1;
+	r[id.g] = tmp1;
 	u[id.g] = ( u0[id.g] + (2*rhsu2[id.g] - rhsu1[id.g])*(*dt) )/tmp1;
 	v[id.g] = ( v0[id.g] + (2*rhsv2[id.g] - rhsv1[id.g])*(*dt) )/tmp1;
 	w[id.g] = ( w0[id.g] + (2*rhsw2[id.g] - rhsw1[id.g])*(*dt) )/tmp1;
-        e[id.g] = ( e0[id.g] + (2*rhse2[id.g] - rhse1[id.g])*(*dt) );
+	e[id.g] = ( e0[id.g] + (2*rhse2[id.g] - rhse1[id.g])*(*dt) );
 
 }
 
 
-__global__ void eulerSumAll3(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, 
-                            myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1,
-                            myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2,
-                            myprec *rhsr3, myprec *rhsu3, myprec *rhsv3, myprec *rhsw3, myprec *rhse3, myprec *dt) {
+__global__ void eulerSumAll3(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w,
+		myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1,
+		myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2,
+		myprec *rhsr3, myprec *rhsu3, myprec *rhsv3, myprec *rhsw3, myprec *rhse3, myprec *dt) {
 	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
-        id.mkidX();
-        myprec tmp1 = 0;
+	id.mkidX();
+	myprec tmp1 = 0;
 	tmp1    =    r0[id.g] + (*dt)*( rhsr1[id.g] + 4*rhsr2[id.g] + rhsr3[id.g])/6.;
-        r[id.g] = tmp1;
+	r[id.g] = tmp1;
 	u[id.g] = (  u0[id.g] + (*dt)*( rhsu1[id.g] + 4*rhsu2[id.g] + rhsu3[id.g])/6.)/tmp1;
 	v[id.g] = (  v0[id.g] + (*dt)*( rhsv1[id.g] + 4*rhsv2[id.g] + rhsv3[id.g])/6.)/tmp1;
 	w[id.g] = (  w0[id.g] + (*dt)*( rhsw1[id.g] + 4*rhsw2[id.g] + rhsw3[id.g])/6.)/tmp1;
-        e[id.g] = (  e0[id.g] + (*dt)*( rhse1[id.g] + 4*rhse2[id.g] + rhse3[id.g])/6.);
+	e[id.g] = (  e0[id.g] + (*dt)*( rhse1[id.g] + 4*rhse2[id.g] + rhse3[id.g])/6.);
 
 }
 
@@ -276,58 +407,131 @@ __global__ void rk3finalR(myprec *a1, myprec *a2, myprec *b, myprec *c, myprec *
 
 __global__ void calcState(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret, myprec *ht, myprec *tem, myprec *pre, myprec *mu, myprec *lam, int bc) {
 
-/*	int threadsPerBlock  = blockDim.x * blockDim.y;
-	int threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y;
-	int blockNumInGrid   = blockIdx.x  + gridDim.x  * blockIdx.y;
 
-	int gl = blockNumInGrid * threadsPerBlock + threadNumInBlock;
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id.mkidX();
+	int gl = id.g;
+	if(bc==1) {
+		int threadsPerBlock  = blockDim.x * blockDim.y;
+		int threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y;
+		int blockNumInGrid   = blockIdx.x  + gridDim.x  * blockIdx.y;
+		gl = blockNumInGrid * threadsPerBlock + threadNumInBlock;
+		gl += mx*my*mz;
+	}
+	myprec cvInv = (gam - 1.0)/Rgas;
+	myprec r = rho[gl];
+	myprec u = uvel[gl];
+	myprec v = vvel[gl];
+	myprec w = wvel[gl];
+	myprec etot = ret[gl];
 
-	if(bc==1) gl += mx*my*mz;
+	myprec invrho = 1.0/r;
+	myprec en = etot*invrho - 0.5*(u*u + v*v + w*w);
 
-    myprec cvInv = (gam - 1.0)/Rgas;
+	myprec  t   = cvInv*en;
+	myprec  p   = r*Rgas*t;
+	tem[gl]     = t;
+	pre[gl]     = p;
 
-    myprec invrho = 1.0/rho[gl];
+	ht[gl]      = (etot + p)*invrho;
 
-    myprec en = ret[gl]*invrho - 0.5*(uvel[gl]*uvel[gl] + vvel[gl]*vvel[gl] + wvel[gl]*wvel[gl]);
-    tem[gl]   = cvInv*en;
-    pre[gl]   = rho[gl]*Rgas*tem[gl];
-    ht[gl]    = (ret[gl] + pre[gl])*invrho;
+	myprec suth = pow(t,viscexp);
+	mu[gl]      = suth/Re;
+	lam[gl]     = suth/Re/Pr/Ec;
+
+	__syncthreads();
+
+}
+
+__global__ void calcStateRR(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret, myprec *ht, myprec *tem, myprec *pre, myprec *mu, myprec *lam) {
 
-    myprec suth = pow(tem[gl],viscexp);
-    mu[gl]      = suth/Re;
-    lam[gl]     = suth/Re/Pr/Ec;
-    __syncthreads();*/
 
 	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
-	id.mkidX();
-    int gl = id.g;
-    if(bc==1) gl += mx*my*mz;
+	id.mkidBCRR(blockIdx.y);
+	int gl = id.g;
 
-    myprec cvInv = (gam - 1.0)/Rgas;
-    myprec r = rho[gl];
-    myprec u = uvel[gl];
-    myprec v = vvel[gl];
-    myprec w = wvel[gl];
-    myprec etot = ret[gl];
+	myprec cv = Rgas/(gam - 1.0) ;
+	myprec r  = rho[gl];
+	myprec u  = uvel[gl];
+	myprec v  = vvel[gl];
+	myprec w  = wvel[gl];
+	myprec t  = tem[gl];
 
-    myprec invrho = 1.0/r;
-    myprec en = etot*invrho - 0.5*(u*u + v*v + w*w);
+	myprec en = cv*t ;
 
-    myprec  t   = cvInv*en;
-    myprec  p   = r*Rgas*t; //tmp1 is temperature // tmp2 is pressure    
-    tem[gl]     = t;
-    pre[gl]     = p;
+	myprec  etot   = r*( en + 0.5*(u*u + v*v + w*w) ) ;
+	myprec  p      = r*Rgas*t;
+	ret[gl]        = etot;
+	pre[gl]        = p;
 
-    ht[gl]      = (etot + p)*invrho;
+	ht[gl]         = (etot + p)/r;
 
-    myprec suth = pow(t,viscexp);
-    mu[gl]      = suth/Re;
-    lam[gl]     = suth/Re/Pr/Ec;
+	myprec suth = pow(t,viscexp);
+	mu[gl]      = suth/Re;
+	lam[gl]     = suth/Re/Pr/Ec;
 
-    __syncthreads();
+	__syncthreads();
 
 }
 
+__global__ void BCwallCenteredTop(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret) {
+
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id.mkidBCwallTop();
+
+
+	myprec u = 0.0;
+	myprec v = 0.0;
+	myprec w = 0.0;
+	myprec t = TwallTop;
+
+	uvel[id.g] = u;
+	vvel[id.g] = v;
+	wvel[id.g] = w;
+
+	myprec cv = Rgas/(gam - 1.0);
+	myprec r = rho[id.g];
+
+	myprec en = cv*t;
+
+	myprec etot = r*( en + 0.5*(u*u + v*v + w*w) );
+
+	ret[id.g] = etot;
+
+	__syncthreads();
+
+}
+__global__ void BCwallCenteredBot(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret) {
+
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id.mkidBCwallBot();
+
+
+	myprec u = 0.0;
+	myprec v = 0.0;
+	myprec w = 0.0;
+	myprec t = TwallBot;
+
+	uvel[id.g] = u;
+	vvel[id.g] = v;
+	wvel[id.g] = w;
+
+	myprec cv = Rgas/(gam - 1.0);
+	myprec r = rho[id.g];
+
+	myprec en = cv*t;
+
+	myprec etot = r*( en + 0.5*(u*u + v*v + w*w) );
+
+	ret[id.g] = etot;
+
+	__syncthreads();
+
+}
+
+
 __global__ void sumLowStorageRK3(myprec *var, myprec *rhs1, myprec *rhs2, myprec *dt, int step) {
 	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
 	var[id.g] = var[id.g] + (*dt)*(alpha[step]*rhs1[id.g] + beta[step]*rhs2[id.g]);
@@ -337,10 +541,10 @@ void calcTimeStepPressGrad(int istep, myprec *dtC, myprec *dpdz, myprec *h_dt, m
 
 	cudaSetDevice(rk.nodeRank);
 	calcTimeStep(dtC,d_r,d_u,d_v,d_w,d_e,d_m,rk);
-	cudaMemcpy(h_dt  , dtC , sizeof(myprec), cudaMemcpyDeviceToHost);
+	cudaMemcpy(h_dt, dtC, sizeof(myprec), cudaMemcpyDeviceToHost);
 	allReduceToMin(h_dt,1);
 	mpiBarrier();
-	cudaMemcpy(dtC , h_dt  , sizeof(myprec), cudaMemcpyHostToDevice);
+	cudaMemcpy(dtC, h_dt, sizeof(myprec), cudaMemcpyHostToDevice);
 	if(forcing) {
 		calcPressureGrad(dpdz,d_r,d_w,rk); //Changed here dpdz_new = dpdz_old - 0.5*(rw_bulk - 1 )
 		cudaMemcpy(h_dpdz, dpdz, sizeof(myprec), cudaMemcpyDeviceToHost);
@@ -351,79 +555,357 @@ void calcTimeStepPressGrad(int istep, myprec *dtC, myprec *dpdz, myprec *h_dt, m
 	//if(rk.rank==0) printf("step number %d with %le %le\n",nsteps*(file) + istep ,*h_dt,*h_dpdz);
 }
 
+void calcMeanRec(myprec *r, myprec *u,myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm, myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in,Communicator rk) {
+
+	myprec *hostYmean = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec));
+
+	int gridYAvg = mx_tot;
+	int blockYAvg = my;
+
+	for (int k = 0; k < stencilSize; k++){
+	int krec = recstn - rk.kstart - (stencilSize-1) + k; // stencilSize-1 is there becase we want k = 0 coreesponding to plane recystn-2, k=1 correspond to plane recycstn-1 and k=2 to recycstn
+	YAvg<<<gridYAvg,blockYAvg,blockYAvg*2*sizeof(myprec),s[0]>>>(r, rm, krec, k);// *2 because the next power of 2 shall be <= 2*my. Eg: 139.. next power = 256 < 278
+	YAvg<<<gridYAvg,blockYAvg,blockYAvg*2*sizeof(myprec),s[0]>>>(u, um, krec, k); //wall normal velocity
+	YAvg<<<gridYAvg,blockYAvg,blockYAvg*2*sizeof(myprec),s[0]>>>(w, wm, krec, k); //streamwise velocity
+	YAvg<<<gridYAvg,blockYAvg,blockYAvg*2*sizeof(myprec),s[0]>>>(t, tm, krec, k);
+	cudaDeviceSynchronize();
+	}
+
+	checkCuda( cudaMemcpy(hostYmean, rm, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) );
+	allReduceSumYavg(hostYmean,stencilSize*mx_tot); // check if mpi_barrier is needed with comm_col communicator?
+	checkCuda( cudaMemcpy(rm, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) );
+
+	checkCuda( cudaMemcpy(hostYmean, um, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) );
+	allReduceSumYavg(hostYmean,stencilSize*mx_tot);
+	checkCuda( cudaMemcpy(um, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) );
+
+	checkCuda( cudaMemcpy(hostYmean, wm, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) );
+	allReduceSumYavg(hostYmean,stencilSize*mx_tot);
+	checkCuda( cudaMemcpy(wm, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) );
+
+	checkCuda( cudaMemcpy(hostYmean, tm, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) );
+	allReduceSumYavg(hostYmean,stencilSize*mx_tot);
+	checkCuda( cudaMemcpy(tm, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) );
+
+	deltaRR<<<1, stencilSize,0,s[0]>>>(wm, delta_rec);
+	interpRR<<<stencilSize, mx_tot,0,s[0]>>>(a, b, idxm, idxp, delta_rec, delta_in) ;
+
+	free(hostYmean);
+}
+
+void Recycle_Rescale(myprec *r, myprec *u, myprec *v, myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm,
+		myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *recy_r, myprec *recy_u,
+		myprec *recy_v, myprec *recy_w, myprec *recy_t,myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in, Communicator rk) {
+
+	dim3 gridRR;
+	gridRR = dim3(my,stencilSize,1);
+	int blockRR = mx_tot;
+
+	Recy_Resc<<<gridRR,blockRR,0,s[0]>>>(r, u, v, w, t, rm, um, wm, tm, rm_in, um_in, wm_in, tm_in, recy_r, recy_u, recy_v, recy_w, recy_t,
+			a, b, idxm, idxp, delta_rec, delta_in, rk);
+
+
+}
+
+
+void InletMeanUpdate(myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *delta_in, Communicator rk) {
+
+	myprec *um_inRR      = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec));
+	myprec *wm_inRR      = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec));
+	myprec *tm_inRR      = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec));
+	myprec *rm_inRR      = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec));
+	myprec *delta99_inRR = (myprec*)malloc(stencilSize*sizeof(myprec));
+
+	readFileInitBL_inRR('r',rm_inRR,rk);	
+	readFileInitBL_inRR('u',um_inRR,rk);
+	readFileInitBL_inRR('w',wm_inRR,rk);
+	readFileInitBL_inRR('t',tm_inRR,rk);
+	readFileInitBL1D_inRR('d',delta99_inRR,rk); 
+
+	checkCuda( cudaMemcpy(rm_in, rm_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) );
+	checkCuda( cudaMemcpy(um_in, um_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) );
+	checkCuda( cudaMemcpy(wm_in, wm_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) );
+	checkCuda( cudaMemcpy(tm_in, tm_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) );
+	checkCuda( cudaMemcpy(delta_in, delta99_inRR, stencilSize*sizeof(myprec), cudaMemcpyHostToDevice) );
+
+	free(um_inRR);
+	free(wm_inRR);
+	free(tm_inRR);
+	free(rm_inRR);
+	free(delta99_inRR);
+}
+
+void Spanwiseshift(myprec *recy_r,myprec *recy_u,myprec *recy_v,myprec *recy_w,myprec *recy_t, Communicator rk){
+
+	dim3 grid;
+	grid = dim3(mx_tot, stencilSize, 1);
+      
+	spanshifting<<<grid, my,0,s[0]>>>(recy_r);
+	spanshifting<<<grid, my,0,s[0]>>>(recy_u);
+	spanshifting<<<grid, my,0,s[0]>>>(recy_v);
+	spanshifting<<<grid, my,0,s[0]>>>(recy_w);
+	spanshifting<<<grid, my,0,s[0]>>>(recy_t);
+
+}
+
+__global__ void spanshifting(myprec *var){
+
+	int k = blockIdx.y;
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id.mkidSpanShift(k,0);
+	__shared__ myprec s_var[my];
+        
+        s_var[id.tix] = var[id.g];
+        __syncthreads();
+	int shift = my/2;
+	Indices id1(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id1.mkidSpanShift(k,shift);
+        var[id1.g] = s_var[id.tix] ;
+        __syncthreads();
+
+}
+
+
+__global__ void Recy_Resc(myprec *r, myprec *u, myprec *v, myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm,
+		myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *recy_r, myprec *recy_u,
+		myprec *recy_v, myprec *recy_w, myprec *recy_t,myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in, Communicator rk) {
+
+	int k = blockIdx.y;
+	int krec = (recstn-stencilSize+1) - rk.kstart + k; // stencilSize+1 is there becase we want k = 0 coreesponding to plane recystn-2, k=1 correspond to plane recycstn-1 and k=2 to recycstn
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id.mkidBCRR(krec);
+
+	Indices id1(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id1.mkidBCRR(k);
+
+	int id_mean = id.i + mx_tot*k ; //
+
+	int idm = idxm[id_mean];
+	int idp = idxp[id_mean];
+
+	myprec delratio = delta_rec[k]/delta_in[k] ;
+	myprec beta_rec = pow(abs(delratio),0.1); // (delta_rec / delta_in)^(1/10) -- delta_in = 1;
+
+	__shared__ myprec s_fluc[mx_tot];
+	__shared__ myprec s_mean[mx_tot];
+
+/*	s_mean[id.i] = um[id_mean] ;
+        __syncthreads();
+        um_in[id_mean] = 1*( a[id_mean] * s_mean[idm] + (1.0-a[id_mean])* s_mean[idp] ) ;// not using b as I dont want um_in=0 which will happen when a=0&b=0.
+*/
+
+	__syncthreads();
+	s_fluc[id.i] = r[id.g] - rm[id_mean] ;
+	__syncthreads();
+	myprec flucinlet = 1*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ;
+	recy_r[id1.g] = rm_in[id_mean] + flucinlet;
+
+	__syncthreads();
+	s_fluc[id.i] = u[id.g] - um[id_mean] ;
+	__syncthreads();
+	flucinlet = beta_rec*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ;
+	recy_u[id1.g] = um_in[id_mean] + flucinlet;
+
+	__syncthreads();
+	s_fluc[id.i] = v[id.g] - 0 ;
+	__syncthreads();
+	flucinlet = beta_rec*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ;
+	recy_v[id1.g] = 0 + flucinlet;
+
+	__syncthreads();
+	s_fluc[id.i] = w[id.g] - wm[id_mean] ;
+	__syncthreads();
+	flucinlet = beta_rec*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ;
+	recy_w[id1.g] = wm_in[id_mean] + flucinlet;
+
+	__syncthreads();
+	s_fluc[id.i] = t[id.g] - tm[id_mean] ;
+	__syncthreads();
+	flucinlet = 1*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ;
+	recy_t[id1.g] = tm_in[id_mean] + flucinlet;
+
+}
+
+
+__global__ void YAvg(myprec *f, myprec *fm, int krec, int kRR) {
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id.mkidBCRecycAvg(krec);
+
+	unsigned int n = id.bdx;
+	n = findNextPowerOf2(n); // to perform the reduction fast we need an array that is of the power of 2
+
+	extern __shared__ myprec s_avg[] ;
+
+	for(int i=id.tix; i<n; i+=id.bdx) {
+		if (i < id.bdx) {
+			s_avg[i] = f[id.g]; //actual domain values
+		} else {
+			s_avg[i] = 0; // fillers to fill the array as n>id.bdx to be a power of 2.
+		}
+	}
+	__syncthreads();
+
+	for (int size = n/2; size>0; size/=2) {
+		if (id.tix<size)
+			s_avg[id.tix] += s_avg[id.tix+size];
+		__syncthreads();
+	}
+	if (id.tix == 0) {
+		fm[id.bix + kRR*mx_tot] = s_avg[0]/my_tot; // ultimately I will add results of all the processors in Y and hence my_tot
+	}
+	__syncthreads();
+
+}
+
+__global__ void deltaRR(myprec *wm, myprec *delta) {
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	int khi = mx_tot-1;
+	int klo = 0;
+	int k;
+	myprec winf = 0.99*wm[mx_tot-1 + id.tix*mx_tot]; 
+	while (khi-klo > 1) {
+		k = (khi+klo)/2;
+		if(wm[k + id.tix*mx_tot] > winf) {
+			khi=k;
+		} else {
+			klo=k;
+		}
+	}
+
+
+	myprec a = (wm[khi+id.tix*mx_tot] - winf)/(wm[khi+id.tix*mx_tot] - wm[klo+id.tix*mx_tot]);
+
+	delta[id.tix] = d_x[khi] - a*(d_x[khi] - d_x[klo]) ;
+}
+
+__global__ void interpRR(myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in) {
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	int idii = id.tix + mx_tot*id.bix ;
+
+	__shared__ myprec s_x[mx_tot];
+	s_x[id.tix] = d_x[id.tix]/delta_rec[id.bix];
+	__syncthreads();
+
+	int khi = mx_tot-1;
+	int klo = 0;
+	int k;
+
+	myprec xi = d_x[id.tix]/delta_in[id.bix]; //inlet x/delta
+
+	// In the following lines of code we will find that what indices on the recycling station corresponds to the x/delta of the inlet.
+	// This will help in interpolating the data at the recycling station so as to be recycled to the inlet.
+	// Suppose we have x/delta_in = 0.35. But at the recycling end we have x/delta_re = 0.32 and 0.38. Thus, first using the while loop
+	// we compute the khi and klo correcponding to x=0.32 and x=0.38. Then we compute the weights as b=(0.38-0.35)/(0.38-0.32) & a=1-b;
+	// a and b are the weights which will be used for interpolation.
+	while (khi-klo > 1) {
+		k = (khi+klo)/2;
+		if( s_x[k] > xi) { // compare with the recycling station x/delta;
+			khi=k;
+		} else {
+			klo=k;
+		}
+	}
+	if (xi <= s_x[id.bdx-1]) { //id.bdx implies last value
+		idxp[idii] = khi ;
+		idxm[idii] = klo ;
+
+		a[idii]    = (s_x[khi] - xi)/(s_x[khi] - s_x[klo]) ;
+		b[idii]    = 1.0 - a[idii];
+	} else {
+		idxp[idii] = khi ;
+		idxm[idii] = klo ;
+
+		a[idii]    = 0.0 ;
+		b[idii]    = 0.0; // This implies that for all the cells at the inlet that are beyond the x/delta_re|max,
+		//we do the interpolation such that u[x/deltain> x/delta_remax] = beta_rec * (b * u[idp]) where idp is khi.
+		//The while loop should ensure that the khi is maintained to its set value of mx_tot-1.
+	}
+}
+
+
+
 void solverWrapper(Communicator rk) {
 
 	cudaSetDevice(rk.nodeRank);
 
 	int start;
-    myprec *dpar1, *dpar2, *dtime;
-    myprec *hpar1 = new myprec[nsteps];
-    myprec *hpar2 = new myprec[nsteps];
-    myprec *htime = new myprec[nsteps];
-
-    checkCuda( cudaMalloc((void**)&dpar1, nsteps*sizeof(myprec)) );
-    checkCuda( cudaMalloc((void**)&dpar2, nsteps*sizeof(myprec)) );
-    checkCuda( cudaMalloc((void**)&dtime, nsteps*sizeof(myprec)) );
+	myprec *dpar1, *dpar2, *dtime;
+	myprec *hpar1 = new myprec[nsteps];
+	myprec *hpar2 = new myprec[nsteps];
+	myprec *htime = new myprec[nsteps];
 
-    FILE *fp;
+	checkCuda( cudaMalloc((void**)&dpar1, nsteps*sizeof(myprec)) );
+	checkCuda( cudaMalloc((void**)&dpar2, nsteps*sizeof(myprec)) );
+	checkCuda( cudaMalloc((void**)&dtime, nsteps*sizeof(myprec)) );
 
-    //check the memory usage of the GPU
-    checkGpuMem(rk);
+	FILE *fp;
 
+	//check the memory usage of the GPU
+	checkGpuMem(rk);
+	if (inletbc == 5) {
+		InletMeanUpdate(rm_in, um_in, wm_in, tm_in, delta_in, rk); // The means at the inflow bc (ghost cells of inflow) will remain constant for the entire simulation and hence it is set once outside the loop
+	}
 
-	/*derVelX<<<d_grid[0],d_block[0],0,s[1]>>>(d_u,d_v,d_w,gij[0],gij[1],gij[2]);
-	derVelY<<<d_grid[3],d_block[3],0,s[2]>>>(d_u,d_v,d_w,gij[3],gij[4],gij[5]);
-	derVelZ<<<d_grid[4],d_block[4],0,s[8]>>>(d_u,d_v,d_w,gij[6],gij[7],gij[8]);
-	calcDil<<<grid0,block0>>>(d_dil,gij[0],gij[4],gij[8]);*/ // In Y and Z derivatives are turned off. so need to fix that or create a new kernel.
+        if(rk.rank==5){
+	FILE *fdel = fopen("delstar.txt","w+");
+	fclose(fdel);}
 
+	if(restartFile<0) {
+		start=0;
+	} else {
+		start=restartFile;
+	}
+	for(int file = start+1; file<nfiles+start+1; file++) {
 
+		if(rk.rank==0) fp = fopen("solution.txt","w+");
 
-    if(restartFile<0) {
-    	start=0;
-    } else {
-    	start=restartFile;
-    }
-    for(int file = start+1; file<nfiles+start+1; file++) {  
-        
-        if(rk.rank==0) fp = fopen("solution.txt","w+");
+		/////Time----------------------------------------------------------------------------------
+		float tm;
+		cudaEvent_t start1, stop;
+		checkCuda( cudaEventCreate(&start1) );
+		checkCuda( cudaEventCreate(&stop) );
+		checkCuda( cudaEventRecord(start1, 0) );
 
-        /////Time----------------------------------------------------------------------------------
-	float tm;
-	cudaEvent_t start1, stop; checkCuda( cudaEventCreate(&start1) ); checkCuda( cudaEventCreate(&stop) ); checkCuda( cudaEventRecord(start1, 0) );
+		if(lowStorage) {
+			runSimulationLowStorage(dpar1,dpar2,dtime,rk);  //running the simulation on the GPU
+		} else {
+			runSimulation(file, dpar1,dpar2,dtime,rk);  //running the simulation on the GPU
+		}
 
-	if(lowStorage) {
-        	runSimulationLowStorage(dpar1,dpar2,dtime,rk);  //running the simulation on the GPU
-    	} else {
-    		runSimulation(file, dpar1,dpar2,dtime,rk);  //running the simulation on the GPU
-          	}
-       
-	checkCuda( cudaEventRecord(stop, 0) );	checkCuda( cudaEventSynchronize(stop) );	checkCuda( cudaEventElapsedTime(&tm, start1, stop) );
-	printf("Time for runSimulation  %3.4f s \n", tm/nsteps/1000);
-        // EndTime---------------------------------------------------------------------------------
+		checkCuda( cudaEventRecord(stop, 0) );
+		checkCuda( cudaEventSynchronize(stop) );
+		checkCuda( cudaEventElapsedTime(&tm, start1, stop) );
+		printf("Time for runSimulation  %3.4f s \n", tm/nsteps/1000);
+		// EndTime---------------------------------------------------------------------------------
 
-    	copyField(1,rk);					  //copying back partial results to CPU
+		copyField(1,rk);					  //copying back partial results to CPU
 
-    	writeField(file,rk);
+		writeField(file,rk);
 
-    	cudaDeviceSynchronize();
+		cudaDeviceSynchronize();
 
-    	checkCuda( cudaMemcpy(htime, dtime, nsteps*sizeof(myprec) , cudaMemcpyDeviceToHost) );
-    	checkCuda( cudaMemcpy(hpar1, dpar1, nsteps*sizeof(myprec) , cudaMemcpyDeviceToHost) );
-    	checkCuda( cudaMemcpy(hpar2, dpar2, nsteps*sizeof(myprec) , cudaMemcpyDeviceToHost) );
+		checkCuda( cudaMemcpy(htime, dtime, nsteps*sizeof(myprec), cudaMemcpyDeviceToHost) );
+		checkCuda( cudaMemcpy(hpar1, dpar1, nsteps*sizeof(myprec), cudaMemcpyDeviceToHost) );
+		checkCuda( cudaMemcpy(hpar2, dpar2, nsteps*sizeof(myprec), cudaMemcpyDeviceToHost) );
 
-    	calcAvgChan(rk);
+		calcAvgChan(rk);
 
-    	if(rk.rank==0) {
-    		printf("file number: %d  \t step: %d  \t time: %lf  \t kin: %le  \t energy: %le\n",file,file*nsteps,htime[nsteps-1],hpar1[nsteps-1],hpar2[nsteps-1]);
-    		for(int t=0; t<nsteps-1; t+=checkCFLcondition)
-    			fprintf(fp,"%d %lf %lf %lf %lf\n",file*(t+1),htime[t],hpar1[t],hpar2[t],htime[t+1]-htime[t]);
-    	}
-    	mpiBarrier();
-    }
-    if(rk.rank==0) fclose(fp);
+		if(rk.rank==0) {
+			printf("file number: %d  \t step: %d  \t time: %lf  \t kin: %le  \t energy: %le\n",file,file*nsteps,htime[nsteps-1],hpar1[nsteps-1],hpar2[nsteps-1]);
+			for(int t=0; t<nsteps-1; t+=checkCFLcondition)
+				fprintf(fp,"%d %lf %lf %lf %lf\n",file*(t+1),htime[t],hpar1[t],hpar2[t],htime[t+1]-htime[t]);
+		}
+		mpiBarrier();
+	}
+	
+	if(rk.rank==0) fclose(fp);
 
-    clearSolver(rk);
-    cudaDeviceReset();
+	clearSolver(rk);
+	cudaDeviceReset();
 }
 
 
diff --git a/src/cuda_main.h b/src/cuda_main.h
index 5091630..796647d 100644
--- a/src/cuda_main.h
+++ b/src/cuda_main.h
@@ -49,6 +49,13 @@ myprec *d_rhsv3;
 myprec *d_rhsw3;
 myprec *d_rhse3;
 
+
+myprec *rm, *um, *wm, *tm;
+myprec *rm_in, *um_in, *wm_in, *tm_in;
+myprec *a_inpl, *b_inpl;
+int    *idxm, *idxp;
+myprec *delta_rec, *delta_in ;
+myprec *recy_r, *recy_u, *recy_v, *recy_w, *recy_e, *recy_h, *recy_p, *recy_t, *recy_m, *recy_l;
 myprec *gij[9];
 
 ///////////////////////////////////////////////////////////////////////
@@ -87,6 +94,28 @@ __global__ void rk3finalR(myprec *a1, myprec *a2, myprec *b, myprec *c, myprec *
 
 __global__ void sumLowStorageRK3(myprec *var, myprec *rhs1, myprec *rhs2, myprec *dt, int step);
 
+
+void calcMeanRec(myprec *r, myprec *u,myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm, myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in, Communicator rk);
+void Recycle_Rescale(myprec *r, myprec *u, myprec *v, myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm,
+                     myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *recy_r, myprec *recy_u,
+                     myprec *recy_v, myprec *recy_w, myprec *recy_t,myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in, Communicator rk) ;
+
+void InletMeanUpdate(myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *delta_in, Communicator rk);
+
+void Spanwiseshift(myprec *recy_r,myprec *recy_u,myprec *recy_v,myprec *recy_w,myprec *recy_t, Communicator rk);
+
+__global__ void Recy_Resc(myprec *r, myprec *u, myprec *v, myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm,
+                          myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *recy_r, myprec *recy_u,
+                          myprec *recy_v, myprec *recy_w, myprec *recy_t,myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in, Communicator rk) ;
+                          
+__global__ void YAvg(myprec *f, myprec *fm, int krec, int kRR);
+
+__global__ void deltaRR(myprec *wm, myprec *delta) ;
+
+__global__ void interpRR(myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in);
+
+__global__ void spanshifting(myprec *var);
+                        
 #endif /* RHS_H_ */
 
 
diff --git a/src/cuda_math.cu b/src/cuda_math.cu
index 5ad4820..c22948b 100644
--- a/src/cuda_math.cu
+++ b/src/cuda_math.cu
@@ -152,7 +152,7 @@ __global__ void integrateThreads(myprec *gOut, myprec *gArr, int arraySize) {
 	myprec sum = 0;
 	for (int it = glb; it < arraySize; it += gdim) {
 		int i = it % mx;
-		sum += gArr[it]*d_dxv[i]/d_dy/d_dz;
+		sum += gArr[it]*d_dxv[i]/d_dy/d_dz /(Lx*Ly*Lz);
 	}
 	extern __shared__ myprec shArr[];
 	shArr[tix] = sum;
@@ -296,6 +296,19 @@ __device__ unsigned int findPreviousPowerOf2(unsigned int n)
     return n;
 }
 
+__device__ unsigned int findNextPowerOf2(unsigned int n)
+{
+    unsigned count = 0;
+    while(n != 0)
+    {
+        n >>= 1;
+        count += 1;
+    }
+    
+    n = 1 << count;
+    return n;
+}
+
 unsigned int hostFindPreviousPowerOf2(unsigned int n)
 {
     while (n & n - 1) {
@@ -305,6 +318,20 @@ unsigned int hostFindPreviousPowerOf2(unsigned int n)
     return n;
 }
 
+unsigned int hostfindNextPowerOf2(unsigned int n)
+{
+    unsigned count = 0;
+    while(n != 0)
+    {
+        n >>= 1;
+        count += 1;
+    }
+
+    n = 1 << count;
+    return n;
+}
+
+
 void hostReduceToMin(myprec *gOut, myprec *var, Communicator rk) {
 
 	cudaSetDevice(rk.nodeRank);
@@ -341,13 +368,12 @@ void hostVolumeIntegral(myprec *gOut, myprec *var, Communicator rk) {
 	int tot = mx*my*mz;
 
 	int gr  = my *  mz;
-	int bl = mx ;
-
-	checkCuda( cudaMalloc((void**)&dwrkM ,gr*sizeof(myprec)) );
-	cudaDeviceSynchronize();
+	int bl  = mx ;
 
 	bl = hostFindPreviousPowerOf2(bl);
 
+	checkCuda( cudaMalloc((void**)&dwrkM ,gr*sizeof(myprec)) );
+	cudaDeviceSynchronize();
 	integrateThreads<<<gr, bl, bl*sizeof(myprec)>>>(dwrkM, var , tot);
 	cudaDeviceSynchronize();
 	reduceThreads<<<   1 , bl, bl*sizeof(myprec)>>>(dwrkM, dwrkM, gr);
diff --git a/src/cuda_math.h b/src/cuda_math.h
index e1aab99..c962bae 100644
--- a/src/cuda_math.h
+++ b/src/cuda_math.h
@@ -29,8 +29,11 @@ __global__ void reduceThreads (myprec *gOut, myprec *gArr, int arraySize);
 __global__ void minOfThreads (myprec *gOut, myprec *gArr, int arraySize);
 __global__ void maxOfThreads (myprec *gOut, myprec *gArr, int arraySize);
 __device__ unsigned int findPreviousPowerOf2(unsigned int n);
+__device__ unsigned int findNextPowerOf2(unsigned int n);
 void hostReduceToMin(myprec *gOut, myprec *var, Communicator rk);
 void hostVolumeIntegral(myprec *gOut, myprec *var, Communicator rk);
 void hostVolumeAverage(myprec *gOut, myprec *var, Communicator rk);
 
+unsigned int hostfindNextPowerOf2(unsigned int n);
+unsigned int hostFindPreviousPowerOf2(unsigned int n);
 #endif /* CUDA_MATH_H_ */
diff --git a/src/cuda_rhs.cu b/src/cuda_rhs.cu
index 225f99b..c24122d 100644
--- a/src/cuda_rhs.cu
+++ b/src/cuda_rhs.cu
@@ -5,19 +5,30 @@
 #include "boundary_condition_x.h"
 #include "boundary_condition_y.h"
 #include "boundary_condition_z.h"
+#include "boundary.h"
+#include "nrbcX.h"
+#include "nrbcZ.h"
+
 
 __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX,
 		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
 		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
 		myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz,
-		myprec *dvdy, myprec *dwdz, myprec *dil, myprec *dpdz, int iNum) {
+		myprec *dvdy, myprec *dwdz, myprec *dil, myprec *dpdz) {
 
 	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
-	id.mkidX();
+	int iNum = blockIdx.z ;
+	id.mkidXFlx(iNum);
 
 	int si = id.tix + stencilSize;       // local i for shared memory access + halo offset
 	int sj = id.tiy;                   // local j for shared memory access
 
+	bool boolIsothnrbc_top = ((topbc==3) && (id.i == mx_tot-1));
+	bool boolIsothnrbc_bot = ((bottombc==3) && (id.i == 0));
+	bool boolFreestreamnrbc_top = ((topbc==4) && (id.i == mx_tot-1));
+	bool boolAdiabnrbc_top = ((topbc==6) && (id.i == mx_tot-1));
+	bool boolAdiabnrbc_bot = ((bottombc==6) && (id.i == 0));
+
 	myprec rXtmp=0;
 	myprec uXtmp=0;
 	myprec vXtmp=0;
@@ -28,14 +39,13 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 	myprec wrk2=0;
 	myprec wrk3=0;
 
-        
-	__shared__ myprec s_u[sPencils/2][mx+stencilSize*2];
-	__shared__ myprec s_v[sPencils/2][mx+stencilSize*2];
-	__shared__ myprec s_w[sPencils/2][mx+stencilSize*2];
-	__shared__ myprec s_t[sPencils/2][mx+stencilSize*2];
-	__shared__ myprec s_p[sPencils/2][mx+stencilSize*2];
-	__shared__ myprec s_prop1[sPencils/2][mx+stencilSize*2];
-	__shared__ myprec s_prop2[sPencils/2][mx+stencilSize*2];
+	__shared__ myprec s_u[sPencils/2][mx/nX+stencilSize*2];
+	__shared__ myprec s_v[sPencils/2][mx/nX+stencilSize*2];
+	__shared__ myprec s_w[sPencils/2][mx/nX+stencilSize*2];
+	__shared__ myprec s_t[sPencils/2][mx/nX+stencilSize*2];
+	__shared__ myprec s_p[sPencils/2][mx/nX+stencilSize*2];
+	__shared__ myprec s_prop1[sPencils/2][mx/nX+stencilSize*2];
+	__shared__ myprec s_prop2[sPencils/2][mx/nX+stencilSize*2];
 
 	s_u[sj][si] = u[id.g];
 	s_v[sj][si] = v[id.g];
@@ -48,19 +58,43 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 
 	// Boundary conditions in the x-direction are in boundary_condition_x.h
 	// these are the BCs for u,v,w,p,t,mu,lambda
-	BCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx);
+	if (id.tix < stencilSize){
+		if (nX ==1){
+			TopBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, topbc);
+			BotBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, bottombc);
+		} else {
+			if (iNum ==0){
+				TopBCxCpy(s_u[sj],u,si,id);TopBCxCpy(s_v[sj],v,si,id);TopBCxCpy(s_w[sj],w,si,id);TopBCxCpy(s_p[sj],p,si,id);
+				TopBCxCpy(s_t[sj],t,si,id);TopBCxCpy(s_prop1[sj],mu,si,id);TopBCxCpy(s_prop2[sj],lam,si,id);
+
+				BotBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, bottombc);
+			} else if (iNum == nX-1){
+				BotBCxCpy(s_u[sj],u,si,id);BotBCxCpy(s_v[sj],v,si,id);BotBCxCpy(s_w[sj],w,si,id);BotBCxCpy(s_p[sj],p,si,id);
+				BotBCxCpy(s_t[sj],t,si,id);BotBCxCpy(s_prop1[sj],mu,si,id);BotBCxCpy(s_prop2[sj],lam,si,id);
+
+				TopBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, topbc);
+			} else {
+				TopBCxCpy(s_u[sj],u,si,id);TopBCxCpy(s_v[sj],v,si,id);TopBCxCpy(s_w[sj],w,si,id);TopBCxCpy(s_p[sj],p,si,id);
+				TopBCxCpy(s_t[sj],t,si,id);TopBCxCpy(s_prop1[sj],mu,si,id);TopBCxCpy(s_prop2[sj],lam,si,id);
+
+				BotBCxCpy(s_u[sj],u,si,id);BotBCxCpy(s_v[sj],v,si,id);BotBCxCpy(s_w[sj],w,si,id);BotBCxCpy(s_p[sj],p,si,id);
+				BotBCxCpy(s_t[sj],t,si,id);BotBCxCpy(s_prop1[sj],mu,si,id);BotBCxCpy(s_prop2[sj],lam,si,id);
+			}
+		}
+	}
+
 	__syncthreads();
 
 	derDevSharedV1x(&wrk1,s_u[sj],si);
 	derDevSharedV1x(&wrk2,s_v[sj],si);
 	derDevSharedV1x(&wrk3,s_w[sj],si);
 
-        dudx[id.g] = wrk1;
-        dvdx[id.g] = wrk2;
-        dwdx[id.g] = wrk3;
+	dudx[id.g] = wrk1;
+	dvdx[id.g] = wrk2;
+	dwdx[id.g] = wrk3;
 
-        rXtmp  = wrk1 + dvdy[id.g] + dwdz[id.g]; // rXtmp is for rho but here it just acts acts as a temporary variable.
-        dil[id.g] = rXtmp; 
+	rXtmp  = wrk1 + dvdy[id.g] + dwdz[id.g]; // rXtmp is for rho but here it just acts acts as a temporary variable.
+	dil[id.g] = rXtmp;
 	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
 	uXtmp = ( 2 * wrk1 - 2./3.*rXtmp ); //rXtmp stores dilatation
 	vXtmp = (     wrk2 + dudy[id.g]  );
@@ -71,7 +105,7 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 
 	//Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse);
 	derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx
-        uXtmp *= wrk2;
+	uXtmp *= wrk2;
 	vXtmp *= wrk2;
 	wXtmp *= wrk2;
 
@@ -83,29 +117,60 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 	derDevSharedV2x(&wrk3,s_w[sj],si);
 	wXtmp = wXtmp + wrk3*s_prop1[sj][si];
 
+	if ( boolFreestreamnrbc_top ) {// This is what the NSCBC says (to put derivative of tangential stress ==0. BUT NOTE THAT WE ARE not setting THE CROSS DERIVATIVE TERMS COMING OUT OF TAU12 AND TAU 13 I.E. mu d2u/dxdy and  mu d2u/dxdz equal to zero. This is because to do it we need to edit the Y and Z kernel which will make the code complicated.;
+		vXtmp = 0;
+		wXtmp = 0;
+	}
 	//adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six)
 	eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp;
 
 	//adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx)
-	derDevSharedV2x(&wrk1,s_t[sj],si);
-	eXtmp = eXtmp + wrk1*s_prop2[sj][si];
-	derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx
-	derDevSharedV1x(&wrk3,s_t[sj],si); //wrk1 = d (t) dx
-	eXtmp = eXtmp + wrk2*wrk3;
 
+	if ( boolFreestreamnrbc_top ) { // dqdx = 0;
+		eXtmp = eXtmp + 0;
+	} else {
+		derDevSharedV2x(&wrk1,s_t[sj],si);
+		eXtmp = eXtmp + wrk1*s_prop2[sj][si];
+		derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx
+		derDevSharedV1x(&wrk3,s_t[sj],si); //wrk1 = d (t) dx
+		eXtmp = eXtmp + wrk2*wrk3;
+	}
 	// pressure and dilation derivatives
+	__syncthreads();
 	s_prop2[sj][si] = rXtmp; // rXtmp stores dilatation temporarily
 	__syncthreads();
 
 	// these are the BCs for the dilatation
-	BCxNumber2(s_prop2[sj],id,si,mx);
+
+	if (id.tix < stencilSize){
+		if (nX ==1){
+			TopBCxNumber2(s_prop2[sj],dil,id,si,mx, topbc);
+			BotBCxNumber2(s_prop2[sj],dil,id,si,mx, bottombc);
+		} else {
+			if (iNum ==0){
+				TopBCxCpy(s_prop2[sj],dil,si,id);
+				BotBCxNumber2(s_prop2[sj],dil,id,si,mx, bottombc);
+			} else if (iNum == nX-1){
+				BotBCxCpy(s_prop2[sj],dil,si,id);
+				TopBCxNumber2(s_prop2[sj],dil,id,si,mx, topbc);
+			} else {
+				TopBCxCpy(s_prop2[sj],dil,si,id);
+				BotBCxCpy(s_prop2[sj],dil,si,id);
+			}
+		}
+	}
 	__syncthreads();
 
 	derDevSharedV1x(&wrk2,s_prop2[sj],si);
 	derDevShared1x(&wrk1 ,s_p[sj],si);
+
+	if (boolFreestreamnrbc_top){ // pressure is taken care in the nrbc
+		wrk1 = 0;
+	}
 	uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0     - wrk1 ;
 	eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si];
 
+
 	//Adding here the terms - d (ru phi) dx;
 	s_prop1[sj][si] = r[id.g];
 	s_prop2[sj][si] = h[id.g];
@@ -113,30 +178,62 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre
 
 	// fill in periodic images in shared memory array
 	// these are the BCs for rho (prop1) and enthalpy (prop2)
-	BCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx);
-	__syncthreads();
 
-	fluxQuadSharedx(&wrk3,s_prop1[sj],s_u[sj],si);
-	rXtmp = wrk3;
-	__syncthreads();
-	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si);
-	uXtmp = uXtmp + wrk1;
-	__syncthreads();
-	fluxCubeSharedx(&wrk2,s_prop1[sj],s_u[sj],s_v[sj],si);
-	vXtmp = vXtmp + wrk2;
-	__syncthreads();
-	fluxCubeSharedx(&wrk3,s_prop1[sj],s_u[sj],s_w[sj],si);
-	wXtmp = wXtmp + wrk3;
-	__syncthreads();
-	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si);
-	eXtmp = eXtmp + wrk1;
+
+	if (id.tix < stencilSize){
+		if (nX ==1){
+			TopBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, topbc);
+			BotBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, bottombc);
+		} else {
+			if (iNum ==0){
+				TopBCxCpy(s_prop1[sj],r,si,id);TopBCxCpy(s_prop2[sj],h,si,id);
+				BotBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, bottombc);
+			} else if (iNum == nX-1){
+				BotBCxCpy(s_prop1[sj],r,si,id);BotBCxCpy(s_prop2[sj],h,si,id);
+				TopBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, topbc);
+			} else {
+				TopBCxCpy(s_prop1[sj],r,si,id); TopBCxCpy(s_prop2[sj],h,si,id);
+				BotBCxCpy(s_prop1[sj],r,si,id); BotBCxCpy(s_prop2[sj],h,si,id);
+			}
+		}
+	}
 	__syncthreads();
 
+
+	if ( boolIsothnrbc_top){
+		IsothnrbcX_top(&rXtmp,&uXtmp,&vXtmp,&wXtmp,&eXtmp, s_prop1[sj],s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_prop2[sj], si, id);
+	} else if (boolIsothnrbc_bot){
+		IsothnrbcX_bot(&rXtmp,&uXtmp,&vXtmp,&wXtmp,&eXtmp, s_prop1[sj],s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_prop2[sj], si, id);
+	} else if (boolFreestreamnrbc_top){
+		FreestreamnrbcX_top(&rXtmp,&uXtmp,&vXtmp,&wXtmp,&eXtmp, s_prop1[sj],s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_prop2[sj], si, id);
+	} else if (boolAdiabnrbc_top){
+
+	} else if (boolAdiabnrbc_bot){
+
+	} else {
+
+		fluxQuadSharedx(&wrk3,s_prop1[sj],s_u[sj],si); // DONT FORGET TO REMOVE SYNCTHREADS FROM THESE FUNCTIONS AS IT IS PLACED IN A DIVERGENT PATH.
+		rXtmp = wrk3;
+		fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si);
+		uXtmp = uXtmp + wrk1;
+		fluxCubeSharedx(&wrk2,s_prop1[sj],s_u[sj],s_v[sj],si);
+		vXtmp = vXtmp + wrk2;
+		fluxCubeSharedx(&wrk3,s_prop1[sj],s_u[sj],s_w[sj],si);
+		wXtmp = wXtmp + wrk3;
+		fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si);
+		eXtmp = eXtmp + wrk1;
+	}	
+
+
+
 	rX[id.g] = rXtmp;
 	uX[id.g] = uXtmp;
 	vX[id.g] = vXtmp;
 	wX[id.g] = wXtmp;
 	eX[id.g] = eXtmp;
+
+	__syncthreads();
+
 }
 
 __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, myprec *eY,
@@ -149,15 +246,17 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 	int jNum = blockIdx.z;
 	id.mkidYFlx(jNum);
 
-        int tid = id.tix + id.bdx*id.tiy;
+	int tid = id.tix + id.bdx*id.tiy;
 
 	int si  = id.tiy + stencilSize;       // local i for shared memory access + halo offset
 	int sj  = id.tix;                   // local j for shared memory access
-        int si1 = tid%id.bdy +  stencilSize;       // local i for shared memory access + halo offset
-        int sj1 = tid/id.bdy;
+	int si1 = tid%id.bdy +  stencilSize;       // local i for shared memory access + halo offset
+	int sj1 = tid/id.bdy;
+
+
 
-       // Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y);
-       // id1.mkidYFlx(jNum);
+	// Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y);
+	// id1.mkidYFlx(jNum);
 
 	myprec rYtmp=0;
 	myprec uYtmp=0;
@@ -174,9 +273,9 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 	__shared__ myprec s_w[mx/nDivX][my/nDivY+stencilSize*2];
 	__shared__ myprec s_prop[mx/nDivX][my/nDivY+stencilSize*2];
 	__shared__ myprec s_dil[mx/nDivX][my/nDivY+stencilSize*2];
-        __shared__ myprec s_tmp[mx/nDivX][my/nDivY+stencilSize*2];
+	__shared__ myprec s_tmp[mx/nDivX][my/nDivY+stencilSize*2];
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
 	s_u[sj][si] = u[id.g];
 	s_v[sj][si] = v[id.g];
 	s_w[sj][si] = w[id.g];
@@ -187,45 +286,93 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 	// fill in periodic images in shared memory array
 	// these are boundary conditions for u,v,w,mu and dilatation
 
-	BCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my,jNum);
+	if (id.tiy < stencilSize){
+		if (nDivY ==1){
+			if(pRow>1) {
+				haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id); haloBCyTop(s_w[sj],w,si,id);
+				haloBCyTop(s_prop[sj],mu,si,id); haloBCyTop(s_dil[sj],dil,si,id);
+
+				haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id); haloBCyBot(s_w[sj],w,si,id);
+				haloBCyBot(s_prop[sj],mu,si,id); haloBCyBot(s_dil[sj],dil,si,id);
+			} else{
+				TopBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my);
+				BotBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my);
+			}
+		} else {
+			if (jNum ==0){
+				TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); TopBCyCpy(s_w[sj],w,si,id);
+				TopBCyCpy(s_prop[sj],mu,si,id); TopBCyCpy(s_dil[sj],dil,si,id);
+
+				if(pRow>1) {
+
+					haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id); haloBCyBot(s_w[sj],w,si,id);
+					haloBCyBot(s_prop[sj],mu,si,id); haloBCyBot(s_dil[sj],dil,si,id);
+				} else{
+					BotBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my);
+				}
+			} else if (jNum == nDivY-1){
+				BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); BotBCyCpy(s_w[sj],w,si,id);
+				BotBCyCpy(s_prop[sj],mu,si,id); BotBCyCpy(s_dil[sj],dil,si,id);
+				if(pRow>1) {
+					haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id); haloBCyTop(s_w[sj],w,si,id);
+					haloBCyTop(s_prop[sj],mu,si,id); haloBCyTop(s_dil[sj],dil,si,id);
+
+				} else {
+					TopBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my);
+				}
+			} else {
+				TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); TopBCyCpy(s_w[sj],w,si,id);
+				TopBCyCpy(s_prop[sj],mu,si,id); TopBCyCpy(s_dil[sj],dil,si,id);
+
+				BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); BotBCyCpy(s_w[sj],w,si,id);
+				BotBCyCpy(s_prop[sj],mu,si,id);BotBCyCpy(s_dil[sj],dil,si,id);
+			}
+		}
+	}
+
 	__syncthreads();
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
 	derDevSharedV1y(&wrk1,s_u[sj1],si1); 
 	derDevSharedV1y(&wrk2,s_v[sj1],si1); 
 	derDevSharedV1y(&wrk3,s_w[sj1],si1); 
 
 	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
 
-
-
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
-        s_tmp[sj][si] = dvdx[id.g];
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	s_tmp[sj][si] = dvdx[id.g];
+	__syncthreads();
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	uYtmp = (    wrk1 + s_tmp[sj1][si1]        );
+	__syncthreads();
+        s_tmp[sj1][si1] = wrk1;
         __syncthreads();
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
-        uYtmp = (    wrk1 + s_tmp[sj1][si1]        );
-	vYtmp = (2 * wrk2 - 2./3.*s_dil[sj1][si1] );
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+        dudy[id.g] = s_tmp[sj][si];
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
         __syncthreads();
-        s_tmp[sj][si] = dvdz[id.g];
-        __syncthreads();
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP 
+	vYtmp = (2 * wrk2 - 2./3.*s_dil[sj1][si1] );
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	__syncthreads();
+	s_tmp[sj][si] = dvdz[id.g];
+	__syncthreads();
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
 	wYtmp = (    wrk3 + s_tmp[sj1][si1]        );
 
-        /*__syncthreads();
+	__syncthreads();
         s_tmp[sj1][si1] = wrk2;
         __syncthreads();
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
-        dvdy[id.g] = s_tmp[sj][si];*/
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP 
-        __syncthreads();
-        s_tmp[sj1][si1] = wrk3;
-        __syncthreads();
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM 
-        dwdy[id.g] = s_tmp[sj][si];
+        dvdy[id.g] = s_tmp[sj][si];
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	__syncthreads();
+	s_tmp[sj1][si1] = wrk3;
+	__syncthreads();
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	dwdy[id.g] = s_tmp[sj][si];
+	__syncthreads();
+
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP 
-      
 
 
 	//adding the viscous dissipation part duidy*mu*siy
@@ -233,7 +380,7 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 
 	//Adding here the terms d (mu) dy * syj;
 	derDevSharedV1y(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dy
-        uYtmp *= wrk2;
+	uYtmp *= wrk2;
 	vYtmp *= wrk2;
 	wYtmp *= wrk2;
 
@@ -253,33 +400,94 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 	vYtmp = vYtmp + s_prop[sj1][si1]*wrk2/3.0;
 	eYtmp = eYtmp + s_prop[sj1][si1]*wrk2/3.0*s_v[sj1][si1];
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
 
 	// pressure derivatives
-        __syncthreads();
+	__syncthreads();
 	s_dil[sj][si] = p[id.g];
 	__syncthreads();
 
 	// Boundary condition for pressure
-	BCyNumber2(s_dil[sj],p,id,si,my, jNum);
+	if (id.tiy < stencilSize){
+		if (nDivY ==1){
+			if(pRow>1) {
+				haloBCyTop(s_dil[sj],p,si,id);
+				haloBCyBot(s_dil[sj],p,si,id);
+			} else {
+				TopBCyNumber2(s_dil[sj],p,id,si,my);
+				BotBCyNumber2(s_dil[sj],p,id,si,my);
+			}
+		} else {
+			if (jNum ==0){
+				TopBCyCpy(s_dil[sj],p,si,id);
+				if(pRow>1) {
+					haloBCyBot(s_dil[sj],p,si,id);
+				} else {
+					BotBCyNumber2(s_dil[sj],p,id,si,my);
+				}
+			} else if (jNum == nDivY-1){
+				BotBCyCpy(s_dil[sj],p,si,id);
+				if(pRow>1) {
+					haloBCyTop(s_dil[sj],p,si,id);
+				} else {
+					TopBCyNumber2(s_dil[sj],p,id,si,my);
+				}
+			} else {
+				TopBCyCpy(s_dil[sj],p,si,id);
+				BotBCyCpy(s_dil[sj],p,si,id);
+			}
+		}
+	}
 	__syncthreads();
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
 	derDevShared1y(&wrk1,s_dil[sj1],si1);
 	vYtmp = vYtmp - wrk1;
 
 
 	// fourier terms
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
-        __syncthreads();   
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	__syncthreads();
 	s_prop[sj][si] = lam[id.g];
 	s_dil[sj][si]  = t[id.g];
 	__syncthreads();
 
 	// Boundary condition for temperature and thermal conductivity
-	BCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my,jNum);
+
+	if (id.tiy < stencilSize){
+		if (nDivY ==1){
+			if(pRow>1) {
+				haloBCyTop(s_prop[sj],lam,si,id); haloBCyTop(s_dil[sj],t,si,id);
+
+				haloBCyBot(s_prop[sj],lam,si,id); haloBCyBot(s_dil[sj],t,si,id);
+			} else{
+				TopBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ;
+				BotBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ;
+			}
+		} else {
+			if (jNum ==0){
+				TopBCyCpy(s_prop[sj],lam,si,id); TopBCyCpy(s_dil[sj],t,si,id);
+				if(pRow>1) {
+					haloBCyBot(s_prop[sj],lam,si,id); haloBCyBot(s_dil[sj],t,si,id);
+				} else {
+					BotBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ;
+				}
+			} else if (jNum == nDivY-1){
+				BotBCyCpy(s_prop[sj],lam,si,id); BotBCyCpy(s_dil[sj],t,si,id);
+				if(pRow>1) {
+					haloBCyTop(s_prop[sj],lam,si,id); haloBCyTop(s_dil[sj],t,si,id);
+				} else {
+					TopBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ;
+				}
+			} else {
+				TopBCyCpy(s_prop[sj],lam,si,id); TopBCyCpy(s_dil[sj],t,si,id);
+				BotBCyCpy(s_prop[sj],lam,si,id); BotBCyCpy(s_dil[sj],t,si,id);
+			}
+		}
+	}
 	__syncthreads();
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
 
 	derDevSharedV2y(&wrk1,s_dil[sj1],si1);
 	eYtmp = eYtmp + wrk1*s_prop[sj1][si1];
@@ -287,17 +495,46 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 	derDevSharedV1y(&wrk1,s_dil[sj1] ,si1); //wrk1 = d (t) dy
 	eYtmp = eYtmp + wrk1*wrk2;
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
 	//Adding here the terms - d (ru phi) dy;
 	__syncthreads();
 	s_prop[sj][si] = r[id.g];
 	s_dil[sj][si]  = h[id.g];
 	__syncthreads();
 	// Boundary condition for density and enthalpy
-	BCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my,jNum);
+	if (id.tiy < stencilSize){
+		if (nDivY ==1){
+			if(pRow>1) {
+				haloBCyTop(s_prop[sj],r,si,id); haloBCyTop(s_dil[sj],h,si,id);
+				haloBCyBot(s_prop[sj],r,si,id); haloBCyBot(s_dil[sj],h,si,id);
+			} else{
+				TopBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ;
+				BotBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ;
+			}
+		} else {
+			if (jNum ==0){
+				TopBCyCpy(s_prop[sj],r,si,id); TopBCyCpy(s_dil[sj],h,si,id);
+				if(pRow>1) {
+					haloBCyBot(s_prop[sj],r,si,id); haloBCyBot(s_dil[sj],h,si,id);
+				} else{
+					BotBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ;
+				}
+			} else if (jNum == nDivY-1){
+				BotBCyCpy(s_prop[sj],r,si,id); BotBCyCpy(s_dil[sj],h,si,id);
+				if(pRow>1) {
+					haloBCyTop(s_prop[sj],r,si,id); haloBCyTop(s_dil[sj],h,si,id);
+				} else{
+					TopBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ;
+				}
+			} else {
+				TopBCyCpy(s_prop[sj],r,si,id); TopBCyCpy(s_dil[sj],h,si,id);
+				BotBCyCpy(s_prop[sj],r,si,id); BotBCyCpy(s_dil[sj],h,si,id);
+			}
+		}
+	}
 	__syncthreads();
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
 	fluxQuadSharedy(&wrk1,s_prop[sj1],s_v[sj1],si1);
 	rYtmp = wrk1;
 	__syncthreads();
@@ -313,10 +550,10 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 	fluxCubeSharedy(&wrk1,s_prop[sj1],s_v[sj1],s_dil[sj1],si1);
 	eYtmp = eYtmp + wrk1;
 	__syncthreads();
-//USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE.      
-        s_prop[sj1][si1] = rYtmp ; s_u[sj1][si1] = uYtmp ; s_v[sj1][si1] = vYtmp ; s_w[sj1][si1] = wYtmp ; s_dil[sj1][si1] = eYtmp ;
-        __syncthreads();
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	//USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE.
+	s_prop[sj1][si1] = rYtmp ; s_u[sj1][si1] = uYtmp ; s_v[sj1][si1] = vYtmp ; s_w[sj1][si1] = wYtmp ; s_dil[sj1][si1] = eYtmp ;
+	__syncthreads();
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
 
 #if useStreams
 	rY[id.g] = rYtmp;
@@ -335,202 +572,503 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre
 }
 
 __global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, myprec *eZ,
-        myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
-        myprec *t,  myprec *p,  myprec *mu, myprec *lam,
-        myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz,
-        myprec *dil, myprec *dpdz) {
-
-    Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
-    int kNum = blockIdx.z;
-    id.mkidZFlx(kNum);
-       
-    int tid = id.tix + id.bdx*id.tiy;
-
-    int si = id.tiy + stencilSize;       // local i for shared memory access + halo offset
-    int sj = id.tix;                   // local j for shared memory access
-    int si1 = tid%id.bdy +  stencilSize;       // local i for shared memory access + halo offset
-    int sj1 = tid/id.bdy;
-
-    //Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y);
-    //id1.mkidZFlx(kNum);
-
-    myprec rZtmp=0;
-    myprec uZtmp=0;
-    myprec vZtmp=0;
-    myprec wZtmp=0;
-    myprec eZtmp=0;
-
-    myprec wrk1=0;
-    myprec wrk2=0;
-    myprec wrk3=0;
-
-        //myprec tmpreg1[stencilSize*2+1];
-        //myprec tmpreg2[stencilSize*2+1];
-        //myprec tmpreg3[stencilSize*2+1];
-
-    __shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2];
-    __shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2];
-    __shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2];
-    __shared__ myprec s_prop[mx/nDivX][mz/nDivZ+stencilSize*2];
-    __shared__ myprec s_dil[mx/nDivX][mz/nDivZ+stencilSize*2];
-    __shared__ myprec s_tmp[mx/nDivX][mz/nDivZ+stencilSize*2];
+		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
+		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
+		myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz,
+		myprec *dil, myprec *dpdz , Communicator rk, recycle rec) {
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
-    s_u[sj][si] = u[id.g];
-    s_v[sj][si] = v[id.g];
-    s_w[sj][si] = w[id.g];
-    s_prop[sj][si] = mu[id.g];
-    s_dil[sj][si] = dil[id.g];
-    __syncthreads();
-
-    // fill in periodic images in shared memory array
-    // these are boundary conditions for u,v,w,mu and dilatation
-
-   
-        BCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,kNum);
-    __syncthreads();
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
-////BY COMPUTATION THREAD ARRANGEMENT
-    derDevSharedV1z(&wrk1,s_u[sj1],si1);
-    derDevSharedV1z(&wrk2,s_v[sj1],si1);
-    derDevSharedV1z(&wrk3,s_w[sj1],si1);
-    //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
-
-// WRK1 CORRESPONDS TO THE NEW THREAD ARRANGEMENT.
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	int kNum = blockIdx.z;
+	id.mkidZFlx(kNum);
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
-        s_tmp[sj][si] = dwdx[id.g];
-        __syncthreads();
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
-    uZtmp = (    wrk1 + s_tmp[sj1][si1]        );
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	int tid = id.tix + id.bdx*id.tiy;
+
+	int si = id.tiy + stencilSize;       // local i for shared memory access + halo offset
+	int sj = id.tix;                   // local j for shared memory access
+	int si1 = tid%id.bdy +  stencilSize;       // local i for shared memory access + halo offset
+	int sj1 = tid/id.bdy;
+
+
+	Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y);
+	id1.mkidZFlx(kNum);
+
+	bool CellAtOutlet = ( (rk.kp == mz_tot - 1) && (id1.k == mz-1) );
+	bool CellAtInlet  = ( (rk.km == 0) && (id1.k == 0) );
+	bool boolOutflownrbc_top = ( (outletbc==4) && CellAtOutlet );
+	bool boolInflownrbc_bot  = ( (inletbc ==5) && CellAtInlet  );
+
+	myprec rZtmp=0;
+	myprec uZtmp=0;
+	myprec vZtmp=0;
+	myprec wZtmp=0;
+	myprec eZtmp=0;
+
+	myprec wrk1=0;
+	myprec wrk2=0;
+	myprec wrk3=0;
+
+	//myprec tmpreg1[stencilSize*2+1];
+	//myprec tmpreg2[stencilSize*2+1];
+	//myprec tmpreg3[stencilSize*2+1];
+
+	__shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2];
+	__shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2];
+	__shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2];
+	__shared__ myprec s_prop[mx/nDivX][mz/nDivZ+stencilSize*2];
+	__shared__ myprec s_dil[mx/nDivX][mz/nDivZ+stencilSize*2];
+	__shared__ myprec s_tmp[mx/nDivX][mz/nDivZ+stencilSize*2];
+
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	s_u[sj][si] = u[id.g];
+	s_v[sj][si] = v[id.g];
+	s_w[sj][si] = w[id.g];
+	s_prop[sj][si] = mu[id.g];
+	s_dil[sj][si] = dil[id.g];
+	__syncthreads();
+
+	// fill in periodic images in shared memory array
+	// these are boundary conditions for u,v,w,mu and dilatation
+	if (id.tiy < stencilSize){
+		if (nDivZ ==1){
+			if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot..
+				if (inletbc == 1 || outletbc ==1 ){ // if periodic
+					haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id);
+					haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id);
+
+					haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id);
+					haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id);
+				} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+					if (rk.km == pCol-2){ // last block
+						haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id);
+						haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id);
+
+						TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc);
+					} else if (rk.kp == 1) {// first block
+						haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id);
+						haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id);
+
+						BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec);
+					} else { // all internal blocks
+						haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id);
+						haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id);
+
+						haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id);
+						haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id);
+					}
+				}
+
+			} else{
+				TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc);
+				BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec);
+			}
+		} else {
+			if (kNum ==0){
+				TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id);
+				TopBCzCpy(s_prop[sj],mu,si,id); TopBCzCpy(s_dil[sj],dil,si,id);
+				if(pCol > 1) {
+					if (inletbc == 1 || outletbc ==1 ){ // if periodic
+						haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id);
+						haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id);
+					} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+						if (rk.kp == 1){ // first block
+							BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec);
+						} else { // all other blocks
+							haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id);
+							haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id);
+						}
+					}
+
+				} else{
+					BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec);
+				}
+			} else if (kNum == nDivZ-1){
+				BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id);
+				BotBCzCpy(s_prop[sj],mu,si,id); BotBCzCpy(s_dil[sj],dil,si,id);
+				if(pCol > 1) {
+					if (inletbc == 1 || outletbc ==1 ){ // if periodic
+						haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id);
+						haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id);
+					} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+						if (rk.km == pCol -2 ){ // last block
+							TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc);
+						} else { // all other blocks
+							haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id);
+							haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id);
+						}
+					}
+				} else{
+					TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc);
+				}
+			} else {
+				TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id);
+				TopBCzCpy(s_prop[sj],mu,si,id); TopBCzCpy(s_dil[sj],dil,si,id);
+
+				BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id);
+				BotBCzCpy(s_prop[sj],mu,si,id); BotBCzCpy(s_dil[sj],dil,si,id);
+			}
+		}
+	}
+	__syncthreads();
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	////BY COMPUTATION THREAD ARRANGEMENT
+	derDevSharedV1z(&wrk1,s_u[sj1],si1);
+	derDevSharedV1z(&wrk2,s_v[sj1],si1);
+	derDevSharedV1z(&wrk3,s_w[sj1],si1);
+	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
+
+	// WRK1 CORRESPONDS TO THE NEW THREAD ARRANGEMENT.
+
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	s_tmp[sj][si] = dwdx[id.g];
+	__syncthreads();
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	uZtmp = (    wrk1 + s_tmp[sj1][si1]        );
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	__syncthreads();
+	s_tmp[sj][si] = dwdy[id.g];
+	__syncthreads();
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	vZtmp = (    wrk2 + s_tmp[sj1][si1]        );
+	wZtmp = (2 * wrk3 - 2./3.*s_dil[sj1][si1] );
+
+	__syncthreads();
+        s_tmp[sj1][si1] = wrk1;
         __syncthreads();
-        s_tmp[sj][si] = dwdy[id.g];
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM 
+        dudz[id.g] = s_tmp[sj][si];
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	__syncthreads();
+        s_tmp[sj1][si1] = wrk2;
         __syncthreads();
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
-    vZtmp = (    wrk2 + s_tmp[sj1][si1]        );
-    wZtmp = (2 * wrk3 - 2./3.*s_dil[sj1][si1] );
-
-        /*__syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM 
+        dvdz[id.g] = s_tmp[sj][si];
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	__syncthreads();
         s_tmp[sj1][si1] = wrk3;
         __syncthreads();
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM 
-        dwdz[id.g] = s_tmp[sj][si];*/
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+        dwdz[id.g] = s_tmp[sj][si];
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
 
+        __syncthreads();
 
 
-    //adding the viscous dissipation part duidz*mu*siz
-    eZtmp = s_prop[sj1][si1]*(uZtmp*wrk1 + vZtmp*wrk2 + wZtmp*wrk3);
+	//adding the viscous dissipation part duidz*mu*siz
+	eZtmp = s_prop[sj1][si1]*(uZtmp*wrk1 + vZtmp*wrk2 + wZtmp*wrk3);
 
-    //Adding here the terms d (mu) dz * szj;
-    derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dz
-    uZtmp *= wrk2;
-    vZtmp *= wrk2;
-    wZtmp *= wrk2;
+	//Adding here the terms d (mu) dz * szj;
+	derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dz
+	uZtmp *= wrk2;
+	vZtmp *= wrk2;
+	wZtmp *= wrk2;
 
-    //viscous fluxes derivative
-    derDevSharedV2z(&wrk1,s_u[sj1],si1);
-    uZtmp = uZtmp + wrk1*s_prop[sj1][si1];
-    derDevSharedV2z(&wrk1,s_v[sj1],si1);
-    vZtmp = vZtmp + wrk1*s_prop[sj1][si1];
-    derDevSharedV2z(&wrk1,s_w[sj1],si1);
-    wZtmp = wZtmp + wrk1*s_prop[sj1][si1];
+	//viscous fluxes derivative
+	derDevSharedV2z(&wrk1,s_u[sj1],si1);
+	uZtmp = uZtmp + wrk1*s_prop[sj1][si1];
+	derDevSharedV2z(&wrk1,s_v[sj1],si1);
+	vZtmp = vZtmp + wrk1*s_prop[sj1][si1];
+	derDevSharedV2z(&wrk1,s_w[sj1],si1);
+	wZtmp = wZtmp + wrk1*s_prop[sj1][si1];
 
-    //adding the viscous dissipation part ui*(mu * d2duidz2 + dmudz * siz)
-    eZtmp = eZtmp + s_u[sj1][si1]*uZtmp + s_v[sj1][si1]*vZtmp + s_w[sj1][si1]*wZtmp;
+	if ( boolOutflownrbc_top ) {
+		uZtmp = 0; // derivative of tangential stress = 0 as per NSCBC
+		vZtmp = 0;
+	}
+	//adding the viscous dissipation part ui*(mu * d2duidz2 + dmudz * siz)
+	eZtmp = eZtmp + s_u[sj1][si1]*uZtmp + s_v[sj1][si1]*vZtmp + s_w[sj1][si1]*wZtmp;
 
-    //dilation derivatives
-    derDevSharedV1z(&wrk2,s_dil[sj1],si1);
-    wZtmp = wZtmp + s_prop[sj1][si1]*wrk2/3.0;
-    eZtmp = eZtmp + s_prop[sj1][si1]*wrk2/3.0*s_w[sj1][si1];
+	//dilation derivatives
+	derDevSharedV1z(&wrk2,s_dil[sj1],si1);
+	wZtmp = wZtmp + s_prop[sj1][si1]*wrk2/3.0;
+	eZtmp = eZtmp + s_prop[sj1][si1]*wrk2/3.0*s_w[sj1][si1];
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
-//SHARED MEM LOADING OPERATION. - ID
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	//SHARED MEM LOADING OPERATION. - ID
 
-    // pressure derivatives
-    __syncthreads();
-    s_dil[sj][si] = p[id.g];
-    __syncthreads();
-    BCzNumber2(s_dil[sj],p,id,si,mz,kNum);
-    __syncthreads();
+	// pressure derivatives
+	__syncthreads();
+	s_tmp[sj][si] = p[id.g];
+	__syncthreads();
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
-    derDevShared1z(&wrk1,s_dil[sj1],si1);
-    wZtmp = wZtmp - wrk1;
+	if (id.tiy < stencilSize){
+		if (nDivZ ==1){
+			if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot..
+				if (inletbc == 1 || outletbc ==1 ){ // if periodic
+					haloBCzTop(s_tmp[sj],p,si,id);
+					haloBCzBot(s_tmp[sj],p,si,id);
+				} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+					if (rk.km == pCol-2){ // last block
+						haloBCzBot(s_tmp[sj],p,si,id);
+						TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc);
+					} else if (rk.kp == 1) {// first block
+						haloBCzTop(s_tmp[sj],p,si,id);
+						BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec);
+					} else { // all internal blocks
+						haloBCzTop(s_tmp[sj],p,si,id);
+						haloBCzBot(s_tmp[sj],p,si,id);
+					}
+				}
+			} else{
+				TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc);
+				BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec);
+			}
+		} else {
+			if (kNum ==0){
+				TopBCzCpy(s_tmp[sj],p,si,id);
+				if(pCol > 1) {
+					if (inletbc == 1 || outletbc ==1 ){ // if periodic
+						haloBCzBot(s_tmp[sj],p,si,id);
+					} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+						if (rk.kp == 1){ // first block
+							BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec);
+						} else { // all other blocks
+							haloBCzBot(s_tmp[sj],p,si,id);
+						}
+					}
+				} else{
+					BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec);
+				}
+			} else if (kNum == nDivZ-1){
+				BotBCzCpy(s_tmp[sj],p,si,id);
+				if(pCol > 1) {
+					if (inletbc == 1 || outletbc ==1 ){ // if periodic
+						haloBCzTop(s_tmp[sj],p,si,id);
+					} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+						if (rk.km == pCol -2 ){ // last block
+							TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc);
+						} else { // all other blocks
+							haloBCzTop(s_tmp[sj],p,si,id);
+						}
+					}
+				} else{
+					TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc);
+				}
+			} else {
+				TopBCzCpy(s_tmp[sj],p,si,id);
+				BotBCzCpy(s_tmp[sj],p,si,id);
+
+			}
+		}
+	}
+	__syncthreads();
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
-    // fourier terms
-    __syncthreads();   
-        s_prop[sj][si] = lam[id.g];
-    s_dil[sj][si]  = t[id.g];
-    __syncthreads();
-    // Boundary conditions for thermal conductivity and temperature
-    BCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,kNum);
-    __syncthreads();
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
-
-    derDevSharedV2z(&wrk1,s_dil[sj1],si1);
-    eZtmp = eZtmp + wrk1*s_prop[sj1][si1];
-    derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (lam) dz
-    derDevSharedV1z(&wrk1,s_dil[sj1],si1); //wrk1 = d (t) dz
-    eZtmp = eZtmp + wrk1*wrk2;
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	derDevShared1z(&wrk1,s_tmp[sj1],si1);
+	if ( boolOutflownrbc_top ||  boolInflownrbc_bot) {
+		wrk1 = 0; // pressure derivative taken care in the nrbc computation.
+	}
+	wZtmp = wZtmp - wrk1;
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
-    //Adding here the terms - d (ru phi) dz;
-    __syncthreads();
-    s_prop[sj][si] = r[id.g];
-    s_dil[sj][si]  = h[id.g];
-    __syncthreads();
-    // Boundary conditions for denisty and enthalpy
-    BCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,kNum);
-    __syncthreads();
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
-    fluxQuadSharedz(&wrk1,s_prop[sj1],s_w[sj1],si1);
-    rZtmp = wrk1;
-    __syncthreads();
-    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_u[sj1],si1);
-    uZtmp = uZtmp + wrk1;
-    __syncthreads();
-    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_v[sj1],si1);
-    vZtmp = vZtmp + wrk1;
-    __syncthreads();
-    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_w[sj1],si1);
-    wZtmp = wZtmp + wrk1;
-    __syncthreads();
-    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_dil[sj1],si1);
-    eZtmp = eZtmp + wrk1;
-    __syncthreads();
-//USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE.      
-    s_prop[sj1][si1] = rZtmp ; s_u[sj1][si1] = uZtmp ; s_v[sj1][si1] = vZtmp ; s_tmp[sj1][si1] = wZtmp ; s_dil[sj1][si1] = eZtmp ;
-    __syncthreads();
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
-        
-    rZ[id.g] += s_prop[sj][si];
-    uZ[id.g] += s_u[sj][si];
-    vZ[id.g] += s_v[sj][si];
-    wZ[id.g] += s_tmp[sj][si] + *dpdz;
-    eZ[id.g] += s_dil[sj][si] + *dpdz*s_w[sj][si];
-    //dil[id.g] = dudx[id.g] + dvdy[id.g] + dwdz[id.g];
-
-//#if useStreams
-//	rZ[id.g] = rZtmp;
-//	uZ[id.g] = uZtmp;
-//	vZ[id.g] = vZtmp;
-//	wZ[id.g] = wZtmp + *dpdz;
-//	eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ;
-//#else
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	// fourier terms
+	__syncthreads();
+	s_prop[sj][si] = lam[id.g];
+	s_dil[sj][si]  = t[id.g];
+	__syncthreads();
+	// Boundary conditions for thermal conductivity and temperature
+
+	if (id.tiy < stencilSize){
+		if (nDivZ ==1){
+			if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot..
+				if (inletbc == 1 || outletbc ==1 ){ // if periodic
+					haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id);
+					haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id);
+				} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+					if (rk.km == pCol-2){ // last block
+						haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id);
+						TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc);
+					} else if (rk.kp == 1) {// first block
+						haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id);
+						BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec);
+					} else { // all internal blocks
+						haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id);
+						haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id);
+					}
+				}
+			} else{
+				TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc);
+				BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec);
+			}
+		} else {
+			if (kNum ==0){
+				TopBCzCpy(s_prop[sj],lam,si,id); TopBCzCpy(s_dil[sj],t,si,id);
+				if(pCol > 1) {
+					if (inletbc == 1 || outletbc ==1 ){ // if periodic
+						haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id);
+					} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+						if (rk.kp == 1){ // first block
+							BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec);
+						} else { // all other blocks
+							haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id);
+						}
+					}
+				} else{
+					BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec);
+				}
+			} else if (kNum == nDivZ-1){
+				BotBCzCpy(s_prop[sj],lam,si,id); BotBCzCpy(s_dil[sj],t,si,id);
+				if(pCol > 1) {
+					if (inletbc == 1 || outletbc ==1 ){ // if periodic
+						haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id);
+					} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+						if (rk.km == pCol -2 ){ // last block
+							TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc);
+						} else { // all other blocks
+							haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id);
+						}
+					}
+				} else{
+					TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc);
+				}
+			} else {
+				TopBCzCpy(s_prop[sj],lam,si,id); TopBCzCpy(s_dil[sj],t,si,id);
+
+				BotBCzCpy(s_prop[sj],lam,si,id); BotBCzCpy(s_dil[sj],t,si,id);
+			}
+		}
+	}
 
+	__syncthreads();
 
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
 
 
+	if ( boolOutflownrbc_top ) {
+		eZtmp = eZtmp + 0.0; // dqdz = 0 at the outflow as per NSCBC
+	} else {
+		derDevSharedV2z(&wrk1,s_dil[sj1],si1);
+		eZtmp = eZtmp + wrk1*s_prop[sj1][si1];
+		derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (lam) dz
+		derDevSharedV1z(&wrk1,s_dil[sj1],si1); //wrk1 = d (t) dz
+		eZtmp = eZtmp + wrk1*wrk2;
+	}
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+	//Adding here the terms - d (ru phi) dz;
+	__syncthreads();
+	s_prop[sj][si] = r[id.g];
+	s_dil[sj][si]  = h[id.g];
+	__syncthreads();
+	// Boundary conditions for denisty and enthalpy
+
+	if (id.tiy < stencilSize){
+		if (nDivZ ==1){
+			if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot..
+				if (inletbc == 1 || outletbc ==1 ){ // if periodic
+					haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id);
+					haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id);
+				} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+					if (rk.km == pCol-2){ // last block
+						haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id);
+						TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc);
+					} else if (rk.kp == 1) {// first block
+						haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id);
+						BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec);
+					} else { // all internal blocks
+						haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id);
+						haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id);
+					}
+				}
+			} else{
+				TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc);
+				BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec);
+			}
+		} else {
+			if (kNum ==0){
+				TopBCzCpy(s_prop[sj],r,si,id); TopBCzCpy(s_dil[sj],h,si,id);
+				if(pCol > 1) {
+					if (inletbc == 1 || outletbc ==1 ){ // if periodic
+						haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id);
+					} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+						if (rk.kp == 1){ // first block
+							BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec);
+						} else { // all other blocks
+							haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id);
+						}
+					}
+				} else{
+					BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec);
+				}
+			} else if (kNum == nDivZ-1){
+				BotBCzCpy(s_prop[sj],r,si,id); BotBCzCpy(s_dil[sj],h,si,id);
+				if(pCol > 1) {
+					if (inletbc == 1 || outletbc ==1 ){ // if periodic
+						haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id);
+					} else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first)
+						if (rk.km == pCol -2 ){ // last block
+							TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc);
+						} else { // all other blocks
+							haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id);
+						}
+					}
+				} else{
+					TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc);
+				}
+			} else {
+				TopBCzCpy(s_prop[sj],r,si,id); TopBCzCpy(s_dil[sj],h,si,id);
+				BotBCzCpy(s_prop[sj],r,si,id); BotBCzCpy(s_dil[sj],h,si,id);
+			}
+		}
+	}
 
-//#endif
+	__syncthreads();
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+	if ( boolOutflownrbc_top){
+		OutflownrbcZ_top(&rZtmp,&uZtmp,&vZtmp,&wZtmp,&eZtmp, s_prop[sj1],s_u[sj1],s_v[sj1],s_w[sj1],s_tmp[sj1],s_dil[sj1], si1, id1);
+	} else if (boolInflownrbc_bot){
+		InflownrbcZ_bot(&rZtmp,&uZtmp,&vZtmp,&wZtmp,&eZtmp, s_prop[sj1],s_u[sj1],s_v[sj1],s_w[sj1],s_tmp[sj1],s_dil[sj1], si1, id1);
+	} else {
+		fluxQuadSharedz(&wrk1,s_prop[sj1],s_w[sj1],si1);
+		rZtmp = wrk1;
+		fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_u[sj1],si1);
+		uZtmp = uZtmp + wrk1;
+		fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_v[sj1],si1);
+		vZtmp = vZtmp + wrk1;
+		fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_w[sj1],si1);
+		wZtmp = wZtmp + wrk1;
+		fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_dil[sj1],si1);
+		eZtmp = eZtmp + wrk1;
+	}
+
+	//USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE.
+	__syncthreads();
+	s_prop[sj1][si1] = rZtmp ; s_u[sj1][si1] = uZtmp ; s_v[sj1][si1] = vZtmp ; s_tmp[sj1][si1] = wZtmp ; s_dil[sj1][si1] = eZtmp ;
+	__syncthreads();
+	//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+
+	if (forcing) {
+		rZ[id.g] += s_prop[sj][si];
+		uZ[id.g] += s_u[sj][si];
+		vZ[id.g] += s_v[sj][si];
+		wZ[id.g] += s_tmp[sj][si] + *dpdz;
+		eZ[id.g] += s_dil[sj][si] + *dpdz*s_w[sj][si];
+	} else {
+		rZ[id.g] += s_prop[sj][si];
+		uZ[id.g] += s_u[sj][si];
+		vZ[id.g] += s_v[sj][si];
+		wZ[id.g] += s_tmp[sj][si] ;
+		eZ[id.g] += s_dil[sj][si] ;
+	}
+	//#if useStreams
+	//	rZ[id.g] = rZtmp;
+	//	uZ[id.g] = uZtmp;
+	//	vZ[id.g] = vZtmp;
+	//	wZ[id.g] = wZtmp + *dpdz;
+	//	eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ;
+	//#else
+
+
+
+
+
+	//#endif
 	__syncthreads();
 }
 
+
+
+
+
+
 //__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX,
 //		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
 //		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
diff --git a/src/cuda_rhs_backup_0511.cu b/src/cuda_rhs_backup_0511.cu
new file mode 100644
index 0000000..7d3d2e7
--- /dev/null
+++ b/src/cuda_rhs_backup_0511.cu
@@ -0,0 +1,554 @@
+#include "globals.h"
+#include "cuda_functions.h"
+#include "cuda_math.h"
+#include "sponge.h"
+#include "boundary_condition_x.h"
+#include "boundary_condition_y.h"
+#include "boundary_condition_z.h"
+
+__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX,
+		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
+		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
+		myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz,
+		myprec *dil, myprec *dpdz, int iNum) {
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id.mkidX();
+
+	int si = id.i + stencilSize;       // local i for shared memory access + halo offset
+	int sj = id.tiy;                   // local j for shared memory access
+
+	myprec rXtmp=0;
+	myprec uXtmp=0;
+	myprec vXtmp=0;
+	myprec wXtmp=0;
+	myprec eXtmp=0;
+
+	myprec wrk1=0;
+	myprec wrk2=0;
+
+	__shared__ myprec s_u[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_v[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_w[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_t[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_p[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_prop1[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_prop2[sPencils][mx+stencilSize*2];
+
+	s_u[sj][si] = u[id.g];
+	s_v[sj][si] = v[id.g];
+	s_w[sj][si] = w[id.g];
+	s_t[sj][si] = t[id.g];
+	s_p[sj][si] = p[id.g];
+	s_prop1[sj][si] = mu[id.g];
+	s_prop2[sj][si] = lam[id.g];
+	__syncthreads();
+
+	// Boundary conditions in the x-direction are in boundary_condition_x.h
+	// these are the BCs for u,v,w,p,t,mu,lambda
+	BCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx);
+	__syncthreads();
+
+	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
+	uXtmp = ( 2 * dudx[id.g] - 2./3.*dil[id.g] );
+	vXtmp = (     dvdx[id.g] + dudy[id.g]  );
+	wXtmp = (     dwdx[id.g] + dudz[id.g]  );
+
+	//adding the viscous dissipation part duidx*mu*six
+	eXtmp = s_prop1[sj][si]*(uXtmp*dudx[id.g] + vXtmp*dvdx[id.g] + wXtmp*dwdx[id.g]);
+
+	//Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse);
+	derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx
+    uXtmp *= wrk2;
+	vXtmp *= wrk2;
+	wXtmp *= wrk2;
+
+	// viscous fluxes derivative mu*d^2ui dx^2
+	derDevSharedV2x(&wrk1,s_u[sj],si);
+	uXtmp = uXtmp + wrk1*s_prop1[sj][si];
+	derDevSharedV2x(&wrk1,s_v[sj],si);
+	vXtmp = vXtmp + wrk1*s_prop1[sj][si];
+	derDevSharedV2x(&wrk1,s_w[sj],si);
+	wXtmp = wXtmp + wrk1*s_prop1[sj][si];
+
+	//adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six)
+	eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp;
+
+	//adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx)
+	derDevSharedV2x(&wrk1,s_t[sj],si);
+	eXtmp = eXtmp + wrk1*s_prop2[sj][si];
+	derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx
+	derDevSharedV1x(&wrk1,s_t[sj],si); //wrk1 = d (t) dx
+	eXtmp = eXtmp + wrk1*wrk2;
+
+	// pressure and dilation derivatives
+	s_prop2[sj][si] = dil[id.g];
+	__syncthreads();
+
+	// these are the BCs for the dilatation
+	BCxNumber2(s_prop2[sj],id,si,mx);
+	__syncthreads();
+
+	derDevSharedV1x(&wrk2,s_prop2[sj],si);
+	derDevShared1x(&wrk1 ,s_p[sj],si);
+	uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0     - wrk1 ;
+	eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si];
+
+	//Adding here the terms - d (ru phi) dx;
+	s_prop1[sj][si] = r[id.g];
+	s_prop2[sj][si] = h[id.g];
+	__syncthreads();
+
+	// fill in periodic images in shared memory array
+	// these are the BCs for rho (prop1) and enthalpy (prop2)
+	BCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx);
+	__syncthreads();
+
+	fluxQuadSharedx(&wrk1,s_prop1[sj],s_u[sj],si);
+	rXtmp = wrk1;
+	__syncthreads();
+	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si);
+	uXtmp = uXtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_v[sj],si);
+	vXtmp = vXtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_w[sj],si);
+	wXtmp = wXtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si);
+	eXtmp = eXtmp + wrk1;
+	__syncthreads();
+
+	rX[id.g] = rXtmp;
+	uX[id.g] = uXtmp;
+	vX[id.g] = vXtmp;
+	wX[id.g] = wXtmp;
+	eX[id.g] = eXtmp;
+}
+
+__global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, myprec *eY,
+		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
+		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
+		myprec *dvdx, myprec *dudy, myprec *dvdy, myprec *dwdy, myprec *dvdz,
+		myprec *dil, myprec *dpdz, int jNum) {
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id.mkidYFlx();
+
+	int si = id.j + stencilSize;       // local i for shared memory access + halo offset
+	int sj = id.tiy;                   // local j for shared memory access
+
+	myprec rYtmp=0;
+	myprec uYtmp=0;
+	myprec vYtmp=0;
+	myprec wYtmp=0;
+	myprec eYtmp=0;
+
+	myprec wrk1=0;
+	myprec wrk2=0;
+
+	__shared__ myprec s_u[sPencils][my+stencilSize*2];
+	__shared__ myprec s_v[sPencils][my+stencilSize*2];
+	__shared__ myprec s_w[sPencils][my+stencilSize*2];
+	__shared__ myprec s_prop[sPencils][my+stencilSize*2];
+	__shared__ myprec s_dil[sPencils][my+stencilSize*2];
+
+	s_u[sj][si] = u[id.g];
+	s_v[sj][si] = v[id.g];
+	s_w[sj][si] = w[id.g];
+	s_prop[sj][si] = mu[id.g];
+	s_dil[sj][si] = dil[id.g];
+	__syncthreads();
+
+	// fill in periodic images in shared memory array
+	// these are boundary conditions for u,v,w,mu and dilatation
+	BCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my);
+	__syncthreads();
+
+	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
+	uYtmp = (    dudy[id.g] + dvdx[id.g]        );
+	vYtmp = (2 * dvdy[id.g] - 2./3.*s_dil[sj][si] );
+	wYtmp = (    dwdy[id.g] + dvdz[id.g]        );
+
+	//adding the viscous dissipation part duidy*mu*siy
+	eYtmp = s_prop[sj][si]*(uYtmp*dudy[id.g] + vYtmp*dvdy[id.g] + wYtmp*dvdz[id.g]);
+
+	//Adding here the terms d (mu) dy * syj;
+	derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dy
+    uYtmp *= wrk2;
+	vYtmp *= wrk2;
+	wYtmp *= wrk2;
+
+	// viscous fluxes derivative
+	derDevSharedV2y(&wrk1,s_u[sj],si);
+	uYtmp = uYtmp + wrk1*s_prop[sj][si];
+	derDevSharedV2y(&wrk1,s_v[sj],si);
+	vYtmp = vYtmp + wrk1*s_prop[sj][si];
+	derDevSharedV2y(&wrk1,s_w[sj],si);
+	wYtmp = wYtmp + wrk1*s_prop[sj][si];
+
+	//adding the viscous dissipation part ui*(mu * d2duidy2 + dmudy * siy)
+	eYtmp = eYtmp + s_u[sj][si]*uYtmp + s_v[sj][si]*vYtmp + s_w[sj][si]*wYtmp;
+
+	//dilation derivatives
+	derDevSharedV1y(&wrk2,s_dil[sj],si);
+	vYtmp = vYtmp + s_prop[sj][si]*wrk2/3.0;
+	eYtmp = eYtmp + s_prop[sj][si]*wrk2/3.0*s_v[sj][si];
+
+	// pressure derivatives
+	s_dil[sj][si] = p[id.g];
+	__syncthreads();
+
+	// Boundary condition for pressure
+	BCyNumber2(s_dil[sj],p,id,si,my);
+	__syncthreads();
+
+	derDevShared1y(&wrk1,s_dil[sj],si);
+	vYtmp = vYtmp - wrk1;
+
+	// fourier terms
+	s_prop[sj][si] = lam[id.g];
+	s_dil[sj][si]  = t[id.g];
+	__syncthreads();
+
+	// Boundary condition for temperature and thermal conductivity
+	BCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my);
+	__syncthreads();
+
+	derDevSharedV2y(&wrk1,s_dil[sj],si);
+	eYtmp = eYtmp + wrk1*s_prop[sj][si];
+	derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dy
+	derDevSharedV1y(&wrk1,s_dil[sj] ,si); //wrk1 = d (t) dy
+	eYtmp = eYtmp + wrk1*wrk2;
+
+	//Adding here the terms - d (ru phi) dy;
+	s_prop[sj][si] = r[id.g];
+	s_dil[sj][si]  = h[id.g];
+	__syncthreads();
+	// Boundary condition for density and enthalpy
+	BCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my);
+	__syncthreads();
+	fluxQuadSharedy(&wrk1,s_prop[sj],s_v[sj],si);
+	rYtmp = wrk1;
+	__syncthreads();
+	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_u[sj],si);
+	uYtmp = uYtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_v[sj],si);
+	vYtmp = vYtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_w[sj],si);
+	wYtmp = wYtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_dil[sj],si);
+	eYtmp = eYtmp + wrk1;
+	__syncthreads();
+
+#if useStreams
+	rY[id.g] = rYtmp;
+	uY[id.g] = uYtmp;
+	vY[id.g] = vYtmp;
+	wY[id.g] = wYtmp;
+	eY[id.g] = eYtmp;
+#else
+	rY[id.g] += rYtmp;
+	uY[id.g] += uYtmp;
+	vY[id.g] += vYtmp;
+	wY[id.g] += wYtmp;
+	eY[id.g] += eYtmp;
+#endif
+	__syncthreads();
+}
+
+__global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, myprec *eZ,
+		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
+		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
+		myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz,
+		myprec *dil, myprec *dpdz, int kNum) {
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+
+	id.mkidZFlx(kNum);
+
+	int si = id.tix + stencilSize;       // local i for shared memory access + halo offset
+	int sj = id.tiy;                   // local j for shared memory access
+
+	myprec rZtmp=0;
+	myprec uZtmp=0;
+	myprec vZtmp=0;
+	myprec wZtmp=0;
+	myprec eZtmp=0;
+
+	myprec wrk1=0;
+	myprec wrk2=0;
+
+	myprec tmpreg1=0;
+	myprec tmpreg2=0;
+	myprec tmpreg3=0;
+	myprec tmpreg4=0;
+	myprec tmpreg5=0;
+
+	__shared__ myprec s_u[sPencils][mz/nDivZ+stencilSize*2];
+	__shared__ myprec s_v[sPencils][mz/nDivZ+stencilSize*2];
+	__shared__ myprec s_w[sPencils][mz/nDivZ+stencilSize*2];
+	__shared__ myprec s_prop[sPencils][mz/nDivZ+stencilSize*2];
+	__shared__ myprec s_dil[sPencils][mz/nDivZ+stencilSize*2];
+
+	s_u[sj][si] = u[id.g];
+	s_prop[sj][si] = mu[id.g];
+	s_dil[sj][si] = dil[id.g];
+	s_v[sj][si] = v[id.g];
+	s_w[sj][si] = w[id.g];
+        tmpreg1 = dudz[id.g];
+        tmpreg2 = dwdx[id.g];
+        tmpreg3 = dvdz[id.g];
+        tmpreg4 = dwdy[id.g];
+        tmpreg5 = dwdz[id.g];
+
+	uZtmp = (    tmpreg1 + tmpreg2        );
+	vZtmp = (    tmpreg3 + tmpreg4        );
+        //__syncthreads(); // for dil - not needed as s_dil is the one updated by the same thread and hence RAW dependent
+	wZtmp = (2 * tmpreg5 - 2./3.*s_dil[sj][si] );
+	eZtmp = s_prop[sj][si]*(uZtmp*tmpreg1 + vZtmp*tmpreg3 + wZtmp*tmpreg5);
+	
+        __syncthreads();
+	BCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,kNum);
+	__syncthreads();
+
+	derDevSharedV1z(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dz
+        derDevSharedV2z(&tmpreg1,s_u[sj],si); //& will come or not?
+	derDevSharedV2z(&tmpreg2,s_v[sj],si);
+	derDevSharedV2z(&tmpreg3,s_w[sj],si);
+        derDevSharedV1z(&tmpreg5,s_dil[sj],si);
+
+        uZtmp *= wrk2;
+	vZtmp *= wrk2;
+	wZtmp *= wrk2;
+       
+        tmpreg4 = s_prop[sj][si];
+
+	uZtmp = uZtmp + tmpreg1*tmpreg4;
+	vZtmp = vZtmp + tmpreg2*tmpreg4;
+	wZtmp = wZtmp + tmpreg3*tmpreg4;
+	eZtmp = eZtmp + s_u[sj][si]*uZtmp + s_v[sj][si]*vZtmp + s_w[sj][si]*wZtmp;
+
+	wZtmp = wZtmp + tmpreg4*tmpreg5/3.0;
+	eZtmp = eZtmp + tmpreg4*tmpreg5/3.0*s_w[sj][si];
+
+
+
+
+
+	s_dil[sj][si] = p[id.g];
+	__syncthreads();
+	BCzNumber2(s_dil[sj],p,id,si,mz,kNum);
+
+	__syncthreads();
+	derDevShared1z(&wrk1,s_dil[sj],si);
+	wZtmp = wZtmp - wrk1;
+
+	// fourier terms
+	s_prop[sj][si] = lam[id.g];
+	s_dil[sj][si]  = t[id.g];
+	__syncthreads();
+
+	// Boundary conditions for thermal conductivity and temperature
+	BCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,kNum);
+	__syncthreads();
+
+	derDevSharedV2z(&wrk1,s_dil[sj],si);
+	eZtmp = eZtmp + wrk1*s_prop[sj][si];
+	derDevSharedV1z(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dz
+	derDevSharedV1z(&wrk1,s_dil[sj],si); //wrk1 = d (t) dz
+	eZtmp = eZtmp + wrk1*wrk2;
+
+	//Adding here the terms - d (ru phi) dz;
+	s_prop[sj][si] = r[id.g];
+	s_dil[sj][si]  = h[id.g];
+	__syncthreads();
+
+	// Boundary conditions for denisty and enthalpy
+	BCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,kNum);
+	__syncthreads();
+
+	fluxQuadSharedz(&wrk1,s_prop[sj],s_w[sj],si);
+	rZtmp = wrk1;
+	__syncthreads();
+	fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_u[sj],si);
+	uZtmp = uZtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_v[sj],si);
+	vZtmp = vZtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_w[sj],si);
+	wZtmp = wZtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_dil[sj],si);
+	eZtmp = eZtmp + wrk1;
+	__syncthreads();
+
+#if useStreams
+	rZ[id.g] = rZtmp;
+	uZ[id.g] = uZtmp;
+	vZ[id.g] = vZtmp;
+	wZ[id.g] = wZtmp + *dpdz;
+	eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ;
+#else
+	rZ[id.g] += rZtmp;
+	uZ[id.g] += uZtmp;
+	vZ[id.g] += vZtmp;
+	wZ[id.g] += wZtmp + *dpdz;
+	eZ[id.g] += eZtmp + *dpdz*s_w[sj][si] ;
+#endif
+	__syncthreads();
+}
+
+//__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX,
+//		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
+//		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
+//		myprec *dil, myprec *dpdz) {
+//
+//	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+//	id.mkidX();
+//
+//	int si = id.i + stencilSize;       // local i for shared memory access + halo offset
+//	int sj = id.tiy;                   // local j for shared memory access
+//
+//	myprec rXtmp=0;
+//	myprec uXtmp=0;
+//	myprec vXtmp=0;
+//	myprec wXtmp=0;
+//	myprec eXtmp=0;
+//
+//	myprec wrk1=0;
+//	myprec wrk2=0;
+//
+//	__shared__ myprec s_u[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_v[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_w[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_t[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_p[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_prop1[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_prop2[sPencils][mx+stencilSize*2];
+//#if !periodicX
+//	__shared__ myprec s_s0[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_s4[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_s8[sPencils][mx+stencilSize*2];
+//#endif
+//	__shared__ myprec s_dil[sPencils][mx+stencilSize*2];
+//
+//	s_u[sj][si] = u[id.g];
+//	s_v[sj][si] = v[id.g];
+//	s_w[sj][si] = w[id.g];
+//	s_t[sj][si] = t[id.g];
+//	s_p[sj][si] = p[id.g];
+//	s_prop1[sj][si] = mu[id.g];
+//	s_prop2[sj][si] = lam[id.g];
+//#if !periodicX
+//	s_s0[sj][si]= gij[0][id.g];
+//	s_s4[sj][si]= gij[4][id.g];
+//	s_s8[sj][si]= gij[8][id.g];
+//#endif
+//	s_dil[sj][si] = dil[id.g];
+//	__syncthreads();
+//
+//	// fill in periodic images in shared memory array
+//	if (id.i < stencilSize) {
+//#if periodicX
+//		perBCx(s_u[sj],si); perBCx(s_v[sj],si); perBCx(s_w[sj],si);
+//		perBCx(s_t[sj],si); perBCx(s_p[sj],si); perBCx(s_prop1[sj],si);
+//		perBCx(s_prop2[sj],si);
+//#else
+//		wallBCxMir(s_p[sj],si);
+//		wallBCxVel(s_u[sj],si); wallBCxVel(s_v[sj],si); wallBCxVel(s_w[sj],si);
+//		wallBCxExt(s_t[sj],si,TwallTop,TwallBot);
+//		mlBoundPT(s_prop1[sj], s_prop2[sj],  s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si);
+//		wallBCxMir(s_s0[sj],si); wallBCxVel(s_s4[sj],si);  wallBCxVel(s_s8[sj],si);
+//#endif
+//	}
+//
+//	__syncthreads();
+//
+//	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
+//	uXtmp = ( 2 * gij[0][id.g] - 2./3.*s_dil[sj][si] );
+//	vXtmp = (     gij[1][id.g] + gij[3][id.g]  );
+//	wXtmp = (     gij[2][id.g] + gij[6][id.g]  );
+//
+//	//adding the viscous dissipation part duidx*mu*six
+//	eXtmp = s_prop1[sj][si]*(uXtmp*gij[0][id.g] + vXtmp*gij[1][id.g] + wXtmp*gij[2][id.g]);
+//
+//	//Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse);
+//	derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx
+//    uXtmp *= wrk2;
+//	vXtmp *= wrk2;
+//	wXtmp *= wrk2;
+//
+//	// viscous fluxes derivative mu*d^2ui dx^2
+//	derDevSharedV2x(&wrk1,s_u[sj],si);
+//	uXtmp = uXtmp + wrk1*s_prop1[sj][si];
+//	derDevSharedV2x(&wrk1,s_v[sj],si);
+//	vXtmp = vXtmp + wrk1*s_prop1[sj][si];
+//	derDevSharedV2x(&wrk1,s_w[sj],si);
+//	wXtmp = wXtmp + wrk1*s_prop1[sj][si];
+//
+//	//adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six)
+//	eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp;
+//
+//	//adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx)
+//	derDevSharedV2x(&wrk1,s_t[sj],si);
+//	eXtmp = eXtmp + wrk1*s_prop2[sj][si];
+//	derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx
+//	derDevSharedV1x(&wrk1,s_t[sj],si); //wrk1 = d (t) dx
+//	eXtmp = eXtmp + wrk1*wrk2;
+//
+//	// pressure and dilation derivatives
+//	if (id.i < stencilSize) {
+//#if periodicX
+//		perBCx(s_dil[sj],si);
+//#else
+//		wallBCxDil(s_dil[sj],s_s0[sj],s_s4[sj],s_s8[sj],si);
+//#endif
+//	}
+//	__syncthreads();
+//
+//	derDevSharedV1x(&wrk2,s_dil[sj],si);
+//	derDevShared1x(&wrk1 ,s_p[sj],si);
+//	uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0     - wrk1 ;
+//	eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si];
+//
+//	//Adding here the terms - d (ru phi) dx;
+//	s_prop1[sj][si] = r[id.g];
+//	s_prop2[sj][si] = h[id.g];
+//	__syncthreads();
+//	// fill in periodic images in shared memory array
+//	if (id.i < stencilSize) {
+//#if periodicX
+//		perBCx(s_prop1[sj],si); perBCx(s_prop2[sj],si);
+//#else
+//		rhBoundPT(s_prop1[sj], s_prop2[sj],  s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si);
+//#endif
+//	}
+//
+//	fluxQuadSharedx(&wrk1,s_prop1[sj],s_u[sj],si);
+//	rXtmp = wrk1;
+//	__syncthreads();
+//	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si);
+//	uXtmp = uXtmp + wrk1;
+//	__syncthreads();
+//	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_v[sj],si);
+//	vXtmp = vXtmp + wrk1;
+//	__syncthreads();
+//	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_w[sj],si);
+//	wXtmp = wXtmp + wrk1;
+//	__syncthreads();
+//	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si);
+//	eXtmp = eXtmp + wrk1;
+//	__syncthreads();
+//
+//	rX[id.g] = rXtmp;
+//	uX[id.g] = uXtmp;
+//	vX[id.g] = vXtmp;
+//	wX[id.g] = wXtmp;
+//	eX[id.g] = eXtmp;
+//}
diff --git a/src/cuda_utils.cu b/src/cuda_utils.cu
index d8756d6..d34c620 100644
--- a/src/cuda_utils.cu
+++ b/src/cuda_utils.cu
@@ -34,8 +34,8 @@ __global__ void cpyCoefficients(myprec *tmpSx, myprec *tmpVSx) {
 
 __constant__ myprec d_dx, d_dy, d_dz, d_d2x, d_d2y, d_d2z, d_x[mx], d_xp[mx], d_dxv[mx];
 
-dim3 d_block[5], grid0, gridBC,  gridHalo,  gridHaloY,  gridHaloZ;
-dim3 d_grid[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ;
+dim3  d_grid[5], grid0, gridBC,  gridHalo,  gridHaloY,  gridHaloZ, d_gridx, gridBCw;
+dim3  d_block[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ, d_blockx, blockBCw;
 
 void copyThreadGridsToDevice(Communicator rk);
 void sanityCheckThreadGrids(Communicator rk);
@@ -65,6 +65,8 @@ void setGPUParameters(Communicator rk)
 	myprec *h_xp  = new myprec[mx];
 	myprec *h_dxv = new myprec[mx];
 
+
+
 	myprec h_dpdz;
 	if(forcing)  h_dpdz = 0.00372;
 	if(!forcing) h_dpdz = 0.0;
@@ -79,11 +81,16 @@ void setGPUParameters(Communicator rk)
 		h_x[i]   = x[i];
 		h_xp[i]  = xp[i];
 	}
-	h_dxv[0] = (x[1]+x[0])/2.0;
+/*	h_dxv[0] = (x[1]+x[0])/2.0;
 	for (int i=1; i<mx-1; i++) {
 		h_dxv[i]   = (x[i+1]-x[i-1])/2.0;
 	}
-	h_dxv[mx-1] = Lx - (x[mx-1]+x[mx-2])/2.0;
+	h_dxv[mx-1] = Lx - (x[mx-1]+x[mx-2])/2.0; //SIMONE'S METHOD*/
+
+	// better way to compute dx_array
+	for (int i=0; i<mx; i++) {
+		h_dxv[i]   = dx/xp[i]; // dx_nu = dx_u * dx_nu/dx_u
+	}
 
 	myprec h_d2x = h_dx*h_dx;
 	myprec h_d2y = h_dy*h_dy;
@@ -186,6 +193,9 @@ void copyThreadGridsToDevice(Communicator rk) {
 	d_grid[0]  = dim3(2*my / sPencils, mz, 1);
 	d_block[0] = dim3(mx, sPencils/2, 1);
 
+	d_gridx  = dim3(2*my / sPencils, mz, nX);
+	d_blockx = dim3(mx/nX, sPencils/2, 1);
+
 	// Y-grid (1) for RHS calculation and (3) for velocity derivative
 	d_grid[3]  = dim3(nDivX, mz, nDivY);
 	d_block[3] = dim3(mx/nDivX, my/nDivY , 1);
@@ -215,6 +225,9 @@ void copyThreadGridsToDevice(Communicator rk) {
 	gridHaloZ  = dim3(mx / sPencils, my, 1);
 	blockHaloZ = dim3(stencilSize, sPencils, 1);
 
+	gridBCw = dim3(mz, 1 , 1);
+	blockBCw = dim3(my , 1 , 1);
+
 	if(rk.rank==0) {
 		printf("Grid configuration:\n");
 		printf("Grid 0:  {%d, %d, %d} blocks. Blocks 0:  {%d, %d, %d} threads.\n",d_grid[0].x, d_grid[0].y, d_grid[0].z, d_block[0].x, d_block[0].y, d_block[0].z);
@@ -226,6 +239,8 @@ void copyThreadGridsToDevice(Communicator rk) {
 		printf("Grid H:  {%d, %d, %d} blocks. Blocks H:  {%d, %d, %d} threads.\n",gridHalo.x, gridHalo.y, gridHalo.z, blockHalo.x, blockHalo.y, blockHalo.z);
 		printf("Grid HY: {%d, %d, %d} blocks. Blocks HY: {%d, %d, %d} threads.\n",gridHaloY.x, gridHaloY.y, gridHaloY.z, blockHaloY.x, blockHaloY.y, blockHaloY.z);
 		printf("Grid HZ: {%d, %d, %d} blocks. Blocks HZ: {%d, %d, %d} threads.\n",gridHaloZ.x, gridHaloZ.y, gridHaloZ.z, blockHaloZ.x, blockHaloZ.y, blockHaloZ.z);
+		printf("Grid X: {%d, %d, %d} blocks. Blocks X: {%d, %d, %d} threads.\n",d_gridx.x, d_gridx.y, d_gridx.z, d_blockx.x, d_blockx.y, d_blockx.z);
+
 		printf("\n");
 	}
 }
@@ -516,6 +531,35 @@ void initSolver(Communicator rk) {
 	checkCuda( cudaMallocHost(&rcvZm5, 5*bytesZ) );
 	checkCuda( cudaMallocHost(&rcvZp5, 5*bytesZ) );
 
+
+	checkCuda( cudaMalloc( (void**)&recy_r, stencilSize*mx_tot*my*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&recy_u, stencilSize*mx_tot*my*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&recy_v, stencilSize*mx_tot*my*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&recy_w, stencilSize*mx_tot*my*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&recy_e, stencilSize*mx_tot*my*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&recy_h, stencilSize*mx_tot*my*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&recy_p, stencilSize*mx_tot*my*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&recy_t, stencilSize*mx_tot*my*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&recy_m, stencilSize*mx_tot*my*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&recy_l, stencilSize*mx_tot*my*sizeof(myprec) ) );
+
+	checkCuda( cudaMalloc( (void**)&rm, stencilSize*mx_tot*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&um, stencilSize*mx_tot*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&wm, stencilSize*mx_tot*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&tm, stencilSize*mx_tot*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&rm_in, stencilSize*mx_tot*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&um_in, stencilSize*mx_tot*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&wm_in, stencilSize*mx_tot*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&tm_in, stencilSize*mx_tot*sizeof(myprec) ) );
+
+	checkCuda( cudaMalloc( (void**)&a_inpl, stencilSize*mx_tot*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&b_inpl, stencilSize*mx_tot*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&idxm, stencilSize*mx_tot*sizeof(int) ) );
+	checkCuda( cudaMalloc( (void**)&idxp, stencilSize*mx_tot*sizeof(int) ) );
+
+	checkCuda( cudaMalloc( (void**)&delta_rec, stencilSize*sizeof(myprec) ) );
+	checkCuda( cudaMalloc( (void**)&delta_in, stencilSize*sizeof(myprec) ) );
+
 }
 
 void clearSolver(Communicator rk) {
@@ -559,6 +603,35 @@ void clearSolver(Communicator rk) {
 	checkCuda( cudaFree(d_m) );
 	checkCuda( cudaFree(d_l) );
 
+
+	checkCuda( cudaFree( recy_r));
+	checkCuda( cudaFree( recy_u));
+	checkCuda( cudaFree( recy_v));
+	checkCuda( cudaFree( recy_w));
+	checkCuda( cudaFree( recy_e));
+	checkCuda( cudaFree( recy_h));
+	checkCuda( cudaFree( recy_p));
+	checkCuda( cudaFree( recy_t));
+	checkCuda( cudaFree( recy_m));
+	checkCuda( cudaFree( recy_l));
+
+	checkCuda( cudaFree( rm   ));
+	checkCuda( cudaFree( um   ));
+	checkCuda( cudaFree( wm   ));
+	checkCuda( cudaFree( tm   ));
+	checkCuda( cudaFree( rm_in));
+	checkCuda( cudaFree( um_in));
+	checkCuda( cudaFree( wm_in));
+	checkCuda( cudaFree( tm_in));
+
+	checkCuda( cudaFree( a_inpl));
+	checkCuda( cudaFree( b_inpl));
+	checkCuda( cudaFree( idxm  )); 
+	checkCuda( cudaFree( idxp  ));
+
+	checkCuda( cudaFree( delta_rec ));
+	checkCuda( cudaFree( delta_in  ));
+
 	if(!lowStorage) {
 		checkCuda( cudaFree(d_rO) );
 		checkCuda( cudaFree(d_eO) );
diff --git a/src/globals.h b/src/globals.h
index f50b4da..2246b79 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -14,50 +14,98 @@
 #define pRow 1
 #define pCol 1
 
+#define GPUperNode 4
+
 //Remember : viscous stencil should ALWAYS be smaller than the advective stencil!!! (otherwise errors in how you load global into shared memory)
 #define stencilSize 3  // the order is double the stencilSize (advective fluxes stencil)
 #define stencilVisc 2  // the order is double the stencilVisc (viscous fluxes stencil)
 
-#define Lx       (2.0)
-#define Ly       (3.0)
-#define Lz       (10.0)
-#define mx_tot   176
-#define my_tot   216 // 246
-#define mz_tot   432
-#define nsteps   2000
+#define boundaryLayer (true)
+#define channel (false)
+#define misc (false)
+
+#define Lx       (12.0)
+#define Ly       (10.0)
+#define Lz       (52.0)
+#define mx_tot   240
+#define my_tot   60 // 246
+#define mz_tot   2048
+#define nsteps   10
 #define nfiles   1
 #define CFL      0.75f
 #define restartFile -1
+#define spanshift (true)
+
+#define recstn   1500  
 
 #define lowStorage    (false)
-#define boundaryLayer (false)
 #define perturbed     (false)
-#define forcing       (true)
-#define periodicX     (false)
-#define nonUniformX   (true)
 
 #define checkCFLcondition 100
-#define checkBulk 100
+#define checkBulk         100
+#define checkMeanRec      1
 
-#define Re       7500.0
-#define Pr       0.70
-#define Ma       0.7
+#define Re       16609
+#define Pr       0.7
+#define Ma       1.9
 #define viscexp  0.75
 #define gam      1.4
 #define Ec       ((gam - 1.f)*Ma*Ma)
 #define Rgas     (1.f/(gam*Ma*Ma))
+#define pinf     (Rgas)
+#define rfac        0.89
+#define Trat     1
+#define Tr       (1 + 0.5*(gam-1)*rfac*Ma*Ma)
+
+#if boundaryLayer
+
+#define forcing       (false)
+#define periodicX     (false)
+#define nonUniformX   (true)
+
+//1 = periodic ; 2 = wall_staggered; 3 = wall_centered; 4= 0th order extrapolation 5 = Inflow
+#define inletbc   (5)
+#define outletbc  (4)
+#define bottombc  (3)
+#define topbc     (4)
+
+#define gridtype  (0) // 0 = BL; 1 = channel_centered; 2 = channel staggered, no wall ,etc.
+
+const myprec TwallTop = 0.0; //Holds no relevance for BL
+const myprec TwallBot = Trat*Tr;
+
+#elif channel
+
+#define forcing       (true)
+#define periodicX     (false)
+#define nonUniformX   (true)
+
+//1 = periodic ; 2 = wall_staggered; 3 = wall_centered; 4= 0th order extrapolation 5 = Inflow
+#define inletbc   (1)
+#define outletbc  (1)
+#define bottombc  (2)
+#define topbc     (2)
+
+#define gridtype  (2) // 0 = BL; 1 = channel_centered; 2 = channel staggered, no wall ,etc.
 
-const double stretch  = 3.0;
 const myprec TwallTop = 1.0;
 const myprec TwallBot = 1.0;
 
+#elif misc
+
+#endif
+
+const double stretch  = 5.0;
+
 #define mx (mx_tot)
 #define my (my_tot/pRow)
 #define mz (mz_tot/pCol)
 
-#define nDivX 	(11) 	// never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations)
-#define nDivY 	(9) 	// never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations)
-#define nDivZ 	(18) 	// never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations)
+#define nX 	(1) 	// never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations)
+
+#define nDivX 	(15) 	// never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations)
+#define nDivY 	(16) 	// never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations)
+#define nDivZ 	(64) 	// never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations)
 
 #define idx(i,j,k) \
 		({ ( k )*mx*my +( j )*mx + ( i ); })
diff --git a/src/init.cpp b/src/init.cpp
index ec239b2..c64d556 100644
--- a/src/init.cpp
+++ b/src/init.cpp
@@ -16,9 +16,9 @@ void initField(int timestep, Communicator rk) {
 	readFileMPI('v',timestep,v,rk);
 	readFileMPI('w',timestep,w,rk);
 	readFileMPI('e',timestep,e,rk);
-    if(rk.rank==0) {
-    	printf("finished initializing field\n");
-    }
+	if(rk.rank==0) {
+		printf("finished initializing field\n");
+	}
 }
 
 void writeField(int timestep, Communicator rk) {
@@ -33,40 +33,100 @@ void initGrid(Communicator rk) {
 
 	//constant grid in y and z and stretched in x
 
-	dx = Lx*(1.0)/(mx_tot);
+	if (gridtype ==0) {
+
+		dx = Lx*(1.0)/(mx_tot-1);// because we have points at the domain boundaries.mx_tot is the total points and mx_tot-1 is the number of parts in which domain is divided
+		//dy = Ly*(1.0)/(my_tot);// Periodic direction
+		//dz = Lz*(1.0)/(mz_tot-1);// because we have points at the domain boundaries.
+
+		// y mesh - uniform
+		for(int j=0;j<my_tot;j++) {
+			y[j]=Ly*(0.5+j*1.0)/(my_tot);  } //staggered
+
+		// z mesh - uniform
+		for(int k=0;k<mz_tot;k++) {
+			z[k]=Lz*(k*1.0)/(mz_tot-1);  }
+
+
+		//stretching for wall-normal direction.
+
+		myprec tanhyp = 0.5; // for boundary layers. For channel tanhyp is 1
+		for(int i=0;i<mx_tot;i++) {
+			myprec fact    =  tanhyp*(i)/(mx_tot-1.0) - 0.5 ;
+			x[i]    =  0.5/tanhyp*(1.0+tanh(stretch*fact)/tanh(stretch*0.5))*(Lx) ;
+			xp[i]   =  0.5*stretch/tanh(stretch*0.5)/pow(cosh(stretch*fact), 2.0) ; // dx/dx_uni = dx/di * di/dx_uni = (dx/di) / dx_uni
+			xpp[i]  =  -tanhyp*pow(stretch,2.0)*tanh(stretch*fact)/tanh(stretch*0.5)/pow(cosh(stretch*fact),2.0) ; //d2x/dx2_uni
+
+			xp[i] = 1/xp[i]; // taking inverse because everywhere dx/dx_uni will come in the denominator. Thus doing this, we can just multiply by xp to save computation effort in division.
+		}
+
+	} else if (gridtype == 1) {
+
+		dx = Lx*(1.0)/(mx_tot-1);// because we have points at the domain boundaries.mx_tot is the total points and mx_tot-1 is the number of parts in which domain is divided
+		//dy = Ly*(1.0)/(my_tot);// Periodic direction
+		//dz = Lz*(1.0)/(mz_tot-1);// because we have points at the domain boundaries.
+
+		// y mesh - uniform
+		for(int j=0;j<my_tot;j++) {
+			y[j]=Ly*(0.5+j*1.0)/(my_tot);  } //staggered
 
-	double xn[mx_tot+1];
-	int denom = mx_tot;
-	myprec denom2 = 2.0;
-	if(boundaryLayer) {
+		// z mesh - uniform
+		for(int k=0;k<mz_tot;k++) {
+			z[k]=Lz*(0.5+k*1.0)/(mz_tot);  }
+
+
+		//stretching for wall-normal direction.
+
+		myprec tanhyp = 1; // for boundary layers. For channel tanhyp is 1
+		for(int i=0;i<mx_tot;i++) {
+			myprec fact    =  tanhyp*(i)/(mx_tot-1.0) - 0.5 ;
+			x[i]    =  0.5/tanhyp*(1.0+tanh(stretch*fact)/tanh(stretch*0.5))*(Lx) ;
+			xp[i]   =  0.5*stretch/tanh(stretch*0.5)/pow(cosh(stretch*fact), 2.0) ; // dx/dx_uni = dx/di * di/dx_uni = (dx/di) / dx_uni
+			xpp[i]  =  -tanhyp*pow(stretch,2.0)*tanh(stretch*fact)/tanh(stretch*0.5)/pow(cosh(stretch*fact),2.0) ; //d2x/dx2_uni
+
+			xp[i] = 1/xp[i]; // taking inverse because everywhere dx/dx_uni will come in the denominator. Thus doing this, we can just multiply by xp to save computation effort in division.
+
+		}
+
+	}else if (gridtype == 2){
+
+		dx = Lx*(1.0)/(mx_tot);
+
+		double xn[mx_tot+1];
+		int denom = mx_tot;
+		myprec denom2 = 2.0;
+		/*if(boundaryLayer) {
 		denom *= 2;
 		denom2/= 2;
-	}
+	}*/
 
-	for (int i=0; i<mx_tot+1; i++)
-		xn[i] = tanh(stretch*((i*1.0)/denom-0.5))/tanh(stretch*0.5);
+		for (int i=0; i<mx_tot+1; i++)
+			xn[i] = tanh(stretch*((i*1.0)/denom-0.5))/tanh(stretch*0.5)	;
 
-	for (int i=0; i<mx_tot; i++)
-		x[i] = Lx * (1.0 + (xn[i] + xn[i+1])/2.0)/denom2;
+		for (int i=0; i<mx_tot; i++)
+			x[i] = Lx * (1.0 + (xn[i] + xn[i+1])/2.0)/denom2;
 
-	derivGrid(xpp,xp,x,dx);
+		derivGrid(xpp,xp,x,dx);
 
-	for (int i=0; i<mx_tot; i++)
-		xp[i] = 1.0/xp[i];
+		for (int i=0; i<mx_tot; i++)
+			xp[i] = 1.0/xp[i];
 
 #if !nonUniformX
-	for(int i=0;i<mx_tot;i++) {
+		for(int i=0;i<mx_tot;i++) {
 			x[i]=Lx*(0.5+i*1.0)/(mx_tot);  }
 
-	dx = x[1] - x[0];
+		dx = x[1] - x[0];
 
 #endif
 
-	for(int j=0;j<my_tot;j++) {
-		y[j]=Ly*(0.5+j*1.0)/(my_tot);  }
+		for(int j=0;j<my_tot;j++) {
+			y[j]=Ly*(0.5+j*1.0)/(my_tot);  }
+
+		for(int k=0;k<mz_tot;k++) {
+			z[k]=Lz*(0.5+k*1.0)/(mz_tot);  }
+
+	}
 
-	for(int k=0;k<mz_tot;k++) {
-		z[k]=Lz*(0.5+k*1.0)/(mz_tot);  }
 
 	if(rk.rank==0) {
 
@@ -123,6 +183,59 @@ void initChannel(Communicator rk) {
 			} } }
 }
 
+void initBL(Communicator rk) {
+
+	readFileMPI('r',0,r,rk);
+	readFileMPI('u',0,u,rk);
+	readFileMPI('v',0,v,rk);
+	readFileMPI('w',0,w,rk);
+	readFileMPI('e',0,e,rk);
+	if(rk.rank==0) {
+		printf("finished Reading Streams outputted initial file\n");
+	}
+/*	double cv = Rgas/(gam-1.0);
+	myprec *t       = (myprec*)malloc(mx*my*mz*sizeof(myprec));
+	myprec *idx     = (myprec*)malloc(mz*sizeof(myprec));
+	myprec *delta99 = (myprec*)malloc(mz*sizeof(myprec));
+
+	readFileInitBL('r',r,rk); //all the variables are initialized with the mean
+	readFileInitBL('u',u,rk);
+	readFileInitBL('w',w,rk);
+	readFileInitBL('t',t,rk);
+	readFileInitBL1D('i',idx,rk); //idx indicates cell location corresponding to the BL thickness.         
+	readFileInitBL1D('d',delta99,rk);
+
+
+	for (int i=0; i<mx; i++) {
+		for (int j=0; j<my; j++) {
+			for (int k=0; k<mz; k++) {  
+				double rr1 = rand()*1.0/(RAND_MAX*1.0) - 0.5;
+				double rr2 = rand()*1.0/(RAND_MAX*1.0) - 0.5;
+				double rr3 = rand()*1.0/(RAND_MAX*1.0) - 0.5;
+				double ufluc  =  0.02*rr1;
+				double vfluc  =  0.02*rr2;
+				double wfluc  =  0.02*rr3;
+
+				//double wmean  = 1.5*U0*R0*x[i]*(1.0-x[i]/Lx);
+				/*if (i<=idx[k]) {
+				ufluc          = ufluc + 0.05*sin(0.5*M_PI*x[i]/delta99[k])*cos(2*M_PI*y[j+rk.jstart]);
+				vfluc          = vfluc + 0.05*sin(0.5*M_PI*x[i]/delta99[k])*sin(2*M_PI*y[j+rk.jstart]);
+				}
+
+				u[idx(i,j,k)] += ufluc;
+				v[idx(i,j,k)]  = vfluc;
+				w[idx(i,j,k)] += wfluc;
+				r[idx(i,j,k)] += 0    ; // only mean
+				e[idx(i,j,k)]  = r[idx(i,j,k)] *(cv*t[idx(i,j,k)] + 0.5 *  (pow(u[idx(i,j,k)],2) + pow(v[idx(i,j,k)],2) + pow(w[idx(i,j,k)],2)) );
+			} } }
+	free(t);
+	free(idx);
+	free(delta99);*/
+}
+
+
+
+
 void initCHIT(Communicator rk) {
 
 	double V0 = 1.0;
@@ -216,24 +329,24 @@ void printRes(Communicator rk) {
 
 	for (int k=0; k<mz; k++)
 		for (int j=0; j<my; j++) {
-			    temw   = 1.0;
-			    double suth = pow(temw,viscexp);
-			    muw      = suth/Re;
+			temw   = 1.0;
+			double suth = pow(temw,viscexp);
+			muw      = suth/Re;
 
-			    double ub[stencilSize*2+1];
-			    for (int i=stencilSize; i<stencilSize*2+1; i++)
-			    	ub[i] = w[i-stencilSize + j*mx + k*my*mx];
-			    for (int i=0; i<stencilSize; i++)
-			    	ub[i] = w[stencilSize-i-1 + j*mx + k*my*mx];
+			double ub[stencilSize*2+1];
+			for (int i=stencilSize; i<stencilSize*2+1; i++)
+				ub[i] = w[i-stencilSize + j*mx + k*my*mx];
+			for (int i=0; i<stencilSize; i++)
+				ub[i] = w[stencilSize-i-1 + j*mx + k*my*mx];
 
-			    double dudx = 0;
-			    for (int i=0; i<stencilSize; i++)
-			    	dudx += coeffF[i]*(ub[i]-ub[stencilSize*2-i])/dx;
+			double dudx = 0;
+			for (int i=0; i<stencilSize; i++)
+				dudx += coeffF[i]*(ub[i]-ub[stencilSize*2-i])/dx;
 
-			    dudx *= xp[0];
+			dudx *= xp[0];
 
-			    ut  = sqrt(muw*abs(dudx)/r[idx(0,j,k)]);
-			    Ret += ut*r[idx(0,j,k)]/muw;
+			ut  = sqrt(muw*abs(dudx)/r[idx(0,j,k)]);
+			Ret += ut*r[idx(0,j,k)]/muw;
 		}
 
 	Ret = Ret/my/mz;
@@ -291,20 +404,20 @@ void calcdt(Communicator rk) {
 
 		int i = gt%my;
 
-	    if(i==0) {
-	    	dx = (x[i+1] + x[i])/2.0;
-	    } else if (i==mx-1) {
-	    	dx = Lx - (x[i] + x[i-1])/2.0;
-	    } else {
-	    	dx = (x[i+1] - x[i-1])/2.0;
-	    }
+		if(i==0) {
+			dx = (x[i+1] + x[i])/2.0;
+		} else if (i==mx-1) {
+			dx = Lx - (x[i] + x[i-1])/2.0;
+		} else {
+			dx = (x[i+1] - x[i-1])/2.0;
+		}
 
-	    dx2 = dx*dx;
+		dx2 = dx*dx;
 
-	    double dtc1, dtv1;
+		double dtc1, dtv1;
 
-	    dtc1      =  MAX( (abs(u[gt]) + sos)/dx, MAX( (abs(v[gt]) + sos)/dy, (abs(w[gt]) + sos)/dz) );
-	    dtv1      =  MAX( 1.0/Re/dx2, MAX( 1.0/Re/dy2, 1.0/Re/dz2) );
+		dtc1      =  MAX( (abs(u[gt]) + sos)/dx, MAX( (abs(v[gt]) + sos)/dy, (abs(w[gt]) + sos)/dz) );
+		dtv1      =  MAX( 1.0/Re/dx2, MAX( 1.0/Re/dy2, 1.0/Re/dz2) );
 		dtConvInv =  MAX( dtConvInv, dtc1 );
 		dtViscInv =  MAX( dtViscInv, dtv1 );
 	}
@@ -315,12 +428,14 @@ void calcdt(Communicator rk) {
 }
 
 void restartWrapper(Communicator rk) {
-    if(restartFile<0) {
-    	if(forcing) {
-    		initChannel(rk);
-    	} else {
-    		if(!boundaryLayer) initCHIT(rk);
-    	}
-    } else {
-    	initField(restartFile,rk); }
+	if(restartFile<0) {
+		if(forcing) {
+			initChannel(rk);
+		}else if(boundaryLayer){
+			initBL(rk);
+		} else {
+			initCHIT(rk);
+		}
+	} else {
+		initField(restartFile,rk); }
 }
diff --git a/src/main.cpp b/src/main.cpp
index 23c381b..759f113 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -7,6 +7,7 @@
 #include "globals.h"
 #include "comm.h"
 
+
 using namespace std;
 
 double dt, h_dpdz;
@@ -48,9 +49,11 @@ int main(int argc, char** argv) {
     initGrid(rk);
 
     //If boundary layer set sponge on the GPU and initialize the solution field with references
-    if(boundaryLayer) calculateSponge(rk);
+    
+    //if(boundaryLayer) calculateSponge(rk);
 
     //Initialize the solution field
+
     restartWrapper(rk);
 
     //Output the initial field
diff --git a/src/main.h b/src/main.h
index bbc6a06..16638a4 100644
--- a/src/main.h
+++ b/src/main.h
@@ -9,7 +9,7 @@
 
 class Communicator {
 public:
-	int nodeRank, rank, jp, jm, kp, km;
+	int nodeRank, rank, jp, jm, kp, km, coodsend, coodrecv;
 	int jstart,jend,kstart,kend;
 	void myRank(int rk) {
 		rank = rk;
diff --git a/src/nrbcX.h b/src/nrbcX.h
new file mode 100644
index 0000000..d788e91
--- /dev/null
+++ b/src/nrbcX.h
@@ -0,0 +1,235 @@
+
+#ifndef NRBCX_H_
+#define NRBCX_H_
+
+extern __device__ __forceinline__ void IsothnrbcX_top(myprec *rXtmp, myprec *uXtmp, myprec *vXtmp, myprec *wXtmp, myprec *eXtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id); 
+extern __device__ __forceinline__ void IsothnrbcX_bot(myprec *rXtmp, myprec *uXtmp, myprec *vXtmp, myprec *wXtmp, myprec *eXtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id); 
+extern __device__ __forceinline__ void FreestreamnrbcX_top(myprec *rXtmp, myprec *uXtmp, myprec *vXtmp, myprec *wXtmp, myprec *eXtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id); 
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void IsothnrbcX_bot(myprec *rXtmp, myprec *uXtmp, myprec *vXtmp, myprec *wXtmp, myprec *eXtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id) {
+
+
+	myprec dp = 0 ;
+	myprec du = 0 ;
+	myprec dv = 0 ;
+	myprec dw = 0 ;
+	myprec dr = 0 ;
+
+	myprec wrk1 = 0;
+
+	derShared1x_FD(&dp, s_p, si) ;
+	derShared1x_FD(&du, s_u, si) ;
+	derShared1x_FD(&dv, s_v, si) ;
+	derShared1x_FD(&dw, s_w, si) ;
+	derShared1x_FD(&dr, s_r, si) ;
+
+	myprec sos = pow(gam*s_p[si]/s_r[si],0.5);
+
+	myprec ev[5];
+
+	ev[0] = s_u[si] - sos;
+	ev[1] = s_u[si]      ;
+	ev[2] = s_u[si]      ;
+	ev[3] = s_u[si]      ;
+	ev[4] = s_u[si] + sos;
+
+
+	myprec L[5];
+
+	if (ev[0]<0){
+		L[0] = ev[0]*(dp - s_r[si]*sos*du);
+	} else {
+		L[0] = 0; // either 0 (perfectly non-reflecting) or use LODI relations ( Eq. 24-28 in Poinsot and Lele (1992) )
+	}
+	if (ev[1]<0){
+		L[1] = ev[1]*(sos*sos*dr - dp);
+	} else {
+		L[1] = 0;
+	}
+	if (ev[2]<0){
+		L[2] = ev[2]*(dv);
+	} else {
+		L[2] = 0;
+	}
+	if (ev[3]<0){
+		L[3] = ev[3]*(dw);
+	} else {
+		L[3] = 0;
+	}
+	if (ev[4]<0){
+		L[4] = ev[4]*(dp + s_r[si]*sos*du);
+	} else {
+		L[4] = L[0]; // used LODI relation eq 26, as du/dt = 0 at the boundary (no-slip)
+	}
+
+	myprec d[5];
+
+	d[0] = 1/(sos*sos)*( L[1] + 0.5*(L[4] + L[0]) ) ;
+	d[1] = 0.5*(  L[4] + L[0]  ) ;
+	d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ;
+	d[3] = L[2] ;
+	d[4] = L[3] ;
+
+	*rXtmp  = - d[0];
+	fluxCubeSharedx(&wrk1,s_r,s_u,s_u,si);
+	*uXtmp = *uXtmp + wrk1;
+	fluxCubeSharedx(&wrk1,s_r,s_u,s_v,si);
+	*vXtmp = *vXtmp + wrk1;
+	fluxCubeSharedx(&wrk1,s_r,s_u,s_w,si);
+	*wXtmp = *wXtmp + wrk1;
+	fluxCubeSharedx(&wrk1,s_r,s_u,s_h,si);
+	*eXtmp = *eXtmp + wrk1;
+}
+
+__device__ __forceinline__ __attribute__((always_inline)) void IsothnrbcX_top(myprec *rXtmp, myprec *uXtmp, myprec *vXtmp, myprec *wXtmp, myprec *eXtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id) {
+
+
+	myprec dp = 0 ;
+	myprec du = 0 ;
+	myprec dv = 0 ;
+	myprec dw = 0 ;
+	myprec dr = 0 ;
+
+	myprec wrk1 = 0;
+
+	derShared1x_BD(&dp, s_p, si) ;
+	derShared1x_BD(&du, s_u, si) ;
+	derShared1x_BD(&dv, s_v, si) ;
+	derShared1x_BD(&dw, s_w, si) ;
+	derShared1x_BD(&dr, s_r, si) ;
+
+	myprec sos = pow(gam*s_p[si]/s_r[si],0.5);
+
+	myprec ev[5];
+
+	ev[0] = s_u[si] - sos;
+	ev[1] = s_u[si];
+	ev[2] = s_u[si];
+	ev[3] = s_u[si];
+	ev[4] = s_u[si] + sos;
+
+
+	myprec L[5];
+
+	if (ev[0]>0){
+		L[0] = ev[0]*(dp - s_r[si]*sos*du);
+	} else {
+		L[0] =  L[4] ; // either 0 (perfectly non-reflecting) or use LODI relations ( Eq. 24-28 in Poinsot and Lele (1992) )
+	}
+	if (ev[1]>0){
+		L[1] = ev[1]*(sos*sos*dr - dp);
+	} else {
+		L[1] = 0;
+	}
+	if (ev[2]>0){
+		L[2] = ev[2]*(dv);
+	} else {
+		L[2] = 0;
+	}
+	if (ev[3]>0){
+		L[3] = ev[3]*(dw);
+	} else {
+		L[3] = 0;
+	}
+	if (ev[4]>0){
+		L[4] = ev[4]*(dp + s_r[si]*sos*du);
+	} else {
+		L[4] = 0;
+	}
+
+	myprec d[5];
+
+	d[0] = 1/(sos*sos) * ( L[1] + 0.5*(L[4] + L[0]) ) ;
+	d[1] = 0.5*( L[4] + L[0] ) ;
+	d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ;
+	d[3] = L[2] ;
+	d[4] = L[3] ;
+
+	*rXtmp  = - d[0];
+	fluxCubeSharedx(&wrk1,s_r,s_u,s_u,si);
+	*uXtmp = *uXtmp + wrk1;
+	fluxCubeSharedx(&wrk1,s_r,s_u,s_v,si);
+	*vXtmp = *vXtmp + wrk1;
+	fluxCubeSharedx(&wrk1,s_r,s_u,s_w,si);
+	*wXtmp = *wXtmp + wrk1;
+	fluxCubeSharedx(&wrk1,s_r,s_u,s_h,si);
+	*eXtmp = *eXtmp + wrk1;
+
+
+}
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void FreestreamnrbcX_top(myprec *rXtmp, myprec *uXtmp, myprec *vXtmp, myprec *wXtmp, myprec *eXtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id) {
+
+	myprec dp = 0 ;
+	myprec du = 0 ;
+	myprec dv = 0 ;
+	myprec dw = 0 ;
+	myprec dr = 0 ;
+
+	derShared1x_BD(&dp, s_p, si) ;
+	derShared1x_BD(&du, s_u, si) ;
+	derShared1x_BD(&dv, s_v, si) ;
+	derShared1x_BD(&dw, s_w, si) ;
+	derShared1x_BD(&dr, s_r, si) ;
+
+	myprec sos = pow(gam*s_p[si]/s_r[si],0.5);
+
+	myprec ev[5];
+
+	ev[0] = s_u[si] - sos;
+	ev[1] = s_u[si];
+	ev[2] = s_u[si];
+	ev[3] = s_u[si];
+	ev[4] = s_u[si] + sos;
+
+	myprec sig = 0.25;
+	myprec K = sig * (1-Ma*Ma) * sos / Lx; // Ma is the free stream Mach number that we specify. Poinsot and Lele say that maximum Mach number in the flow should be used. So should there be a comparison kernel that computes maximum Mach number? I dont think its needed as the free stream Mach number and Ma in eqn would be very similar in order and equation 40 is not something accurate but a mere model.
+
+	myprec L[5];
+
+	if (ev[0]>0){
+		L[0] = ev[0]*(dp - s_r[si]*sos*du);
+	} else {
+		L[0] =  K*(s_p[si] - pinf) ; // either 0 (perfectly non-reflecting) or use LODI relations ( Eq. 24-28 in Poinsot and Lele (1992) )
+	}
+	if (ev[1]>0){
+		L[1] = ev[1]*(sos*sos*dr - dp);
+	} else {
+		L[1] = 0;
+	}
+	if (ev[2]>0){
+		L[2] = ev[2]*(dv);
+	} else {
+		L[2] = 0;
+	}
+	if (ev[3]>0){
+		L[3] = ev[3]*(dw);
+	} else {
+		L[3] = 0;
+	}
+	if (ev[4]>0){
+		L[4] = ev[4]*(dp + s_r[si]*sos*du);
+	} else {
+		L[4] = 0;
+	}
+
+	myprec d[5];
+
+	d[0] = 1/(sos*sos) * ( L[1] + 0.5*(L[4] + L[0]) ) ;
+	d[1] = 0.5*( L[4] + L[0] ) ;
+	d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ;
+	d[3] = L[2] ;
+	d[4] = L[3] ;
+
+	*rXtmp   = - d[0];
+	*uXtmp  += - s_u[si]*d[0] - s_r[si]*d[2];
+	*vXtmp  += - s_v[si]*d[0] - s_r[si]*d[3];
+	*wXtmp  += - s_w[si]*d[0] - s_r[si]*d[4];
+	*eXtmp  += - ( 0.5*( s_u[si]*s_u[si] + s_v[si]*s_v[si] + s_w[si]*s_w[si] )*d[0] + d[1]/(gam - 1) + s_r[si]*s_u[si] * d[2] + s_r[si]*s_v[si] * d[3] + s_r[si]*s_w[si] * d[4]  )     ;
+
+}
+
+
+#endif
diff --git a/src/nrbcZ.h b/src/nrbcZ.h
new file mode 100644
index 0000000..a1d7481
--- /dev/null
+++ b/src/nrbcZ.h
@@ -0,0 +1,163 @@
+
+#ifndef NRBCZ_H_
+#define NRBCZ_H_
+
+extern __device__ __forceinline__ void InflownrbcZ_bot(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id); 
+extern __device__ __forceinline__ void OutflownrbcZ_top(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id); 
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void InflownrbcZ_bot(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id) {
+
+
+	myprec dpf = 0 ;
+	myprec duf = 0 ;
+	myprec dvf = 0 ;
+	myprec dwf = 0 ;
+	myprec drf = 0 ;
+
+	myprec dpb = 0 ;
+	myprec dub = 0 ;
+	myprec dvb = 0 ;
+	myprec dwb = 0 ;
+	myprec drb = 0 ;
+
+
+	derShared1z_FD(&dpf, s_p, si) ;
+	derShared1z_FD(&duf, s_u, si) ;
+	derShared1z_FD(&dvf, s_v, si) ;
+	derShared1z_FD(&dwf, s_w, si) ;
+	derShared1z_FD(&drf, s_r, si) ;
+
+	derShared1z_BD(&dpb, s_p, si) ;
+	derShared1z_BD(&dub, s_u, si) ;
+	derShared1z_BD(&dvb, s_v, si) ;
+	derShared1z_BD(&dwb, s_w, si) ;
+	derShared1z_BD(&drb, s_r, si) ;
+
+	myprec sos = pow(gam*s_p[si]/s_r[si],0.5);
+
+	myprec ev[5];
+
+	ev[0] = s_w[si] - sos;
+	ev[1] = s_w[si]      ;
+	ev[2] = s_w[si]      ;
+	ev[3] = s_w[si]      ;
+	ev[4] = s_w[si] + sos;
+
+	myprec L[5];
+
+	if (ev[0]<0){
+		L[0] = ev[0]*(dpf - s_r[si]*sos*dwf);
+	} else {
+		L[0] = ev[0]*(dpb - s_r[si]*sos*dwb);
+	}
+	if (ev[1]<0){
+		L[1] = ev[1]*(sos*sos*drf - dpf);
+	} else {
+		L[1] = ev[1]*(sos*sos*drb - dpb);
+	}
+	if (ev[2]<0){
+		L[2] = ev[2]*(dvf);
+	} else {
+		L[2] = ev[2]*(dvb);
+	}
+	if (ev[3]<0){
+		L[3] = ev[3]*(duf);
+	} else {
+		L[3] = ev[3]*(dub);
+	}
+	if (ev[4]<0){
+		L[4] = ev[4]*(dpf + s_r[si]*sos*dwf);
+	} else {
+		L[4] = ev[4]*(dpb + s_r[si]*sos*dwb);
+	}
+
+	myprec d[5];
+
+	d[0] = 1/(sos*sos)*( L[1] + 0.5*(L[4] + L[0]) ) ;
+	d[1] = 0.5*(  L[4] + L[0]  ) ;
+	d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ;
+	d[3] = L[2] ;
+	d[4] = L[3] ;
+
+	*rZtmp   = - d[0];
+	*uZtmp  += - s_u[si]*d[0] - s_r[si]*d[4];
+	*vZtmp  += - s_v[si]*d[0] - s_r[si]*d[3];
+	*wZtmp  += - s_w[si]*d[0] - s_r[si]*d[2];
+	*eZtmp  += - ( 0.5*( s_u[si]*s_u[si] + s_v[si]*s_v[si] + s_w[si]*s_w[si] )*d[0] + d[1]/(gam - 1) + s_r[si]*s_u[si] * d[4] + s_r[si]*s_v[si] * d[3] + s_r[si]*s_w[si] * d[2]  )     ;
+}
+
+
+__device__ __forceinline__ __attribute__((always_inline)) void OutflownrbcZ_top(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id) {
+
+	myprec dp = 0 ;
+	myprec du = 0 ;
+	myprec dv = 0 ;
+	myprec dw = 0 ;
+	myprec dr = 0 ;
+
+	derShared1z_BD(&dp, s_p, si) ;
+	derShared1z_BD(&du, s_u, si) ;
+	derShared1z_BD(&dv, s_v, si) ;
+	derShared1z_BD(&dw, s_w, si) ;
+	derShared1z_BD(&dr, s_r, si) ;
+
+	myprec sos = pow(gam*s_p[si]/s_r[si],0.5);
+
+	myprec ev[5];
+
+	ev[0] = s_w[si] - sos;
+	ev[1] = s_w[si];
+	ev[2] = s_w[si];
+	ev[3] = s_w[si];
+	ev[4] = s_w[si] + sos;
+
+	myprec sig = 0.25;
+	myprec K = sig * (1-Ma*Ma) * sos / Lz; // Ma is the free stream Mach number that we specify. Poinsot and Lele say that maximum Mach number in the flow should be used. So should there be a comparison kernel that computes maximum Mach number? I dont think its needed as the free stream Mach number and Ma in eqn would be very similar in order and equation 40 is not something accurate but a mere model.
+
+	myprec L[5];
+
+	if (ev[0]>0){
+		L[0] = ev[0]*(dp - s_r[si]*sos*dw);
+	} else {
+		L[0] =  K*(s_p[si] - pinf) ; // either 0 (perfectly non-reflecting) or use LODI relations ( Eq. 24-28 in Poinsot and Lele (1992) )
+	}
+	if (ev[1]>0){
+		L[1] = ev[1]*(sos*sos*dr - dp);
+	} else {
+		L[1] = 0;
+	}
+	if (ev[2]>0){
+		L[2] = ev[2]*(dv);
+	} else {
+		L[2] = 0;
+	}
+	if (ev[3]>0){
+		L[3] = ev[3]*(du);
+	} else {
+		L[3] = 0;
+	}
+	if (ev[4]>0){
+		L[4] = ev[4]*(dp + s_r[si]*sos*dw);
+	} else {
+		L[4] = 0;
+	}
+
+	myprec d[5];
+
+	d[0] = 1/(sos*sos) * ( L[1] + 0.5*(L[4] + L[0]) ) ;
+	d[1] = 0.5*( L[4] + L[0] ) ;
+	d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ;
+	d[3] = L[2] ;
+	d[4] = L[3] ;
+
+	*rZtmp   = - d[0];
+	*uZtmp  += - s_u[si]*d[0] - s_r[si]*d[4];
+	*vZtmp  += - s_v[si]*d[0] - s_r[si]*d[3];
+	*wZtmp  += - s_w[si]*d[0] - s_r[si]*d[2];
+	*eZtmp  += - ( 0.5*( s_u[si]*s_u[si] + s_v[si]*s_v[si] + s_w[si]*s_w[si] )*d[0] + d[1]/(gam - 1) + s_r[si]*s_u[si] * d[4] + s_r[si]*s_v[si] * d[3] + s_r[si]*s_w[si] * d[2]  )     ;
+
+}
+
+
+#endif
diff --git a/src/temporary.cu b/src/temporary.cu
new file mode 100644
index 0000000..acada91
--- /dev/null
+++ b/src/temporary.cu
@@ -0,0 +1,608 @@
+#include "globals.h"
+#include "cuda_functions.h"
+#include "cuda_math.h"
+#include "sponge.h"
+#include "boundary_condition_x.h"
+#include "boundary_condition_y.h"
+#include "boundary_condition_z.h"
+
+__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX,
+		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
+		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
+		myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz,
+		myprec *dil, myprec *dpdz, int iNum) {
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	id.mkidX();
+
+	int si = id.tix + stencilSize;       // local i for shared memory access + halo offset
+	int sj = id.tiy;                   // local j for shared memory access
+
+	myprec rXtmp=0;
+	myprec uXtmp=0;
+	myprec vXtmp=0;
+	myprec wXtmp=0;
+	myprec eXtmp=0;
+
+	myprec wrk1=0;
+	myprec wrk2=0;
+	myprec wrk3=0;
+
+        
+	__shared__ myprec s_u[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_v[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_w[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_t[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_p[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_prop1[sPencils][mx+stencilSize*2];
+	__shared__ myprec s_prop2[sPencils][mx+stencilSize*2];
+
+	s_u[sj][si] = u[id.g];
+	s_v[sj][si] = v[id.g];
+	s_w[sj][si] = w[id.g];
+	s_t[sj][si] = t[id.g];
+	s_p[sj][si] = p[id.g];
+	s_prop1[sj][si] = mu[id.g];
+	s_prop2[sj][si] = lam[id.g];
+	__syncthreads();
+
+	// Boundary conditions in the x-direction are in boundary_condition_x.h
+	// these are the BCs for u,v,w,p,t,mu,lambda
+	BCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx);
+	__syncthreads();
+
+	derDevSharedV1x(&wrk1,s_u[sj],si);
+	derDevSharedV1x(&wrk2,s_v[sj],si);
+	derDevSharedV1x(&wrk3,s_w[sj],si);
+	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
+	uXtmp = ( 2 * wrk1 - 2./3.*dil[id.g] );
+	vXtmp = (     wrk2 + dudy[id.g]  );
+	wXtmp = (     wrk3 + dudz[id.g]  );
+
+	//adding the viscous dissipation part duidx*mu*six
+	eXtmp = s_prop1[sj][si]*(uXtmp*wrk1 + vXtmp*wrk2 + wXtmp*wrk3);
+
+	//Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse);
+	derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx
+    uXtmp *= wrk2;
+	vXtmp *= wrk2;
+	wXtmp *= wrk2;
+
+	// viscous fluxes derivative mu*d^2ui dx^2
+	derDevSharedV2x(&wrk1,s_u[sj],si);
+	uXtmp = uXtmp + wrk1*s_prop1[sj][si];
+	derDevSharedV2x(&wrk2,s_v[sj],si);
+	vXtmp = vXtmp + wrk2*s_prop1[sj][si];
+	derDevSharedV2x(&wrk3,s_w[sj],si);
+	wXtmp = wXtmp + wrk3*s_prop1[sj][si];
+
+	//adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six)
+	eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp;
+
+	//adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx)
+	derDevSharedV2x(&wrk1,s_t[sj],si);
+	eXtmp = eXtmp + wrk1*s_prop2[sj][si];
+	derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx
+	derDevSharedV1x(&wrk3,s_t[sj],si); //wrk1 = d (t) dx
+	eXtmp = eXtmp + wrk2*wrk3;
+
+	// pressure and dilation derivatives
+	s_prop2[sj][si] = dil[id.g];
+	__syncthreads();
+
+	// these are the BCs for the dilatation
+	BCxNumber2(s_prop2[sj],id,si,mx);
+	__syncthreads();
+
+	derDevSharedV1x(&wrk2,s_prop2[sj],si);
+	derDevShared1x(&wrk1 ,s_p[sj],si);
+	uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0     - wrk1 ;
+	eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si];
+
+	//Adding here the terms - d (ru phi) dx;
+	s_prop1[sj][si] = r[id.g];
+	s_prop2[sj][si] = h[id.g];
+	__syncthreads();
+
+	// fill in periodic images in shared memory array
+	// these are the BCs for rho (prop1) and enthalpy (prop2)
+	BCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx);
+	__syncthreads();
+
+	fluxQuadSharedx(&wrk3,s_prop1[sj],s_u[sj],si);
+	rXtmp = wrk3;
+	__syncthreads();
+	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si);
+	uXtmp = uXtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedx(&wrk2,s_prop1[sj],s_u[sj],s_v[sj],si);
+	vXtmp = vXtmp + wrk2;
+	__syncthreads();
+	fluxCubeSharedx(&wrk3,s_prop1[sj],s_u[sj],s_w[sj],si);
+	wXtmp = wXtmp + wrk3;
+	__syncthreads();
+	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si);
+	eXtmp = eXtmp + wrk1;
+	__syncthreads();
+
+	rX[id.g] = rXtmp;
+	uX[id.g] = uXtmp;
+	vX[id.g] = vXtmp;
+	wX[id.g] = wXtmp;
+	eX[id.g] = eXtmp;
+}
+
+__global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, myprec *eY,
+		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
+		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
+		myprec *dvdx, myprec *dudy, myprec *dvdy, myprec *dwdy, myprec *dvdz,
+		myprec *dil, myprec *dpdz) {
+
+	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+	int jNum = blockIdx.z;
+	id.mkidYFlx(jNum);
+
+	int si = id.tiy + stencilSize;       // local i for shared memory access + halo offset
+	int sj = id.tix;                   // local j for shared memory access
+
+	myprec rYtmp=0;
+	myprec uYtmp=0;
+	myprec vYtmp=0;
+	myprec wYtmp=0;
+	myprec eYtmp=0;
+
+	myprec wrk1=0;
+	myprec wrk2=0;
+	myprec wrk3=0;
+
+	__shared__ myprec s_u[lPencils][my/nDivY+stencilSize*2];
+	__shared__ myprec s_v[lPencils][my/nDivY+stencilSize*2];
+	__shared__ myprec s_w[lPencils][my/nDivY+stencilSize*2];
+	__shared__ myprec s_prop[lPencils][my/nDivY+stencilSize*2];
+	__shared__ myprec s_dil[lPencils][my/nDivY+stencilSize*2];
+
+	s_u[sj][si] = u[id.g];
+	s_v[sj][si] = v[id.g];
+	s_w[sj][si] = w[id.g];
+	s_prop[sj][si] = mu[id.g];
+	s_dil[sj][si] = dil[id.g];
+	__syncthreads();
+
+	// fill in periodic images in shared memory array
+	// these are boundary conditions for u,v,w,mu and dilatation
+	BCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my,jNum);
+	__syncthreads();
+
+	derDevSharedV1y(&wrk1,s_u[sj],si); 
+	derDevSharedV1y(&wrk2,s_v[sj],si); 
+	derDevSharedV1y(&wrk3,s_w[sj],si); 
+	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
+	uYtmp = (    wrk1 + dvdx[id.g]        );
+	vYtmp = (2 * wrk2 - 2./3.*s_dil[sj][si] );
+	wYtmp = (    wrk3 + dvdz[id.g]        );
+
+	//adding the viscous dissipation part duidy*mu*siy
+	eYtmp = s_prop[sj][si]*(uYtmp*wrk1 + vYtmp*wrk2 + wYtmp*wrk3);
+
+	//Adding here the terms d (mu) dy * syj;
+	derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dy
+        uYtmp *= wrk2;
+	vYtmp *= wrk2;
+	wYtmp *= wrk2;
+
+	// viscous fluxes derivative
+	derDevSharedV2y(&wrk1,s_u[sj],si);
+	uYtmp = uYtmp + wrk1*s_prop[sj][si];
+	derDevSharedV2y(&wrk1,s_v[sj],si);
+	vYtmp = vYtmp + wrk1*s_prop[sj][si];
+	derDevSharedV2y(&wrk1,s_w[sj],si);
+	wYtmp = wYtmp + wrk1*s_prop[sj][si];
+
+	//adding the viscous dissipation part ui*(mu * d2duidy2 + dmudy * siy)
+	eYtmp = eYtmp + s_u[sj][si]*uYtmp + s_v[sj][si]*vYtmp + s_w[sj][si]*wYtmp;
+
+	//dilation derivatives
+	derDevSharedV1y(&wrk2,s_dil[sj],si);
+	vYtmp = vYtmp + s_prop[sj][si]*wrk2/3.0;
+	eYtmp = eYtmp + s_prop[sj][si]*wrk2/3.0*s_v[sj][si];
+
+	// pressure derivatives
+	s_dil[sj][si] = p[id.g];
+	__syncthreads();
+
+	// Boundary condition for pressure
+	BCyNumber2(s_dil[sj],p,id,si,my, jNum);
+	__syncthreads();
+
+	derDevShared1y(&wrk1,s_dil[sj],si);
+	vYtmp = vYtmp - wrk1;
+
+	// fourier terms
+	s_prop[sj][si] = lam[id.g];
+	s_dil[sj][si]  = t[id.g];
+	__syncthreads();
+
+	// Boundary condition for temperature and thermal conductivity
+	BCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my,jNum);
+	__syncthreads();
+
+	derDevSharedV2y(&wrk1,s_dil[sj],si);
+	eYtmp = eYtmp + wrk1*s_prop[sj][si];
+	derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dy
+	derDevSharedV1y(&wrk1,s_dil[sj] ,si); //wrk1 = d (t) dy
+	eYtmp = eYtmp + wrk1*wrk2;
+
+	//Adding here the terms - d (ru phi) dy;
+	s_prop[sj][si] = r[id.g];
+	s_dil[sj][si]  = h[id.g];
+	__syncthreads();
+	// Boundary condition for density and enthalpy
+	BCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my,jNum);
+	__syncthreads();
+	fluxQuadSharedy(&wrk1,s_prop[sj],s_v[sj],si);
+	rYtmp = wrk1;
+	__syncthreads();
+	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_u[sj],si);
+	uYtmp = uYtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_v[sj],si);
+	vYtmp = vYtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_w[sj],si);
+	wYtmp = wYtmp + wrk1;
+	__syncthreads();
+	fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_dil[sj],si);
+	eYtmp = eYtmp + wrk1;
+	__syncthreads();
+
+#if useStreams
+	rY[id.g] = rYtmp;
+	uY[id.g] = uYtmp;
+	vY[id.g] = vYtmp;
+	wY[id.g] = wYtmp;
+	eY[id.g] = eYtmp;
+#else
+	rY[id.g] += rYtmp;
+	uY[id.g] += uYtmp;
+	vY[id.g] += vYtmp;
+	wY[id.g] += wYtmp;
+	eY[id.g] += eYtmp;
+#endif
+	__syncthreads();
+}
+
+__global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, myprec *eZ,
+        myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
+        myprec *t,  myprec *p,  myprec *mu, myprec *lam,
+        myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz,
+        myprec *dil, myprec *dpdz) {
+
+    Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+    int kNum = blockIdx.z;
+    id.mkidZFlx(kNum);
+       
+    int tid = id.tix + id.bdx*id.tiy;
+
+    int si = id.tiy + stencilSize;       // local i for shared memory access + halo offset
+    int sj = id.tix;                   // local j for shared memory access
+    int si1 = tid%id.bdy +  stencilSize;       // local i for shared memory access + halo offset
+    int sj1 = tid/id.bdy;
+
+    Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y);
+    id1.mkidZFlx(kNum);
+
+    myprec rZtmp=0;
+    myprec uZtmp=0;
+    myprec vZtmp=0;
+    myprec wZtmp=0;
+    myprec eZtmp=0;
+
+    myprec wrk1=0;
+    myprec wrk2=0;
+    myprec wrk3=0;
+
+        //myprec tmpreg1[stencilSize*2+1];
+        //myprec tmpreg2[stencilSize*2+1];
+        //myprec tmpreg3[stencilSize*2+1];
+
+    __shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2];
+    __shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2];
+    __shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2];
+    __shared__ myprec s_prop[mx/nDivX][mz/nDivZ+stencilSize*2];
+    __shared__ myprec s_dil[mx/nDivX][mz/nDivZ+stencilSize*2];
+    __shared__ myprec s_tmp[mx/nDivX][mz/nDivZ+stencilSize*2];
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+    s_u[sj][si] = u[id.g];
+    s_v[sj][si] = v[id.g];
+    s_w[sj][si] = w[id.g];
+    s_prop[sj][si] = mu[id.g];
+    s_dil[sj][si] = dil[id.g];
+    __syncthreads();
+
+    // fill in periodic images in shared memory array
+    // these are boundary conditions for u,v,w,mu and dilatation
+
+   
+        BCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,kNum);
+    __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+////BY COMPUTATION THREAD ARRANGEMENT
+    derDevSharedV1z(&wrk1,s_u[sj1],si1);
+    derDevSharedV1z(&wrk2,s_v[sj1],si1);
+    derDevSharedV1z(&wrk3,s_w[sj1],si1);
+    //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
+
+// WRK1 CORRESPONDS TO THE NEW THREAD ARRANGEMENT.
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+        s_tmp[sj][si] = dwdx[id.g];
+        __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+    uZtmp = (    wrk1 + s_tmp[sj1][si1]        );
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+        __syncthreads();
+        s_tmp[sj][si] = dwdy[id.g];
+        __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+    vZtmp = (    wrk2 + s_tmp[sj1][si1]        );
+    wZtmp = (2 * wrk3 - 2./3.*s_dil[sj1][si1] );
+
+    //adding the viscous dissipation part duidz*mu*siz
+    eZtmp = s_prop[sj1][si1]*(uZtmp*wrk1 + vZtmp*wrk2 + wZtmp*wrk3);
+
+    //Adding here the terms d (mu) dz * szj;
+    derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dz
+    uZtmp *= wrk2;
+    vZtmp *= wrk2;
+    wZtmp *= wrk2;
+
+    //viscous fluxes derivative
+    derDevSharedV2z(&wrk1,s_u[sj1],si1);
+    uZtmp = uZtmp + wrk1*s_prop[sj1][si1];
+    derDevSharedV2z(&wrk1,s_v[sj1],si1);
+    vZtmp = vZtmp + wrk1*s_prop[sj1][si1];
+    derDevSharedV2z(&wrk1,s_w[sj1],si1);
+    wZtmp = wZtmp + wrk1*s_prop[sj1][si1];
+
+    //adding the viscous dissipation part ui*(mu * d2duidz2 + dmudz * siz)
+    eZtmp = eZtmp + s_u[sj1][si1]*uZtmp + s_v[sj1][si1]*vZtmp + s_w[sj1][si1]*wZtmp;
+
+    //dilation derivatives
+    derDevSharedV1z(&wrk2,s_dil[sj1],si1);
+    wZtmp = wZtmp + s_prop[sj1][si1]*wrk2/3.0;
+    eZtmp = eZtmp + s_prop[sj1][si1]*wrk2/3.0*s_w[sj1][si1];
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+//SHARED MEM LOADING OPERATION. - ID
+
+    // pressure derivatives
+    __syncthreads();
+    s_dil[sj][si] = p[id.g];
+    __syncthreads();
+    BCzNumber2(s_dil[sj],p,id,si,mz,kNum);
+    __syncthreads();
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+    derDevShared1z(&wrk1,s_dil[sj1],si1);
+    wZtmp = wZtmp - wrk1;
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+    // fourier terms
+    __syncthreads();   
+        s_prop[sj][si] = lam[id.g];
+    s_dil[sj][si]  = t[id.g];
+    __syncthreads();
+    // Boundary conditions for thermal conductivity and temperature
+    BCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,kNum);
+    __syncthreads();
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+
+    derDevSharedV2z(&wrk1,s_dil[sj1],si1);
+    eZtmp = eZtmp + wrk1*s_prop[sj1][si1];
+    derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (lam) dz
+    derDevSharedV1z(&wrk1,s_dil[sj1],si1); //wrk1 = d (t) dz
+    eZtmp = eZtmp + wrk1*wrk2;
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+    //Adding here the terms - d (ru phi) dz;
+    s_prop[sj][si] = r[id.g];
+    s_dil[sj][si]  = h[id.g];
+    __syncthreads();
+    // Boundary conditions for denisty and enthalpy
+    BCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,kNum);
+    __syncthreads();
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP
+    fluxQuadSharedz(&wrk1,s_prop[sj1],s_w[sj1],si1);
+    rZtmp = wrk1;
+    __syncthreads();
+    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_u[sj1],si1);
+    uZtmp = uZtmp + wrk1;
+    __syncthreads();
+    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_v[sj1],si1);
+    vZtmp = vZtmp + wrk1;
+    __syncthreads();
+    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_w[sj1],si1);
+    wZtmp = wZtmp + wrk1;
+    __syncthreads();
+    fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_dil[sj1],si1);
+    eZtmp = eZtmp + wrk1;
+    __syncthreads();
+//USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE.      
+    s_prop[sj1][si1] = rZtmp ; s_u[sj1][si1] = uZtmp ; s_v[sj1][si1] = vZtmp ; s_tmp[sj1][si1] = wZtmp ; s_dil[sj1][si1] = eZtmp ;
+    __syncthreads();
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM
+        
+    rZ[id.g] += s_prop[sj][si];
+    uZ[id.g] += s_u[sj][si];
+    vZ[id.g] += s_v[sj][si];
+    wZ[id.g] += s_tmp[sj][si] + *dpdz;
+    eZ[id.g] += s_dil[sj][si] + *dpdz*s_w[sj][si];
+
+
+//#if useStreams
+//	rZ[id.g] = rZtmp;
+//	uZ[id.g] = uZtmp;
+//	vZ[id.g] = vZtmp;
+//	wZ[id.g] = wZtmp + *dpdz;
+//	eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ;
+//#else
+
+
+
+
+
+//#endif
+	__syncthreads();
+}
+
+//__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX,
+//		myprec *r,  myprec *u,  myprec *v,  myprec *w,  myprec *h ,
+//		myprec *t,  myprec *p,  myprec *mu, myprec *lam,
+//		myprec *dil, myprec *dpdz) {
+//
+//	Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
+//	id.mkidX();
+//
+//	int si = id.i + stencilSize;       // local i for shared memory access + halo offset
+//	int sj = id.tiy;                   // local j for shared memory access
+//
+//	myprec rXtmp=0;
+//	myprec uXtmp=0;
+//	myprec vXtmp=0;
+//	myprec wXtmp=0;
+//	myprec eXtmp=0;
+//
+//	myprec wrk1=0;
+//	myprec wrk2=0;
+//
+//	__shared__ myprec s_u[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_v[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_w[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_t[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_p[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_prop1[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_prop2[sPencils][mx+stencilSize*2];
+//#if !periodicX
+//	__shared__ myprec s_s0[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_s4[sPencils][mx+stencilSize*2];
+//	__shared__ myprec s_s8[sPencils][mx+stencilSize*2];
+//#endif
+//	__shared__ myprec s_dil[sPencils][mx+stencilSize*2];
+//
+//	s_u[sj][si] = u[id.g];
+//	s_v[sj][si] = v[id.g];
+//	s_w[sj][si] = w[id.g];
+//	s_t[sj][si] = t[id.g];
+//	s_p[sj][si] = p[id.g];
+//	s_prop1[sj][si] = mu[id.g];
+//	s_prop2[sj][si] = lam[id.g];
+//#if !periodicX
+//	s_s0[sj][si]= gij[0][id.g];
+//	s_s4[sj][si]= gij[4][id.g];
+//	s_s8[sj][si]= gij[8][id.g];
+//#endif
+//	s_dil[sj][si] = dil[id.g];
+//	__syncthreads();
+//
+//	// fill in periodic images in shared memory array
+//	if (id.i < stencilSize) {
+//#if periodicX
+//		perBCx(s_u[sj],si); perBCx(s_v[sj],si); perBCx(s_w[sj],si);
+//		perBCx(s_t[sj],si); perBCx(s_p[sj],si); perBCx(s_prop1[sj],si);
+//		perBCx(s_prop2[sj],si);
+//#else
+//		wallBCxMir(s_p[sj],si);
+//		wallBCxVel(s_u[sj],si); wallBCxVel(s_v[sj],si); wallBCxVel(s_w[sj],si);
+//		wallBCxExt(s_t[sj],si,TwallTop,TwallBot);
+//		mlBoundPT(s_prop1[sj], s_prop2[sj],  s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si);
+//		wallBCxMir(s_s0[sj],si); wallBCxVel(s_s4[sj],si);  wallBCxVel(s_s8[sj],si);
+//#endif
+//	}
+//
+//	__syncthreads();
+//
+//	//initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms
+//	uXtmp = ( 2 * gij[0][id.g] - 2./3.*s_dil[sj][si] );
+//	vXtmp = (     gij[1][id.g] + gij[3][id.g]  );
+//	wXtmp = (     gij[2][id.g] + gij[6][id.g]  );
+//
+//	//adding the viscous dissipation part duidx*mu*six
+//	eXtmp = s_prop1[sj][si]*(uXtmp*gij[0][id.g] + vXtmp*gij[1][id.g] + wXtmp*gij[2][id.g]);
+//
+//	//Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse);
+//	derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx
+//    uXtmp *= wrk2;
+//	vXtmp *= wrk2;
+//	wXtmp *= wrk2;
+//
+//	// viscous fluxes derivative mu*d^2ui dx^2
+//	derDevSharedV2x(&wrk1,s_u[sj],si);
+//	uXtmp = uXtmp + wrk1*s_prop1[sj][si];
+//	derDevSharedV2x(&wrk1,s_v[sj],si);
+//	vXtmp = vXtmp + wrk1*s_prop1[sj][si];
+//	derDevSharedV2x(&wrk1,s_w[sj],si);
+//	wXtmp = wXtmp + wrk1*s_prop1[sj][si];
+//
+//	//adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six)
+//	eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp;
+//
+//	//adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx)
+//	derDevSharedV2x(&wrk1,s_t[sj],si);
+//	eXtmp = eXtmp + wrk1*s_prop2[sj][si];
+//	derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx
+//	derDevSharedV1x(&wrk1,s_t[sj],si); //wrk1 = d (t) dx
+//	eXtmp = eXtmp + wrk1*wrk2;
+//
+//	// pressure and dilation derivatives
+//	if (id.i < stencilSize) {
+//#if periodicX
+//		perBCx(s_dil[sj],si);
+//#else
+//		wallBCxDil(s_dil[sj],s_s0[sj],s_s4[sj],s_s8[sj],si);
+//#endif
+//	}
+//	__syncthreads();
+//
+//	derDevSharedV1x(&wrk2,s_dil[sj],si);
+//	derDevShared1x(&wrk1 ,s_p[sj],si);
+//	uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0     - wrk1 ;
+//	eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si];
+//
+//	//Adding here the terms - d (ru phi) dx;
+//	s_prop1[sj][si] = r[id.g];
+//	s_prop2[sj][si] = h[id.g];
+//	__syncthreads();
+//	// fill in periodic images in shared memory array
+//	if (id.i < stencilSize) {
+//#if periodicX
+//		perBCx(s_prop1[sj],si); perBCx(s_prop2[sj],si);
+//#else
+//		rhBoundPT(s_prop1[sj], s_prop2[sj],  s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si);
+//#endif
+//	}
+//
+//	fluxQuadSharedx(&wrk1,s_prop1[sj],s_u[sj],si);
+//	rXtmp = wrk1;
+//	__syncthreads();
+//	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si);
+//	uXtmp = uXtmp + wrk1;
+//	__syncthreads();
+//	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_v[sj],si);
+//	vXtmp = vXtmp + wrk1;
+//	__syncthreads();
+//	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_w[sj],si);
+//	wXtmp = wXtmp + wrk1;
+//	__syncthreads();
+//	fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si);
+//	eXtmp = eXtmp + wrk1;
+//	__syncthreads();
+//
+//	rX[id.g] = rXtmp;
+//	uX[id.g] = uXtmp;
+//	vX[id.g] = vXtmp;
+//	wX[id.g] = wXtmp;
+//	eX[id.g] = eXtmp;
+//}