From cb9cd7aad3a20aa9b0e4b3c2d3cbb26ac3e2f760 Mon Sep 17 00:00:00 2001 From: Asif Manzoor Hasan Date: Wed, 1 Dec 2021 18:32:47 +0100 Subject: [PATCH 1/2] Optimization for channel --- .gitignore | 2 + Makefile | 6 +- src/boundary.h | 22 ++ src/boundary_condition_y.h | 107 ++++++-- src/boundary_condition_z.h | 8 +- src/calc_stress.cu | 120 ++++++++- src/cuda_derivs.h | 66 ++--- src/cuda_functions.h | 23 +- src/cuda_main.cu | 203 ++++++++++---- src/cuda_main.h | 13 + src/cuda_rhs.cu | 526 +++++++++++++++++++++++-------------- src/cuda_utils.cu | 24 +- src/globals.h | 40 +-- src/main.h | 2 +- 14 files changed, 808 insertions(+), 354 deletions(-) diff --git a/.gitignore b/.gitignore index 7f02fbe..4fd798e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*.ncu-rep *.bin *.txt *.o @@ -6,3 +7,4 @@ logfile *.xmf *.mp4 ns +tmp diff --git a/Makefile b/Makefile index 6485a4b..f5a4d34 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ GEN = /usr #MPI = /usr/local/openmpi-4.1.1 -CUDA = /usr/local/cuda-11.2 +CUDA = /usr/local/cuda-11.4 GPU_ARCHITECTURE = 70 @@ -38,6 +38,7 @@ INC = -I$(CUDA)/include -I$(GEN)/include $(GLB) #INC += -I$(MPI)/include LIB = -L$(CUDA)/lib64 -L$(GEN)/lib -lc -lstdc++ -lcuda -lcudart -lcudadevrt #LIB += -L$(MPI)/lib +#NVCC = $(CUDA)/bin/nvcc $(DBG) $(CPPFLAGS) -lineinfo -rdc=true # NVCC = nvcc $(DBG) $(CPPFLAGS) -lineinfo -rdc=true # MAXREG = # --maxrregcount=82 --ptxas-options=-v ifeq ($(DBG),) @@ -104,3 +105,6 @@ $(OBJ)cuda_link.o: $(OBJ_CUDA) clean: rm -rf $(TARGET) $(OBJ)*.o + + + diff --git a/src/boundary.h b/src/boundary.h index 3fb8755..8c9e4dc 100644 --- a/src/boundary.h +++ b/src/boundary.h @@ -12,7 +12,9 @@ extern __device__ __forceinline__ void perBCx(myprec *s_f, int si); extern __device__ __forceinline__ void perBCy(myprec *s_f, int si); extern __device__ __forceinline__ void perBCz(myprec *s_f, int si); extern __device__ __forceinline__ void perBCzBot(myprec *s_f, myprec *f, int si, Indices id); +extern __device__ __forceinline__ void perBCyBot(myprec *s_f, myprec *f, int si, Indices id); extern __device__ __forceinline__ void perBCzTop(myprec *s_f, myprec *f, int si, Indices id); +extern __device__ __forceinline__ void perBCyTop(myprec *s_f, myprec *f, int si, Indices id); extern __device__ __forceinline__ void haloBCy(myprec *s_f, myprec *f, int si, Indices id); extern __device__ __forceinline__ void haloBCz(myprec *s_f, myprec *f, int si, Indices id); extern __device__ __forceinline__ void haloBCzBot(myprec *s_f, myprec *f, int si, Indices id); @@ -33,7 +35,9 @@ extern __device__ __forceinline__ void botBCxExt(myprec *s_f, int si, myprec Bcb extern __device__ __forceinline__ void topBCzExt(myprec *s_f, int si); extern __device__ __forceinline__ void botBCzExt(myprec *s_f, int si); extern __device__ __forceinline__ void botBCzCpy(myprec *s_f, myprec *f, int si, Indices id); +extern __device__ __forceinline__ void botBCyCpy(myprec *s_f, myprec *f, int si, Indices id); extern __device__ __forceinline__ void topBCzCpy(myprec *s_f, myprec *f, int si, Indices id); +extern __device__ __forceinline__ void topBCyCpy(myprec *s_f, myprec *f, int si, Indices id); __device__ __forceinline__ __attribute__((always_inline)) void perBCx(myprec *s_f, int si) { s_f[si-stencilSize] = s_f[si+mx-stencilSize]; @@ -54,10 +58,20 @@ __device__ __forceinline__ __attribute__((always_inline)) void perBCzBot(myprec s_f[si-stencilSize] = f[idx(id.i,id.j,id.k+mz-stencilSize)]; } +__device__ __forceinline__ __attribute__((always_inline)) void perBCyBot(myprec *s_f, myprec *f, int si, Indices id) { + s_f[si-stencilSize] = f[idx(id.i,id.j+my-stencilSize,id.k)]; +} + + __device__ __forceinline__ __attribute__((always_inline)) void perBCzTop(myprec *s_f, myprec *f, int si, Indices id) { s_f[si+mz/nDivZ] = f[idx(id.i,id.j,si-stencilSize)]; } +__device__ __forceinline__ __attribute__((always_inline)) void perBCyTop(myprec *s_f, myprec *f, int si, Indices id) { + s_f[si+my/nDivY] = f[idx(id.i,si-stencilSize, id.k)]; +} + + __device__ __forceinline__ __attribute__((always_inline)) void haloBCy(myprec *s_f, myprec *f, int si, Indices id) { s_f[si-stencilSize] = f[mx*my*mz + id.j + id.i*stencilSize + id.k*mx*stencilSize]; s_f[si+my] = f[mx*my*mz + stencilSize*mx*mz + id.j + id.i*stencilSize + id.k*mx*stencilSize]; @@ -183,9 +197,17 @@ __device__ __forceinline__ __attribute__((always_inline)) void botBCzCpy(myprec s_f[si-stencilSize] = f[idx(id.i,id.j,id.k-stencilSize)]; } +__device__ __forceinline__ __attribute__((always_inline)) void botBCyCpy(myprec *s_f, myprec *f, int si, Indices id) { +s_f[si-stencilSize] = f[idx(id.i,id.j-stencilSize,id.k)]; +} + __device__ __forceinline__ __attribute__((always_inline)) void topBCzCpy(myprec *s_f, myprec *f, int si, Indices id) { s_f[si+mz/nDivZ] = f[idx(id.i,id.j,id.k+mz/nDivZ)]; } +__device__ __forceinline__ __attribute__((always_inline)) void topBCyCpy(myprec *s_f, myprec *f, int si, Indices id) { +s_f[si+my/nDivY] = f[idx(id.i,id.j+my/nDivY,id.k)]; +} + #endif /* BOUNDARY_H_ */ diff --git a/src/boundary_condition_y.h b/src/boundary_condition_y.h index 419b18c..cade0f3 100644 --- a/src/boundary_condition_y.h +++ b/src/boundary_condition_y.h @@ -8,52 +8,125 @@ extern __device__ __forceinline__ void BCyNumber1(myprec *s_u, myprec *s_v, mypr myprec *s_m, myprec *s_dil, myprec *u, myprec *v, myprec *w, myprec *m, myprec *dil, - Indices id, int si, int n); + Indices id, int si, int n, int jNum); -extern __device__ __forceinline__ void BCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int m); +extern __device__ __forceinline__ void BCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int m, int jNum); extern __device__ __forceinline__ void BCyNumber3(myprec *s_f, myprec *s_g, myprec *f, myprec *g, - Indices id, int si, int n); + Indices id, int si, int n, int jNum); __device__ __forceinline__ __attribute__((always_inline)) void BCyNumber1(myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_m, myprec *s_dil, myprec *u, myprec *v, myprec *w, myprec *m, myprec *dil, - Indices id, int si, int n) { - if (id.j < stencilSize) { - if(multiGPU) { + Indices id, int si, int n, int jNum) { + if (id.tiy < stencilSize) { + if(nDivY==1) { + if(multiGPU) { haloBCy(s_u,u,si,id); haloBCy(s_v,v,si,id); haloBCy(s_w,w,si,id); haloBCy(s_m,m,si,id); haloBCy(s_dil,dil,si,id); - } else { + } else { perBCy(s_u,si); perBCy(s_v,si); perBCy(s_w,si); perBCy(s_m,si); perBCy(s_dil,si); } + } + else { + if(jNum==0) { + topBCyCpy(s_u,u,si,id); topBCyCpy(s_v,v,si,id); topBCyCpy(s_w,w,si,id); + topBCyCpy(s_m,m,si,id); topBCyCpy(s_dil,dil,si,id); + + perBCyBot(s_u,u,si,id); + perBCyBot(s_v,v,si,id); + perBCyBot(s_w,w,si,id); + perBCyBot(s_m,m,si,id); + perBCyBot(s_dil,dil,si,id); + + + } else if (jNum==nDivY-1) { + botBCyCpy(s_u,u,si,id); botBCyCpy(s_v,v,si,id); botBCyCpy(s_w,w,si,id); + botBCyCpy(s_m,m,si,id); botBCyCpy(s_dil,dil,si,id); + + perBCyTop(s_u,u,si,id); + perBCyTop(s_v,v,si,id); + perBCyTop(s_w,w,si,id); + perBCyTop(s_m,m,si,id); + perBCyTop(s_dil,dil,si,id); + + } else { + topBCyCpy(s_u,u,si,id); topBCyCpy(s_v,v,si,id); topBCyCpy(s_w,w,si,id); + topBCyCpy(s_m,m,si,id); topBCyCpy(s_dil,dil,si,id); + botBCyCpy(s_u,u,si,id); botBCyCpy(s_v,v,si,id); botBCyCpy(s_w,w,si,id); + botBCyCpy(s_m,m,si,id); botBCyCpy(s_dil,dil,si,id); + } + } } + + + + __syncthreads(); } -__device__ __forceinline__ __attribute__((always_inline)) void BCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int n) { - if (id.j < stencilSize) { - if(multiGPU) { +__device__ __forceinline__ __attribute__((always_inline)) void BCyNumber2(myprec *s_p, myprec *p, Indices id, int si, int n, int jNum) { + if (id.tiy < stencilSize) { + if(nDivY==1) { + if(multiGPU) { haloBCy(s_p,p,si,id); - } else { + } else { perBCy(s_p,si); } + } + else { + if(jNum==0) { + topBCyCpy(s_p,p,si,id); + perBCyBot(s_p,p,si,id); + + + } else if (jNum==nDivY-1) { + botBCyCpy(s_p,p,si,id); + perBCyTop(s_p,p,si,id); + + } else { + topBCyCpy(s_p,p,si,id); botBCyCpy(s_p,p,si,id); + } + } } + + + + __syncthreads(); + } __device__ __forceinline__ __attribute__((always_inline)) void BCyNumber3(myprec *s_f, myprec *s_g, myprec *f, myprec *g, - Indices id, int si, int n) { - if (id.j < stencilSize) { - if(multiGPU) { - haloBCy(s_f,f,si,id); haloBCy(s_g,g,si,id); - } else { - perBCy(s_f,si); perBCy(s_g,si); + Indices id, int si, int n, int jNum) { + if (id.tiy < stencilSize) { + if(nDivY==1) { + if(multiGPU) { + haloBCy(s_f,f,si,id); haloBCy(s_g,g,si,id); + } else { + perBCy(s_f,si); perBCy(s_g,si); + } + } + else { + if(jNum==0) { + topBCyCpy(s_f,f,si,id); topBCyCpy(s_g,g,si,id); perBCyBot(s_f,f,si,id); + perBCyBot(s_g,g,si,id); + + + } else if (jNum==nDivY-1) { + botBCyCpy(s_f,f,si,id); botBCyCpy(s_g,g,si,id); perBCyTop(s_f,f,si,id); + perBCyTop(s_g,g,si,id); + + } else { + topBCyCpy(s_f,f,si,id); topBCyCpy(s_g,g,si,id); botBCyCpy(s_f,f,si,id); botBCyCpy(s_g,g,si,id); + } } } + __syncthreads(); } diff --git a/src/boundary_condition_z.h b/src/boundary_condition_z.h index 75cff94..11f0769 100644 --- a/src/boundary_condition_z.h +++ b/src/boundary_condition_z.h @@ -83,7 +83,7 @@ __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber1(myprec myprec *m, myprec *dil, Indices id, int si, int n, int kNum) { - if (id.tix < stencilSize) { + if (id.tiy < stencilSize) { if(nDivZ==1) { if(multiGPU) { haloBCz(s_u,u,si,id); haloBCz(s_v,v,si,id); haloBCz(s_w,w,si,id); @@ -164,7 +164,7 @@ __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber1(myprec } __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber2(myprec *s_p, myprec *p, Indices id, int si, int n, int kNum) { - if (id.tix < stencilSize) { + if (id.tiy < stencilSize) { if(nDivZ==1) { if(multiGPU) { haloBCz(s_p,p,si,id); @@ -212,7 +212,7 @@ __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber2(myprec __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber3(myprec *s_l, myprec *s_t, myprec *l, myprec *t, Indices id, int si, int n, int kNum) { - if (id.tix < stencilSize) { + if (id.tiy < stencilSize) { if(nDivZ==1) { if(multiGPU) { haloBCz(s_l,l,si,id); haloBCz(s_t,t,si,id); @@ -273,7 +273,7 @@ __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber3(myprec __device__ __forceinline__ __attribute__((always_inline)) void BCzNumber4(myprec *s_r, myprec *s_h, myprec *r, myprec *h, Indices id, int si, int n, int kNum) { - if (id.tix < stencilSize) { + if (id.tiy < stencilSize) { if(nDivZ==1) { if(multiGPU) { haloBCz(s_h,h,si,id); haloBCz(s_r,r,si,id); diff --git a/src/calc_stress.cu b/src/calc_stress.cu index 9cf9a49..5fcff95 100644 --- a/src/calc_stress.cu +++ b/src/calc_stress.cu @@ -6,6 +6,8 @@ #include "cuda_globals.h" #include "cuda_math.h" #include "boundary_condition_x.h" +#include "boundary_condition_y.h" +#include "boundary_condition_z.h" #include "comm.h" #include "sponge.h" @@ -14,7 +16,7 @@ __global__ void deviceAdvanceTime(myprec *dt) { } __global__ void deviceCalcPress(myprec *a, myprec *b, myprec *c) { - *a = 0.99*(*b) - 0.5*(*a/(*c)-1); + *a = 0.99*(*b) - 0.5*( *a/(*c) - 1 ); //*a/(*c)-1 //a here is rw_bulk // changed from rw_bulk/r_bulk to rw_bulk } __global__ void derVelX(myprec *u, myprec *v, myprec *w, myprec *dudx, myprec *dvdx, myprec *dwdx) { @@ -46,20 +48,105 @@ __global__ void derVelX(myprec *u, myprec *v, myprec *w, myprec *dudx, myprec *d __global__ void derVelY(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *dvdy, myprec *dwdy) { - Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); - - derDevV1yL(dudy,u,id); + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + int jNum = blockIdx.z; + + id.mkidYFlx(jNum); + + int tid = id.tix + id.bdx*id.tiy; + + int si = id.tiy + stencilSize; // local i for shared memory access + halo offset + int sj = id.tix; // local j for shared memory access + int si1 = tid%id.bdy + stencilSize; // local i for shared memory access + halo offset + int sj1 = tid/id.bdy; + + Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y); + id1.mkidYFlx(jNum); + + myprec wrk1=0; + myprec wrk2=0; + //myprec wrk3=0; + + __shared__ myprec s_u[mx/nDivX][my/nDivY+stencilSize*2]; + __shared__ myprec s_v[mx/nDivX][my/nDivY+stencilSize*2]; + //__shared__ myprec s_w[mx/nDivX][my/nDivY+stencilSize*2]; + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + //s_w[sj][si] = w[id.g]; + __syncthreads(); + BCyNumber2(s_u[sj],u,id,si,my,jNum); + BCyNumber2(s_v[sj],v,id,si,my,jNum); + //BCyNumber2(s_w[sj],w,id,si,my,jNum); + __syncthreads(); +// COMP Tile sj1 and si1 + derDevSharedV1y(&wrk1,s_u[sj1],si1); + derDevSharedV1y(&wrk2,s_v[sj1],si1); + //derDevSharedV1y(&wrk3,s_w[sj1],si1); + __syncthreads(); + s_u[sj1][si1] = wrk1; + s_v[sj1][si1] = wrk2; + // s_w[sj1][si1] = wrk3; + __syncthreads(); +//MEM tile sj si + dudy[id.g] = s_u[sj][si]; + dvdy[id.g] = s_v[sj][si]; + //dwdy[id.g] = s_w[sj][si]; + __syncthreads(); + +/* derDevV1yL(dudy,u,id); derDevV1yL(dvdy,v,id); - derDevV1yL(dwdy,w,id); + //derDevV1yL(dwdy,w,id);*/ } -__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz, int kNum) { - - Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); - - derDevV1zL(dudz,u,uInit,id,kNum); +__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + int kNum = blockIdx.z; + + id.mkidZFlx(kNum); + + int tid = id.tix + id.bdx*id.tiy; + + int si = id.tiy + stencilSize; // local i for shared memory access + halo offset + int sj = id.tix; // local j for shared memory access + int si1 = tid%id.bdy + stencilSize; // local i for shared memory access + halo offset + int sj1 = tid/id.bdy; + + Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y); + id1.mkidZFlx(kNum); + + myprec wrk1=0; + myprec wrk2=0; + myprec wrk3=0; + + __shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2]; + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + __syncthreads(); + BCzNumber2(s_u[sj],u,id,si,mz,kNum); + BCzNumber2(s_v[sj],v,id,si,mz,kNum); + BCzNumber2(s_w[sj],w,id,si,mz,kNum); + __syncthreads(); +// COMP Tile sj1 and si1 + derDevSharedV1z(&wrk1,s_u[sj1],si1); + derDevSharedV1z(&wrk2,s_v[sj1],si1); + derDevSharedV1z(&wrk3,s_w[sj1],si1); + __syncthreads(); + s_u[sj1][si1] = wrk1; + s_v[sj1][si1] = wrk2; + s_w[sj1][si1] = wrk3; + __syncthreads(); +//MEM tile sj si + dudz[id.g] = s_u[sj][si]; + dvdz[id.g] = s_v[sj][si]; + dwdz[id.g] = s_w[sj][si]; + __syncthreads(); + /*derDevV1zL(dudz,u,uInit,id,kNum); derDevV1zL(dvdz,v,vInit,id,kNum); - derDevV1zL(dwdz,w,wInit,id,kNum); + derDevV1zL(dwdz,w,wInit,id,kNum);*/ } __global__ void derVelYBC(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *dvdy, myprec *dwdy, int direction) { @@ -112,7 +199,7 @@ void calcPressureGrad(myprec *dpdz, myprec *r, myprec *w, Communicator rk) { deviceMul<<>>(workA,r,w); hostVolumeIntegral(dpdz,workA,rk); - deviceCalcPress<<<1,1>>>(dpdz,dpdz_prev,rbulk); + deviceCalcPress<<<1,1>>>(dpdz,dpdz_prev,rbulk); //I changed here from rbulk to workA. // i CHANGED TO 1 checkCuda( cudaFree(workA) ); checkCuda( cudaFree(dpdz_prev) ); @@ -159,15 +246,17 @@ __global__ void deviceCalcDt(myprec *wrkArray, myprec *r, myprec *u, myprec *v, } -void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, Communicator rk) { +void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, myprec *dtC, myprec *dpdz, int file, int istep, Communicator rk) { cudaSetDevice(rk.nodeRank); myprec *workA, *rbulk; myprec *hostWork = (myprec*)malloc(sizeof(myprec)); + dim3 gr0 = dim3(my / sPencils, mz, 1); dim3 bl0 = dim3(mx, sPencils, 1); + checkCuda( cudaMalloc((void**)&workA,mx*my*mz*sizeof(myprec)) ); checkCuda( cudaMalloc((void**)&rbulk , sizeof(myprec)) ); @@ -176,6 +265,8 @@ void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, mypre checkCuda( cudaMemcpy(hostWork, rbulk, sizeof(myprec), cudaMemcpyDeviceToHost) ); allReduceSum(hostWork,1); checkCuda( cudaMemcpy(rbulk, hostWork, sizeof(myprec), cudaMemcpyHostToDevice) ); + if(rk.rank==0) printf("step number %d with %le %le %3.10le\n",nsteps*(file-1) + istep ,*dtC,*dpdz, *hostWork); + deviceMul<<>>(workA,r,w); hostVolumeIntegral(par1,workA,rk); @@ -198,4 +289,7 @@ void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, mypre free(hostWork); checkCuda( cudaFree(workA) ); checkCuda( cudaFree(rbulk) ); + + + } diff --git a/src/cuda_derivs.h b/src/cuda_derivs.h index fe8670f..ad8040f 100644 --- a/src/cuda_derivs.h +++ b/src/cuda_derivs.h @@ -35,12 +35,10 @@ __device__ __forceinline__ __attribute__((always_inline)) void fluxQuadSharedx(m flxm = 0.0; __syncthreads(); - for (int lt=1; lt>>(d_r,d_u,d_v,d_w,d_e,d_h,d_t,d_p,d_m,d_l,0); //here 0 means interior points - derVelX<<>>(d_u,d_v,d_w,gij[0],gij[1],gij[2]); + //derVelX<<>>(d_u,d_v,d_w,gij[0],gij[1],gij[2]); derVelY<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5]); - for (int kNum=0; kNum>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],kNum); + //for (int kNum=0; kNum<1; kNum++) //CHANGED!!!!! + derVelZ<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8]); if(multiGPU) { updateHaloFive(d_r,d_u,d_v,d_w,d_e,rk); cudaDeviceSynchronize(); calcState<<>>(d_r,d_u,d_v,d_w,d_e,d_h,d_t,d_p,d_m,d_l,1); //here 1 means halo points @@ -27,33 +28,34 @@ inline void calcRHS(myprec *rhsr, myprec *rhsu, myprec *rhsv, myprec *rhsw, mypr derVelZBC<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],1); //here 1 means upper boundary (mz-index) } cudaDeviceSynchronize(); - calcDil<<>>(d_dil,gij[0],gij[4],gij[8]); + //calcDil<<>>(d_dil,gij[0],gij[4],gij[8]); cudaDeviceSynchronize(); if(multiGPU) deviceBlocker<<>>(); //in order to hide the halo update with deviceRHSX (on stream s[0]) - deviceRHSX<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[0],gij[1],gij[2],gij[3],gij[6],d_dil,dpdz,0); + deviceRHSX<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[0],gij[1],gij[2],gij[3],gij[6],gij[4],gij[8],d_dil,dpdz,0); if(multiGPU) updateHalo(d_dil,rk); cudaDeviceSynchronize(); - deviceRHSY<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[1],gij[3],gij[4],gij[5],gij[7],d_dil,dpdz,0); - for (int kNum=0; kNum>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[2],gij[5],gij[6],gij[7],gij[8],d_dil,dpdz,kNum); + deviceRHSY<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[1],gij[3],gij[4],gij[5],gij[7],d_dil,dpdz); + cudaDeviceSynchronize(); + deviceRHSZ<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[2],gij[5],gij[6],gij[7],gij[8],d_dil,dpdz); cudaDeviceSynchronize(); if(boundaryLayer) addSponge<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_e); cudaDeviceSynchronize(); } -void runSimulation(myprec *par1, myprec *par2, myprec *time, Communicator rk) { - - cudaSetDevice(rk.nodeRank); - - myprec h_dt,h_dpdz; - - for (int istep = 0; istep < nsteps; istep++) { - if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk); - if(istep>0) deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC); - if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC); - deviceAdvanceTime<<<1,1>>>(dtC); - if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk); - +void runSimulation(int file , myprec *par1, myprec *par2, myprec *time, Communicator rk) { + + cudaSetDevice(rk.nodeRank); + myprec h_dt,h_dpdz; + + for (int istep = 0; istep < nsteps; istep++) { + + if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk); + if(istep>0) deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC); + if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC); + deviceAdvanceTime<<<1,1>>>(dtC); + if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,&h_dt,&h_dpdz,file, istep, rk); + + deviceMul<<>>(d_uO,d_r,d_u); deviceMul<<>>(d_vO,d_r,d_v); deviceMul<<>>(d_wO,d_r,d_w); @@ -64,12 +66,15 @@ void runSimulation(myprec *par1, myprec *par2, myprec *time, Communicator rk) { //runge kutta step 1 calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk); - eulerSum<<>>(d_r,d_rO,d_rhsr1,dtC); - eulerSum<<>>(d_e,d_eO,d_rhse1,dtC); - cudaDeviceSynchronize(); - eulerSumR<<>>(d_u,d_uO,d_rhsu1,d_r,dtC); - eulerSumR<<>>(d_v,d_vO,d_rhsv1,d_r,dtC); - eulerSumR<<>>(d_w,d_wO,d_rhsw1,d_r,dtC); + //eulerSum<<>>(d_r,d_rO,d_rhsr1,dtC); + //eulerSum<<>>(d_e,d_eO,d_rhse1,dtC); + + eulerSumAll<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, + d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,dtC); + //cudaDeviceSynchronize(); + //eulerSumR<<>>(d_u,d_uO,d_rhsu1,d_r,dtC); + //eulerSumR<<>>(d_v,d_vO,d_rhsv1,d_r,dtC); + //eulerSumR<<>>(d_w,d_wO,d_rhsw1,d_r,dtC); cudaDeviceSynchronize(); if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer @@ -80,12 +85,14 @@ void runSimulation(myprec *par1, myprec *par2, myprec *time, Communicator rk) { //runge kutta step 2 calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk); - eulerSum3<<>>(d_r,d_rO,d_rhsr1,d_rhsr2,dtC); - eulerSum3<<>>(d_e,d_eO,d_rhse1,d_rhse2,dtC); - cudaDeviceSynchronize(); - eulerSum3R<<>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_r,dtC); - eulerSum3R<<>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_r,dtC); - eulerSum3R<<>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_r,dtC); + //eulerSum3<<>>(d_r,d_rO,d_rhsr1,d_rhsr2,dtC); + //eulerSum3<<>>(d_e,d_eO,d_rhse1,d_rhse2,dtC); + eulerSumAll2<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, + d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2, dtC); + //cudaDeviceSynchronize(); + //eulerSum3R<<>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_r,dtC); + //eulerSum3R<<>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_r,dtC); + //eulerSum3R<<>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_r,dtC); cudaDeviceSynchronize(); if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer @@ -96,12 +103,16 @@ void runSimulation(myprec *par1, myprec *par2, myprec *time, Communicator rk) { //runge kutta step 3 calcRHS(d_rhsr3,d_rhsu3,d_rhsv3,d_rhsw3,d_rhse3,rk); - rk3final<<>>(d_r,d_rO,d_rhsr1,d_rhsr2,d_rhsr3,dtC); - rk3final<<>>(d_e,d_eO,d_rhse1,d_rhse2,d_rhse3,dtC); - cudaDeviceSynchronize(); - rk3finalR<<>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_rhsu3,d_r,dtC); - rk3finalR<<>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_rhsv3,d_r,dtC); - rk3finalR<<>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_rhsw3,d_r,dtC); +// rk3final<<>>(d_r,d_rO,d_rhsr1,d_rhsr2,d_rhsr3,dtC); +// rk3final<<>>(d_e,d_eO,d_rhse1,d_rhse2,d_rhse3,dtC); + eulerSumAll3<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, + d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2, + d_rhsr3, d_rhsu3, d_rhsv3, d_rhsw3, d_rhse3,dtC); + +// cudaDeviceSynchronize(); +// rk3finalR<<>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_rhsu3,d_r,dtC); +// rk3finalR<<>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_rhsv3,d_r,dtC); +// rk3finalR<<>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_rhsw3,d_r,dtC); cudaDeviceSynchronize(); } } @@ -113,12 +124,11 @@ void runSimulationLowStorage(myprec *par1, myprec *par2, myprec *time, Communica myprec h_dt,h_dpdz; for (int istep = 0; istep < nsteps; istep++) { - if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk); +/* if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk); if(istep>0) deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC); if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC); deviceAdvanceTime<<<1,1>>>(dtC); - if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk); - + if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk);*/ //Starting the Runge-Kutta Steps @@ -185,6 +195,55 @@ void runSimulationLowStorage(myprec *par1, myprec *par2, myprec *time, Communica } } +__global__ void eulerSumAll(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, + myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1 + ,myprec *dt) { + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + myprec tmp1 = 0; + tmp1 = (r0[id.g] + rhsr1[id.g]*(*dt)/2.0); + r[id.g] = tmp1; + u[id.g] = (u0[id.g] + rhsu1[id.g]*(*dt)/2.0)/tmp1; + v[id.g] = (v0[id.g] + rhsv1[id.g]*(*dt)/2.0)/tmp1; + w[id.g] = (w0[id.g] + rhsw1[id.g]*(*dt)/2.0)/tmp1; + e[id.g] = (e0[id.g] + rhse1[id.g]*(*dt)/2.0); + +} + + +__global__ void eulerSumAll2(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, + myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1, + myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, myprec *dt) { + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + myprec tmp1 = 0; + tmp1 = ( r0[id.g] + (2*rhsr2[id.g] - rhsr1[id.g])*(*dt) ); + r[id.g] = tmp1; + u[id.g] = ( u0[id.g] + (2*rhsu2[id.g] - rhsu1[id.g])*(*dt) )/tmp1; + v[id.g] = ( v0[id.g] + (2*rhsv2[id.g] - rhsv1[id.g])*(*dt) )/tmp1; + w[id.g] = ( w0[id.g] + (2*rhsw2[id.g] - rhsw1[id.g])*(*dt) )/tmp1; + e[id.g] = ( e0[id.g] + (2*rhse2[id.g] - rhse1[id.g])*(*dt) ); + +} + + +__global__ void eulerSumAll3(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, + myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1, + myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, + myprec *rhsr3, myprec *rhsu3, myprec *rhsv3, myprec *rhsw3, myprec *rhse3, myprec *dt) { + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + myprec tmp1 = 0; + tmp1 = r0[id.g] + (*dt)*( rhsr1[id.g] + 4*rhsr2[id.g] + rhsr3[id.g])/6.; + r[id.g] = tmp1; + u[id.g] = ( u0[id.g] + (*dt)*( rhsu1[id.g] + 4*rhsu2[id.g] + rhsu3[id.g])/6.)/tmp1; + v[id.g] = ( v0[id.g] + (*dt)*( rhsv1[id.g] + 4*rhsv2[id.g] + rhsv3[id.g])/6.)/tmp1; + w[id.g] = ( w0[id.g] + (*dt)*( rhsw1[id.g] + 4*rhsw2[id.g] + rhsw3[id.g])/6.)/tmp1; + e[id.g] = ( e0[id.g] + (*dt)*( rhse1[id.g] + 4*rhse2[id.g] + rhse3[id.g])/6.); + +} + + __global__ void eulerSum(myprec *a, myprec *b, myprec *c, myprec *dt) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); a[id.g] = (b[id.g] + c[id.g]*(*dt)/2.0); @@ -217,7 +276,7 @@ __global__ void rk3finalR(myprec *a1, myprec *a2, myprec *b, myprec *c, myprec * __global__ void calcState(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret, myprec *ht, myprec *tem, myprec *pre, myprec *mu, myprec *lam, int bc) { - int threadsPerBlock = blockDim.x * blockDim.y; +/* int threadsPerBlock = blockDim.x * blockDim.y; int threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y; int blockNumInGrid = blockIdx.x + gridDim.x * blockIdx.y; @@ -237,6 +296,34 @@ __global__ void calcState(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec suth = pow(tem[gl],viscexp); mu[gl] = suth/Re; lam[gl] = suth/Re/Pr/Ec; + __syncthreads();*/ + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + int gl = id.g; + if(bc==1) gl += mx*my*mz; + + myprec cvInv = (gam - 1.0)/Rgas; + myprec r = rho[gl]; + myprec u = uvel[gl]; + myprec v = vvel[gl]; + myprec w = wvel[gl]; + myprec etot = ret[gl]; + + myprec invrho = 1.0/r; + myprec en = etot*invrho - 0.5*(u*u + v*v + w*w); + + myprec t = cvInv*en; + myprec p = r*Rgas*t; //tmp1 is temperature // tmp2 is pressure + tem[gl] = t; + pre[gl] = p; + + ht[gl] = (etot + p)*invrho; + + myprec suth = pow(t,viscexp); + mu[gl] = suth/Re; + lam[gl] = suth/Re/Pr/Ec; + __syncthreads(); } @@ -255,13 +342,13 @@ void calcTimeStepPressGrad(int istep, myprec *dtC, myprec *dpdz, myprec *h_dt, m mpiBarrier(); cudaMemcpy(dtC , h_dt , sizeof(myprec), cudaMemcpyHostToDevice); if(forcing) { - calcPressureGrad(dpdz,d_r,d_w,rk); + calcPressureGrad(dpdz,d_r,d_w,rk); //Changed here dpdz_new = dpdz_old - 0.5*(rw_bulk - 1 ) cudaMemcpy(h_dpdz, dpdz, sizeof(myprec), cudaMemcpyDeviceToHost); allReduceArray(h_dpdz,1); mpiBarrier(); cudaMemcpy(dpdz, h_dpdz, sizeof(myprec), cudaMemcpyHostToDevice); } - if(rk.rank==0) printf("step number %d with %le %le\n",istep,*h_dt,*h_dpdz); + //if(rk.rank==0) printf("step number %d with %le %le\n",nsteps*(file) + istep ,*h_dt,*h_dpdz); } void solverWrapper(Communicator rk) { @@ -283,20 +370,36 @@ void solverWrapper(Communicator rk) { //check the memory usage of the GPU checkGpuMem(rk); + + /*derVelX<<>>(d_u,d_v,d_w,gij[0],gij[1],gij[2]); + derVelY<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5]); + derVelZ<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8]); + calcDil<<>>(d_dil,gij[0],gij[4],gij[8]);*/ // In Y and Z derivatives are turned off. so need to fix that or create a new kernel. + + + if(restartFile<0) { start=0; } else { start=restartFile; } + for(int file = start+1; file mx mod sPencils!=0\n"); } @@ -293,7 +293,7 @@ void sanityCheckThreadGrids(Communicator rk) { } mpiBarrier(); exit(1); - } + }*/ } void copyField(int direction, Communicator rk) { diff --git a/src/globals.h b/src/globals.h index b374a59..f50b4da 100644 --- a/src/globals.h +++ b/src/globals.h @@ -18,36 +18,36 @@ #define stencilSize 3 // the order is double the stencilSize (advective fluxes stencil) #define stencilVisc 2 // the order is double the stencilVisc (viscous fluxes stencil) -#define Lx (20.0) -#define Ly (7.0) -#define Lz (500.0) -#define mx_tot 240 -#define my_tot 64 -#define mz_tot 2048 -#define nsteps 1001 -#define nfiles 100 +#define Lx (2.0) +#define Ly (3.0) +#define Lz (10.0) +#define mx_tot 176 +#define my_tot 216 // 246 +#define mz_tot 432 +#define nsteps 2000 +#define nfiles 1 #define CFL 0.75f -#define restartFile -1 +#define restartFile -1 -#define lowStorage (true) -#define boundaryLayer (true) -#define perturbed (true) -#define forcing (false) +#define lowStorage (false) +#define boundaryLayer (false) +#define perturbed (false) +#define forcing (true) #define periodicX (false) #define nonUniformX (true) #define checkCFLcondition 100 #define checkBulk 100 -#define Re 1500.0 -#define Pr 0.75 -#define Ma 0.35 -#define viscexp 1.5 +#define Re 7500.0 +#define Pr 0.70 +#define Ma 0.7 +#define viscexp 0.75 #define gam 1.4 #define Ec ((gam - 1.f)*Ma*Ma) #define Rgas (1.f/(gam*Ma*Ma)) -const double stretch = 5.0; +const double stretch = 3.0; const myprec TwallTop = 1.0; const myprec TwallBot = 1.0; @@ -55,7 +55,9 @@ const myprec TwallBot = 1.0; #define my (my_tot/pRow) #define mz (mz_tot/pCol) -#define nDivZ (8) // never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations) +#define nDivX (11) // never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations) +#define nDivY (9) // never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations) +#define nDivZ (18) // never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations) #define idx(i,j,k) \ ({ ( k )*mx*my +( j )*mx + ( i ); }) diff --git a/src/main.h b/src/main.h index 5b6b35a..bbc6a06 100644 --- a/src/main.h +++ b/src/main.h @@ -32,7 +32,7 @@ void setGPUParameters(Communicator rk); void setDevice(int device); void copyField(int direction, Communicator rk); void checkGpuMem(Communicator rk); -void runSimulation(myprec *kin, myprec *enst, myprec *time, Communicator rk); +void runSimulation(int file, myprec *kin, myprec *enst, myprec *time, Communicator rk); void initSolver(Communicator rk); void clearSolver(Communicator rk); void calculateSponge(Communicator rk); From eccac7bc44a59dc0826e15968e8fabeacfd92b27 Mon Sep 17 00:00:00 2001 From: Asif Manzoor Hasan Date: Wed, 2 Feb 2022 17:22:12 +0100 Subject: [PATCH 2/2] Latest Version --- src/.cproject | 138 ++++ src/.project | 27 + src/.settings/language.settings.xml | 44 ++ src/backinit.cpp | 354 +++++++++ src/boundary.h | 271 +++++-- src/boundary_condition_x.h | 314 +++++--- src/boundary_condition_y.h | 153 ++-- src/boundary_condition_z.h | 488 +++++-------- src/calc_stress.cu | 154 +++- src/comm.cpp | 321 ++++++-- src/comm.h | 7 + src/cuda_derivs.cu | 1 - src/cuda_derivs.h | 82 ++- src/cuda_functions.h | 70 +- src/cuda_globals.h | 16 +- src/cuda_main.cu | 978 ++++++++++++++++++------- src/cuda_main.h | 29 + src/cuda_math.cu | 36 +- src/cuda_math.h | 3 + src/cuda_rhs.cu | 1046 ++++++++++++++++++++------- src/cuda_rhs_backup_0511.cu | 554 ++++++++++++++ src/cuda_utils.cu | 81 ++- src/globals.h | 86 ++- src/init.cpp | 225 ++++-- src/main.cpp | 5 +- src/main.h | 2 +- src/nrbcX.h | 235 ++++++ src/nrbcZ.h | 163 +++++ src/temporary.cu | 608 ++++++++++++++++ 29 files changed, 5203 insertions(+), 1288 deletions(-) create mode 100644 src/.cproject create mode 100644 src/.project create mode 100644 src/.settings/language.settings.xml create mode 100644 src/backinit.cpp create mode 100644 src/cuda_rhs_backup_0511.cu create mode 100644 src/nrbcX.h create mode 100644 src/nrbcZ.h create mode 100644 src/temporary.cu diff --git a/src/.cproject b/src/.cproject new file mode 100644 index 0000000..c6790cb --- /dev/null +++ b/src/.cproject @@ -0,0 +1,138 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/.project b/src/.project new file mode 100644 index 0000000..65be2a9 --- /dev/null +++ b/src/.project @@ -0,0 +1,27 @@ + + + MPI_test foler + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + diff --git a/src/.settings/language.settings.xml b/src/.settings/language.settings.xml new file mode 100644 index 0000000..4053a67 --- /dev/null +++ b/src/.settings/language.settings.xml @@ -0,0 +1,44 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/backinit.cpp b/src/backinit.cpp new file mode 100644 index 0000000..da1fdca --- /dev/null +++ b/src/backinit.cpp @@ -0,0 +1,354 @@ +#include +#include +#include +#include +#include +#include +#include "comm.h" +#include "globals.h" +#include "main.h" + +void derivGrid(double *d2f, double *df, double *f, double dx); + +void initField(int timestep, Communicator rk) { + readFileMPI('r',timestep,r,rk); + readFileMPI('u',timestep,u,rk); + readFileMPI('v',timestep,v,rk); + readFileMPI('w',timestep,w,rk); + readFileMPI('e',timestep,e,rk); + if(rk.rank==0) { + printf("finished initializing field\n"); + } +} + +void writeField(int timestep, Communicator rk) { + saveFileMPI('r',timestep,r,rk); + saveFileMPI('u',timestep,u,rk); + saveFileMPI('v',timestep,v,rk); + saveFileMPI('w',timestep,w,rk); + saveFileMPI('e',timestep,e,rk); +} + +void initGrid(Communicator rk) { + + //constant grid in y and z and stretched in x + + if (boundaryLayer) { + myprec tmp = 1/( (mxwr-1.0) - 1.0); + for (int i=0; i1) { + haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id); + + haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id); + } else{ + TopBCyNumber2(s_u[sj],u,id,si,my); TopBCyNumber2(s_v[sj],v,id,si,my); + BotBCyNumber2(s_u[sj],u,id,si,my); BotBCyNumber2(s_v[sj],v,id,si,my); + } + } else { + if (jNum ==0){ + TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); + + if(pRow>1) { + + haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id); + } else{ + BotBCyNumber2(s_u[sj],u,id,si,my); BotBCyNumber2(s_v[sj],v,id,si,my); + } + } else if (jNum == nDivY-1){ + BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); + if(pRow>1) { + haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id); + + } else { + TopBCyNumber2(s_u[sj],u,id,si,my); TopBCyNumber2(s_v[sj],v,id,si,my); + } + } else { + TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); + + BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); + } + } + } __syncthreads(); // COMP Tile sj1 and si1 derDevSharedV1y(&wrk1,s_u[sj1],si1); @@ -98,7 +103,7 @@ __global__ void derVelY(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *d //derDevV1yL(dwdy,w,id);*/ } -__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz) { +__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz, recycle rec, Communicator rk) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); int kNum = blockIdx.z; @@ -126,10 +131,78 @@ __global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *d s_v[sj][si] = v[id.g]; s_w[sj][si] = w[id.g]; __syncthreads(); - BCzNumber2(s_u[sj],u,id,si,mz,kNum); - BCzNumber2(s_v[sj],v,id,si,mz,kNum); - BCzNumber2(s_w[sj],w,id,si,mz,kNum); + + + + if (id.tiy < stencilSize){ + if (nDivZ ==1){ + if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot.. + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol-2){ // last block + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + + TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc); + } else if (rk.kp == 1) {// first block + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + + BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec); + } else { // all internal blocks + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + } + } + + } else{ + TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc); + BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec); + } + } else { + if (kNum ==0){ + TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.kp == 1){ // first block + BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec); + } else { // all other blocks + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + } + } + + } else{ + BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec); + } + } else if (kNum == nDivZ-1){ + BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol -2 ){ // last block + TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc); + } else { // all other blocks + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + } + } + } else{ + TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc); + } + } else { + TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id); + + BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id); + } + } + } + __syncthreads(); + // COMP Tile sj1 and si1 derDevSharedV1z(&wrk1,s_u[sj1],si1); derDevSharedV1z(&wrk2,s_v[sj1],si1); @@ -232,14 +305,16 @@ __global__ void deviceCalcDt(myprec *wrkArray, myprec *r, myprec *u, myprec *v, myprec dtViscInv = 0.0; myprec ien = e[id.g]/r[id.g] - 0.5*(u[id.g]*u[id.g] + v[id.g]*v[id.g] + w[id.g]*w[id.g]); - myprec sos = pow(gam*(gam-1)*ien,0.5); + myprec sos = pow(abs(gam*(gam-1)*ien),0.5); myprec dx,d2x; dx = d_dxv[id.i]; d2x = dx*dx; + myprec nu = mu[id.g]/r[id.g]; + dtConvInv = MAX( (abs(u[id.g]) + sos)/dx, MAX( (abs(v[id.g]) + sos)*d_dy, (abs(w[id.g]) + sos)*d_dz) ); - dtViscInv = MAX( mu[id.g]/d2x, MAX( mu[id.g]*d_d2y, mu[id.g]*d_d2z) ); + dtViscInv = MAX( nu/d2x, MAX( nu*d_d2y, nu*d_d2z) ); wrkArray[id.g] = CFL/MAX(dtConvInv, dtViscInv); __syncthreads(); @@ -265,8 +340,6 @@ void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, mypre checkCuda( cudaMemcpy(hostWork, rbulk, sizeof(myprec), cudaMemcpyDeviceToHost) ); allReduceSum(hostWork,1); checkCuda( cudaMemcpy(rbulk, hostWork, sizeof(myprec), cudaMemcpyHostToDevice) ); - if(rk.rank==0) printf("step number %d with %le %le %3.10le\n",nsteps*(file-1) + istep ,*dtC,*dpdz, *hostWork); - deviceMul<<>>(workA,r,w); hostVolumeIntegral(par1,workA,rk); @@ -290,6 +363,7 @@ void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, mypre checkCuda( cudaFree(workA) ); checkCuda( cudaFree(rbulk) ); + if(rk.rank==0) printf("step number %d with %le %le\n",nsteps*(file-1) + istep ,*dtC,*dpdz); } diff --git a/src/comm.cpp b/src/comm.cpp index 0d29ee8..a53037b 100644 --- a/src/comm.cpp +++ b/src/comm.cpp @@ -19,6 +19,8 @@ myprec *rcvYp,*rcvYm,*rcvZp,*rcvZm; myprec *senYp5,*senYm5,*senZp5,*senZm5; myprec *rcvYp5,*rcvYm5,*rcvZp5,*rcvZm5; +MPI_Comm comm_col; + void updateHaloTest(myprec *var, Communicator rk) { int ierr; @@ -133,6 +135,30 @@ void updateHaloFive(myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, Commu fillBoundariesFive(rcvYm5,rcvYp5,rcvZm5,rcvZp5,r,u,v,w,e,1,rk); } +void TransferToInlet(myprec *recy_r, myprec *recy_u,myprec *recy_v, myprec *recy_w, myprec *recy_t, Communicator rk) { + + int ierr; + MPI_Status status; + + if ( (rk.kstart <= recstn) && ( rk.kend >= recstn) ) { //The processor row consisting of recycling station co-od = [y=0:pRow-1, z=Recyc] + ierr = MPI_Send(recy_r, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 0, MPI_COMM_WORLD); // A processor with [5, Recyc] will send data to [5, 0] + ierr = MPI_Send(recy_u, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 1, MPI_COMM_WORLD); + ierr = MPI_Send(recy_v, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 2, MPI_COMM_WORLD); + ierr = MPI_Send(recy_w, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 3, MPI_COMM_WORLD); + ierr = MPI_Send(recy_t, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 4, MPI_COMM_WORLD); + } + + if (rk.kstart == 0) { //co-od = [y=0:pRow-1, z=0] + ierr = MPI_Recv(recy_r, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 0, MPI_COMM_WORLD, &status);// A processor with [8, 0] will recieve data from [8, Recyc] + ierr = MPI_Recv(recy_u, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 1, MPI_COMM_WORLD, &status); + ierr = MPI_Recv(recy_v, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 2, MPI_COMM_WORLD, &status); + ierr = MPI_Recv(recy_w, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 3, MPI_COMM_WORLD, &status); + ierr = MPI_Recv(recy_t, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 4, MPI_COMM_WORLD, &status); + } + + ierr = MPI_Barrier(MPI_COMM_WORLD); +} + long int nameToHash(char *name, int length) { long int hash = 0; for (int i=0; imyRank(myRank); - - const int dimens[] = {pRow,pCol}; - const int periods[] = {1,1}; - int coord[2],cYPos[2],cYNeg[2],cZPos[2],cZNeg[2]; - int ierr; - - MPI_Comm comm_cart; - - ierr = MPI_Cart_create(MPI_COMM_WORLD, 2, dimens, periods, 0, &comm_cart); - ierr = MPI_Cart_coords(comm_cart, rk->rank, 2, coord); - - cZPos[0] = coord[0]; - cZNeg[0] = coord[0]; - cYPos[1] = coord[1]; - cYNeg[1] = coord[1]; - - cZPos[1] = coord[1]+1; - if(cZPos[1]>pCol-1) - cZPos[1] = 0; - - cZNeg[1] = coord[1]-1; - if(cZNeg[1]<0) - cZNeg[1] = pCol-1; - - cYPos[0] = coord[0]+1; - if(cYPos[0]>pRow-1) - cYPos[0] = 0; - - cYNeg[0] = coord[0]-1; - if(cYNeg[0]<0) - cYNeg[0] = pRow-1; + rk->myRank(myRank); + + const int dimens[] = {pRow,pCol}; + const int periods[] = {1,1}; + int coord[2],cYPos[2],cYNeg[2],cZPos[2],cZNeg[2], cZSenRR[2], cZRecRR[2], color; + int ierr; + + MPI_Comm comm_cart; + + ierr = MPI_Cart_create(MPI_COMM_WORLD, 2, dimens, periods, 0, &comm_cart); + ierr = MPI_Cart_coords(comm_cart, rk->rank, 2, coord); + + cZPos[0] = coord[0]; + cZNeg[0] = coord[0]; + cYPos[1] = coord[1]; + cYNeg[1] = coord[1]; + + cZPos[1] = coord[1]+1; + if(cZPos[1]>pCol-1) + cZPos[1] = 0; + + cZNeg[1] = coord[1]-1; + if(cZNeg[1]<0) + cZNeg[1] = pCol-1; + + cYPos[0] = coord[0]+1; + if(cYPos[0]>pRow-1) + cYPos[0] = 0; + + cYNeg[0] = coord[0]-1; + if(cYNeg[0]<0) + cYNeg[0] = pRow-1; + + ierr = MPI_Cart_rank(comm_cart, cZPos, &rk->kp); + ierr = MPI_Cart_rank(comm_cart, cZNeg, &rk->km); + ierr = MPI_Cart_rank(comm_cart, cYPos, &rk->jp); + ierr = MPI_Cart_rank(comm_cart, cYNeg, &rk->jm); + + rk->jstart = coord[0]*my; + rk->jend = rk->jstart + my; + rk->kstart = coord[1]*mz; + rk->kend = rk->kstart + mz; + + int kr1,kr2; + int RecyZRank = 0; + for(int i = 0; i= recstn)) { + RecyZRank = i; + } + + } + + cZSenRR[0] = coord[0]; + cZSenRR[1] = RecyZRank; + cZRecRR[0] = coord[0]; + cZRecRR[1] = 0; + + ierr = MPI_Cart_rank(comm_cart, cZSenRR, &rk->coodsend); + ierr = MPI_Cart_rank(comm_cart, cZRecRR, &rk->coodrecv); + + + long int hash; + MPI_Comm comm_node; + int procnamesize; + char procname[MPI_MAX_PROCESSOR_NAME]; + + ierr = MPI_Get_processor_name(procname, &procnamesize); + + hash = (rk->rank)/GPUperNode ; // nameToHash(procname, procnamesize); + + ierr = MPI_Comm_split(MPI_COMM_WORLD, hash, rk->rank, &comm_node); + ierr = MPI_Comm_rank(comm_node, &rk->nodeRank); + + // MPI_Comm comm_col; // All rows in a particular column. Used to communicate across Y direction. [0:pRow-1, z] + color = coord[1]; // All processors in a particular column will be bundled in the communicator called comm_col; + ierr = MPI_Comm_split(MPI_COMM_WORLD, color, rk->rank, &comm_col); // refer https://mpitutorial.com/tutorials/introduction-to-groups-and-communicators/ + int rankcol; + ierr = MPI_Comm_rank(comm_col, &rankcol); + + ierr = MPI_Barrier(MPI_COMM_WORLD); + printf("rank number %d, gpu number %d\n",rk->rank,rk->nodeRank); + ierr = MPI_Barrier(MPI_COMM_WORLD); + printf("SendingRank %d, Recieving Rank %d\n",rk->coodsend,rk->coodrecv); + ierr = MPI_Barrier(MPI_COMM_WORLD); + printf("Neighbours %d\n",rk -> km); + ierr = MPI_Barrier(MPI_COMM_WORLD); - ierr = MPI_Cart_rank(comm_cart, cZPos, &rk->kp); - ierr = MPI_Cart_rank(comm_cart, cZNeg, &rk->km); - ierr = MPI_Cart_rank(comm_cart, cYPos, &rk->jp); - ierr = MPI_Cart_rank(comm_cart, cYNeg, &rk->jm); - - rk->jstart = coord[0]*my; - rk->jend = rk->jstart + my; - rk->kstart = coord[1]*mz; - rk->kend = rk->kstart + mz; - - long int hash; - MPI_Comm comm_node; - int procnamesize; - char procname[MPI_MAX_PROCESSOR_NAME]; - - ierr = MPI_Get_processor_name(procname, &procnamesize); - - hash = 1; // nameToHash(procname, procnamesize); - - ierr = MPI_Comm_split(MPI_COMM_WORLD, hash, rk->rank, &comm_node); - ierr = MPI_Comm_rank(comm_node , &rk->nodeRank); - ierr = MPI_Barrier(MPI_COMM_WORLD); - printf("rank number %d, gpu number %d\n",rk->rank,rk->nodeRank); - ierr = MPI_Barrier(MPI_COMM_WORLD); } void saveFileMPI(char filename, int timestep, double *var, Communicator rk) { @@ -277,6 +335,144 @@ void readFileMPI(char filename, int timestep, double *var, Communicator rk) { var[i + (j-rk.jstart)*mx + (k-rk.kstart)*mx*my] = var_tot[i + j * mx_tot + k * mx_tot * my_tot]; delete [] var_tot; } +void readFileMPIInit(char filename, int timestep, double *r, double *u, double *v, double *w, double *ret, Communicator rk){ + double *var_tot = new double[mx_tot*my_tot*mz_tot*5]; + int ierr; + int lSize = mx_tot*my_tot*mz_tot*5; + + if(rk.rank==0) { + char str[80]; + size_t result; + sprintf(str, "fields/%c.%07d.bin",filename,timestep); + FILE *fb = fopen(str,"rb"); + result = fread(var_tot , sizeof(myprec) , lSize , fb ); + fclose(fb); + } + ierr = MPI_Barrier(MPI_COMM_WORLD); + + ierr = MPI_Bcast(var_tot, lSize, MPI_DOUBLE, 0, MPI_COMM_WORLD); + + ierr = MPI_Barrier(MPI_COMM_WORLD); + + for (int m=0; m<5; m++){ + for (int k=rk.kstart; k 279 || my+2*stencilSize>600 || mz/nDivZ+2*stencilSize>600 @@ -64,11 +64,13 @@ const int lPencils = 32; #endif #endif + extern __constant__ myprec dcoeffF[stencilSize]; extern __constant__ myprec dcoeffS[stencilSize+1]; extern __constant__ myprec dcoeffVF[stencilVisc]; extern __constant__ myprec dcoeffVS[stencilVisc+1]; + #if mx<=546 //limit on the GPU constant memory usage (655356 bytes) extern __constant__ myprec dcoeffSx[mx*(2*stencilSize+1)]; extern __constant__ myprec dcoeffVSx[mx*(2*stencilVisc+1)]; @@ -82,8 +84,8 @@ extern __device__ myprec time_on_GPU; extern __device__ Communicator rkGPU; -extern dim3 d_block[5], grid0, gridBC, gridHalo, gridHaloY, gridHaloZ; -extern dim3 d_grid[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ; +extern dim3 d_grid[5], grid0, gridBC, gridHalo, gridHaloY, gridHaloZ, d_gridx, gridBCw; +extern dim3 d_block[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ, d_blockx, blockBCw; extern cudaStream_t s[8+nDivZ]; @@ -138,4 +140,12 @@ extern myprec *rcvYp,*rcvYm,*rcvZp,*rcvZm; extern myprec *senYp5,*senYm5,*senZp5,*senZm5; extern myprec *rcvYp5,*rcvYm5,*rcvZp5,*rcvZm5; +extern myprec *rm, *um, *wm, *tm; +extern myprec *rm_in, *um_in, *wm_in, *tm_in; +extern myprec *a_inpl, *b_inpl; +extern int *idxm, *idxp; +extern myprec *delta_rec, *delta_in ; +extern myprec *recy_r, *recy_u, *recy_v, *recy_w, *recy_e, *recy_h, *recy_p, *recy_t, *recy_m, *recy_l; + + #endif diff --git a/src/cuda_main.cu b/src/cuda_main.cu index 4f04436..f031113 100644 --- a/src/cuda_main.cu +++ b/src/cuda_main.cu @@ -6,56 +6,182 @@ #include "comm.h" #include "sponge.h" - - -__device__ __constant__ myprec alpha[] = {0. , -17./60., -5./12.}; +__device__ __constant__ myprec alpha[] = {0., -17./60., -5./12.}; __device__ __constant__ myprec beta[] = {8./15., 5./12., 3./4. }; cudaStream_t s[8+nDivZ]; -inline void calcRHS(myprec *rhsr, myprec *rhsu, myprec *rhsv, myprec *rhsw, myprec *rhse, Communicator rk) { +inline void calcRHS(myprec *rhsr, myprec *rhsu, myprec *rhsv, myprec *rhsw, myprec *rhse, Communicator rk, recycle rec) { + if (topbc == 3) { + BCwallCenteredTop<<>>(d_r,d_u,d_v,d_w,d_e); // The values from RK3 step for the boundary nodes will be not accurate as we already have BCspecified for them and we do not solvecharacteristic equations for them (except for rho). + // So u[0] may not be 0 after RK3. Thus, we ensure the boundary values are accurate using this step. rho will be accurate at + // boundary as we solve the NS eqn for rho accurately at the wall (reflecting BC). + } + if (bottombc == 3) { + BCwallCenteredBot<<>>(d_r,d_u,d_v,d_w,d_e); + + } + if(multiGPU) deviceBlocker<<>>(); //in order to hide the halo update with deviceRHSX (on stream s[0]) calcState<<>>(d_r,d_u,d_v,d_w,d_e,d_h,d_t,d_p,d_m,d_l,0); //here 0 means interior points - //derVelX<<>>(d_u,d_v,d_w,gij[0],gij[1],gij[2]); - derVelY<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5]); - //for (int kNum=0; kNum<1; kNum++) //CHANGED!!!!! - derVelZ<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8]); + // if(multiGPU) deviceBlocker<<>>(); //in order to hide the halo update with deviceRHSX (on stream s[0]) + deviceRHSX<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[0],gij[1],gij[2],gij[3],gij[6],gij[4],gij[8],d_dil,dpdz); + if(multiGPU) { - updateHaloFive(d_r,d_u,d_v,d_w,d_e,rk); cudaDeviceSynchronize(); + updateHaloFive(d_r,d_u,d_v,d_w,d_e,rk); + cudaDeviceSynchronize(); calcState<<>>(d_r,d_u,d_v,d_w,d_e,d_h,d_t,d_p,d_m,d_l,1); //here 1 means halo points - derVelYBC<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],0); //here 0 means lower boundary (0-index) - derVelZBC<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],0); //here 0 means lower boundary (0-index) - derVelYBC<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],1); //here 1 means upper boundary (my-index) - derVelZBC<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],1); //here 1 means upper boundary (mz-index) + // derVelYBC<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],0); //here 0 means lower boundary (0-index) + // derVelZBC<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],0); //here 0 means lower boundary (0-index) + // derVelYBC<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],1); //here 1 means upper boundary (my-index) + // derVelZBC<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],1); //here 1 means upper boundary (mz-index) } - cudaDeviceSynchronize(); - //calcDil<<>>(d_dil,gij[0],gij[4],gij[8]); - cudaDeviceSynchronize(); - if(multiGPU) deviceBlocker<<>>(); //in order to hide the halo update with deviceRHSX (on stream s[0]) - deviceRHSX<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[0],gij[1],gij[2],gij[3],gij[6],gij[4],gij[8],d_dil,dpdz,0); + if(multiGPU) updateHalo(d_dil,rk); cudaDeviceSynchronize(); deviceRHSY<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[1],gij[3],gij[4],gij[5],gij[7],d_dil,dpdz); - cudaDeviceSynchronize(); - deviceRHSZ<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[2],gij[5],gij[6],gij[7],gij[8],d_dil,dpdz); cudaDeviceSynchronize(); - if(boundaryLayer) addSponge<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_e); + deviceRHSZ<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[2],gij[5],gij[6],gij[7],gij[8],d_dil,dpdz, rk,rec); + cudaDeviceSynchronize(); + //if(boundaryLayer) addSponge<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_e); cudaDeviceSynchronize(); } -void runSimulation(int file , myprec *par1, myprec *par2, myprec *time, Communicator rk) { - - cudaSetDevice(rk.nodeRank); - myprec h_dt,h_dpdz; - - for (int istep = 0; istep < nsteps; istep++) { - - if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk); - if(istep>0) deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC); - if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC); - deviceAdvanceTime<<<1,1>>>(dtC); - if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,&h_dt,&h_dpdz,file, istep, rk); - - +void runSimulation(int file, myprec *par1, myprec *par2, myprec *time, Communicator rk) { + + cudaSetDevice(rk.nodeRank); + myprec h_dt,h_dpdz; + + for (int istep = 0; istep < nsteps; istep++) { + if (topbc == 3) { + BCwallCenteredTop<<>>(d_r,d_u,d_v,d_w,d_e); // The values from RK3 step for the boundary nodes will be not accurate as we already have BCspecified for them and we do not solvecharacteristic equations for them (except for rho). + // So u[0] may not be 0 after RK3. Thus, we ensure the boundary values are accurate using this step. rho will be accurate at + // boundary as we solve the NS eqn for rho accurately at the wall (reflecting BC). + } + if (bottombc == 3) { + BCwallCenteredBot<<>>(d_r,d_u,d_v,d_w,d_e); + + } + cudaDeviceSynchronize(); + + if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk); + if(istep>0) deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1],dtC); + if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC); + deviceAdvanceTime<<<1,1>>>(dtC); + if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,&h_dt,&h_dpdz,file, istep, rk); + + int notanum=0; + if (h_dt != h_dt) notanum = 1; + if (notanum>0){ + if(rk.rank==0) printf("Encountered a NAN in dt\n"); + exit(1); + } + + //Recycling-Rescaling + if (inletbc == 5) { + if (( rk.kstart <= recstn) && ( rk.kend >= recstn) ) { // this section will be executed only by the processors that includes recycling stn. + if (abs(recstn - rk.kstart) < stencilSize-1) { + printf("The Recycling station is too close to the processor left boundary."); + exit(1); + } + if(istep%checkMeanRec==0) { + calcMeanRec(d_r, d_u, d_w, d_t, rm, um, wm, tm, a_inpl, b_inpl, idxm, idxp, delta_rec, delta_in,rk); + } + cudaDeviceSynchronize(); + Recycle_Rescale(d_r, d_u, d_v, d_w, d_t, rm, um, wm, tm, rm_in, um_in, wm_in, tm_in, recy_r, + recy_u, recy_v, recy_w, recy_t, a_inpl, b_inpl, idxm, idxp, delta_rec, delta_in,rk); + cudaDeviceSynchronize(); + } + if (pCol > 1) { + TransferToInlet(recy_r, recy_u, recy_v, recy_w, recy_t, rk); + cudaDeviceSynchronize(); + } + + if (pRow==1 && spanshift) { + Spanwiseshift(recy_r, recy_u, recy_v, recy_w, recy_t, rk); + cudaDeviceSynchronize(); + } + + + + + dim3 gridcalcstateRR; + gridcalcstateRR = dim3(my, stencilSize, 1); + calcStateRR<<>>(recy_r, recy_u, recy_v, recy_w, recy_e, recy_h, recy_t, recy_p, recy_m, recy_l); + } + cudaDeviceSynchronize(); + + if ((istep%100==0)){ + /*myprec *h_wminl = (myprec*)malloc(mx*sizeof(myprec)); + myprec *wminl; + checkCuda( cudaMalloc( (void**)&wminl, mx_tot*sizeof(myprec) ) ); + int gridYAvg = mx_tot; + int blockYAvg = my; + YAvg<<>>(recy_w, wminl, stencilSize-1, 0); + cudaDeviceSynchronize(); + checkCuda( cudaMemcpy(h_wminl, wminl, mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) ); + allReduceSumYavg(h_wminl,mx_tot);*/ + + myprec *h_delre = (myprec*)malloc(stencilSize*sizeof(myprec)); + checkCuda( cudaMemcpy(h_delre, delta_rec, stencilSize*sizeof(myprec), cudaMemcpyDeviceToHost) ); + + myprec *h_wm = (myprec*)malloc(stencilSize*mx*sizeof(myprec)); + checkCuda( cudaMemcpy(h_wm, wm, mx*stencilSize*sizeof(myprec), cudaMemcpyDeviceToHost) ); + + myprec deltastar=0; + int kn = stencilSize-1; + myprec Uinv = 1/h_wm[mx-1 + kn*mx]; + for (int i=0; i>>(d_uO,d_r,d_u); deviceMul<<>>(d_vO,d_r,d_v); deviceMul<<>>(d_wO,d_r,d_w); @@ -65,181 +191,186 @@ void runSimulation(int file , myprec *par1, myprec *par2, myprec *time, Communic //Starting the Runge-Kutta Steps //runge kutta step 1 - calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk); + calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk,rec); + //eulerSum<<>>(d_r,d_rO,d_rhsr1,dtC); //eulerSum<<>>(d_e,d_eO,d_rhse1,dtC); - eulerSumAll<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, - d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,dtC); + eulerSumAll<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, + d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,dtC); //cudaDeviceSynchronize(); //eulerSumR<<>>(d_u,d_uO,d_rhsu1,d_r,dtC); //eulerSumR<<>>(d_v,d_vO,d_rhsv1,d_r,dtC); //eulerSumR<<>>(d_w,d_wO,d_rhsw1,d_r,dtC); cudaDeviceSynchronize(); - if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer - deviceCpy<<>>(d_r,d_r); - deviceCpy<<>>(d_u,d_u); - deviceCpy<<>>(d_v,d_v); - deviceCpy<<>>(d_w,d_w); } + /* if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer + deviceCpy<<>>(d_r,d_r); + deviceCpy<<>>(d_u,d_u); + deviceCpy<<>>(d_v,d_v); + deviceCpy<<>>(d_w,d_w); + }*/ //runge kutta step 2 - calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk); + calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk,rec); //eulerSum3<<>>(d_r,d_rO,d_rhsr1,d_rhsr2,dtC); //eulerSum3<<>>(d_e,d_eO,d_rhse1,d_rhse2,dtC); - eulerSumAll2<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, - d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2, dtC); - //cudaDeviceSynchronize(); + eulerSumAll2<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, + d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2, dtC); + //cudaDeviceSynchronize(); //eulerSum3R<<>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_r,dtC); //eulerSum3R<<>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_r,dtC); //eulerSum3R<<>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_r,dtC); cudaDeviceSynchronize(); - if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer - deviceCpy<<>>(d_r,d_r); - deviceCpy<<>>(d_u,d_u); - deviceCpy<<>>(d_v,d_v); - deviceCpy<<>>(d_w,d_w); } + /* if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer + deviceCpy<<>>(d_r,d_r); + deviceCpy<<>>(d_u,d_u); + deviceCpy<<>>(d_v,d_v); + deviceCpy<<>>(d_w,d_w); + }*/ //runge kutta step 3 - calcRHS(d_rhsr3,d_rhsu3,d_rhsv3,d_rhsw3,d_rhse3,rk); -// rk3final<<>>(d_r,d_rO,d_rhsr1,d_rhsr2,d_rhsr3,dtC); -// rk3final<<>>(d_e,d_eO,d_rhse1,d_rhse2,d_rhse3,dtC); - eulerSumAll3<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, - d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2, - d_rhsr3, d_rhsu3, d_rhsv3, d_rhsw3, d_rhse3,dtC); - -// cudaDeviceSynchronize(); -// rk3finalR<<>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_rhsu3,d_r,dtC); -// rk3finalR<<>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_rhsv3,d_r,dtC); -// rk3finalR<<>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_rhsw3,d_r,dtC); + calcRHS(d_rhsr3,d_rhsu3,d_rhsv3,d_rhsw3,d_rhse3,rk,rec); + // rk3final<<>>(d_r,d_rO,d_rhsr1,d_rhsr2,d_rhsr3,dtC); + // rk3final<<>>(d_e,d_eO,d_rhse1,d_rhse2,d_rhse3,dtC); + eulerSumAll3<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, + d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2, + d_rhsr3, d_rhsu3, d_rhsv3, d_rhsw3, d_rhse3,dtC); + + // cudaDeviceSynchronize(); + // rk3finalR<<>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_rhsu3,d_r,dtC); + // rk3finalR<<>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_rhsv3,d_r,dtC); + // rk3finalR<<>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_rhsw3,d_r,dtC); cudaDeviceSynchronize(); } } void runSimulationLowStorage(myprec *par1, myprec *par2, myprec *time, Communicator rk) { - cudaSetDevice(rk.nodeRank); - - myprec h_dt,h_dpdz; - - for (int istep = 0; istep < nsteps; istep++) { -/* if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk); - if(istep>0) deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC); - if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC); - deviceAdvanceTime<<<1,1>>>(dtC); - if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk);*/ - - //Starting the Runge-Kutta Steps - - //runge kutta step 1 - calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk); - deviceMul<<>>(d_u,d_u,d_r); - deviceMul<<>>(d_v,d_v,d_r); - deviceMul<<>>(d_w,d_w,d_r); - sumLowStorageRK3<<>>(d_r, d_rhsr1, d_rhsr1, dtC, 0); - sumLowStorageRK3<<>>(d_u, d_rhsu1, d_rhsu1, dtC, 0); - sumLowStorageRK3<<>>(d_v, d_rhsv1, d_rhsv1, dtC, 0); - sumLowStorageRK3<<>>(d_w, d_rhsw1, d_rhsw1, dtC, 0); - sumLowStorageRK3<<>>(d_e, d_rhse1, d_rhse1, dtC, 0); - cudaStreamSynchronize(s[0]); - deviceDiv<<>>(d_u,d_u,d_r); - deviceDiv<<>>(d_v,d_v,d_r); - deviceDiv<<>>(d_w,d_w,d_r); - cudaDeviceSynchronize(); - - if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer - deviceCpy<<>>(d_r,d_r); - deviceCpy<<>>(d_u,d_u); - deviceCpy<<>>(d_v,d_v); - deviceCpy<<>>(d_w,d_w); } - - - //runge kutta step 2 - calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk); - deviceMul<<>>(d_u,d_u,d_r); - deviceMul<<>>(d_v,d_v,d_r); - deviceMul<<>>(d_w,d_w,d_r); - sumLowStorageRK3<<>>(d_r, d_rhsr1, d_rhsr2, dtC, 1); - sumLowStorageRK3<<>>(d_u, d_rhsu1, d_rhsu2, dtC, 1); - sumLowStorageRK3<<>>(d_v, d_rhsv1, d_rhsv2, dtC, 1); - sumLowStorageRK3<<>>(d_w, d_rhsw1, d_rhsw2, dtC, 1); - sumLowStorageRK3<<>>(d_e, d_rhse1, d_rhse2, dtC, 1); - cudaStreamSynchronize(s[0]); - deviceDiv<<>>(d_u,d_u,d_r); - deviceDiv<<>>(d_v,d_v,d_r); - deviceDiv<<>>(d_w,d_w,d_r); - cudaDeviceSynchronize(); - - if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer - deviceCpy<<>>(d_r,d_r); - deviceCpy<<>>(d_u,d_u); - deviceCpy<<>>(d_v,d_v); - deviceCpy<<>>(d_w,d_w); } - - //runge kutta step 3 - calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk); - deviceMul<<>>(d_u,d_u,d_r); - deviceMul<<>>(d_v,d_v,d_r); - deviceMul<<>>(d_w,d_w,d_r); - sumLowStorageRK3<<>>(d_r, d_rhsr2, d_rhsr1, dtC, 2); - sumLowStorageRK3<<>>(d_u, d_rhsu2, d_rhsu1, dtC, 2); - sumLowStorageRK3<<>>(d_v, d_rhsv2, d_rhsv1, dtC, 2); - sumLowStorageRK3<<>>(d_w, d_rhsw2, d_rhsw1, dtC, 2); - sumLowStorageRK3<<>>(d_e, d_rhse2, d_rhse1, dtC, 2); - cudaStreamSynchronize(s[0]); - deviceDiv<<>>(d_u,d_u,d_r); - deviceDiv<<>>(d_v,d_v,d_r); - deviceDiv<<>>(d_w,d_w,d_r); - cudaDeviceSynchronize(); - } + /* cudaSetDevice(rk.nodeRank); + + myprec h_dt,h_dpdz; + + for (int istep = 0; istep < nsteps; istep++) { + if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk); + if(istep>0) deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC); + if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC); + deviceAdvanceTime<<<1,1>>>(dtC); + if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk); + + //Starting the Runge-Kutta Steps + + //runge kutta step 1 + calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk); + deviceMul<<>>(d_u,d_u,d_r); + deviceMul<<>>(d_v,d_v,d_r); + deviceMul<<>>(d_w,d_w,d_r); + sumLowStorageRK3<<>>(d_r, d_rhsr1, d_rhsr1, dtC, 0); + sumLowStorageRK3<<>>(d_u, d_rhsu1, d_rhsu1, dtC, 0); + sumLowStorageRK3<<>>(d_v, d_rhsv1, d_rhsv1, dtC, 0); + sumLowStorageRK3<<>>(d_w, d_rhsw1, d_rhsw1, dtC, 0); + sumLowStorageRK3<<>>(d_e, d_rhse1, d_rhse1, dtC, 0); + cudaStreamSynchronize(s[0]); + deviceDiv<<>>(d_u,d_u,d_r); + deviceDiv<<>>(d_v,d_v,d_r); + deviceDiv<<>>(d_w,d_w,d_r); + cudaDeviceSynchronize(); + + if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer + deviceCpy<<>>(d_r,d_r); + deviceCpy<<>>(d_u,d_u); + deviceCpy<<>>(d_v,d_v); + deviceCpy<<>>(d_w,d_w); + } + + + //runge kutta step 2 + calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk); + deviceMul<<>>(d_u,d_u,d_r); + deviceMul<<>>(d_v,d_v,d_r); + deviceMul<<>>(d_w,d_w,d_r); + sumLowStorageRK3<<>>(d_r, d_rhsr1, d_rhsr2, dtC, 1); + sumLowStorageRK3<<>>(d_u, d_rhsu1, d_rhsu2, dtC, 1); + sumLowStorageRK3<<>>(d_v, d_rhsv1, d_rhsv2, dtC, 1); + sumLowStorageRK3<<>>(d_w, d_rhsw1, d_rhsw2, dtC, 1); + sumLowStorageRK3<<>>(d_e, d_rhse1, d_rhse2, dtC, 1); + cudaStreamSynchronize(s[0]); + deviceDiv<<>>(d_u,d_u,d_r); + deviceDiv<<>>(d_v,d_v,d_r); + deviceDiv<<>>(d_w,d_w,d_r); + cudaDeviceSynchronize(); + + if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer + deviceCpy<<>>(d_r,d_r); + deviceCpy<<>>(d_u,d_u); + deviceCpy<<>>(d_v,d_v); + deviceCpy<<>>(d_w,d_w); + } + + //runge kutta step 3 + calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk); + deviceMul<<>>(d_u,d_u,d_r); + deviceMul<<>>(d_v,d_v,d_r); + deviceMul<<>>(d_w,d_w,d_r); + sumLowStorageRK3<<>>(d_r, d_rhsr2, d_rhsr1, dtC, 2); + sumLowStorageRK3<<>>(d_u, d_rhsu2, d_rhsu1, dtC, 2); + sumLowStorageRK3<<>>(d_v, d_rhsv2, d_rhsv1, dtC, 2); + sumLowStorageRK3<<>>(d_w, d_rhsw2, d_rhsw1, dtC, 2); + sumLowStorageRK3<<>>(d_e, d_rhse2, d_rhse1, dtC, 2); + cudaStreamSynchronize(s[0]); + deviceDiv<<>>(d_u,d_u,d_r); + deviceDiv<<>>(d_v,d_v,d_r); + deviceDiv<<>>(d_w,d_w,d_r); + cudaDeviceSynchronize(); + }*/ } -__global__ void eulerSumAll(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, - myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1 - ,myprec *dt) { +__global__ void eulerSumAll(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, + myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1 + ,myprec *dt) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); - id.mkidX(); - myprec tmp1 = 0; + id.mkidX(); + myprec tmp1 = 0; tmp1 = (r0[id.g] + rhsr1[id.g]*(*dt)/2.0); - r[id.g] = tmp1; + r[id.g] = tmp1; u[id.g] = (u0[id.g] + rhsu1[id.g]*(*dt)/2.0)/tmp1; v[id.g] = (v0[id.g] + rhsv1[id.g]*(*dt)/2.0)/tmp1; w[id.g] = (w0[id.g] + rhsw1[id.g]*(*dt)/2.0)/tmp1; - e[id.g] = (e0[id.g] + rhse1[id.g]*(*dt)/2.0); + e[id.g] = (e0[id.g] + rhse1[id.g]*(*dt)/2.0); } -__global__ void eulerSumAll2(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, - myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1, - myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, myprec *dt) { +__global__ void eulerSumAll2(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, + myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1, + myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, myprec *dt) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); - id.mkidX(); - myprec tmp1 = 0; + id.mkidX(); + myprec tmp1 = 0; tmp1 = ( r0[id.g] + (2*rhsr2[id.g] - rhsr1[id.g])*(*dt) ); - r[id.g] = tmp1; + r[id.g] = tmp1; u[id.g] = ( u0[id.g] + (2*rhsu2[id.g] - rhsu1[id.g])*(*dt) )/tmp1; v[id.g] = ( v0[id.g] + (2*rhsv2[id.g] - rhsv1[id.g])*(*dt) )/tmp1; w[id.g] = ( w0[id.g] + (2*rhsw2[id.g] - rhsw1[id.g])*(*dt) )/tmp1; - e[id.g] = ( e0[id.g] + (2*rhse2[id.g] - rhse1[id.g])*(*dt) ); + e[id.g] = ( e0[id.g] + (2*rhse2[id.g] - rhse1[id.g])*(*dt) ); } -__global__ void eulerSumAll3(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, - myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1, - myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, - myprec *rhsr3, myprec *rhsu3, myprec *rhsv3, myprec *rhsw3, myprec *rhse3, myprec *dt) { +__global__ void eulerSumAll3(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, + myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1, + myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, + myprec *rhsr3, myprec *rhsu3, myprec *rhsv3, myprec *rhsw3, myprec *rhse3, myprec *dt) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); - id.mkidX(); - myprec tmp1 = 0; + id.mkidX(); + myprec tmp1 = 0; tmp1 = r0[id.g] + (*dt)*( rhsr1[id.g] + 4*rhsr2[id.g] + rhsr3[id.g])/6.; - r[id.g] = tmp1; + r[id.g] = tmp1; u[id.g] = ( u0[id.g] + (*dt)*( rhsu1[id.g] + 4*rhsu2[id.g] + rhsu3[id.g])/6.)/tmp1; v[id.g] = ( v0[id.g] + (*dt)*( rhsv1[id.g] + 4*rhsv2[id.g] + rhsv3[id.g])/6.)/tmp1; w[id.g] = ( w0[id.g] + (*dt)*( rhsw1[id.g] + 4*rhsw2[id.g] + rhsw3[id.g])/6.)/tmp1; - e[id.g] = ( e0[id.g] + (*dt)*( rhse1[id.g] + 4*rhse2[id.g] + rhse3[id.g])/6.); + e[id.g] = ( e0[id.g] + (*dt)*( rhse1[id.g] + 4*rhse2[id.g] + rhse3[id.g])/6.); } @@ -276,58 +407,131 @@ __global__ void rk3finalR(myprec *a1, myprec *a2, myprec *b, myprec *c, myprec * __global__ void calcState(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret, myprec *ht, myprec *tem, myprec *pre, myprec *mu, myprec *lam, int bc) { -/* int threadsPerBlock = blockDim.x * blockDim.y; - int threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y; - int blockNumInGrid = blockIdx.x + gridDim.x * blockIdx.y; - int gl = blockNumInGrid * threadsPerBlock + threadNumInBlock; + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + int gl = id.g; + if(bc==1) { + int threadsPerBlock = blockDim.x * blockDim.y; + int threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y; + int blockNumInGrid = blockIdx.x + gridDim.x * blockIdx.y; + gl = blockNumInGrid * threadsPerBlock + threadNumInBlock; + gl += mx*my*mz; + } + myprec cvInv = (gam - 1.0)/Rgas; + myprec r = rho[gl]; + myprec u = uvel[gl]; + myprec v = vvel[gl]; + myprec w = wvel[gl]; + myprec etot = ret[gl]; - if(bc==1) gl += mx*my*mz; + myprec invrho = 1.0/r; + myprec en = etot*invrho - 0.5*(u*u + v*v + w*w); - myprec cvInv = (gam - 1.0)/Rgas; + myprec t = cvInv*en; + myprec p = r*Rgas*t; + tem[gl] = t; + pre[gl] = p; - myprec invrho = 1.0/rho[gl]; + ht[gl] = (etot + p)*invrho; - myprec en = ret[gl]*invrho - 0.5*(uvel[gl]*uvel[gl] + vvel[gl]*vvel[gl] + wvel[gl]*wvel[gl]); - tem[gl] = cvInv*en; - pre[gl] = rho[gl]*Rgas*tem[gl]; - ht[gl] = (ret[gl] + pre[gl])*invrho; + myprec suth = pow(t,viscexp); + mu[gl] = suth/Re; + lam[gl] = suth/Re/Pr/Ec; + + __syncthreads(); + +} + +__global__ void calcStateRR(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret, myprec *ht, myprec *tem, myprec *pre, myprec *mu, myprec *lam) { - myprec suth = pow(tem[gl],viscexp); - mu[gl] = suth/Re; - lam[gl] = suth/Re/Pr/Ec; - __syncthreads();*/ Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); - id.mkidX(); - int gl = id.g; - if(bc==1) gl += mx*my*mz; + id.mkidBCRR(blockIdx.y); + int gl = id.g; - myprec cvInv = (gam - 1.0)/Rgas; - myprec r = rho[gl]; - myprec u = uvel[gl]; - myprec v = vvel[gl]; - myprec w = wvel[gl]; - myprec etot = ret[gl]; + myprec cv = Rgas/(gam - 1.0) ; + myprec r = rho[gl]; + myprec u = uvel[gl]; + myprec v = vvel[gl]; + myprec w = wvel[gl]; + myprec t = tem[gl]; - myprec invrho = 1.0/r; - myprec en = etot*invrho - 0.5*(u*u + v*v + w*w); + myprec en = cv*t ; - myprec t = cvInv*en; - myprec p = r*Rgas*t; //tmp1 is temperature // tmp2 is pressure - tem[gl] = t; - pre[gl] = p; + myprec etot = r*( en + 0.5*(u*u + v*v + w*w) ) ; + myprec p = r*Rgas*t; + ret[gl] = etot; + pre[gl] = p; - ht[gl] = (etot + p)*invrho; + ht[gl] = (etot + p)/r; - myprec suth = pow(t,viscexp); - mu[gl] = suth/Re; - lam[gl] = suth/Re/Pr/Ec; + myprec suth = pow(t,viscexp); + mu[gl] = suth/Re; + lam[gl] = suth/Re/Pr/Ec; - __syncthreads(); + __syncthreads(); } +__global__ void BCwallCenteredTop(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret) { + + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidBCwallTop(); + + + myprec u = 0.0; + myprec v = 0.0; + myprec w = 0.0; + myprec t = TwallTop; + + uvel[id.g] = u; + vvel[id.g] = v; + wvel[id.g] = w; + + myprec cv = Rgas/(gam - 1.0); + myprec r = rho[id.g]; + + myprec en = cv*t; + + myprec etot = r*( en + 0.5*(u*u + v*v + w*w) ); + + ret[id.g] = etot; + + __syncthreads(); + +} +__global__ void BCwallCenteredBot(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret) { + + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidBCwallBot(); + + + myprec u = 0.0; + myprec v = 0.0; + myprec w = 0.0; + myprec t = TwallBot; + + uvel[id.g] = u; + vvel[id.g] = v; + wvel[id.g] = w; + + myprec cv = Rgas/(gam - 1.0); + myprec r = rho[id.g]; + + myprec en = cv*t; + + myprec etot = r*( en + 0.5*(u*u + v*v + w*w) ); + + ret[id.g] = etot; + + __syncthreads(); + +} + + __global__ void sumLowStorageRK3(myprec *var, myprec *rhs1, myprec *rhs2, myprec *dt, int step) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); var[id.g] = var[id.g] + (*dt)*(alpha[step]*rhs1[id.g] + beta[step]*rhs2[id.g]); @@ -337,10 +541,10 @@ void calcTimeStepPressGrad(int istep, myprec *dtC, myprec *dpdz, myprec *h_dt, m cudaSetDevice(rk.nodeRank); calcTimeStep(dtC,d_r,d_u,d_v,d_w,d_e,d_m,rk); - cudaMemcpy(h_dt , dtC , sizeof(myprec), cudaMemcpyDeviceToHost); + cudaMemcpy(h_dt, dtC, sizeof(myprec), cudaMemcpyDeviceToHost); allReduceToMin(h_dt,1); mpiBarrier(); - cudaMemcpy(dtC , h_dt , sizeof(myprec), cudaMemcpyHostToDevice); + cudaMemcpy(dtC, h_dt, sizeof(myprec), cudaMemcpyHostToDevice); if(forcing) { calcPressureGrad(dpdz,d_r,d_w,rk); //Changed here dpdz_new = dpdz_old - 0.5*(rw_bulk - 1 ) cudaMemcpy(h_dpdz, dpdz, sizeof(myprec), cudaMemcpyDeviceToHost); @@ -351,79 +555,357 @@ void calcTimeStepPressGrad(int istep, myprec *dtC, myprec *dpdz, myprec *h_dt, m //if(rk.rank==0) printf("step number %d with %le %le\n",nsteps*(file) + istep ,*h_dt,*h_dpdz); } +void calcMeanRec(myprec *r, myprec *u,myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm, myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in,Communicator rk) { + + myprec *hostYmean = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec)); + + int gridYAvg = mx_tot; + int blockYAvg = my; + + for (int k = 0; k < stencilSize; k++){ + int krec = recstn - rk.kstart - (stencilSize-1) + k; // stencilSize-1 is there becase we want k = 0 coreesponding to plane recystn-2, k=1 correspond to plane recycstn-1 and k=2 to recycstn + YAvg<<>>(r, rm, krec, k);// *2 because the next power of 2 shall be <= 2*my. Eg: 139.. next power = 256 < 278 + YAvg<<>>(u, um, krec, k); //wall normal velocity + YAvg<<>>(w, wm, krec, k); //streamwise velocity + YAvg<<>>(t, tm, krec, k); + cudaDeviceSynchronize(); + } + + checkCuda( cudaMemcpy(hostYmean, rm, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) ); + allReduceSumYavg(hostYmean,stencilSize*mx_tot); // check if mpi_barrier is needed with comm_col communicator? + checkCuda( cudaMemcpy(rm, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + + checkCuda( cudaMemcpy(hostYmean, um, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) ); + allReduceSumYavg(hostYmean,stencilSize*mx_tot); + checkCuda( cudaMemcpy(um, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + + checkCuda( cudaMemcpy(hostYmean, wm, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) ); + allReduceSumYavg(hostYmean,stencilSize*mx_tot); + checkCuda( cudaMemcpy(wm, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + + checkCuda( cudaMemcpy(hostYmean, tm, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) ); + allReduceSumYavg(hostYmean,stencilSize*mx_tot); + checkCuda( cudaMemcpy(tm, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + + deltaRR<<<1, stencilSize,0,s[0]>>>(wm, delta_rec); + interpRR<<>>(a, b, idxm, idxp, delta_rec, delta_in) ; + + free(hostYmean); +} + +void Recycle_Rescale(myprec *r, myprec *u, myprec *v, myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm, + myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *recy_r, myprec *recy_u, + myprec *recy_v, myprec *recy_w, myprec *recy_t,myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in, Communicator rk) { + + dim3 gridRR; + gridRR = dim3(my,stencilSize,1); + int blockRR = mx_tot; + + Recy_Resc<<>>(r, u, v, w, t, rm, um, wm, tm, rm_in, um_in, wm_in, tm_in, recy_r, recy_u, recy_v, recy_w, recy_t, + a, b, idxm, idxp, delta_rec, delta_in, rk); + + +} + + +void InletMeanUpdate(myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *delta_in, Communicator rk) { + + myprec *um_inRR = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec)); + myprec *wm_inRR = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec)); + myprec *tm_inRR = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec)); + myprec *rm_inRR = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec)); + myprec *delta99_inRR = (myprec*)malloc(stencilSize*sizeof(myprec)); + + readFileInitBL_inRR('r',rm_inRR,rk); + readFileInitBL_inRR('u',um_inRR,rk); + readFileInitBL_inRR('w',wm_inRR,rk); + readFileInitBL_inRR('t',tm_inRR,rk); + readFileInitBL1D_inRR('d',delta99_inRR,rk); + + checkCuda( cudaMemcpy(rm_in, rm_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + checkCuda( cudaMemcpy(um_in, um_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + checkCuda( cudaMemcpy(wm_in, wm_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + checkCuda( cudaMemcpy(tm_in, tm_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + checkCuda( cudaMemcpy(delta_in, delta99_inRR, stencilSize*sizeof(myprec), cudaMemcpyHostToDevice) ); + + free(um_inRR); + free(wm_inRR); + free(tm_inRR); + free(rm_inRR); + free(delta99_inRR); +} + +void Spanwiseshift(myprec *recy_r,myprec *recy_u,myprec *recy_v,myprec *recy_w,myprec *recy_t, Communicator rk){ + + dim3 grid; + grid = dim3(mx_tot, stencilSize, 1); + + spanshifting<<>>(recy_r); + spanshifting<<>>(recy_u); + spanshifting<<>>(recy_v); + spanshifting<<>>(recy_w); + spanshifting<<>>(recy_t); + +} + +__global__ void spanshifting(myprec *var){ + + int k = blockIdx.y; + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidSpanShift(k,0); + __shared__ myprec s_var[my]; + + s_var[id.tix] = var[id.g]; + __syncthreads(); + int shift = my/2; + Indices id1(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id1.mkidSpanShift(k,shift); + var[id1.g] = s_var[id.tix] ; + __syncthreads(); + +} + + +__global__ void Recy_Resc(myprec *r, myprec *u, myprec *v, myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm, + myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *recy_r, myprec *recy_u, + myprec *recy_v, myprec *recy_w, myprec *recy_t,myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in, Communicator rk) { + + int k = blockIdx.y; + int krec = (recstn-stencilSize+1) - rk.kstart + k; // stencilSize+1 is there becase we want k = 0 coreesponding to plane recystn-2, k=1 correspond to plane recycstn-1 and k=2 to recycstn + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidBCRR(krec); + + Indices id1(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id1.mkidBCRR(k); + + int id_mean = id.i + mx_tot*k ; // + + int idm = idxm[id_mean]; + int idp = idxp[id_mean]; + + myprec delratio = delta_rec[k]/delta_in[k] ; + myprec beta_rec = pow(abs(delratio),0.1); // (delta_rec / delta_in)^(1/10) -- delta_in = 1; + + __shared__ myprec s_fluc[mx_tot]; + __shared__ myprec s_mean[mx_tot]; + +/* s_mean[id.i] = um[id_mean] ; + __syncthreads(); + um_in[id_mean] = 1*( a[id_mean] * s_mean[idm] + (1.0-a[id_mean])* s_mean[idp] ) ;// not using b as I dont want um_in=0 which will happen when a=0&b=0. +*/ + + __syncthreads(); + s_fluc[id.i] = r[id.g] - rm[id_mean] ; + __syncthreads(); + myprec flucinlet = 1*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ; + recy_r[id1.g] = rm_in[id_mean] + flucinlet; + + __syncthreads(); + s_fluc[id.i] = u[id.g] - um[id_mean] ; + __syncthreads(); + flucinlet = beta_rec*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ; + recy_u[id1.g] = um_in[id_mean] + flucinlet; + + __syncthreads(); + s_fluc[id.i] = v[id.g] - 0 ; + __syncthreads(); + flucinlet = beta_rec*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ; + recy_v[id1.g] = 0 + flucinlet; + + __syncthreads(); + s_fluc[id.i] = w[id.g] - wm[id_mean] ; + __syncthreads(); + flucinlet = beta_rec*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ; + recy_w[id1.g] = wm_in[id_mean] + flucinlet; + + __syncthreads(); + s_fluc[id.i] = t[id.g] - tm[id_mean] ; + __syncthreads(); + flucinlet = 1*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ; + recy_t[id1.g] = tm_in[id_mean] + flucinlet; + +} + + +__global__ void YAvg(myprec *f, myprec *fm, int krec, int kRR) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidBCRecycAvg(krec); + + unsigned int n = id.bdx; + n = findNextPowerOf2(n); // to perform the reduction fast we need an array that is of the power of 2 + + extern __shared__ myprec s_avg[] ; + + for(int i=id.tix; iid.bdx to be a power of 2. + } + } + __syncthreads(); + + for (int size = n/2; size>0; size/=2) { + if (id.tix 1) { + k = (khi+klo)/2; + if(wm[k + id.tix*mx_tot] > winf) { + khi=k; + } else { + klo=k; + } + } + + + myprec a = (wm[khi+id.tix*mx_tot] - winf)/(wm[khi+id.tix*mx_tot] - wm[klo+id.tix*mx_tot]); + + delta[id.tix] = d_x[khi] - a*(d_x[khi] - d_x[klo]) ; +} + +__global__ void interpRR(myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + int idii = id.tix + mx_tot*id.bix ; + + __shared__ myprec s_x[mx_tot]; + s_x[id.tix] = d_x[id.tix]/delta_rec[id.bix]; + __syncthreads(); + + int khi = mx_tot-1; + int klo = 0; + int k; + + myprec xi = d_x[id.tix]/delta_in[id.bix]; //inlet x/delta + + // In the following lines of code we will find that what indices on the recycling station corresponds to the x/delta of the inlet. + // This will help in interpolating the data at the recycling station so as to be recycled to the inlet. + // Suppose we have x/delta_in = 0.35. But at the recycling end we have x/delta_re = 0.32 and 0.38. Thus, first using the while loop + // we compute the khi and klo correcponding to x=0.32 and x=0.38. Then we compute the weights as b=(0.38-0.35)/(0.38-0.32) & a=1-b; + // a and b are the weights which will be used for interpolation. + while (khi-klo > 1) { + k = (khi+klo)/2; + if( s_x[k] > xi) { // compare with the recycling station x/delta; + khi=k; + } else { + klo=k; + } + } + if (xi <= s_x[id.bdx-1]) { //id.bdx implies last value + idxp[idii] = khi ; + idxm[idii] = klo ; + + a[idii] = (s_x[khi] - xi)/(s_x[khi] - s_x[klo]) ; + b[idii] = 1.0 - a[idii]; + } else { + idxp[idii] = khi ; + idxm[idii] = klo ; + + a[idii] = 0.0 ; + b[idii] = 0.0; // This implies that for all the cells at the inlet that are beyond the x/delta_re|max, + //we do the interpolation such that u[x/deltain> x/delta_remax] = beta_rec * (b * u[idp]) where idp is khi. + //The while loop should ensure that the khi is maintained to its set value of mx_tot-1. + } +} + + + void solverWrapper(Communicator rk) { cudaSetDevice(rk.nodeRank); int start; - myprec *dpar1, *dpar2, *dtime; - myprec *hpar1 = new myprec[nsteps]; - myprec *hpar2 = new myprec[nsteps]; - myprec *htime = new myprec[nsteps]; - - checkCuda( cudaMalloc((void**)&dpar1, nsteps*sizeof(myprec)) ); - checkCuda( cudaMalloc((void**)&dpar2, nsteps*sizeof(myprec)) ); - checkCuda( cudaMalloc((void**)&dtime, nsteps*sizeof(myprec)) ); + myprec *dpar1, *dpar2, *dtime; + myprec *hpar1 = new myprec[nsteps]; + myprec *hpar2 = new myprec[nsteps]; + myprec *htime = new myprec[nsteps]; - FILE *fp; + checkCuda( cudaMalloc((void**)&dpar1, nsteps*sizeof(myprec)) ); + checkCuda( cudaMalloc((void**)&dpar2, nsteps*sizeof(myprec)) ); + checkCuda( cudaMalloc((void**)&dtime, nsteps*sizeof(myprec)) ); - //check the memory usage of the GPU - checkGpuMem(rk); + FILE *fp; + //check the memory usage of the GPU + checkGpuMem(rk); + if (inletbc == 5) { + InletMeanUpdate(rm_in, um_in, wm_in, tm_in, delta_in, rk); // The means at the inflow bc (ghost cells of inflow) will remain constant for the entire simulation and hence it is set once outside the loop + } - /*derVelX<<>>(d_u,d_v,d_w,gij[0],gij[1],gij[2]); - derVelY<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5]); - derVelZ<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8]); - calcDil<<>>(d_dil,gij[0],gij[4],gij[8]);*/ // In Y and Z derivatives are turned off. so need to fix that or create a new kernel. + if(rk.rank==5){ + FILE *fdel = fopen("delstar.txt","w+"); + fclose(fdel);} + if(restartFile<0) { + start=0; + } else { + start=restartFile; + } + for(int file = start+1; file>= 1; + count += 1; + } + + n = 1 << count; + return n; +} + unsigned int hostFindPreviousPowerOf2(unsigned int n) { while (n & n - 1) { @@ -305,6 +318,20 @@ unsigned int hostFindPreviousPowerOf2(unsigned int n) return n; } +unsigned int hostfindNextPowerOf2(unsigned int n) +{ + unsigned count = 0; + while(n != 0) + { + n >>= 1; + count += 1; + } + + n = 1 << count; + return n; +} + + void hostReduceToMin(myprec *gOut, myprec *var, Communicator rk) { cudaSetDevice(rk.nodeRank); @@ -341,13 +368,12 @@ void hostVolumeIntegral(myprec *gOut, myprec *var, Communicator rk) { int tot = mx*my*mz; int gr = my * mz; - int bl = mx ; - - checkCuda( cudaMalloc((void**)&dwrkM ,gr*sizeof(myprec)) ); - cudaDeviceSynchronize(); + int bl = mx ; bl = hostFindPreviousPowerOf2(bl); + checkCuda( cudaMalloc((void**)&dwrkM ,gr*sizeof(myprec)) ); + cudaDeviceSynchronize(); integrateThreads<<>>(dwrkM, var , tot); cudaDeviceSynchronize(); reduceThreads<<< 1 , bl, bl*sizeof(myprec)>>>(dwrkM, dwrkM, gr); diff --git a/src/cuda_math.h b/src/cuda_math.h index e1aab99..c962bae 100644 --- a/src/cuda_math.h +++ b/src/cuda_math.h @@ -29,8 +29,11 @@ __global__ void reduceThreads (myprec *gOut, myprec *gArr, int arraySize); __global__ void minOfThreads (myprec *gOut, myprec *gArr, int arraySize); __global__ void maxOfThreads (myprec *gOut, myprec *gArr, int arraySize); __device__ unsigned int findPreviousPowerOf2(unsigned int n); +__device__ unsigned int findNextPowerOf2(unsigned int n); void hostReduceToMin(myprec *gOut, myprec *var, Communicator rk); void hostVolumeIntegral(myprec *gOut, myprec *var, Communicator rk); void hostVolumeAverage(myprec *gOut, myprec *var, Communicator rk); +unsigned int hostfindNextPowerOf2(unsigned int n); +unsigned int hostFindPreviousPowerOf2(unsigned int n); #endif /* CUDA_MATH_H_ */ diff --git a/src/cuda_rhs.cu b/src/cuda_rhs.cu index 225f99b..c24122d 100644 --- a/src/cuda_rhs.cu +++ b/src/cuda_rhs.cu @@ -5,19 +5,30 @@ #include "boundary_condition_x.h" #include "boundary_condition_y.h" #include "boundary_condition_z.h" +#include "boundary.h" +#include "nrbcX.h" +#include "nrbcZ.h" + __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , myprec *t, myprec *p, myprec *mu, myprec *lam, myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz, - myprec *dvdy, myprec *dwdz, myprec *dil, myprec *dpdz, int iNum) { + myprec *dvdy, myprec *dwdz, myprec *dil, myprec *dpdz) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); - id.mkidX(); + int iNum = blockIdx.z ; + id.mkidXFlx(iNum); int si = id.tix + stencilSize; // local i for shared memory access + halo offset int sj = id.tiy; // local j for shared memory access + bool boolIsothnrbc_top = ((topbc==3) && (id.i == mx_tot-1)); + bool boolIsothnrbc_bot = ((bottombc==3) && (id.i == 0)); + bool boolFreestreamnrbc_top = ((topbc==4) && (id.i == mx_tot-1)); + bool boolAdiabnrbc_top = ((topbc==6) && (id.i == mx_tot-1)); + bool boolAdiabnrbc_bot = ((bottombc==6) && (id.i == 0)); + myprec rXtmp=0; myprec uXtmp=0; myprec vXtmp=0; @@ -28,14 +39,13 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre myprec wrk2=0; myprec wrk3=0; - - __shared__ myprec s_u[sPencils/2][mx+stencilSize*2]; - __shared__ myprec s_v[sPencils/2][mx+stencilSize*2]; - __shared__ myprec s_w[sPencils/2][mx+stencilSize*2]; - __shared__ myprec s_t[sPencils/2][mx+stencilSize*2]; - __shared__ myprec s_p[sPencils/2][mx+stencilSize*2]; - __shared__ myprec s_prop1[sPencils/2][mx+stencilSize*2]; - __shared__ myprec s_prop2[sPencils/2][mx+stencilSize*2]; + __shared__ myprec s_u[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_v[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_w[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_t[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_p[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_prop1[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_prop2[sPencils/2][mx/nX+stencilSize*2]; s_u[sj][si] = u[id.g]; s_v[sj][si] = v[id.g]; @@ -48,19 +58,43 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre // Boundary conditions in the x-direction are in boundary_condition_x.h // these are the BCs for u,v,w,p,t,mu,lambda - BCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); + if (id.tix < stencilSize){ + if (nX ==1){ + TopBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, topbc); + BotBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, bottombc); + } else { + if (iNum ==0){ + TopBCxCpy(s_u[sj],u,si,id);TopBCxCpy(s_v[sj],v,si,id);TopBCxCpy(s_w[sj],w,si,id);TopBCxCpy(s_p[sj],p,si,id); + TopBCxCpy(s_t[sj],t,si,id);TopBCxCpy(s_prop1[sj],mu,si,id);TopBCxCpy(s_prop2[sj],lam,si,id); + + BotBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, bottombc); + } else if (iNum == nX-1){ + BotBCxCpy(s_u[sj],u,si,id);BotBCxCpy(s_v[sj],v,si,id);BotBCxCpy(s_w[sj],w,si,id);BotBCxCpy(s_p[sj],p,si,id); + BotBCxCpy(s_t[sj],t,si,id);BotBCxCpy(s_prop1[sj],mu,si,id);BotBCxCpy(s_prop2[sj],lam,si,id); + + TopBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, topbc); + } else { + TopBCxCpy(s_u[sj],u,si,id);TopBCxCpy(s_v[sj],v,si,id);TopBCxCpy(s_w[sj],w,si,id);TopBCxCpy(s_p[sj],p,si,id); + TopBCxCpy(s_t[sj],t,si,id);TopBCxCpy(s_prop1[sj],mu,si,id);TopBCxCpy(s_prop2[sj],lam,si,id); + + BotBCxCpy(s_u[sj],u,si,id);BotBCxCpy(s_v[sj],v,si,id);BotBCxCpy(s_w[sj],w,si,id);BotBCxCpy(s_p[sj],p,si,id); + BotBCxCpy(s_t[sj],t,si,id);BotBCxCpy(s_prop1[sj],mu,si,id);BotBCxCpy(s_prop2[sj],lam,si,id); + } + } + } + __syncthreads(); derDevSharedV1x(&wrk1,s_u[sj],si); derDevSharedV1x(&wrk2,s_v[sj],si); derDevSharedV1x(&wrk3,s_w[sj],si); - dudx[id.g] = wrk1; - dvdx[id.g] = wrk2; - dwdx[id.g] = wrk3; + dudx[id.g] = wrk1; + dvdx[id.g] = wrk2; + dwdx[id.g] = wrk3; - rXtmp = wrk1 + dvdy[id.g] + dwdz[id.g]; // rXtmp is for rho but here it just acts acts as a temporary variable. - dil[id.g] = rXtmp; + rXtmp = wrk1 + dvdy[id.g] + dwdz[id.g]; // rXtmp is for rho but here it just acts acts as a temporary variable. + dil[id.g] = rXtmp; //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms uXtmp = ( 2 * wrk1 - 2./3.*rXtmp ); //rXtmp stores dilatation vXtmp = ( wrk2 + dudy[id.g] ); @@ -71,7 +105,7 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre //Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse); derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx - uXtmp *= wrk2; + uXtmp *= wrk2; vXtmp *= wrk2; wXtmp *= wrk2; @@ -83,29 +117,60 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre derDevSharedV2x(&wrk3,s_w[sj],si); wXtmp = wXtmp + wrk3*s_prop1[sj][si]; + if ( boolFreestreamnrbc_top ) {// This is what the NSCBC says (to put derivative of tangential stress ==0. BUT NOTE THAT WE ARE not setting THE CROSS DERIVATIVE TERMS COMING OUT OF TAU12 AND TAU 13 I.E. mu d2u/dxdy and mu d2u/dxdz equal to zero. This is because to do it we need to edit the Y and Z kernel which will make the code complicated.; + vXtmp = 0; + wXtmp = 0; + } //adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six) eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp; //adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx) - derDevSharedV2x(&wrk1,s_t[sj],si); - eXtmp = eXtmp + wrk1*s_prop2[sj][si]; - derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx - derDevSharedV1x(&wrk3,s_t[sj],si); //wrk1 = d (t) dx - eXtmp = eXtmp + wrk2*wrk3; + if ( boolFreestreamnrbc_top ) { // dqdx = 0; + eXtmp = eXtmp + 0; + } else { + derDevSharedV2x(&wrk1,s_t[sj],si); + eXtmp = eXtmp + wrk1*s_prop2[sj][si]; + derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx + derDevSharedV1x(&wrk3,s_t[sj],si); //wrk1 = d (t) dx + eXtmp = eXtmp + wrk2*wrk3; + } // pressure and dilation derivatives + __syncthreads(); s_prop2[sj][si] = rXtmp; // rXtmp stores dilatation temporarily __syncthreads(); // these are the BCs for the dilatation - BCxNumber2(s_prop2[sj],id,si,mx); + + if (id.tix < stencilSize){ + if (nX ==1){ + TopBCxNumber2(s_prop2[sj],dil,id,si,mx, topbc); + BotBCxNumber2(s_prop2[sj],dil,id,si,mx, bottombc); + } else { + if (iNum ==0){ + TopBCxCpy(s_prop2[sj],dil,si,id); + BotBCxNumber2(s_prop2[sj],dil,id,si,mx, bottombc); + } else if (iNum == nX-1){ + BotBCxCpy(s_prop2[sj],dil,si,id); + TopBCxNumber2(s_prop2[sj],dil,id,si,mx, topbc); + } else { + TopBCxCpy(s_prop2[sj],dil,si,id); + BotBCxCpy(s_prop2[sj],dil,si,id); + } + } + } __syncthreads(); derDevSharedV1x(&wrk2,s_prop2[sj],si); derDevShared1x(&wrk1 ,s_p[sj],si); + + if (boolFreestreamnrbc_top){ // pressure is taken care in the nrbc + wrk1 = 0; + } uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0 - wrk1 ; eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si]; + //Adding here the terms - d (ru phi) dx; s_prop1[sj][si] = r[id.g]; s_prop2[sj][si] = h[id.g]; @@ -113,30 +178,62 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre // fill in periodic images in shared memory array // these are the BCs for rho (prop1) and enthalpy (prop2) - BCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); - __syncthreads(); - fluxQuadSharedx(&wrk3,s_prop1[sj],s_u[sj],si); - rXtmp = wrk3; - __syncthreads(); - fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); - uXtmp = uXtmp + wrk1; - __syncthreads(); - fluxCubeSharedx(&wrk2,s_prop1[sj],s_u[sj],s_v[sj],si); - vXtmp = vXtmp + wrk2; - __syncthreads(); - fluxCubeSharedx(&wrk3,s_prop1[sj],s_u[sj],s_w[sj],si); - wXtmp = wXtmp + wrk3; - __syncthreads(); - fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); - eXtmp = eXtmp + wrk1; + + if (id.tix < stencilSize){ + if (nX ==1){ + TopBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, topbc); + BotBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, bottombc); + } else { + if (iNum ==0){ + TopBCxCpy(s_prop1[sj],r,si,id);TopBCxCpy(s_prop2[sj],h,si,id); + BotBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, bottombc); + } else if (iNum == nX-1){ + BotBCxCpy(s_prop1[sj],r,si,id);BotBCxCpy(s_prop2[sj],h,si,id); + TopBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, topbc); + } else { + TopBCxCpy(s_prop1[sj],r,si,id); TopBCxCpy(s_prop2[sj],h,si,id); + BotBCxCpy(s_prop1[sj],r,si,id); BotBCxCpy(s_prop2[sj],h,si,id); + } + } + } __syncthreads(); + + if ( boolIsothnrbc_top){ + IsothnrbcX_top(&rXtmp,&uXtmp,&vXtmp,&wXtmp,&eXtmp, s_prop1[sj],s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_prop2[sj], si, id); + } else if (boolIsothnrbc_bot){ + IsothnrbcX_bot(&rXtmp,&uXtmp,&vXtmp,&wXtmp,&eXtmp, s_prop1[sj],s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_prop2[sj], si, id); + } else if (boolFreestreamnrbc_top){ + FreestreamnrbcX_top(&rXtmp,&uXtmp,&vXtmp,&wXtmp,&eXtmp, s_prop1[sj],s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_prop2[sj], si, id); + } else if (boolAdiabnrbc_top){ + + } else if (boolAdiabnrbc_bot){ + + } else { + + fluxQuadSharedx(&wrk3,s_prop1[sj],s_u[sj],si); // DONT FORGET TO REMOVE SYNCTHREADS FROM THESE FUNCTIONS AS IT IS PLACED IN A DIVERGENT PATH. + rXtmp = wrk3; + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); + uXtmp = uXtmp + wrk1; + fluxCubeSharedx(&wrk2,s_prop1[sj],s_u[sj],s_v[sj],si); + vXtmp = vXtmp + wrk2; + fluxCubeSharedx(&wrk3,s_prop1[sj],s_u[sj],s_w[sj],si); + wXtmp = wXtmp + wrk3; + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); + eXtmp = eXtmp + wrk1; + } + + + rX[id.g] = rXtmp; uX[id.g] = uXtmp; vX[id.g] = vXtmp; wX[id.g] = wXtmp; eX[id.g] = eXtmp; + + __syncthreads(); + } __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, myprec *eY, @@ -149,15 +246,17 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre int jNum = blockIdx.z; id.mkidYFlx(jNum); - int tid = id.tix + id.bdx*id.tiy; + int tid = id.tix + id.bdx*id.tiy; int si = id.tiy + stencilSize; // local i for shared memory access + halo offset int sj = id.tix; // local j for shared memory access - int si1 = tid%id.bdy + stencilSize; // local i for shared memory access + halo offset - int sj1 = tid/id.bdy; + int si1 = tid%id.bdy + stencilSize; // local i for shared memory access + halo offset + int sj1 = tid/id.bdy; + + - // Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y); - // id1.mkidYFlx(jNum); + // Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y); + // id1.mkidYFlx(jNum); myprec rYtmp=0; myprec uYtmp=0; @@ -174,9 +273,9 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre __shared__ myprec s_w[mx/nDivX][my/nDivY+stencilSize*2]; __shared__ myprec s_prop[mx/nDivX][my/nDivY+stencilSize*2]; __shared__ myprec s_dil[mx/nDivX][my/nDivY+stencilSize*2]; - __shared__ myprec s_tmp[mx/nDivX][my/nDivY+stencilSize*2]; + __shared__ myprec s_tmp[mx/nDivX][my/nDivY+stencilSize*2]; -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM s_u[sj][si] = u[id.g]; s_v[sj][si] = v[id.g]; s_w[sj][si] = w[id.g]; @@ -187,45 +286,93 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre // fill in periodic images in shared memory array // these are boundary conditions for u,v,w,mu and dilatation - BCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my,jNum); + if (id.tiy < stencilSize){ + if (nDivY ==1){ + if(pRow>1) { + haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id); haloBCyTop(s_w[sj],w,si,id); + haloBCyTop(s_prop[sj],mu,si,id); haloBCyTop(s_dil[sj],dil,si,id); + + haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id); haloBCyBot(s_w[sj],w,si,id); + haloBCyBot(s_prop[sj],mu,si,id); haloBCyBot(s_dil[sj],dil,si,id); + } else{ + TopBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my); + BotBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my); + } + } else { + if (jNum ==0){ + TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); TopBCyCpy(s_w[sj],w,si,id); + TopBCyCpy(s_prop[sj],mu,si,id); TopBCyCpy(s_dil[sj],dil,si,id); + + if(pRow>1) { + + haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id); haloBCyBot(s_w[sj],w,si,id); + haloBCyBot(s_prop[sj],mu,si,id); haloBCyBot(s_dil[sj],dil,si,id); + } else{ + BotBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my); + } + } else if (jNum == nDivY-1){ + BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); BotBCyCpy(s_w[sj],w,si,id); + BotBCyCpy(s_prop[sj],mu,si,id); BotBCyCpy(s_dil[sj],dil,si,id); + if(pRow>1) { + haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id); haloBCyTop(s_w[sj],w,si,id); + haloBCyTop(s_prop[sj],mu,si,id); haloBCyTop(s_dil[sj],dil,si,id); + + } else { + TopBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my); + } + } else { + TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); TopBCyCpy(s_w[sj],w,si,id); + TopBCyCpy(s_prop[sj],mu,si,id); TopBCyCpy(s_dil[sj],dil,si,id); + + BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); BotBCyCpy(s_w[sj],w,si,id); + BotBCyCpy(s_prop[sj],mu,si,id);BotBCyCpy(s_dil[sj],dil,si,id); + } + } + } + __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP derDevSharedV1y(&wrk1,s_u[sj1],si1); derDevSharedV1y(&wrk2,s_v[sj1],si1); derDevSharedV1y(&wrk3,s_w[sj1],si1); //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms - - - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM - s_tmp[sj][si] = dvdx[id.g]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + s_tmp[sj][si] = dvdx[id.g]; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + uYtmp = ( wrk1 + s_tmp[sj1][si1] ); + __syncthreads(); + s_tmp[sj1][si1] = wrk1; __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP - uYtmp = ( wrk1 + s_tmp[sj1][si1] ); - vYtmp = (2 * wrk2 - 2./3.*s_dil[sj1][si1] ); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + dudy[id.g] = s_tmp[sj][si]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP __syncthreads(); - s_tmp[sj][si] = dvdz[id.g]; - __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + vYtmp = (2 * wrk2 - 2./3.*s_dil[sj1][si1] ); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + __syncthreads(); + s_tmp[sj][si] = dvdz[id.g]; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP wYtmp = ( wrk3 + s_tmp[sj1][si1] ); - /*__syncthreads(); + __syncthreads(); s_tmp[sj1][si1] = wrk2; __syncthreads(); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM - dvdy[id.g] = s_tmp[sj][si];*/ -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP - __syncthreads(); - s_tmp[sj1][si1] = wrk3; - __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM - dwdy[id.g] = s_tmp[sj][si]; + dvdy[id.g] = s_tmp[sj][si]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + __syncthreads(); + s_tmp[sj1][si1] = wrk3; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + dwdy[id.g] = s_tmp[sj][si]; + __syncthreads(); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP - //adding the viscous dissipation part duidy*mu*siy @@ -233,7 +380,7 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre //Adding here the terms d (mu) dy * syj; derDevSharedV1y(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dy - uYtmp *= wrk2; + uYtmp *= wrk2; vYtmp *= wrk2; wYtmp *= wrk2; @@ -253,33 +400,94 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre vYtmp = vYtmp + s_prop[sj1][si1]*wrk2/3.0; eYtmp = eYtmp + s_prop[sj1][si1]*wrk2/3.0*s_v[sj1][si1]; -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM // pressure derivatives - __syncthreads(); + __syncthreads(); s_dil[sj][si] = p[id.g]; __syncthreads(); // Boundary condition for pressure - BCyNumber2(s_dil[sj],p,id,si,my, jNum); + if (id.tiy < stencilSize){ + if (nDivY ==1){ + if(pRow>1) { + haloBCyTop(s_dil[sj],p,si,id); + haloBCyBot(s_dil[sj],p,si,id); + } else { + TopBCyNumber2(s_dil[sj],p,id,si,my); + BotBCyNumber2(s_dil[sj],p,id,si,my); + } + } else { + if (jNum ==0){ + TopBCyCpy(s_dil[sj],p,si,id); + if(pRow>1) { + haloBCyBot(s_dil[sj],p,si,id); + } else { + BotBCyNumber2(s_dil[sj],p,id,si,my); + } + } else if (jNum == nDivY-1){ + BotBCyCpy(s_dil[sj],p,si,id); + if(pRow>1) { + haloBCyTop(s_dil[sj],p,si,id); + } else { + TopBCyNumber2(s_dil[sj],p,id,si,my); + } + } else { + TopBCyCpy(s_dil[sj],p,si,id); + BotBCyCpy(s_dil[sj],p,si,id); + } + } + } __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP derDevShared1y(&wrk1,s_dil[sj1],si1); vYtmp = vYtmp - wrk1; // fourier terms -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM - __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + __syncthreads(); s_prop[sj][si] = lam[id.g]; s_dil[sj][si] = t[id.g]; __syncthreads(); // Boundary condition for temperature and thermal conductivity - BCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my,jNum); + + if (id.tiy < stencilSize){ + if (nDivY ==1){ + if(pRow>1) { + haloBCyTop(s_prop[sj],lam,si,id); haloBCyTop(s_dil[sj],t,si,id); + + haloBCyBot(s_prop[sj],lam,si,id); haloBCyBot(s_dil[sj],t,si,id); + } else{ + TopBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ; + BotBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ; + } + } else { + if (jNum ==0){ + TopBCyCpy(s_prop[sj],lam,si,id); TopBCyCpy(s_dil[sj],t,si,id); + if(pRow>1) { + haloBCyBot(s_prop[sj],lam,si,id); haloBCyBot(s_dil[sj],t,si,id); + } else { + BotBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ; + } + } else if (jNum == nDivY-1){ + BotBCyCpy(s_prop[sj],lam,si,id); BotBCyCpy(s_dil[sj],t,si,id); + if(pRow>1) { + haloBCyTop(s_prop[sj],lam,si,id); haloBCyTop(s_dil[sj],t,si,id); + } else { + TopBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ; + } + } else { + TopBCyCpy(s_prop[sj],lam,si,id); TopBCyCpy(s_dil[sj],t,si,id); + BotBCyCpy(s_prop[sj],lam,si,id); BotBCyCpy(s_dil[sj],t,si,id); + } + } + } __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP derDevSharedV2y(&wrk1,s_dil[sj1],si1); eYtmp = eYtmp + wrk1*s_prop[sj1][si1]; @@ -287,17 +495,46 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre derDevSharedV1y(&wrk1,s_dil[sj1] ,si1); //wrk1 = d (t) dy eYtmp = eYtmp + wrk1*wrk2; -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM //Adding here the terms - d (ru phi) dy; __syncthreads(); s_prop[sj][si] = r[id.g]; s_dil[sj][si] = h[id.g]; __syncthreads(); // Boundary condition for density and enthalpy - BCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my,jNum); + if (id.tiy < stencilSize){ + if (nDivY ==1){ + if(pRow>1) { + haloBCyTop(s_prop[sj],r,si,id); haloBCyTop(s_dil[sj],h,si,id); + haloBCyBot(s_prop[sj],r,si,id); haloBCyBot(s_dil[sj],h,si,id); + } else{ + TopBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ; + BotBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ; + } + } else { + if (jNum ==0){ + TopBCyCpy(s_prop[sj],r,si,id); TopBCyCpy(s_dil[sj],h,si,id); + if(pRow>1) { + haloBCyBot(s_prop[sj],r,si,id); haloBCyBot(s_dil[sj],h,si,id); + } else{ + BotBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ; + } + } else if (jNum == nDivY-1){ + BotBCyCpy(s_prop[sj],r,si,id); BotBCyCpy(s_dil[sj],h,si,id); + if(pRow>1) { + haloBCyTop(s_prop[sj],r,si,id); haloBCyTop(s_dil[sj],h,si,id); + } else{ + TopBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ; + } + } else { + TopBCyCpy(s_prop[sj],r,si,id); TopBCyCpy(s_dil[sj],h,si,id); + BotBCyCpy(s_prop[sj],r,si,id); BotBCyCpy(s_dil[sj],h,si,id); + } + } + } __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP fluxQuadSharedy(&wrk1,s_prop[sj1],s_v[sj1],si1); rYtmp = wrk1; __syncthreads(); @@ -313,10 +550,10 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre fluxCubeSharedy(&wrk1,s_prop[sj1],s_v[sj1],s_dil[sj1],si1); eYtmp = eYtmp + wrk1; __syncthreads(); -//USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE. - s_prop[sj1][si1] = rYtmp ; s_u[sj1][si1] = uYtmp ; s_v[sj1][si1] = vYtmp ; s_w[sj1][si1] = wYtmp ; s_dil[sj1][si1] = eYtmp ; - __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + //USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE. + s_prop[sj1][si1] = rYtmp ; s_u[sj1][si1] = uYtmp ; s_v[sj1][si1] = vYtmp ; s_w[sj1][si1] = wYtmp ; s_dil[sj1][si1] = eYtmp ; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM #if useStreams rY[id.g] = rYtmp; @@ -335,202 +572,503 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre } __global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, myprec *eZ, - myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , - myprec *t, myprec *p, myprec *mu, myprec *lam, - myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz, - myprec *dil, myprec *dpdz) { - - Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); - int kNum = blockIdx.z; - id.mkidZFlx(kNum); - - int tid = id.tix + id.bdx*id.tiy; - - int si = id.tiy + stencilSize; // local i for shared memory access + halo offset - int sj = id.tix; // local j for shared memory access - int si1 = tid%id.bdy + stencilSize; // local i for shared memory access + halo offset - int sj1 = tid/id.bdy; - - //Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y); - //id1.mkidZFlx(kNum); - - myprec rZtmp=0; - myprec uZtmp=0; - myprec vZtmp=0; - myprec wZtmp=0; - myprec eZtmp=0; - - myprec wrk1=0; - myprec wrk2=0; - myprec wrk3=0; - - //myprec tmpreg1[stencilSize*2+1]; - //myprec tmpreg2[stencilSize*2+1]; - //myprec tmpreg3[stencilSize*2+1]; - - __shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2]; - __shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2]; - __shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2]; - __shared__ myprec s_prop[mx/nDivX][mz/nDivZ+stencilSize*2]; - __shared__ myprec s_dil[mx/nDivX][mz/nDivZ+stencilSize*2]; - __shared__ myprec s_tmp[mx/nDivX][mz/nDivZ+stencilSize*2]; + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz, + myprec *dil, myprec *dpdz , Communicator rk, recycle rec) { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM - s_u[sj][si] = u[id.g]; - s_v[sj][si] = v[id.g]; - s_w[sj][si] = w[id.g]; - s_prop[sj][si] = mu[id.g]; - s_dil[sj][si] = dil[id.g]; - __syncthreads(); - - // fill in periodic images in shared memory array - // these are boundary conditions for u,v,w,mu and dilatation - - - BCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,kNum); - __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP -////BY COMPUTATION THREAD ARRANGEMENT - derDevSharedV1z(&wrk1,s_u[sj1],si1); - derDevSharedV1z(&wrk2,s_v[sj1],si1); - derDevSharedV1z(&wrk3,s_w[sj1],si1); - //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms - -// WRK1 CORRESPONDS TO THE NEW THREAD ARRANGEMENT. + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + int kNum = blockIdx.z; + id.mkidZFlx(kNum); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM - s_tmp[sj][si] = dwdx[id.g]; - __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP - uZtmp = ( wrk1 + s_tmp[sj1][si1] ); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + int tid = id.tix + id.bdx*id.tiy; + + int si = id.tiy + stencilSize; // local i for shared memory access + halo offset + int sj = id.tix; // local j for shared memory access + int si1 = tid%id.bdy + stencilSize; // local i for shared memory access + halo offset + int sj1 = tid/id.bdy; + + + Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y); + id1.mkidZFlx(kNum); + + bool CellAtOutlet = ( (rk.kp == mz_tot - 1) && (id1.k == mz-1) ); + bool CellAtInlet = ( (rk.km == 0) && (id1.k == 0) ); + bool boolOutflownrbc_top = ( (outletbc==4) && CellAtOutlet ); + bool boolInflownrbc_bot = ( (inletbc ==5) && CellAtInlet ); + + myprec rZtmp=0; + myprec uZtmp=0; + myprec vZtmp=0; + myprec wZtmp=0; + myprec eZtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + myprec wrk3=0; + + //myprec tmpreg1[stencilSize*2+1]; + //myprec tmpreg2[stencilSize*2+1]; + //myprec tmpreg3[stencilSize*2+1]; + + __shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_prop[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_dil[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_tmp[mx/nDivX][mz/nDivZ+stencilSize*2]; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + s_prop[sj][si] = mu[id.g]; + s_dil[sj][si] = dil[id.g]; + __syncthreads(); + + // fill in periodic images in shared memory array + // these are boundary conditions for u,v,w,mu and dilatation + if (id.tiy < stencilSize){ + if (nDivZ ==1){ + if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot.. + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id); + + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol-2){ // last block + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id); + + TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc); + } else if (rk.kp == 1) {// first block + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id); + + BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec); + } else { // all internal blocks + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id); + + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id); + } + } + + } else{ + TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc); + BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec); + } + } else { + if (kNum ==0){ + TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id); + TopBCzCpy(s_prop[sj],mu,si,id); TopBCzCpy(s_dil[sj],dil,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.kp == 1){ // first block + BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec); + } else { // all other blocks + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id); + } + } + + } else{ + BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec); + } + } else if (kNum == nDivZ-1){ + BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id); + BotBCzCpy(s_prop[sj],mu,si,id); BotBCzCpy(s_dil[sj],dil,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol -2 ){ // last block + TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc); + } else { // all other blocks + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id); + } + } + } else{ + TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc); + } + } else { + TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id); + TopBCzCpy(s_prop[sj],mu,si,id); TopBCzCpy(s_dil[sj],dil,si,id); + + BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id); + BotBCzCpy(s_prop[sj],mu,si,id); BotBCzCpy(s_dil[sj],dil,si,id); + } + } + } + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + ////BY COMPUTATION THREAD ARRANGEMENT + derDevSharedV1z(&wrk1,s_u[sj1],si1); + derDevSharedV1z(&wrk2,s_v[sj1],si1); + derDevSharedV1z(&wrk3,s_w[sj1],si1); + //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms + + // WRK1 CORRESPONDS TO THE NEW THREAD ARRANGEMENT. + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + s_tmp[sj][si] = dwdx[id.g]; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + uZtmp = ( wrk1 + s_tmp[sj1][si1] ); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + __syncthreads(); + s_tmp[sj][si] = dwdy[id.g]; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + vZtmp = ( wrk2 + s_tmp[sj1][si1] ); + wZtmp = (2 * wrk3 - 2./3.*s_dil[sj1][si1] ); + + __syncthreads(); + s_tmp[sj1][si1] = wrk1; __syncthreads(); - s_tmp[sj][si] = dwdy[id.g]; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + dudz[id.g] = s_tmp[sj][si]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + __syncthreads(); + s_tmp[sj1][si1] = wrk2; __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP - vZtmp = ( wrk2 + s_tmp[sj1][si1] ); - wZtmp = (2 * wrk3 - 2./3.*s_dil[sj1][si1] ); - - /*__syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + dvdz[id.g] = s_tmp[sj][si]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + __syncthreads(); s_tmp[sj1][si1] = wrk3; __syncthreads(); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM - dwdz[id.g] = s_tmp[sj][si];*/ -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + dwdz[id.g] = s_tmp[sj][si]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + __syncthreads(); - //adding the viscous dissipation part duidz*mu*siz - eZtmp = s_prop[sj1][si1]*(uZtmp*wrk1 + vZtmp*wrk2 + wZtmp*wrk3); + //adding the viscous dissipation part duidz*mu*siz + eZtmp = s_prop[sj1][si1]*(uZtmp*wrk1 + vZtmp*wrk2 + wZtmp*wrk3); - //Adding here the terms d (mu) dz * szj; - derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dz - uZtmp *= wrk2; - vZtmp *= wrk2; - wZtmp *= wrk2; + //Adding here the terms d (mu) dz * szj; + derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dz + uZtmp *= wrk2; + vZtmp *= wrk2; + wZtmp *= wrk2; - //viscous fluxes derivative - derDevSharedV2z(&wrk1,s_u[sj1],si1); - uZtmp = uZtmp + wrk1*s_prop[sj1][si1]; - derDevSharedV2z(&wrk1,s_v[sj1],si1); - vZtmp = vZtmp + wrk1*s_prop[sj1][si1]; - derDevSharedV2z(&wrk1,s_w[sj1],si1); - wZtmp = wZtmp + wrk1*s_prop[sj1][si1]; + //viscous fluxes derivative + derDevSharedV2z(&wrk1,s_u[sj1],si1); + uZtmp = uZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV2z(&wrk1,s_v[sj1],si1); + vZtmp = vZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV2z(&wrk1,s_w[sj1],si1); + wZtmp = wZtmp + wrk1*s_prop[sj1][si1]; - //adding the viscous dissipation part ui*(mu * d2duidz2 + dmudz * siz) - eZtmp = eZtmp + s_u[sj1][si1]*uZtmp + s_v[sj1][si1]*vZtmp + s_w[sj1][si1]*wZtmp; + if ( boolOutflownrbc_top ) { + uZtmp = 0; // derivative of tangential stress = 0 as per NSCBC + vZtmp = 0; + } + //adding the viscous dissipation part ui*(mu * d2duidz2 + dmudz * siz) + eZtmp = eZtmp + s_u[sj1][si1]*uZtmp + s_v[sj1][si1]*vZtmp + s_w[sj1][si1]*wZtmp; - //dilation derivatives - derDevSharedV1z(&wrk2,s_dil[sj1],si1); - wZtmp = wZtmp + s_prop[sj1][si1]*wrk2/3.0; - eZtmp = eZtmp + s_prop[sj1][si1]*wrk2/3.0*s_w[sj1][si1]; + //dilation derivatives + derDevSharedV1z(&wrk2,s_dil[sj1],si1); + wZtmp = wZtmp + s_prop[sj1][si1]*wrk2/3.0; + eZtmp = eZtmp + s_prop[sj1][si1]*wrk2/3.0*s_w[sj1][si1]; -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM -//SHARED MEM LOADING OPERATION. - ID + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + //SHARED MEM LOADING OPERATION. - ID - // pressure derivatives - __syncthreads(); - s_dil[sj][si] = p[id.g]; - __syncthreads(); - BCzNumber2(s_dil[sj],p,id,si,mz,kNum); - __syncthreads(); + // pressure derivatives + __syncthreads(); + s_tmp[sj][si] = p[id.g]; + __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP - derDevShared1z(&wrk1,s_dil[sj1],si1); - wZtmp = wZtmp - wrk1; + if (id.tiy < stencilSize){ + if (nDivZ ==1){ + if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot.. + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_tmp[sj],p,si,id); + haloBCzBot(s_tmp[sj],p,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol-2){ // last block + haloBCzBot(s_tmp[sj],p,si,id); + TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc); + } else if (rk.kp == 1) {// first block + haloBCzTop(s_tmp[sj],p,si,id); + BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec); + } else { // all internal blocks + haloBCzTop(s_tmp[sj],p,si,id); + haloBCzBot(s_tmp[sj],p,si,id); + } + } + } else{ + TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc); + BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec); + } + } else { + if (kNum ==0){ + TopBCzCpy(s_tmp[sj],p,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzBot(s_tmp[sj],p,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.kp == 1){ // first block + BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec); + } else { // all other blocks + haloBCzBot(s_tmp[sj],p,si,id); + } + } + } else{ + BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec); + } + } else if (kNum == nDivZ-1){ + BotBCzCpy(s_tmp[sj],p,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_tmp[sj],p,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol -2 ){ // last block + TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc); + } else { // all other blocks + haloBCzTop(s_tmp[sj],p,si,id); + } + } + } else{ + TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc); + } + } else { + TopBCzCpy(s_tmp[sj],p,si,id); + BotBCzCpy(s_tmp[sj],p,si,id); + + } + } + } + __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM - // fourier terms - __syncthreads(); - s_prop[sj][si] = lam[id.g]; - s_dil[sj][si] = t[id.g]; - __syncthreads(); - // Boundary conditions for thermal conductivity and temperature - BCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,kNum); - __syncthreads(); - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP - - derDevSharedV2z(&wrk1,s_dil[sj1],si1); - eZtmp = eZtmp + wrk1*s_prop[sj1][si1]; - derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (lam) dz - derDevSharedV1z(&wrk1,s_dil[sj1],si1); //wrk1 = d (t) dz - eZtmp = eZtmp + wrk1*wrk2; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + derDevShared1z(&wrk1,s_tmp[sj1],si1); + if ( boolOutflownrbc_top || boolInflownrbc_bot) { + wrk1 = 0; // pressure derivative taken care in the nrbc computation. + } + wZtmp = wZtmp - wrk1; -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM - //Adding here the terms - d (ru phi) dz; - __syncthreads(); - s_prop[sj][si] = r[id.g]; - s_dil[sj][si] = h[id.g]; - __syncthreads(); - // Boundary conditions for denisty and enthalpy - BCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,kNum); - __syncthreads(); - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP - fluxQuadSharedz(&wrk1,s_prop[sj1],s_w[sj1],si1); - rZtmp = wrk1; - __syncthreads(); - fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_u[sj1],si1); - uZtmp = uZtmp + wrk1; - __syncthreads(); - fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_v[sj1],si1); - vZtmp = vZtmp + wrk1; - __syncthreads(); - fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_w[sj1],si1); - wZtmp = wZtmp + wrk1; - __syncthreads(); - fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_dil[sj1],si1); - eZtmp = eZtmp + wrk1; - __syncthreads(); -//USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE. - s_prop[sj1][si1] = rZtmp ; s_u[sj1][si1] = uZtmp ; s_v[sj1][si1] = vZtmp ; s_tmp[sj1][si1] = wZtmp ; s_dil[sj1][si1] = eZtmp ; - __syncthreads(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM - - rZ[id.g] += s_prop[sj][si]; - uZ[id.g] += s_u[sj][si]; - vZ[id.g] += s_v[sj][si]; - wZ[id.g] += s_tmp[sj][si] + *dpdz; - eZ[id.g] += s_dil[sj][si] + *dpdz*s_w[sj][si]; - //dil[id.g] = dudx[id.g] + dvdy[id.g] + dwdz[id.g]; - -//#if useStreams -// rZ[id.g] = rZtmp; -// uZ[id.g] = uZtmp; -// vZ[id.g] = vZtmp; -// wZ[id.g] = wZtmp + *dpdz; -// eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ; -//#else + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + // fourier terms + __syncthreads(); + s_prop[sj][si] = lam[id.g]; + s_dil[sj][si] = t[id.g]; + __syncthreads(); + // Boundary conditions for thermal conductivity and temperature + + if (id.tiy < stencilSize){ + if (nDivZ ==1){ + if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot.. + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id); + haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol-2){ // last block + haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id); + TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc); + } else if (rk.kp == 1) {// first block + haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id); + BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec); + } else { // all internal blocks + haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id); + haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id); + } + } + } else{ + TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc); + BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec); + } + } else { + if (kNum ==0){ + TopBCzCpy(s_prop[sj],lam,si,id); TopBCzCpy(s_dil[sj],t,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.kp == 1){ // first block + BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec); + } else { // all other blocks + haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id); + } + } + } else{ + BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec); + } + } else if (kNum == nDivZ-1){ + BotBCzCpy(s_prop[sj],lam,si,id); BotBCzCpy(s_dil[sj],t,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol -2 ){ // last block + TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc); + } else { // all other blocks + haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id); + } + } + } else{ + TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc); + } + } else { + TopBCzCpy(s_prop[sj],lam,si,id); TopBCzCpy(s_dil[sj],t,si,id); + + BotBCzCpy(s_prop[sj],lam,si,id); BotBCzCpy(s_dil[sj],t,si,id); + } + } + } + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + if ( boolOutflownrbc_top ) { + eZtmp = eZtmp + 0.0; // dqdz = 0 at the outflow as per NSCBC + } else { + derDevSharedV2z(&wrk1,s_dil[sj1],si1); + eZtmp = eZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (lam) dz + derDevSharedV1z(&wrk1,s_dil[sj1],si1); //wrk1 = d (t) dz + eZtmp = eZtmp + wrk1*wrk2; + } + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + //Adding here the terms - d (ru phi) dz; + __syncthreads(); + s_prop[sj][si] = r[id.g]; + s_dil[sj][si] = h[id.g]; + __syncthreads(); + // Boundary conditions for denisty and enthalpy + + if (id.tiy < stencilSize){ + if (nDivZ ==1){ + if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot.. + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id); + haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol-2){ // last block + haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id); + TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc); + } else if (rk.kp == 1) {// first block + haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id); + BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec); + } else { // all internal blocks + haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id); + haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id); + } + } + } else{ + TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc); + BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec); + } + } else { + if (kNum ==0){ + TopBCzCpy(s_prop[sj],r,si,id); TopBCzCpy(s_dil[sj],h,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.kp == 1){ // first block + BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec); + } else { // all other blocks + haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id); + } + } + } else{ + BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec); + } + } else if (kNum == nDivZ-1){ + BotBCzCpy(s_prop[sj],r,si,id); BotBCzCpy(s_dil[sj],h,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol -2 ){ // last block + TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc); + } else { // all other blocks + haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id); + } + } + } else{ + TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc); + } + } else { + TopBCzCpy(s_prop[sj],r,si,id); TopBCzCpy(s_dil[sj],h,si,id); + BotBCzCpy(s_prop[sj],r,si,id); BotBCzCpy(s_dil[sj],h,si,id); + } + } + } -//#endif + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + if ( boolOutflownrbc_top){ + OutflownrbcZ_top(&rZtmp,&uZtmp,&vZtmp,&wZtmp,&eZtmp, s_prop[sj1],s_u[sj1],s_v[sj1],s_w[sj1],s_tmp[sj1],s_dil[sj1], si1, id1); + } else if (boolInflownrbc_bot){ + InflownrbcZ_bot(&rZtmp,&uZtmp,&vZtmp,&wZtmp,&eZtmp, s_prop[sj1],s_u[sj1],s_v[sj1],s_w[sj1],s_tmp[sj1],s_dil[sj1], si1, id1); + } else { + fluxQuadSharedz(&wrk1,s_prop[sj1],s_w[sj1],si1); + rZtmp = wrk1; + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_u[sj1],si1); + uZtmp = uZtmp + wrk1; + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_v[sj1],si1); + vZtmp = vZtmp + wrk1; + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_w[sj1],si1); + wZtmp = wZtmp + wrk1; + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_dil[sj1],si1); + eZtmp = eZtmp + wrk1; + } + + //USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE. + __syncthreads(); + s_prop[sj1][si1] = rZtmp ; s_u[sj1][si1] = uZtmp ; s_v[sj1][si1] = vZtmp ; s_tmp[sj1][si1] = wZtmp ; s_dil[sj1][si1] = eZtmp ; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + + if (forcing) { + rZ[id.g] += s_prop[sj][si]; + uZ[id.g] += s_u[sj][si]; + vZ[id.g] += s_v[sj][si]; + wZ[id.g] += s_tmp[sj][si] + *dpdz; + eZ[id.g] += s_dil[sj][si] + *dpdz*s_w[sj][si]; + } else { + rZ[id.g] += s_prop[sj][si]; + uZ[id.g] += s_u[sj][si]; + vZ[id.g] += s_v[sj][si]; + wZ[id.g] += s_tmp[sj][si] ; + eZ[id.g] += s_dil[sj][si] ; + } + //#if useStreams + // rZ[id.g] = rZtmp; + // uZ[id.g] = uZtmp; + // vZ[id.g] = vZtmp; + // wZ[id.g] = wZtmp + *dpdz; + // eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ; + //#else + + + + + + //#endif __syncthreads(); } + + + + + //__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, // myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , // myprec *t, myprec *p, myprec *mu, myprec *lam, diff --git a/src/cuda_rhs_backup_0511.cu b/src/cuda_rhs_backup_0511.cu new file mode 100644 index 0000000..7d3d2e7 --- /dev/null +++ b/src/cuda_rhs_backup_0511.cu @@ -0,0 +1,554 @@ +#include "globals.h" +#include "cuda_functions.h" +#include "cuda_math.h" +#include "sponge.h" +#include "boundary_condition_x.h" +#include "boundary_condition_y.h" +#include "boundary_condition_z.h" + +__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz, + myprec *dil, myprec *dpdz, int iNum) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + + int si = id.i + stencilSize; // local i for shared memory access + halo offset + int sj = id.tiy; // local j for shared memory access + + myprec rXtmp=0; + myprec uXtmp=0; + myprec vXtmp=0; + myprec wXtmp=0; + myprec eXtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + + __shared__ myprec s_u[sPencils][mx+stencilSize*2]; + __shared__ myprec s_v[sPencils][mx+stencilSize*2]; + __shared__ myprec s_w[sPencils][mx+stencilSize*2]; + __shared__ myprec s_t[sPencils][mx+stencilSize*2]; + __shared__ myprec s_p[sPencils][mx+stencilSize*2]; + __shared__ myprec s_prop1[sPencils][mx+stencilSize*2]; + __shared__ myprec s_prop2[sPencils][mx+stencilSize*2]; + + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + s_t[sj][si] = t[id.g]; + s_p[sj][si] = p[id.g]; + s_prop1[sj][si] = mu[id.g]; + s_prop2[sj][si] = lam[id.g]; + __syncthreads(); + + // Boundary conditions in the x-direction are in boundary_condition_x.h + // these are the BCs for u,v,w,p,t,mu,lambda + BCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); + __syncthreads(); + + //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms + uXtmp = ( 2 * dudx[id.g] - 2./3.*dil[id.g] ); + vXtmp = ( dvdx[id.g] + dudy[id.g] ); + wXtmp = ( dwdx[id.g] + dudz[id.g] ); + + //adding the viscous dissipation part duidx*mu*six + eXtmp = s_prop1[sj][si]*(uXtmp*dudx[id.g] + vXtmp*dvdx[id.g] + wXtmp*dwdx[id.g]); + + //Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse); + derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx + uXtmp *= wrk2; + vXtmp *= wrk2; + wXtmp *= wrk2; + + // viscous fluxes derivative mu*d^2ui dx^2 + derDevSharedV2x(&wrk1,s_u[sj],si); + uXtmp = uXtmp + wrk1*s_prop1[sj][si]; + derDevSharedV2x(&wrk1,s_v[sj],si); + vXtmp = vXtmp + wrk1*s_prop1[sj][si]; + derDevSharedV2x(&wrk1,s_w[sj],si); + wXtmp = wXtmp + wrk1*s_prop1[sj][si]; + + //adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six) + eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp; + + //adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx) + derDevSharedV2x(&wrk1,s_t[sj],si); + eXtmp = eXtmp + wrk1*s_prop2[sj][si]; + derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx + derDevSharedV1x(&wrk1,s_t[sj],si); //wrk1 = d (t) dx + eXtmp = eXtmp + wrk1*wrk2; + + // pressure and dilation derivatives + s_prop2[sj][si] = dil[id.g]; + __syncthreads(); + + // these are the BCs for the dilatation + BCxNumber2(s_prop2[sj],id,si,mx); + __syncthreads(); + + derDevSharedV1x(&wrk2,s_prop2[sj],si); + derDevShared1x(&wrk1 ,s_p[sj],si); + uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0 - wrk1 ; + eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si]; + + //Adding here the terms - d (ru phi) dx; + s_prop1[sj][si] = r[id.g]; + s_prop2[sj][si] = h[id.g]; + __syncthreads(); + + // fill in periodic images in shared memory array + // these are the BCs for rho (prop1) and enthalpy (prop2) + BCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); + __syncthreads(); + + fluxQuadSharedx(&wrk1,s_prop1[sj],s_u[sj],si); + rXtmp = wrk1; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); + uXtmp = uXtmp + wrk1; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_v[sj],si); + vXtmp = vXtmp + wrk1; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_w[sj],si); + wXtmp = wXtmp + wrk1; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); + eXtmp = eXtmp + wrk1; + __syncthreads(); + + rX[id.g] = rXtmp; + uX[id.g] = uXtmp; + vX[id.g] = vXtmp; + wX[id.g] = wXtmp; + eX[id.g] = eXtmp; +} + +__global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, myprec *eY, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dvdx, myprec *dudy, myprec *dvdy, myprec *dwdy, myprec *dvdz, + myprec *dil, myprec *dpdz, int jNum) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidYFlx(); + + int si = id.j + stencilSize; // local i for shared memory access + halo offset + int sj = id.tiy; // local j for shared memory access + + myprec rYtmp=0; + myprec uYtmp=0; + myprec vYtmp=0; + myprec wYtmp=0; + myprec eYtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + + __shared__ myprec s_u[sPencils][my+stencilSize*2]; + __shared__ myprec s_v[sPencils][my+stencilSize*2]; + __shared__ myprec s_w[sPencils][my+stencilSize*2]; + __shared__ myprec s_prop[sPencils][my+stencilSize*2]; + __shared__ myprec s_dil[sPencils][my+stencilSize*2]; + + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + s_prop[sj][si] = mu[id.g]; + s_dil[sj][si] = dil[id.g]; + __syncthreads(); + + // fill in periodic images in shared memory array + // these are boundary conditions for u,v,w,mu and dilatation + BCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my); + __syncthreads(); + + //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms + uYtmp = ( dudy[id.g] + dvdx[id.g] ); + vYtmp = (2 * dvdy[id.g] - 2./3.*s_dil[sj][si] ); + wYtmp = ( dwdy[id.g] + dvdz[id.g] ); + + //adding the viscous dissipation part duidy*mu*siy + eYtmp = s_prop[sj][si]*(uYtmp*dudy[id.g] + vYtmp*dvdy[id.g] + wYtmp*dvdz[id.g]); + + //Adding here the terms d (mu) dy * syj; + derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dy + uYtmp *= wrk2; + vYtmp *= wrk2; + wYtmp *= wrk2; + + // viscous fluxes derivative + derDevSharedV2y(&wrk1,s_u[sj],si); + uYtmp = uYtmp + wrk1*s_prop[sj][si]; + derDevSharedV2y(&wrk1,s_v[sj],si); + vYtmp = vYtmp + wrk1*s_prop[sj][si]; + derDevSharedV2y(&wrk1,s_w[sj],si); + wYtmp = wYtmp + wrk1*s_prop[sj][si]; + + //adding the viscous dissipation part ui*(mu * d2duidy2 + dmudy * siy) + eYtmp = eYtmp + s_u[sj][si]*uYtmp + s_v[sj][si]*vYtmp + s_w[sj][si]*wYtmp; + + //dilation derivatives + derDevSharedV1y(&wrk2,s_dil[sj],si); + vYtmp = vYtmp + s_prop[sj][si]*wrk2/3.0; + eYtmp = eYtmp + s_prop[sj][si]*wrk2/3.0*s_v[sj][si]; + + // pressure derivatives + s_dil[sj][si] = p[id.g]; + __syncthreads(); + + // Boundary condition for pressure + BCyNumber2(s_dil[sj],p,id,si,my); + __syncthreads(); + + derDevShared1y(&wrk1,s_dil[sj],si); + vYtmp = vYtmp - wrk1; + + // fourier terms + s_prop[sj][si] = lam[id.g]; + s_dil[sj][si] = t[id.g]; + __syncthreads(); + + // Boundary condition for temperature and thermal conductivity + BCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my); + __syncthreads(); + + derDevSharedV2y(&wrk1,s_dil[sj],si); + eYtmp = eYtmp + wrk1*s_prop[sj][si]; + derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dy + derDevSharedV1y(&wrk1,s_dil[sj] ,si); //wrk1 = d (t) dy + eYtmp = eYtmp + wrk1*wrk2; + + //Adding here the terms - d (ru phi) dy; + s_prop[sj][si] = r[id.g]; + s_dil[sj][si] = h[id.g]; + __syncthreads(); + // Boundary condition for density and enthalpy + BCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my); + __syncthreads(); + fluxQuadSharedy(&wrk1,s_prop[sj],s_v[sj],si); + rYtmp = wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_u[sj],si); + uYtmp = uYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_v[sj],si); + vYtmp = vYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_w[sj],si); + wYtmp = wYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_dil[sj],si); + eYtmp = eYtmp + wrk1; + __syncthreads(); + +#if useStreams + rY[id.g] = rYtmp; + uY[id.g] = uYtmp; + vY[id.g] = vYtmp; + wY[id.g] = wYtmp; + eY[id.g] = eYtmp; +#else + rY[id.g] += rYtmp; + uY[id.g] += uYtmp; + vY[id.g] += vYtmp; + wY[id.g] += wYtmp; + eY[id.g] += eYtmp; +#endif + __syncthreads(); +} + +__global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, myprec *eZ, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz, + myprec *dil, myprec *dpdz, int kNum) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + + id.mkidZFlx(kNum); + + int si = id.tix + stencilSize; // local i for shared memory access + halo offset + int sj = id.tiy; // local j for shared memory access + + myprec rZtmp=0; + myprec uZtmp=0; + myprec vZtmp=0; + myprec wZtmp=0; + myprec eZtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + + myprec tmpreg1=0; + myprec tmpreg2=0; + myprec tmpreg3=0; + myprec tmpreg4=0; + myprec tmpreg5=0; + + __shared__ myprec s_u[sPencils][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_v[sPencils][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_w[sPencils][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_prop[sPencils][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_dil[sPencils][mz/nDivZ+stencilSize*2]; + + s_u[sj][si] = u[id.g]; + s_prop[sj][si] = mu[id.g]; + s_dil[sj][si] = dil[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + tmpreg1 = dudz[id.g]; + tmpreg2 = dwdx[id.g]; + tmpreg3 = dvdz[id.g]; + tmpreg4 = dwdy[id.g]; + tmpreg5 = dwdz[id.g]; + + uZtmp = ( tmpreg1 + tmpreg2 ); + vZtmp = ( tmpreg3 + tmpreg4 ); + //__syncthreads(); // for dil - not needed as s_dil is the one updated by the same thread and hence RAW dependent + wZtmp = (2 * tmpreg5 - 2./3.*s_dil[sj][si] ); + eZtmp = s_prop[sj][si]*(uZtmp*tmpreg1 + vZtmp*tmpreg3 + wZtmp*tmpreg5); + + __syncthreads(); + BCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,kNum); + __syncthreads(); + + derDevSharedV1z(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dz + derDevSharedV2z(&tmpreg1,s_u[sj],si); //& will come or not? + derDevSharedV2z(&tmpreg2,s_v[sj],si); + derDevSharedV2z(&tmpreg3,s_w[sj],si); + derDevSharedV1z(&tmpreg5,s_dil[sj],si); + + uZtmp *= wrk2; + vZtmp *= wrk2; + wZtmp *= wrk2; + + tmpreg4 = s_prop[sj][si]; + + uZtmp = uZtmp + tmpreg1*tmpreg4; + vZtmp = vZtmp + tmpreg2*tmpreg4; + wZtmp = wZtmp + tmpreg3*tmpreg4; + eZtmp = eZtmp + s_u[sj][si]*uZtmp + s_v[sj][si]*vZtmp + s_w[sj][si]*wZtmp; + + wZtmp = wZtmp + tmpreg4*tmpreg5/3.0; + eZtmp = eZtmp + tmpreg4*tmpreg5/3.0*s_w[sj][si]; + + + + + + s_dil[sj][si] = p[id.g]; + __syncthreads(); + BCzNumber2(s_dil[sj],p,id,si,mz,kNum); + + __syncthreads(); + derDevShared1z(&wrk1,s_dil[sj],si); + wZtmp = wZtmp - wrk1; + + // fourier terms + s_prop[sj][si] = lam[id.g]; + s_dil[sj][si] = t[id.g]; + __syncthreads(); + + // Boundary conditions for thermal conductivity and temperature + BCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,kNum); + __syncthreads(); + + derDevSharedV2z(&wrk1,s_dil[sj],si); + eZtmp = eZtmp + wrk1*s_prop[sj][si]; + derDevSharedV1z(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dz + derDevSharedV1z(&wrk1,s_dil[sj],si); //wrk1 = d (t) dz + eZtmp = eZtmp + wrk1*wrk2; + + //Adding here the terms - d (ru phi) dz; + s_prop[sj][si] = r[id.g]; + s_dil[sj][si] = h[id.g]; + __syncthreads(); + + // Boundary conditions for denisty and enthalpy + BCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,kNum); + __syncthreads(); + + fluxQuadSharedz(&wrk1,s_prop[sj],s_w[sj],si); + rZtmp = wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_u[sj],si); + uZtmp = uZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_v[sj],si); + vZtmp = vZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_w[sj],si); + wZtmp = wZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_dil[sj],si); + eZtmp = eZtmp + wrk1; + __syncthreads(); + +#if useStreams + rZ[id.g] = rZtmp; + uZ[id.g] = uZtmp; + vZ[id.g] = vZtmp; + wZ[id.g] = wZtmp + *dpdz; + eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ; +#else + rZ[id.g] += rZtmp; + uZ[id.g] += uZtmp; + vZ[id.g] += vZtmp; + wZ[id.g] += wZtmp + *dpdz; + eZ[id.g] += eZtmp + *dpdz*s_w[sj][si] ; +#endif + __syncthreads(); +} + +//__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, +// myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , +// myprec *t, myprec *p, myprec *mu, myprec *lam, +// myprec *dil, myprec *dpdz) { +// +// Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); +// id.mkidX(); +// +// int si = id.i + stencilSize; // local i for shared memory access + halo offset +// int sj = id.tiy; // local j for shared memory access +// +// myprec rXtmp=0; +// myprec uXtmp=0; +// myprec vXtmp=0; +// myprec wXtmp=0; +// myprec eXtmp=0; +// +// myprec wrk1=0; +// myprec wrk2=0; +// +// __shared__ myprec s_u[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_v[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_w[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_t[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_p[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_prop1[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_prop2[sPencils][mx+stencilSize*2]; +//#if !periodicX +// __shared__ myprec s_s0[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_s4[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_s8[sPencils][mx+stencilSize*2]; +//#endif +// __shared__ myprec s_dil[sPencils][mx+stencilSize*2]; +// +// s_u[sj][si] = u[id.g]; +// s_v[sj][si] = v[id.g]; +// s_w[sj][si] = w[id.g]; +// s_t[sj][si] = t[id.g]; +// s_p[sj][si] = p[id.g]; +// s_prop1[sj][si] = mu[id.g]; +// s_prop2[sj][si] = lam[id.g]; +//#if !periodicX +// s_s0[sj][si]= gij[0][id.g]; +// s_s4[sj][si]= gij[4][id.g]; +// s_s8[sj][si]= gij[8][id.g]; +//#endif +// s_dil[sj][si] = dil[id.g]; +// __syncthreads(); +// +// // fill in periodic images in shared memory array +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_u[sj],si); perBCx(s_v[sj],si); perBCx(s_w[sj],si); +// perBCx(s_t[sj],si); perBCx(s_p[sj],si); perBCx(s_prop1[sj],si); +// perBCx(s_prop2[sj],si); +//#else +// wallBCxMir(s_p[sj],si); +// wallBCxVel(s_u[sj],si); wallBCxVel(s_v[sj],si); wallBCxVel(s_w[sj],si); +// wallBCxExt(s_t[sj],si,TwallTop,TwallBot); +// mlBoundPT(s_prop1[sj], s_prop2[sj], s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si); +// wallBCxMir(s_s0[sj],si); wallBCxVel(s_s4[sj],si); wallBCxVel(s_s8[sj],si); +//#endif +// } +// +// __syncthreads(); +// +// //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms +// uXtmp = ( 2 * gij[0][id.g] - 2./3.*s_dil[sj][si] ); +// vXtmp = ( gij[1][id.g] + gij[3][id.g] ); +// wXtmp = ( gij[2][id.g] + gij[6][id.g] ); +// +// //adding the viscous dissipation part duidx*mu*six +// eXtmp = s_prop1[sj][si]*(uXtmp*gij[0][id.g] + vXtmp*gij[1][id.g] + wXtmp*gij[2][id.g]); +// +// //Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse); +// derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx +// uXtmp *= wrk2; +// vXtmp *= wrk2; +// wXtmp *= wrk2; +// +// // viscous fluxes derivative mu*d^2ui dx^2 +// derDevSharedV2x(&wrk1,s_u[sj],si); +// uXtmp = uXtmp + wrk1*s_prop1[sj][si]; +// derDevSharedV2x(&wrk1,s_v[sj],si); +// vXtmp = vXtmp + wrk1*s_prop1[sj][si]; +// derDevSharedV2x(&wrk1,s_w[sj],si); +// wXtmp = wXtmp + wrk1*s_prop1[sj][si]; +// +// //adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six) +// eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp; +// +// //adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx) +// derDevSharedV2x(&wrk1,s_t[sj],si); +// eXtmp = eXtmp + wrk1*s_prop2[sj][si]; +// derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx +// derDevSharedV1x(&wrk1,s_t[sj],si); //wrk1 = d (t) dx +// eXtmp = eXtmp + wrk1*wrk2; +// +// // pressure and dilation derivatives +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_dil[sj],si); +//#else +// wallBCxDil(s_dil[sj],s_s0[sj],s_s4[sj],s_s8[sj],si); +//#endif +// } +// __syncthreads(); +// +// derDevSharedV1x(&wrk2,s_dil[sj],si); +// derDevShared1x(&wrk1 ,s_p[sj],si); +// uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0 - wrk1 ; +// eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si]; +// +// //Adding here the terms - d (ru phi) dx; +// s_prop1[sj][si] = r[id.g]; +// s_prop2[sj][si] = h[id.g]; +// __syncthreads(); +// // fill in periodic images in shared memory array +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_prop1[sj],si); perBCx(s_prop2[sj],si); +//#else +// rhBoundPT(s_prop1[sj], s_prop2[sj], s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si); +//#endif +// } +// +// fluxQuadSharedx(&wrk1,s_prop1[sj],s_u[sj],si); +// rXtmp = wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); +// uXtmp = uXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_v[sj],si); +// vXtmp = vXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_w[sj],si); +// wXtmp = wXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); +// eXtmp = eXtmp + wrk1; +// __syncthreads(); +// +// rX[id.g] = rXtmp; +// uX[id.g] = uXtmp; +// vX[id.g] = vXtmp; +// wX[id.g] = wXtmp; +// eX[id.g] = eXtmp; +//} diff --git a/src/cuda_utils.cu b/src/cuda_utils.cu index d8756d6..d34c620 100644 --- a/src/cuda_utils.cu +++ b/src/cuda_utils.cu @@ -34,8 +34,8 @@ __global__ void cpyCoefficients(myprec *tmpSx, myprec *tmpVSx) { __constant__ myprec d_dx, d_dy, d_dz, d_d2x, d_d2y, d_d2z, d_x[mx], d_xp[mx], d_dxv[mx]; -dim3 d_block[5], grid0, gridBC, gridHalo, gridHaloY, gridHaloZ; -dim3 d_grid[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ; +dim3 d_grid[5], grid0, gridBC, gridHalo, gridHaloY, gridHaloZ, d_gridx, gridBCw; +dim3 d_block[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ, d_blockx, blockBCw; void copyThreadGridsToDevice(Communicator rk); void sanityCheckThreadGrids(Communicator rk); @@ -65,6 +65,8 @@ void setGPUParameters(Communicator rk) myprec *h_xp = new myprec[mx]; myprec *h_dxv = new myprec[mx]; + + myprec h_dpdz; if(forcing) h_dpdz = 0.00372; if(!forcing) h_dpdz = 0.0; @@ -79,11 +81,16 @@ void setGPUParameters(Communicator rk) h_x[i] = x[i]; h_xp[i] = xp[i]; } - h_dxv[0] = (x[1]+x[0])/2.0; +/* h_dxv[0] = (x[1]+x[0])/2.0; for (int i=1; i0){ + L[0] = ev[0]*(dp - s_r[si]*sos*du); + } else { + L[0] = L[4] ; // either 0 (perfectly non-reflecting) or use LODI relations ( Eq. 24-28 in Poinsot and Lele (1992) ) + } + if (ev[1]>0){ + L[1] = ev[1]*(sos*sos*dr - dp); + } else { + L[1] = 0; + } + if (ev[2]>0){ + L[2] = ev[2]*(dv); + } else { + L[2] = 0; + } + if (ev[3]>0){ + L[3] = ev[3]*(dw); + } else { + L[3] = 0; + } + if (ev[4]>0){ + L[4] = ev[4]*(dp + s_r[si]*sos*du); + } else { + L[4] = 0; + } + + myprec d[5]; + + d[0] = 1/(sos*sos) * ( L[1] + 0.5*(L[4] + L[0]) ) ; + d[1] = 0.5*( L[4] + L[0] ) ; + d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ; + d[3] = L[2] ; + d[4] = L[3] ; + + *rXtmp = - d[0]; + fluxCubeSharedx(&wrk1,s_r,s_u,s_u,si); + *uXtmp = *uXtmp + wrk1; + fluxCubeSharedx(&wrk1,s_r,s_u,s_v,si); + *vXtmp = *vXtmp + wrk1; + fluxCubeSharedx(&wrk1,s_r,s_u,s_w,si); + *wXtmp = *wXtmp + wrk1; + fluxCubeSharedx(&wrk1,s_r,s_u,s_h,si); + *eXtmp = *eXtmp + wrk1; + + +} + + +__device__ __forceinline__ __attribute__((always_inline)) void FreestreamnrbcX_top(myprec *rXtmp, myprec *uXtmp, myprec *vXtmp, myprec *wXtmp, myprec *eXtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id) { + + myprec dp = 0 ; + myprec du = 0 ; + myprec dv = 0 ; + myprec dw = 0 ; + myprec dr = 0 ; + + derShared1x_BD(&dp, s_p, si) ; + derShared1x_BD(&du, s_u, si) ; + derShared1x_BD(&dv, s_v, si) ; + derShared1x_BD(&dw, s_w, si) ; + derShared1x_BD(&dr, s_r, si) ; + + myprec sos = pow(gam*s_p[si]/s_r[si],0.5); + + myprec ev[5]; + + ev[0] = s_u[si] - sos; + ev[1] = s_u[si]; + ev[2] = s_u[si]; + ev[3] = s_u[si]; + ev[4] = s_u[si] + sos; + + myprec sig = 0.25; + myprec K = sig * (1-Ma*Ma) * sos / Lx; // Ma is the free stream Mach number that we specify. Poinsot and Lele say that maximum Mach number in the flow should be used. So should there be a comparison kernel that computes maximum Mach number? I dont think its needed as the free stream Mach number and Ma in eqn would be very similar in order and equation 40 is not something accurate but a mere model. + + myprec L[5]; + + if (ev[0]>0){ + L[0] = ev[0]*(dp - s_r[si]*sos*du); + } else { + L[0] = K*(s_p[si] - pinf) ; // either 0 (perfectly non-reflecting) or use LODI relations ( Eq. 24-28 in Poinsot and Lele (1992) ) + } + if (ev[1]>0){ + L[1] = ev[1]*(sos*sos*dr - dp); + } else { + L[1] = 0; + } + if (ev[2]>0){ + L[2] = ev[2]*(dv); + } else { + L[2] = 0; + } + if (ev[3]>0){ + L[3] = ev[3]*(dw); + } else { + L[3] = 0; + } + if (ev[4]>0){ + L[4] = ev[4]*(dp + s_r[si]*sos*du); + } else { + L[4] = 0; + } + + myprec d[5]; + + d[0] = 1/(sos*sos) * ( L[1] + 0.5*(L[4] + L[0]) ) ; + d[1] = 0.5*( L[4] + L[0] ) ; + d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ; + d[3] = L[2] ; + d[4] = L[3] ; + + *rXtmp = - d[0]; + *uXtmp += - s_u[si]*d[0] - s_r[si]*d[2]; + *vXtmp += - s_v[si]*d[0] - s_r[si]*d[3]; + *wXtmp += - s_w[si]*d[0] - s_r[si]*d[4]; + *eXtmp += - ( 0.5*( s_u[si]*s_u[si] + s_v[si]*s_v[si] + s_w[si]*s_w[si] )*d[0] + d[1]/(gam - 1) + s_r[si]*s_u[si] * d[2] + s_r[si]*s_v[si] * d[3] + s_r[si]*s_w[si] * d[4] ) ; + +} + + +#endif diff --git a/src/nrbcZ.h b/src/nrbcZ.h new file mode 100644 index 0000000..a1d7481 --- /dev/null +++ b/src/nrbcZ.h @@ -0,0 +1,163 @@ + +#ifndef NRBCZ_H_ +#define NRBCZ_H_ + +extern __device__ __forceinline__ void InflownrbcZ_bot(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id); +extern __device__ __forceinline__ void OutflownrbcZ_top(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id); + + +__device__ __forceinline__ __attribute__((always_inline)) void InflownrbcZ_bot(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id) { + + + myprec dpf = 0 ; + myprec duf = 0 ; + myprec dvf = 0 ; + myprec dwf = 0 ; + myprec drf = 0 ; + + myprec dpb = 0 ; + myprec dub = 0 ; + myprec dvb = 0 ; + myprec dwb = 0 ; + myprec drb = 0 ; + + + derShared1z_FD(&dpf, s_p, si) ; + derShared1z_FD(&duf, s_u, si) ; + derShared1z_FD(&dvf, s_v, si) ; + derShared1z_FD(&dwf, s_w, si) ; + derShared1z_FD(&drf, s_r, si) ; + + derShared1z_BD(&dpb, s_p, si) ; + derShared1z_BD(&dub, s_u, si) ; + derShared1z_BD(&dvb, s_v, si) ; + derShared1z_BD(&dwb, s_w, si) ; + derShared1z_BD(&drb, s_r, si) ; + + myprec sos = pow(gam*s_p[si]/s_r[si],0.5); + + myprec ev[5]; + + ev[0] = s_w[si] - sos; + ev[1] = s_w[si] ; + ev[2] = s_w[si] ; + ev[3] = s_w[si] ; + ev[4] = s_w[si] + sos; + + myprec L[5]; + + if (ev[0]<0){ + L[0] = ev[0]*(dpf - s_r[si]*sos*dwf); + } else { + L[0] = ev[0]*(dpb - s_r[si]*sos*dwb); + } + if (ev[1]<0){ + L[1] = ev[1]*(sos*sos*drf - dpf); + } else { + L[1] = ev[1]*(sos*sos*drb - dpb); + } + if (ev[2]<0){ + L[2] = ev[2]*(dvf); + } else { + L[2] = ev[2]*(dvb); + } + if (ev[3]<0){ + L[3] = ev[3]*(duf); + } else { + L[3] = ev[3]*(dub); + } + if (ev[4]<0){ + L[4] = ev[4]*(dpf + s_r[si]*sos*dwf); + } else { + L[4] = ev[4]*(dpb + s_r[si]*sos*dwb); + } + + myprec d[5]; + + d[0] = 1/(sos*sos)*( L[1] + 0.5*(L[4] + L[0]) ) ; + d[1] = 0.5*( L[4] + L[0] ) ; + d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ; + d[3] = L[2] ; + d[4] = L[3] ; + + *rZtmp = - d[0]; + *uZtmp += - s_u[si]*d[0] - s_r[si]*d[4]; + *vZtmp += - s_v[si]*d[0] - s_r[si]*d[3]; + *wZtmp += - s_w[si]*d[0] - s_r[si]*d[2]; + *eZtmp += - ( 0.5*( s_u[si]*s_u[si] + s_v[si]*s_v[si] + s_w[si]*s_w[si] )*d[0] + d[1]/(gam - 1) + s_r[si]*s_u[si] * d[4] + s_r[si]*s_v[si] * d[3] + s_r[si]*s_w[si] * d[2] ) ; +} + + +__device__ __forceinline__ __attribute__((always_inline)) void OutflownrbcZ_top(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id) { + + myprec dp = 0 ; + myprec du = 0 ; + myprec dv = 0 ; + myprec dw = 0 ; + myprec dr = 0 ; + + derShared1z_BD(&dp, s_p, si) ; + derShared1z_BD(&du, s_u, si) ; + derShared1z_BD(&dv, s_v, si) ; + derShared1z_BD(&dw, s_w, si) ; + derShared1z_BD(&dr, s_r, si) ; + + myprec sos = pow(gam*s_p[si]/s_r[si],0.5); + + myprec ev[5]; + + ev[0] = s_w[si] - sos; + ev[1] = s_w[si]; + ev[2] = s_w[si]; + ev[3] = s_w[si]; + ev[4] = s_w[si] + sos; + + myprec sig = 0.25; + myprec K = sig * (1-Ma*Ma) * sos / Lz; // Ma is the free stream Mach number that we specify. Poinsot and Lele say that maximum Mach number in the flow should be used. So should there be a comparison kernel that computes maximum Mach number? I dont think its needed as the free stream Mach number and Ma in eqn would be very similar in order and equation 40 is not something accurate but a mere model. + + myprec L[5]; + + if (ev[0]>0){ + L[0] = ev[0]*(dp - s_r[si]*sos*dw); + } else { + L[0] = K*(s_p[si] - pinf) ; // either 0 (perfectly non-reflecting) or use LODI relations ( Eq. 24-28 in Poinsot and Lele (1992) ) + } + if (ev[1]>0){ + L[1] = ev[1]*(sos*sos*dr - dp); + } else { + L[1] = 0; + } + if (ev[2]>0){ + L[2] = ev[2]*(dv); + } else { + L[2] = 0; + } + if (ev[3]>0){ + L[3] = ev[3]*(du); + } else { + L[3] = 0; + } + if (ev[4]>0){ + L[4] = ev[4]*(dp + s_r[si]*sos*dw); + } else { + L[4] = 0; + } + + myprec d[5]; + + d[0] = 1/(sos*sos) * ( L[1] + 0.5*(L[4] + L[0]) ) ; + d[1] = 0.5*( L[4] + L[0] ) ; + d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ; + d[3] = L[2] ; + d[4] = L[3] ; + + *rZtmp = - d[0]; + *uZtmp += - s_u[si]*d[0] - s_r[si]*d[4]; + *vZtmp += - s_v[si]*d[0] - s_r[si]*d[3]; + *wZtmp += - s_w[si]*d[0] - s_r[si]*d[2]; + *eZtmp += - ( 0.5*( s_u[si]*s_u[si] + s_v[si]*s_v[si] + s_w[si]*s_w[si] )*d[0] + d[1]/(gam - 1) + s_r[si]*s_u[si] * d[4] + s_r[si]*s_v[si] * d[3] + s_r[si]*s_w[si] * d[2] ) ; + +} + + +#endif diff --git a/src/temporary.cu b/src/temporary.cu new file mode 100644 index 0000000..acada91 --- /dev/null +++ b/src/temporary.cu @@ -0,0 +1,608 @@ +#include "globals.h" +#include "cuda_functions.h" +#include "cuda_math.h" +#include "sponge.h" +#include "boundary_condition_x.h" +#include "boundary_condition_y.h" +#include "boundary_condition_z.h" + +__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz, + myprec *dil, myprec *dpdz, int iNum) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + + int si = id.tix + stencilSize; // local i for shared memory access + halo offset + int sj = id.tiy; // local j for shared memory access + + myprec rXtmp=0; + myprec uXtmp=0; + myprec vXtmp=0; + myprec wXtmp=0; + myprec eXtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + myprec wrk3=0; + + + __shared__ myprec s_u[sPencils][mx+stencilSize*2]; + __shared__ myprec s_v[sPencils][mx+stencilSize*2]; + __shared__ myprec s_w[sPencils][mx+stencilSize*2]; + __shared__ myprec s_t[sPencils][mx+stencilSize*2]; + __shared__ myprec s_p[sPencils][mx+stencilSize*2]; + __shared__ myprec s_prop1[sPencils][mx+stencilSize*2]; + __shared__ myprec s_prop2[sPencils][mx+stencilSize*2]; + + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + s_t[sj][si] = t[id.g]; + s_p[sj][si] = p[id.g]; + s_prop1[sj][si] = mu[id.g]; + s_prop2[sj][si] = lam[id.g]; + __syncthreads(); + + // Boundary conditions in the x-direction are in boundary_condition_x.h + // these are the BCs for u,v,w,p,t,mu,lambda + BCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); + __syncthreads(); + + derDevSharedV1x(&wrk1,s_u[sj],si); + derDevSharedV1x(&wrk2,s_v[sj],si); + derDevSharedV1x(&wrk3,s_w[sj],si); + //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms + uXtmp = ( 2 * wrk1 - 2./3.*dil[id.g] ); + vXtmp = ( wrk2 + dudy[id.g] ); + wXtmp = ( wrk3 + dudz[id.g] ); + + //adding the viscous dissipation part duidx*mu*six + eXtmp = s_prop1[sj][si]*(uXtmp*wrk1 + vXtmp*wrk2 + wXtmp*wrk3); + + //Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse); + derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx + uXtmp *= wrk2; + vXtmp *= wrk2; + wXtmp *= wrk2; + + // viscous fluxes derivative mu*d^2ui dx^2 + derDevSharedV2x(&wrk1,s_u[sj],si); + uXtmp = uXtmp + wrk1*s_prop1[sj][si]; + derDevSharedV2x(&wrk2,s_v[sj],si); + vXtmp = vXtmp + wrk2*s_prop1[sj][si]; + derDevSharedV2x(&wrk3,s_w[sj],si); + wXtmp = wXtmp + wrk3*s_prop1[sj][si]; + + //adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six) + eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp; + + //adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx) + derDevSharedV2x(&wrk1,s_t[sj],si); + eXtmp = eXtmp + wrk1*s_prop2[sj][si]; + derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx + derDevSharedV1x(&wrk3,s_t[sj],si); //wrk1 = d (t) dx + eXtmp = eXtmp + wrk2*wrk3; + + // pressure and dilation derivatives + s_prop2[sj][si] = dil[id.g]; + __syncthreads(); + + // these are the BCs for the dilatation + BCxNumber2(s_prop2[sj],id,si,mx); + __syncthreads(); + + derDevSharedV1x(&wrk2,s_prop2[sj],si); + derDevShared1x(&wrk1 ,s_p[sj],si); + uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0 - wrk1 ; + eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si]; + + //Adding here the terms - d (ru phi) dx; + s_prop1[sj][si] = r[id.g]; + s_prop2[sj][si] = h[id.g]; + __syncthreads(); + + // fill in periodic images in shared memory array + // these are the BCs for rho (prop1) and enthalpy (prop2) + BCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); + __syncthreads(); + + fluxQuadSharedx(&wrk3,s_prop1[sj],s_u[sj],si); + rXtmp = wrk3; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); + uXtmp = uXtmp + wrk1; + __syncthreads(); + fluxCubeSharedx(&wrk2,s_prop1[sj],s_u[sj],s_v[sj],si); + vXtmp = vXtmp + wrk2; + __syncthreads(); + fluxCubeSharedx(&wrk3,s_prop1[sj],s_u[sj],s_w[sj],si); + wXtmp = wXtmp + wrk3; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); + eXtmp = eXtmp + wrk1; + __syncthreads(); + + rX[id.g] = rXtmp; + uX[id.g] = uXtmp; + vX[id.g] = vXtmp; + wX[id.g] = wXtmp; + eX[id.g] = eXtmp; +} + +__global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, myprec *eY, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dvdx, myprec *dudy, myprec *dvdy, myprec *dwdy, myprec *dvdz, + myprec *dil, myprec *dpdz) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + int jNum = blockIdx.z; + id.mkidYFlx(jNum); + + int si = id.tiy + stencilSize; // local i for shared memory access + halo offset + int sj = id.tix; // local j for shared memory access + + myprec rYtmp=0; + myprec uYtmp=0; + myprec vYtmp=0; + myprec wYtmp=0; + myprec eYtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + myprec wrk3=0; + + __shared__ myprec s_u[lPencils][my/nDivY+stencilSize*2]; + __shared__ myprec s_v[lPencils][my/nDivY+stencilSize*2]; + __shared__ myprec s_w[lPencils][my/nDivY+stencilSize*2]; + __shared__ myprec s_prop[lPencils][my/nDivY+stencilSize*2]; + __shared__ myprec s_dil[lPencils][my/nDivY+stencilSize*2]; + + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + s_prop[sj][si] = mu[id.g]; + s_dil[sj][si] = dil[id.g]; + __syncthreads(); + + // fill in periodic images in shared memory array + // these are boundary conditions for u,v,w,mu and dilatation + BCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my,jNum); + __syncthreads(); + + derDevSharedV1y(&wrk1,s_u[sj],si); + derDevSharedV1y(&wrk2,s_v[sj],si); + derDevSharedV1y(&wrk3,s_w[sj],si); + //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms + uYtmp = ( wrk1 + dvdx[id.g] ); + vYtmp = (2 * wrk2 - 2./3.*s_dil[sj][si] ); + wYtmp = ( wrk3 + dvdz[id.g] ); + + //adding the viscous dissipation part duidy*mu*siy + eYtmp = s_prop[sj][si]*(uYtmp*wrk1 + vYtmp*wrk2 + wYtmp*wrk3); + + //Adding here the terms d (mu) dy * syj; + derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dy + uYtmp *= wrk2; + vYtmp *= wrk2; + wYtmp *= wrk2; + + // viscous fluxes derivative + derDevSharedV2y(&wrk1,s_u[sj],si); + uYtmp = uYtmp + wrk1*s_prop[sj][si]; + derDevSharedV2y(&wrk1,s_v[sj],si); + vYtmp = vYtmp + wrk1*s_prop[sj][si]; + derDevSharedV2y(&wrk1,s_w[sj],si); + wYtmp = wYtmp + wrk1*s_prop[sj][si]; + + //adding the viscous dissipation part ui*(mu * d2duidy2 + dmudy * siy) + eYtmp = eYtmp + s_u[sj][si]*uYtmp + s_v[sj][si]*vYtmp + s_w[sj][si]*wYtmp; + + //dilation derivatives + derDevSharedV1y(&wrk2,s_dil[sj],si); + vYtmp = vYtmp + s_prop[sj][si]*wrk2/3.0; + eYtmp = eYtmp + s_prop[sj][si]*wrk2/3.0*s_v[sj][si]; + + // pressure derivatives + s_dil[sj][si] = p[id.g]; + __syncthreads(); + + // Boundary condition for pressure + BCyNumber2(s_dil[sj],p,id,si,my, jNum); + __syncthreads(); + + derDevShared1y(&wrk1,s_dil[sj],si); + vYtmp = vYtmp - wrk1; + + // fourier terms + s_prop[sj][si] = lam[id.g]; + s_dil[sj][si] = t[id.g]; + __syncthreads(); + + // Boundary condition for temperature and thermal conductivity + BCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my,jNum); + __syncthreads(); + + derDevSharedV2y(&wrk1,s_dil[sj],si); + eYtmp = eYtmp + wrk1*s_prop[sj][si]; + derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dy + derDevSharedV1y(&wrk1,s_dil[sj] ,si); //wrk1 = d (t) dy + eYtmp = eYtmp + wrk1*wrk2; + + //Adding here the terms - d (ru phi) dy; + s_prop[sj][si] = r[id.g]; + s_dil[sj][si] = h[id.g]; + __syncthreads(); + // Boundary condition for density and enthalpy + BCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my,jNum); + __syncthreads(); + fluxQuadSharedy(&wrk1,s_prop[sj],s_v[sj],si); + rYtmp = wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_u[sj],si); + uYtmp = uYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_v[sj],si); + vYtmp = vYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_w[sj],si); + wYtmp = wYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_dil[sj],si); + eYtmp = eYtmp + wrk1; + __syncthreads(); + +#if useStreams + rY[id.g] = rYtmp; + uY[id.g] = uYtmp; + vY[id.g] = vYtmp; + wY[id.g] = wYtmp; + eY[id.g] = eYtmp; +#else + rY[id.g] += rYtmp; + uY[id.g] += uYtmp; + vY[id.g] += vYtmp; + wY[id.g] += wYtmp; + eY[id.g] += eYtmp; +#endif + __syncthreads(); +} + +__global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, myprec *eZ, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz, + myprec *dil, myprec *dpdz) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + int kNum = blockIdx.z; + id.mkidZFlx(kNum); + + int tid = id.tix + id.bdx*id.tiy; + + int si = id.tiy + stencilSize; // local i for shared memory access + halo offset + int sj = id.tix; // local j for shared memory access + int si1 = tid%id.bdy + stencilSize; // local i for shared memory access + halo offset + int sj1 = tid/id.bdy; + + Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y); + id1.mkidZFlx(kNum); + + myprec rZtmp=0; + myprec uZtmp=0; + myprec vZtmp=0; + myprec wZtmp=0; + myprec eZtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + myprec wrk3=0; + + //myprec tmpreg1[stencilSize*2+1]; + //myprec tmpreg2[stencilSize*2+1]; + //myprec tmpreg3[stencilSize*2+1]; + + __shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_prop[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_dil[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_tmp[mx/nDivX][mz/nDivZ+stencilSize*2]; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + s_prop[sj][si] = mu[id.g]; + s_dil[sj][si] = dil[id.g]; + __syncthreads(); + + // fill in periodic images in shared memory array + // these are boundary conditions for u,v,w,mu and dilatation + + + BCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,kNum); + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP +////BY COMPUTATION THREAD ARRANGEMENT + derDevSharedV1z(&wrk1,s_u[sj1],si1); + derDevSharedV1z(&wrk2,s_v[sj1],si1); + derDevSharedV1z(&wrk3,s_w[sj1],si1); + //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms + +// WRK1 CORRESPONDS TO THE NEW THREAD ARRANGEMENT. + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + s_tmp[sj][si] = dwdx[id.g]; + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + uZtmp = ( wrk1 + s_tmp[sj1][si1] ); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + __syncthreads(); + s_tmp[sj][si] = dwdy[id.g]; + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + vZtmp = ( wrk2 + s_tmp[sj1][si1] ); + wZtmp = (2 * wrk3 - 2./3.*s_dil[sj1][si1] ); + + //adding the viscous dissipation part duidz*mu*siz + eZtmp = s_prop[sj1][si1]*(uZtmp*wrk1 + vZtmp*wrk2 + wZtmp*wrk3); + + //Adding here the terms d (mu) dz * szj; + derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dz + uZtmp *= wrk2; + vZtmp *= wrk2; + wZtmp *= wrk2; + + //viscous fluxes derivative + derDevSharedV2z(&wrk1,s_u[sj1],si1); + uZtmp = uZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV2z(&wrk1,s_v[sj1],si1); + vZtmp = vZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV2z(&wrk1,s_w[sj1],si1); + wZtmp = wZtmp + wrk1*s_prop[sj1][si1]; + + //adding the viscous dissipation part ui*(mu * d2duidz2 + dmudz * siz) + eZtmp = eZtmp + s_u[sj1][si1]*uZtmp + s_v[sj1][si1]*vZtmp + s_w[sj1][si1]*wZtmp; + + //dilation derivatives + derDevSharedV1z(&wrk2,s_dil[sj1],si1); + wZtmp = wZtmp + s_prop[sj1][si1]*wrk2/3.0; + eZtmp = eZtmp + s_prop[sj1][si1]*wrk2/3.0*s_w[sj1][si1]; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM +//SHARED MEM LOADING OPERATION. - ID + + // pressure derivatives + __syncthreads(); + s_dil[sj][si] = p[id.g]; + __syncthreads(); + BCzNumber2(s_dil[sj],p,id,si,mz,kNum); + __syncthreads(); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + derDevShared1z(&wrk1,s_dil[sj1],si1); + wZtmp = wZtmp - wrk1; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + // fourier terms + __syncthreads(); + s_prop[sj][si] = lam[id.g]; + s_dil[sj][si] = t[id.g]; + __syncthreads(); + // Boundary conditions for thermal conductivity and temperature + BCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,kNum); + __syncthreads(); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + + derDevSharedV2z(&wrk1,s_dil[sj1],si1); + eZtmp = eZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (lam) dz + derDevSharedV1z(&wrk1,s_dil[sj1],si1); //wrk1 = d (t) dz + eZtmp = eZtmp + wrk1*wrk2; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + //Adding here the terms - d (ru phi) dz; + s_prop[sj][si] = r[id.g]; + s_dil[sj][si] = h[id.g]; + __syncthreads(); + // Boundary conditions for denisty and enthalpy + BCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,kNum); + __syncthreads(); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + fluxQuadSharedz(&wrk1,s_prop[sj1],s_w[sj1],si1); + rZtmp = wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_u[sj1],si1); + uZtmp = uZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_v[sj1],si1); + vZtmp = vZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_w[sj1],si1); + wZtmp = wZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_dil[sj1],si1); + eZtmp = eZtmp + wrk1; + __syncthreads(); +//USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE. + s_prop[sj1][si1] = rZtmp ; s_u[sj1][si1] = uZtmp ; s_v[sj1][si1] = vZtmp ; s_tmp[sj1][si1] = wZtmp ; s_dil[sj1][si1] = eZtmp ; + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + + rZ[id.g] += s_prop[sj][si]; + uZ[id.g] += s_u[sj][si]; + vZ[id.g] += s_v[sj][si]; + wZ[id.g] += s_tmp[sj][si] + *dpdz; + eZ[id.g] += s_dil[sj][si] + *dpdz*s_w[sj][si]; + + +//#if useStreams +// rZ[id.g] = rZtmp; +// uZ[id.g] = uZtmp; +// vZ[id.g] = vZtmp; +// wZ[id.g] = wZtmp + *dpdz; +// eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ; +//#else + + + + + +//#endif + __syncthreads(); +} + +//__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, +// myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , +// myprec *t, myprec *p, myprec *mu, myprec *lam, +// myprec *dil, myprec *dpdz) { +// +// Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); +// id.mkidX(); +// +// int si = id.i + stencilSize; // local i for shared memory access + halo offset +// int sj = id.tiy; // local j for shared memory access +// +// myprec rXtmp=0; +// myprec uXtmp=0; +// myprec vXtmp=0; +// myprec wXtmp=0; +// myprec eXtmp=0; +// +// myprec wrk1=0; +// myprec wrk2=0; +// +// __shared__ myprec s_u[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_v[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_w[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_t[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_p[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_prop1[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_prop2[sPencils][mx+stencilSize*2]; +//#if !periodicX +// __shared__ myprec s_s0[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_s4[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_s8[sPencils][mx+stencilSize*2]; +//#endif +// __shared__ myprec s_dil[sPencils][mx+stencilSize*2]; +// +// s_u[sj][si] = u[id.g]; +// s_v[sj][si] = v[id.g]; +// s_w[sj][si] = w[id.g]; +// s_t[sj][si] = t[id.g]; +// s_p[sj][si] = p[id.g]; +// s_prop1[sj][si] = mu[id.g]; +// s_prop2[sj][si] = lam[id.g]; +//#if !periodicX +// s_s0[sj][si]= gij[0][id.g]; +// s_s4[sj][si]= gij[4][id.g]; +// s_s8[sj][si]= gij[8][id.g]; +//#endif +// s_dil[sj][si] = dil[id.g]; +// __syncthreads(); +// +// // fill in periodic images in shared memory array +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_u[sj],si); perBCx(s_v[sj],si); perBCx(s_w[sj],si); +// perBCx(s_t[sj],si); perBCx(s_p[sj],si); perBCx(s_prop1[sj],si); +// perBCx(s_prop2[sj],si); +//#else +// wallBCxMir(s_p[sj],si); +// wallBCxVel(s_u[sj],si); wallBCxVel(s_v[sj],si); wallBCxVel(s_w[sj],si); +// wallBCxExt(s_t[sj],si,TwallTop,TwallBot); +// mlBoundPT(s_prop1[sj], s_prop2[sj], s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si); +// wallBCxMir(s_s0[sj],si); wallBCxVel(s_s4[sj],si); wallBCxVel(s_s8[sj],si); +//#endif +// } +// +// __syncthreads(); +// +// //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms +// uXtmp = ( 2 * gij[0][id.g] - 2./3.*s_dil[sj][si] ); +// vXtmp = ( gij[1][id.g] + gij[3][id.g] ); +// wXtmp = ( gij[2][id.g] + gij[6][id.g] ); +// +// //adding the viscous dissipation part duidx*mu*six +// eXtmp = s_prop1[sj][si]*(uXtmp*gij[0][id.g] + vXtmp*gij[1][id.g] + wXtmp*gij[2][id.g]); +// +// //Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse); +// derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx +// uXtmp *= wrk2; +// vXtmp *= wrk2; +// wXtmp *= wrk2; +// +// // viscous fluxes derivative mu*d^2ui dx^2 +// derDevSharedV2x(&wrk1,s_u[sj],si); +// uXtmp = uXtmp + wrk1*s_prop1[sj][si]; +// derDevSharedV2x(&wrk1,s_v[sj],si); +// vXtmp = vXtmp + wrk1*s_prop1[sj][si]; +// derDevSharedV2x(&wrk1,s_w[sj],si); +// wXtmp = wXtmp + wrk1*s_prop1[sj][si]; +// +// //adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six) +// eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp; +// +// //adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx) +// derDevSharedV2x(&wrk1,s_t[sj],si); +// eXtmp = eXtmp + wrk1*s_prop2[sj][si]; +// derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx +// derDevSharedV1x(&wrk1,s_t[sj],si); //wrk1 = d (t) dx +// eXtmp = eXtmp + wrk1*wrk2; +// +// // pressure and dilation derivatives +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_dil[sj],si); +//#else +// wallBCxDil(s_dil[sj],s_s0[sj],s_s4[sj],s_s8[sj],si); +//#endif +// } +// __syncthreads(); +// +// derDevSharedV1x(&wrk2,s_dil[sj],si); +// derDevShared1x(&wrk1 ,s_p[sj],si); +// uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0 - wrk1 ; +// eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si]; +// +// //Adding here the terms - d (ru phi) dx; +// s_prop1[sj][si] = r[id.g]; +// s_prop2[sj][si] = h[id.g]; +// __syncthreads(); +// // fill in periodic images in shared memory array +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_prop1[sj],si); perBCx(s_prop2[sj],si); +//#else +// rhBoundPT(s_prop1[sj], s_prop2[sj], s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si); +//#endif +// } +// +// fluxQuadSharedx(&wrk1,s_prop1[sj],s_u[sj],si); +// rXtmp = wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); +// uXtmp = uXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_v[sj],si); +// vXtmp = vXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_w[sj],si); +// wXtmp = wXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); +// eXtmp = eXtmp + wrk1; +// __syncthreads(); +// +// rX[id.g] = rXtmp; +// uX[id.g] = uXtmp; +// vX[id.g] = vXtmp; +// wX[id.g] = wXtmp; +// eX[id.g] = eXtmp; +//}