diff --git a/.gitignore b/.gitignore index 7f02fbe..4fd798e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*.ncu-rep *.bin *.txt *.o @@ -6,3 +7,4 @@ logfile *.xmf *.mp4 ns +tmp diff --git a/Makefile b/Makefile index 6485a4b..f5a4d34 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ GEN = /usr #MPI = /usr/local/openmpi-4.1.1 -CUDA = /usr/local/cuda-11.2 +CUDA = /usr/local/cuda-11.4 GPU_ARCHITECTURE = 70 @@ -38,6 +38,7 @@ INC = -I$(CUDA)/include -I$(GEN)/include $(GLB) #INC += -I$(MPI)/include LIB = -L$(CUDA)/lib64 -L$(GEN)/lib -lc -lstdc++ -lcuda -lcudart -lcudadevrt #LIB += -L$(MPI)/lib +#NVCC = $(CUDA)/bin/nvcc $(DBG) $(CPPFLAGS) -lineinfo -rdc=true # NVCC = nvcc $(DBG) $(CPPFLAGS) -lineinfo -rdc=true # MAXREG = # --maxrregcount=82 --ptxas-options=-v ifeq ($(DBG),) @@ -104,3 +105,6 @@ $(OBJ)cuda_link.o: $(OBJ_CUDA) clean: rm -rf $(TARGET) $(OBJ)*.o + + + diff --git a/src/.cproject b/src/.cproject new file mode 100644 index 0000000..c6790cb --- /dev/null +++ b/src/.cproject @@ -0,0 +1,138 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/.project b/src/.project new file mode 100644 index 0000000..65be2a9 --- /dev/null +++ b/src/.project @@ -0,0 +1,27 @@ + + + MPI_test foler + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + diff --git a/src/.settings/language.settings.xml b/src/.settings/language.settings.xml new file mode 100644 index 0000000..4053a67 --- /dev/null +++ b/src/.settings/language.settings.xml @@ -0,0 +1,44 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/backinit.cpp b/src/backinit.cpp new file mode 100644 index 0000000..da1fdca --- /dev/null +++ b/src/backinit.cpp @@ -0,0 +1,354 @@ +#include +#include +#include +#include +#include +#include +#include "comm.h" +#include "globals.h" +#include "main.h" + +void derivGrid(double *d2f, double *df, double *f, double dx); + +void initField(int timestep, Communicator rk) { + readFileMPI('r',timestep,r,rk); + readFileMPI('u',timestep,u,rk); + readFileMPI('v',timestep,v,rk); + readFileMPI('w',timestep,w,rk); + readFileMPI('e',timestep,e,rk); + if(rk.rank==0) { + printf("finished initializing field\n"); + } +} + +void writeField(int timestep, Communicator rk) { + saveFileMPI('r',timestep,r,rk); + saveFileMPI('u',timestep,u,rk); + saveFileMPI('v',timestep,v,rk); + saveFileMPI('w',timestep,w,rk); + saveFileMPI('e',timestep,e,rk); +} + +void initGrid(Communicator rk) { + + //constant grid in y and z and stretched in x + + if (boundaryLayer) { + myprec tmp = 1/( (mxwr-1.0) - 1.0); + for (int i=0; i1) { + haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id); + + haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id); + } else{ + TopBCyNumber2(s_u[sj],u,id,si,my); TopBCyNumber2(s_v[sj],v,id,si,my); + BotBCyNumber2(s_u[sj],u,id,si,my); BotBCyNumber2(s_v[sj],v,id,si,my); + } + } else { + if (jNum ==0){ + TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); + + if(pRow>1) { + + haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id); + } else{ + BotBCyNumber2(s_u[sj],u,id,si,my); BotBCyNumber2(s_v[sj],v,id,si,my); + } + } else if (jNum == nDivY-1){ + BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); + if(pRow>1) { + haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id); + + } else { + TopBCyNumber2(s_u[sj],u,id,si,my); TopBCyNumber2(s_v[sj],v,id,si,my); + } + } else { + TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); + + BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); + } + } + } + __syncthreads(); +// COMP Tile sj1 and si1 + derDevSharedV1y(&wrk1,s_u[sj1],si1); + derDevSharedV1y(&wrk2,s_v[sj1],si1); + //derDevSharedV1y(&wrk3,s_w[sj1],si1); + __syncthreads(); + s_u[sj1][si1] = wrk1; + s_v[sj1][si1] = wrk2; + // s_w[sj1][si1] = wrk3; + __syncthreads(); +//MEM tile sj si + dudy[id.g] = s_u[sj][si]; + dvdy[id.g] = s_v[sj][si]; + //dwdy[id.g] = s_w[sj][si]; + __syncthreads(); + +/* derDevV1yL(dudy,u,id); derDevV1yL(dvdy,v,id); - derDevV1yL(dwdy,w,id); + //derDevV1yL(dwdy,w,id);*/ } -__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz, int kNum) { - - Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); +__global__ void derVelZ(myprec *u, myprec *v, myprec *w, myprec *dudz, myprec *dvdz, myprec *dwdz, recycle rec, Communicator rk) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + int kNum = blockIdx.z; + + id.mkidZFlx(kNum); + + int tid = id.tix + id.bdx*id.tiy; + + int si = id.tiy + stencilSize; // local i for shared memory access + halo offset + int sj = id.tix; // local j for shared memory access + int si1 = tid%id.bdy + stencilSize; // local i for shared memory access + halo offset + int sj1 = tid/id.bdy; + + Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y); + id1.mkidZFlx(kNum); + + myprec wrk1=0; + myprec wrk2=0; + myprec wrk3=0; + + __shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2]; + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + __syncthreads(); + + + + if (id.tiy < stencilSize){ + if (nDivZ ==1){ + if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot.. + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol-2){ // last block + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + + TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc); + } else if (rk.kp == 1) {// first block + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + + BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec); + } else { // all internal blocks + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + } + } + + } else{ + TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc); + BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec); + } + } else { + if (kNum ==0){ + TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.kp == 1){ // first block + BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec); + } else { // all other blocks + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + } + } + + } else{ + BotBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,inletbc, rec); + } + } else if (kNum == nDivZ-1){ + BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol -2 ){ // last block + TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc); + } else { // all other blocks + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + } + } + } else{ + TopBCzderVel(s_u[sj],s_v[sj],s_w[sj],u,v,w,id,si,mx,outletbc); + } + } else { + TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id); + + BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id); + } + } + } - derDevV1zL(dudz,u,uInit,id,kNum); + __syncthreads(); + +// COMP Tile sj1 and si1 + derDevSharedV1z(&wrk1,s_u[sj1],si1); + derDevSharedV1z(&wrk2,s_v[sj1],si1); + derDevSharedV1z(&wrk3,s_w[sj1],si1); + __syncthreads(); + s_u[sj1][si1] = wrk1; + s_v[sj1][si1] = wrk2; + s_w[sj1][si1] = wrk3; + __syncthreads(); +//MEM tile sj si + dudz[id.g] = s_u[sj][si]; + dvdz[id.g] = s_v[sj][si]; + dwdz[id.g] = s_w[sj][si]; + __syncthreads(); + /*derDevV1zL(dudz,u,uInit,id,kNum); derDevV1zL(dvdz,v,vInit,id,kNum); - derDevV1zL(dwdz,w,wInit,id,kNum); + derDevV1zL(dwdz,w,wInit,id,kNum);*/ } __global__ void derVelYBC(myprec *u, myprec *v, myprec *w, myprec *dudy, myprec *dvdy, myprec *dwdy, int direction) { @@ -112,7 +272,7 @@ void calcPressureGrad(myprec *dpdz, myprec *r, myprec *w, Communicator rk) { deviceMul<<>>(workA,r,w); hostVolumeIntegral(dpdz,workA,rk); - deviceCalcPress<<<1,1>>>(dpdz,dpdz_prev,rbulk); + deviceCalcPress<<<1,1>>>(dpdz,dpdz_prev,rbulk); //I changed here from rbulk to workA. // i CHANGED TO 1 checkCuda( cudaFree(workA) ); checkCuda( cudaFree(dpdz_prev) ); @@ -145,29 +305,33 @@ __global__ void deviceCalcDt(myprec *wrkArray, myprec *r, myprec *u, myprec *v, myprec dtViscInv = 0.0; myprec ien = e[id.g]/r[id.g] - 0.5*(u[id.g]*u[id.g] + v[id.g]*v[id.g] + w[id.g]*w[id.g]); - myprec sos = pow(gam*(gam-1)*ien,0.5); + myprec sos = pow(abs(gam*(gam-1)*ien),0.5); myprec dx,d2x; dx = d_dxv[id.i]; d2x = dx*dx; + myprec nu = mu[id.g]/r[id.g]; + dtConvInv = MAX( (abs(u[id.g]) + sos)/dx, MAX( (abs(v[id.g]) + sos)*d_dy, (abs(w[id.g]) + sos)*d_dz) ); - dtViscInv = MAX( mu[id.g]/d2x, MAX( mu[id.g]*d_d2y, mu[id.g]*d_d2z) ); + dtViscInv = MAX( nu/d2x, MAX( nu*d_d2y, nu*d_d2z) ); wrkArray[id.g] = CFL/MAX(dtConvInv, dtViscInv); __syncthreads(); } -void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, Communicator rk) { +void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, myprec *dtC, myprec *dpdz, int file, int istep, Communicator rk) { cudaSetDevice(rk.nodeRank); myprec *workA, *rbulk; myprec *hostWork = (myprec*)malloc(sizeof(myprec)); + dim3 gr0 = dim3(my / sPencils, mz, 1); dim3 bl0 = dim3(mx, sPencils, 1); + checkCuda( cudaMalloc((void**)&workA,mx*my*mz*sizeof(myprec)) ); checkCuda( cudaMalloc((void**)&rbulk , sizeof(myprec)) ); @@ -198,4 +362,8 @@ void calcBulk(myprec *par1, myprec *par2, myprec *r, myprec *u, myprec *v, mypre free(hostWork); checkCuda( cudaFree(workA) ); checkCuda( cudaFree(rbulk) ); + + if(rk.rank==0) printf("step number %d with %le %le\n",nsteps*(file-1) + istep ,*dtC,*dpdz); + + } diff --git a/src/comm.cpp b/src/comm.cpp index 0d29ee8..a53037b 100644 --- a/src/comm.cpp +++ b/src/comm.cpp @@ -19,6 +19,8 @@ myprec *rcvYp,*rcvYm,*rcvZp,*rcvZm; myprec *senYp5,*senYm5,*senZp5,*senZm5; myprec *rcvYp5,*rcvYm5,*rcvZp5,*rcvZm5; +MPI_Comm comm_col; + void updateHaloTest(myprec *var, Communicator rk) { int ierr; @@ -133,6 +135,30 @@ void updateHaloFive(myprec *r, myprec *u, myprec *v, myprec *w, myprec *e, Commu fillBoundariesFive(rcvYm5,rcvYp5,rcvZm5,rcvZp5,r,u,v,w,e,1,rk); } +void TransferToInlet(myprec *recy_r, myprec *recy_u,myprec *recy_v, myprec *recy_w, myprec *recy_t, Communicator rk) { + + int ierr; + MPI_Status status; + + if ( (rk.kstart <= recstn) && ( rk.kend >= recstn) ) { //The processor row consisting of recycling station co-od = [y=0:pRow-1, z=Recyc] + ierr = MPI_Send(recy_r, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 0, MPI_COMM_WORLD); // A processor with [5, Recyc] will send data to [5, 0] + ierr = MPI_Send(recy_u, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 1, MPI_COMM_WORLD); + ierr = MPI_Send(recy_v, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 2, MPI_COMM_WORLD); + ierr = MPI_Send(recy_w, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 3, MPI_COMM_WORLD); + ierr = MPI_Send(recy_t, stencilSize*mx_tot*my, MPI_myprec, rk.coodrecv, 4, MPI_COMM_WORLD); + } + + if (rk.kstart == 0) { //co-od = [y=0:pRow-1, z=0] + ierr = MPI_Recv(recy_r, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 0, MPI_COMM_WORLD, &status);// A processor with [8, 0] will recieve data from [8, Recyc] + ierr = MPI_Recv(recy_u, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 1, MPI_COMM_WORLD, &status); + ierr = MPI_Recv(recy_v, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 2, MPI_COMM_WORLD, &status); + ierr = MPI_Recv(recy_w, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 3, MPI_COMM_WORLD, &status); + ierr = MPI_Recv(recy_t, stencilSize*mx_tot*my, MPI_myprec, rk.coodsend, 4, MPI_COMM_WORLD, &status); + } + + ierr = MPI_Barrier(MPI_COMM_WORLD); +} + long int nameToHash(char *name, int length) { long int hash = 0; for (int i=0; imyRank(myRank); - - const int dimens[] = {pRow,pCol}; - const int periods[] = {1,1}; - int coord[2],cYPos[2],cYNeg[2],cZPos[2],cZNeg[2]; - int ierr; - - MPI_Comm comm_cart; - - ierr = MPI_Cart_create(MPI_COMM_WORLD, 2, dimens, periods, 0, &comm_cart); - ierr = MPI_Cart_coords(comm_cart, rk->rank, 2, coord); - - cZPos[0] = coord[0]; - cZNeg[0] = coord[0]; - cYPos[1] = coord[1]; - cYNeg[1] = coord[1]; - - cZPos[1] = coord[1]+1; - if(cZPos[1]>pCol-1) - cZPos[1] = 0; - - cZNeg[1] = coord[1]-1; - if(cZNeg[1]<0) - cZNeg[1] = pCol-1; - - cYPos[0] = coord[0]+1; - if(cYPos[0]>pRow-1) - cYPos[0] = 0; - - cYNeg[0] = coord[0]-1; - if(cYNeg[0]<0) - cYNeg[0] = pRow-1; + rk->myRank(myRank); + + const int dimens[] = {pRow,pCol}; + const int periods[] = {1,1}; + int coord[2],cYPos[2],cYNeg[2],cZPos[2],cZNeg[2], cZSenRR[2], cZRecRR[2], color; + int ierr; + + MPI_Comm comm_cart; + + ierr = MPI_Cart_create(MPI_COMM_WORLD, 2, dimens, periods, 0, &comm_cart); + ierr = MPI_Cart_coords(comm_cart, rk->rank, 2, coord); + + cZPos[0] = coord[0]; + cZNeg[0] = coord[0]; + cYPos[1] = coord[1]; + cYNeg[1] = coord[1]; + + cZPos[1] = coord[1]+1; + if(cZPos[1]>pCol-1) + cZPos[1] = 0; + + cZNeg[1] = coord[1]-1; + if(cZNeg[1]<0) + cZNeg[1] = pCol-1; + + cYPos[0] = coord[0]+1; + if(cYPos[0]>pRow-1) + cYPos[0] = 0; + + cYNeg[0] = coord[0]-1; + if(cYNeg[0]<0) + cYNeg[0] = pRow-1; + + ierr = MPI_Cart_rank(comm_cart, cZPos, &rk->kp); + ierr = MPI_Cart_rank(comm_cart, cZNeg, &rk->km); + ierr = MPI_Cart_rank(comm_cart, cYPos, &rk->jp); + ierr = MPI_Cart_rank(comm_cart, cYNeg, &rk->jm); + + rk->jstart = coord[0]*my; + rk->jend = rk->jstart + my; + rk->kstart = coord[1]*mz; + rk->kend = rk->kstart + mz; + + int kr1,kr2; + int RecyZRank = 0; + for(int i = 0; i= recstn)) { + RecyZRank = i; + } + + } + + cZSenRR[0] = coord[0]; + cZSenRR[1] = RecyZRank; + cZRecRR[0] = coord[0]; + cZRecRR[1] = 0; + + ierr = MPI_Cart_rank(comm_cart, cZSenRR, &rk->coodsend); + ierr = MPI_Cart_rank(comm_cart, cZRecRR, &rk->coodrecv); + + + long int hash; + MPI_Comm comm_node; + int procnamesize; + char procname[MPI_MAX_PROCESSOR_NAME]; + + ierr = MPI_Get_processor_name(procname, &procnamesize); + + hash = (rk->rank)/GPUperNode ; // nameToHash(procname, procnamesize); + + ierr = MPI_Comm_split(MPI_COMM_WORLD, hash, rk->rank, &comm_node); + ierr = MPI_Comm_rank(comm_node, &rk->nodeRank); + + // MPI_Comm comm_col; // All rows in a particular column. Used to communicate across Y direction. [0:pRow-1, z] + color = coord[1]; // All processors in a particular column will be bundled in the communicator called comm_col; + ierr = MPI_Comm_split(MPI_COMM_WORLD, color, rk->rank, &comm_col); // refer https://mpitutorial.com/tutorials/introduction-to-groups-and-communicators/ + int rankcol; + ierr = MPI_Comm_rank(comm_col, &rankcol); + + ierr = MPI_Barrier(MPI_COMM_WORLD); + printf("rank number %d, gpu number %d\n",rk->rank,rk->nodeRank); + ierr = MPI_Barrier(MPI_COMM_WORLD); + printf("SendingRank %d, Recieving Rank %d\n",rk->coodsend,rk->coodrecv); + ierr = MPI_Barrier(MPI_COMM_WORLD); + printf("Neighbours %d\n",rk -> km); + ierr = MPI_Barrier(MPI_COMM_WORLD); - ierr = MPI_Cart_rank(comm_cart, cZPos, &rk->kp); - ierr = MPI_Cart_rank(comm_cart, cZNeg, &rk->km); - ierr = MPI_Cart_rank(comm_cart, cYPos, &rk->jp); - ierr = MPI_Cart_rank(comm_cart, cYNeg, &rk->jm); - - rk->jstart = coord[0]*my; - rk->jend = rk->jstart + my; - rk->kstart = coord[1]*mz; - rk->kend = rk->kstart + mz; - - long int hash; - MPI_Comm comm_node; - int procnamesize; - char procname[MPI_MAX_PROCESSOR_NAME]; - - ierr = MPI_Get_processor_name(procname, &procnamesize); - - hash = 1; // nameToHash(procname, procnamesize); - - ierr = MPI_Comm_split(MPI_COMM_WORLD, hash, rk->rank, &comm_node); - ierr = MPI_Comm_rank(comm_node , &rk->nodeRank); - ierr = MPI_Barrier(MPI_COMM_WORLD); - printf("rank number %d, gpu number %d\n",rk->rank,rk->nodeRank); - ierr = MPI_Barrier(MPI_COMM_WORLD); } void saveFileMPI(char filename, int timestep, double *var, Communicator rk) { @@ -277,6 +335,144 @@ void readFileMPI(char filename, int timestep, double *var, Communicator rk) { var[i + (j-rk.jstart)*mx + (k-rk.kstart)*mx*my] = var_tot[i + j * mx_tot + k * mx_tot * my_tot]; delete [] var_tot; } +void readFileMPIInit(char filename, int timestep, double *r, double *u, double *v, double *w, double *ret, Communicator rk){ + double *var_tot = new double[mx_tot*my_tot*mz_tot*5]; + int ierr; + int lSize = mx_tot*my_tot*mz_tot*5; + + if(rk.rank==0) { + char str[80]; + size_t result; + sprintf(str, "fields/%c.%07d.bin",filename,timestep); + FILE *fb = fopen(str,"rb"); + result = fread(var_tot , sizeof(myprec) , lSize , fb ); + fclose(fb); + } + ierr = MPI_Barrier(MPI_COMM_WORLD); + + ierr = MPI_Bcast(var_tot, lSize, MPI_DOUBLE, 0, MPI_COMM_WORLD); + + ierr = MPI_Barrier(MPI_COMM_WORLD); + + for (int m=0; m<5; m++){ + for (int k=rk.kstart; k 279 || my+2*stencilSize>600 || mz/nDivZ+2*stencilSize>600 @@ -64,11 +64,13 @@ const int lPencils = 32; #endif #endif + extern __constant__ myprec dcoeffF[stencilSize]; extern __constant__ myprec dcoeffS[stencilSize+1]; extern __constant__ myprec dcoeffVF[stencilVisc]; extern __constant__ myprec dcoeffVS[stencilVisc+1]; + #if mx<=546 //limit on the GPU constant memory usage (655356 bytes) extern __constant__ myprec dcoeffSx[mx*(2*stencilSize+1)]; extern __constant__ myprec dcoeffVSx[mx*(2*stencilVisc+1)]; @@ -82,8 +84,8 @@ extern __device__ myprec time_on_GPU; extern __device__ Communicator rkGPU; -extern dim3 d_block[5], grid0, gridBC, gridHalo, gridHaloY, gridHaloZ; -extern dim3 d_grid[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ; +extern dim3 d_grid[5], grid0, gridBC, gridHalo, gridHaloY, gridHaloZ, d_gridx, gridBCw; +extern dim3 d_block[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ, d_blockx, blockBCw; extern cudaStream_t s[8+nDivZ]; @@ -138,4 +140,12 @@ extern myprec *rcvYp,*rcvYm,*rcvZp,*rcvZm; extern myprec *senYp5,*senYm5,*senZp5,*senZm5; extern myprec *rcvYp5,*rcvYm5,*rcvZp5,*rcvZm5; +extern myprec *rm, *um, *wm, *tm; +extern myprec *rm_in, *um_in, *wm_in, *tm_in; +extern myprec *a_inpl, *b_inpl; +extern int *idxm, *idxp; +extern myprec *delta_rec, *delta_in ; +extern myprec *recy_r, *recy_u, *recy_v, *recy_w, *recy_e, *recy_h, *recy_p, *recy_t, *recy_m, *recy_l; + + #endif diff --git a/src/cuda_main.cu b/src/cuda_main.cu index 5f413c1..f031113 100644 --- a/src/cuda_main.cu +++ b/src/cuda_main.cu @@ -6,53 +6,181 @@ #include "comm.h" #include "sponge.h" - -__device__ __constant__ myprec alpha[] = {0. , -17./60., -5./12.}; +__device__ __constant__ myprec alpha[] = {0., -17./60., -5./12.}; __device__ __constant__ myprec beta[] = {8./15., 5./12., 3./4. }; cudaStream_t s[8+nDivZ]; -inline void calcRHS(myprec *rhsr, myprec *rhsu, myprec *rhsv, myprec *rhsw, myprec *rhse, Communicator rk) { +inline void calcRHS(myprec *rhsr, myprec *rhsu, myprec *rhsv, myprec *rhsw, myprec *rhse, Communicator rk, recycle rec) { + if (topbc == 3) { + BCwallCenteredTop<<>>(d_r,d_u,d_v,d_w,d_e); // The values from RK3 step for the boundary nodes will be not accurate as we already have BCspecified for them and we do not solvecharacteristic equations for them (except for rho). + // So u[0] may not be 0 after RK3. Thus, we ensure the boundary values are accurate using this step. rho will be accurate at + // boundary as we solve the NS eqn for rho accurately at the wall (reflecting BC). + } + if (bottombc == 3) { + BCwallCenteredBot<<>>(d_r,d_u,d_v,d_w,d_e); + + } + if(multiGPU) deviceBlocker<<>>(); //in order to hide the halo update with deviceRHSX (on stream s[0]) calcState<<>>(d_r,d_u,d_v,d_w,d_e,d_h,d_t,d_p,d_m,d_l,0); //here 0 means interior points - derVelX<<>>(d_u,d_v,d_w,gij[0],gij[1],gij[2]); - derVelY<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5]); - for (int kNum=0; kNum>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],kNum); + // if(multiGPU) deviceBlocker<<>>(); //in order to hide the halo update with deviceRHSX (on stream s[0]) + deviceRHSX<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[0],gij[1],gij[2],gij[3],gij[6],gij[4],gij[8],d_dil,dpdz); + if(multiGPU) { - updateHaloFive(d_r,d_u,d_v,d_w,d_e,rk); cudaDeviceSynchronize(); + updateHaloFive(d_r,d_u,d_v,d_w,d_e,rk); + cudaDeviceSynchronize(); calcState<<>>(d_r,d_u,d_v,d_w,d_e,d_h,d_t,d_p,d_m,d_l,1); //here 1 means halo points - derVelYBC<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],0); //here 0 means lower boundary (0-index) - derVelZBC<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],0); //here 0 means lower boundary (0-index) - derVelYBC<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],1); //here 1 means upper boundary (my-index) - derVelZBC<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],1); //here 1 means upper boundary (mz-index) + // derVelYBC<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],0); //here 0 means lower boundary (0-index) + // derVelZBC<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],0); //here 0 means lower boundary (0-index) + // derVelYBC<<>>(d_u,d_v,d_w,gij[3],gij[4],gij[5],1); //here 1 means upper boundary (my-index) + // derVelZBC<<>>(d_u,d_v,d_w,gij[6],gij[7],gij[8],1); //here 1 means upper boundary (mz-index) } - cudaDeviceSynchronize(); - calcDil<<>>(d_dil,gij[0],gij[4],gij[8]); - cudaDeviceSynchronize(); - if(multiGPU) deviceBlocker<<>>(); //in order to hide the halo update with deviceRHSX (on stream s[0]) - deviceRHSX<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[0],gij[1],gij[2],gij[3],gij[6],d_dil,dpdz,0); + if(multiGPU) updateHalo(d_dil,rk); cudaDeviceSynchronize(); - deviceRHSY<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[1],gij[3],gij[4],gij[5],gij[7],d_dil,dpdz,0); - for (int kNum=0; kNum>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[2],gij[5],gij[6],gij[7],gij[8],d_dil,dpdz,kNum); + deviceRHSY<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[1],gij[3],gij[4],gij[5],gij[7],d_dil,dpdz); + cudaDeviceSynchronize(); + deviceRHSZ<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_h,d_t,d_p,d_m,d_l,gij[2],gij[5],gij[6],gij[7],gij[8],d_dil,dpdz, rk,rec); cudaDeviceSynchronize(); - if(boundaryLayer) addSponge<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_e); + //if(boundaryLayer) addSponge<<>>(rhsr,rhsu,rhsv,rhsw,rhse,d_r,d_u,d_v,d_w,d_e); cudaDeviceSynchronize(); } -void runSimulation(myprec *par1, myprec *par2, myprec *time, Communicator rk) { +void runSimulation(int file, myprec *par1, myprec *par2, myprec *time, Communicator rk) { cudaSetDevice(rk.nodeRank); - myprec h_dt,h_dpdz; for (int istep = 0; istep < nsteps; istep++) { + if (topbc == 3) { + BCwallCenteredTop<<>>(d_r,d_u,d_v,d_w,d_e); // The values from RK3 step for the boundary nodes will be not accurate as we already have BCspecified for them and we do not solvecharacteristic equations for them (except for rho). + // So u[0] may not be 0 after RK3. Thus, we ensure the boundary values are accurate using this step. rho will be accurate at + // boundary as we solve the NS eqn for rho accurately at the wall (reflecting BC). + } + if (bottombc == 3) { + BCwallCenteredBot<<>>(d_r,d_u,d_v,d_w,d_e); + + } + cudaDeviceSynchronize(); + if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk); - if(istep>0) deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC); + if(istep>0) deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1],dtC); if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC); deviceAdvanceTime<<<1,1>>>(dtC); - if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk); + if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,&h_dt,&h_dpdz,file, istep, rk); + + int notanum=0; + if (h_dt != h_dt) notanum = 1; + if (notanum>0){ + if(rk.rank==0) printf("Encountered a NAN in dt\n"); + exit(1); + } + + //Recycling-Rescaling + if (inletbc == 5) { + if (( rk.kstart <= recstn) && ( rk.kend >= recstn) ) { // this section will be executed only by the processors that includes recycling stn. + if (abs(recstn - rk.kstart) < stencilSize-1) { + printf("The Recycling station is too close to the processor left boundary."); + exit(1); + } + if(istep%checkMeanRec==0) { + calcMeanRec(d_r, d_u, d_w, d_t, rm, um, wm, tm, a_inpl, b_inpl, idxm, idxp, delta_rec, delta_in,rk); + } + cudaDeviceSynchronize(); + Recycle_Rescale(d_r, d_u, d_v, d_w, d_t, rm, um, wm, tm, rm_in, um_in, wm_in, tm_in, recy_r, + recy_u, recy_v, recy_w, recy_t, a_inpl, b_inpl, idxm, idxp, delta_rec, delta_in,rk); + cudaDeviceSynchronize(); + } + if (pCol > 1) { + TransferToInlet(recy_r, recy_u, recy_v, recy_w, recy_t, rk); + cudaDeviceSynchronize(); + } + + if (pRow==1 && spanshift) { + Spanwiseshift(recy_r, recy_u, recy_v, recy_w, recy_t, rk); + cudaDeviceSynchronize(); + } + + + + + dim3 gridcalcstateRR; + gridcalcstateRR = dim3(my, stencilSize, 1); + calcStateRR<<>>(recy_r, recy_u, recy_v, recy_w, recy_e, recy_h, recy_t, recy_p, recy_m, recy_l); + } + cudaDeviceSynchronize(); + + if ((istep%100==0)){ + /*myprec *h_wminl = (myprec*)malloc(mx*sizeof(myprec)); + myprec *wminl; + checkCuda( cudaMalloc( (void**)&wminl, mx_tot*sizeof(myprec) ) ); + int gridYAvg = mx_tot; + int blockYAvg = my; + YAvg<<>>(recy_w, wminl, stencilSize-1, 0); + cudaDeviceSynchronize(); + checkCuda( cudaMemcpy(h_wminl, wminl, mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) ); + allReduceSumYavg(h_wminl,mx_tot);*/ + + myprec *h_delre = (myprec*)malloc(stencilSize*sizeof(myprec)); + checkCuda( cudaMemcpy(h_delre, delta_rec, stencilSize*sizeof(myprec), cudaMemcpyDeviceToHost) ); + + myprec *h_wm = (myprec*)malloc(stencilSize*mx*sizeof(myprec)); + checkCuda( cudaMemcpy(h_wm, wm, mx*stencilSize*sizeof(myprec), cudaMemcpyDeviceToHost) ); + + myprec deltastar=0; + int kn = stencilSize-1; + myprec Uinv = 1/h_wm[mx-1 + kn*mx]; + for (int i=0; i>>(d_uO,d_r,d_u); deviceMul<<>>(d_vO,d_r,d_v); @@ -63,128 +191,190 @@ void runSimulation(myprec *par1, myprec *par2, myprec *time, Communicator rk) { //Starting the Runge-Kutta Steps //runge kutta step 1 - calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk); - eulerSum<<>>(d_r,d_rO,d_rhsr1,dtC); - eulerSum<<>>(d_e,d_eO,d_rhse1,dtC); - cudaDeviceSynchronize(); - eulerSumR<<>>(d_u,d_uO,d_rhsu1,d_r,dtC); - eulerSumR<<>>(d_v,d_vO,d_rhsv1,d_r,dtC); - eulerSumR<<>>(d_w,d_wO,d_rhsw1,d_r,dtC); + calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk,rec); + + //eulerSum<<>>(d_r,d_rO,d_rhsr1,dtC); + //eulerSum<<>>(d_e,d_eO,d_rhse1,dtC); + + eulerSumAll<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, + d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,dtC); + //cudaDeviceSynchronize(); + //eulerSumR<<>>(d_u,d_uO,d_rhsu1,d_r,dtC); + //eulerSumR<<>>(d_v,d_vO,d_rhsv1,d_r,dtC); + //eulerSumR<<>>(d_w,d_wO,d_rhsw1,d_r,dtC); cudaDeviceSynchronize(); - if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer - deviceCpy<<>>(d_r,d_r); - deviceCpy<<>>(d_u,d_u); - deviceCpy<<>>(d_v,d_v); - deviceCpy<<>>(d_w,d_w); } + /* if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer + deviceCpy<<>>(d_r,d_r); + deviceCpy<<>>(d_u,d_u); + deviceCpy<<>>(d_v,d_v); + deviceCpy<<>>(d_w,d_w); + }*/ //runge kutta step 2 - calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk); - eulerSum3<<>>(d_r,d_rO,d_rhsr1,d_rhsr2,dtC); - eulerSum3<<>>(d_e,d_eO,d_rhse1,d_rhse2,dtC); - cudaDeviceSynchronize(); - eulerSum3R<<>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_r,dtC); - eulerSum3R<<>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_r,dtC); - eulerSum3R<<>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_r,dtC); + calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk,rec); + //eulerSum3<<>>(d_r,d_rO,d_rhsr1,d_rhsr2,dtC); + //eulerSum3<<>>(d_e,d_eO,d_rhse1,d_rhse2,dtC); + eulerSumAll2<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, + d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2, dtC); + //cudaDeviceSynchronize(); + //eulerSum3R<<>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_r,dtC); + //eulerSum3R<<>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_r,dtC); + //eulerSum3R<<>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_r,dtC); cudaDeviceSynchronize(); - if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer - deviceCpy<<>>(d_r,d_r); - deviceCpy<<>>(d_u,d_u); - deviceCpy<<>>(d_v,d_v); - deviceCpy<<>>(d_w,d_w); } + /* if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer + deviceCpy<<>>(d_r,d_r); + deviceCpy<<>>(d_u,d_u); + deviceCpy<<>>(d_v,d_v); + deviceCpy<<>>(d_w,d_w); + }*/ //runge kutta step 3 - calcRHS(d_rhsr3,d_rhsu3,d_rhsv3,d_rhsw3,d_rhse3,rk); - rk3final<<>>(d_r,d_rO,d_rhsr1,d_rhsr2,d_rhsr3,dtC); - rk3final<<>>(d_e,d_eO,d_rhse1,d_rhse2,d_rhse3,dtC); - cudaDeviceSynchronize(); - rk3finalR<<>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_rhsu3,d_r,dtC); - rk3finalR<<>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_rhsv3,d_r,dtC); - rk3finalR<<>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_rhsw3,d_r,dtC); + calcRHS(d_rhsr3,d_rhsu3,d_rhsv3,d_rhsw3,d_rhse3,rk,rec); + // rk3final<<>>(d_r,d_rO,d_rhsr1,d_rhsr2,d_rhsr3,dtC); + // rk3final<<>>(d_e,d_eO,d_rhse1,d_rhse2,d_rhse3,dtC); + eulerSumAll3<<>>(d_r, d_rO, d_u, d_uO, d_v, d_vO, d_w, d_wO, d_e, d_eO, + d_rhsr1, d_rhsu1, d_rhsv1, d_rhsw1, d_rhse1,d_rhsr2, d_rhsu2, d_rhsv2, d_rhsw2, d_rhse2, + d_rhsr3, d_rhsu3, d_rhsv3, d_rhsw3, d_rhse3,dtC); + + // cudaDeviceSynchronize(); + // rk3finalR<<>>(d_u,d_uO,d_rhsu1,d_rhsu2,d_rhsu3,d_r,dtC); + // rk3finalR<<>>(d_v,d_vO,d_rhsv1,d_rhsv2,d_rhsv3,d_r,dtC); + // rk3finalR<<>>(d_w,d_wO,d_rhsw1,d_rhsw2,d_rhsw3,d_r,dtC); cudaDeviceSynchronize(); } } void runSimulationLowStorage(myprec *par1, myprec *par2, myprec *time, Communicator rk) { - cudaSetDevice(rk.nodeRank); - - myprec h_dt,h_dpdz; - - for (int istep = 0; istep < nsteps; istep++) { - if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk); - if(istep>0) deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC); - if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC); - deviceAdvanceTime<<<1,1>>>(dtC); - if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk); + /* cudaSetDevice(rk.nodeRank); + + myprec h_dt,h_dpdz; + + for (int istep = 0; istep < nsteps; istep++) { + if(istep%checkCFLcondition==0) calcTimeStepPressGrad(istep,dtC,dpdz,&h_dt,&h_dpdz,rk); + if(istep>0) deviceSumOne<<<1,1>>>(&time[istep],&time[istep-1] ,dtC); + if(istep==0) deviceSumOne<<<1,1>>>(&time[istep],&time[nsteps-1],dtC); + deviceAdvanceTime<<<1,1>>>(dtC); + if(istep%checkBulk==0) calcBulk(&par1[istep],&par2[istep],d_r,d_u,d_v,d_w,d_e,rk); + + //Starting the Runge-Kutta Steps + + //runge kutta step 1 + calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk); + deviceMul<<>>(d_u,d_u,d_r); + deviceMul<<>>(d_v,d_v,d_r); + deviceMul<<>>(d_w,d_w,d_r); + sumLowStorageRK3<<>>(d_r, d_rhsr1, d_rhsr1, dtC, 0); + sumLowStorageRK3<<>>(d_u, d_rhsu1, d_rhsu1, dtC, 0); + sumLowStorageRK3<<>>(d_v, d_rhsv1, d_rhsv1, dtC, 0); + sumLowStorageRK3<<>>(d_w, d_rhsw1, d_rhsw1, dtC, 0); + sumLowStorageRK3<<>>(d_e, d_rhse1, d_rhse1, dtC, 0); + cudaStreamSynchronize(s[0]); + deviceDiv<<>>(d_u,d_u,d_r); + deviceDiv<<>>(d_v,d_v,d_r); + deviceDiv<<>>(d_w,d_w,d_r); + cudaDeviceSynchronize(); + + if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer + deviceCpy<<>>(d_r,d_r); + deviceCpy<<>>(d_u,d_u); + deviceCpy<<>>(d_v,d_v); + deviceCpy<<>>(d_w,d_w); + } + + + //runge kutta step 2 + calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk); + deviceMul<<>>(d_u,d_u,d_r); + deviceMul<<>>(d_v,d_v,d_r); + deviceMul<<>>(d_w,d_w,d_r); + sumLowStorageRK3<<>>(d_r, d_rhsr1, d_rhsr2, dtC, 1); + sumLowStorageRK3<<>>(d_u, d_rhsu1, d_rhsu2, dtC, 1); + sumLowStorageRK3<<>>(d_v, d_rhsv1, d_rhsv2, dtC, 1); + sumLowStorageRK3<<>>(d_w, d_rhsw1, d_rhsw2, dtC, 1); + sumLowStorageRK3<<>>(d_e, d_rhse1, d_rhse2, dtC, 1); + cudaStreamSynchronize(s[0]); + deviceDiv<<>>(d_u,d_u,d_r); + deviceDiv<<>>(d_v,d_v,d_r); + deviceDiv<<>>(d_w,d_w,d_r); + cudaDeviceSynchronize(); + + if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer + deviceCpy<<>>(d_r,d_r); + deviceCpy<<>>(d_u,d_u); + deviceCpy<<>>(d_v,d_v); + deviceCpy<<>>(d_w,d_w); + } + + //runge kutta step 3 + calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk); + deviceMul<<>>(d_u,d_u,d_r); + deviceMul<<>>(d_v,d_v,d_r); + deviceMul<<>>(d_w,d_w,d_r); + sumLowStorageRK3<<>>(d_r, d_rhsr2, d_rhsr1, dtC, 2); + sumLowStorageRK3<<>>(d_u, d_rhsu2, d_rhsu1, dtC, 2); + sumLowStorageRK3<<>>(d_v, d_rhsv2, d_rhsv1, dtC, 2); + sumLowStorageRK3<<>>(d_w, d_rhsw2, d_rhsw1, dtC, 2); + sumLowStorageRK3<<>>(d_e, d_rhse2, d_rhse1, dtC, 2); + cudaStreamSynchronize(s[0]); + deviceDiv<<>>(d_u,d_u,d_r); + deviceDiv<<>>(d_v,d_v,d_r); + deviceDiv<<>>(d_w,d_w,d_r); + cudaDeviceSynchronize(); + }*/ +} +__global__ void eulerSumAll(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, + myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1 + ,myprec *dt) { + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + myprec tmp1 = 0; + tmp1 = (r0[id.g] + rhsr1[id.g]*(*dt)/2.0); + r[id.g] = tmp1; + u[id.g] = (u0[id.g] + rhsu1[id.g]*(*dt)/2.0)/tmp1; + v[id.g] = (v0[id.g] + rhsv1[id.g]*(*dt)/2.0)/tmp1; + w[id.g] = (w0[id.g] + rhsw1[id.g]*(*dt)/2.0)/tmp1; + e[id.g] = (e0[id.g] + rhse1[id.g]*(*dt)/2.0); - //Starting the Runge-Kutta Steps +} - //runge kutta step 1 - calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk); - deviceMul<<>>(d_u,d_u,d_r); - deviceMul<<>>(d_v,d_v,d_r); - deviceMul<<>>(d_w,d_w,d_r); - sumLowStorageRK3<<>>(d_r, d_rhsr1, d_rhsr1, dtC, 0); - sumLowStorageRK3<<>>(d_u, d_rhsu1, d_rhsu1, dtC, 0); - sumLowStorageRK3<<>>(d_v, d_rhsv1, d_rhsv1, dtC, 0); - sumLowStorageRK3<<>>(d_w, d_rhsw1, d_rhsw1, dtC, 0); - sumLowStorageRK3<<>>(d_e, d_rhse1, d_rhse1, dtC, 0); - cudaStreamSynchronize(s[0]); - deviceDiv<<>>(d_u,d_u,d_r); - deviceDiv<<>>(d_v,d_v,d_r); - deviceDiv<<>>(d_w,d_w,d_r); - cudaDeviceSynchronize(); - if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer - deviceCpy<<>>(d_r,d_r); - deviceCpy<<>>(d_u,d_u); - deviceCpy<<>>(d_v,d_v); - deviceCpy<<>>(d_w,d_w); } +__global__ void eulerSumAll2(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, + myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1, + myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, myprec *dt) { + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + myprec tmp1 = 0; + tmp1 = ( r0[id.g] + (2*rhsr2[id.g] - rhsr1[id.g])*(*dt) ); + r[id.g] = tmp1; + u[id.g] = ( u0[id.g] + (2*rhsu2[id.g] - rhsu1[id.g])*(*dt) )/tmp1; + v[id.g] = ( v0[id.g] + (2*rhsv2[id.g] - rhsv1[id.g])*(*dt) )/tmp1; + w[id.g] = ( w0[id.g] + (2*rhsw2[id.g] - rhsw1[id.g])*(*dt) )/tmp1; + e[id.g] = ( e0[id.g] + (2*rhse2[id.g] - rhse1[id.g])*(*dt) ); +} - //runge kutta step 2 - calcRHS(d_rhsr2,d_rhsu2,d_rhsv2,d_rhsw2,d_rhse2,rk); - deviceMul<<>>(d_u,d_u,d_r); - deviceMul<<>>(d_v,d_v,d_r); - deviceMul<<>>(d_w,d_w,d_r); - sumLowStorageRK3<<>>(d_r, d_rhsr1, d_rhsr2, dtC, 1); - sumLowStorageRK3<<>>(d_u, d_rhsu1, d_rhsu2, dtC, 1); - sumLowStorageRK3<<>>(d_v, d_rhsv1, d_rhsv2, dtC, 1); - sumLowStorageRK3<<>>(d_w, d_rhsw1, d_rhsw2, dtC, 1); - sumLowStorageRK3<<>>(d_e, d_rhse1, d_rhse2, dtC, 1); - cudaStreamSynchronize(s[0]); - deviceDiv<<>>(d_u,d_u,d_r); - deviceDiv<<>>(d_v,d_v,d_r); - deviceDiv<<>>(d_w,d_w,d_r); - cudaDeviceSynchronize(); - if(multiGPU) { //To initiate slowly the routines so that we have time to initiate the memory transfer - deviceCpy<<>>(d_r,d_r); - deviceCpy<<>>(d_u,d_u); - deviceCpy<<>>(d_v,d_v); - deviceCpy<<>>(d_w,d_w); } +__global__ void eulerSumAll3(myprec *r, myprec *r0,myprec *u, myprec *u0, myprec *v, myprec *v0,myprec *w, + myprec *w0, myprec *e, myprec *e0, myprec *rhsr1, myprec *rhsu1, myprec *rhsv1, myprec *rhsw1, myprec *rhse1, + myprec *rhsr2, myprec *rhsu2, myprec *rhsv2, myprec *rhsw2, myprec *rhse2, + myprec *rhsr3, myprec *rhsu3, myprec *rhsv3, myprec *rhsw3, myprec *rhse3, myprec *dt) { + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + myprec tmp1 = 0; + tmp1 = r0[id.g] + (*dt)*( rhsr1[id.g] + 4*rhsr2[id.g] + rhsr3[id.g])/6.; + r[id.g] = tmp1; + u[id.g] = ( u0[id.g] + (*dt)*( rhsu1[id.g] + 4*rhsu2[id.g] + rhsu3[id.g])/6.)/tmp1; + v[id.g] = ( v0[id.g] + (*dt)*( rhsv1[id.g] + 4*rhsv2[id.g] + rhsv3[id.g])/6.)/tmp1; + w[id.g] = ( w0[id.g] + (*dt)*( rhsw1[id.g] + 4*rhsw2[id.g] + rhsw3[id.g])/6.)/tmp1; + e[id.g] = ( e0[id.g] + (*dt)*( rhse1[id.g] + 4*rhse2[id.g] + rhse3[id.g])/6.); - //runge kutta step 3 - calcRHS(d_rhsr1,d_rhsu1,d_rhsv1,d_rhsw1,d_rhse1,rk); - deviceMul<<>>(d_u,d_u,d_r); - deviceMul<<>>(d_v,d_v,d_r); - deviceMul<<>>(d_w,d_w,d_r); - sumLowStorageRK3<<>>(d_r, d_rhsr2, d_rhsr1, dtC, 2); - sumLowStorageRK3<<>>(d_u, d_rhsu2, d_rhsu1, dtC, 2); - sumLowStorageRK3<<>>(d_v, d_rhsv2, d_rhsv1, dtC, 2); - sumLowStorageRK3<<>>(d_w, d_rhsw2, d_rhsw1, dtC, 2); - sumLowStorageRK3<<>>(d_e, d_rhse2, d_rhse1, dtC, 2); - cudaStreamSynchronize(s[0]); - deviceDiv<<>>(d_u,d_u,d_r); - deviceDiv<<>>(d_v,d_v,d_r); - deviceDiv<<>>(d_w,d_w,d_r); - cudaDeviceSynchronize(); - } } + __global__ void eulerSum(myprec *a, myprec *b, myprec *c, myprec *dt) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); a[id.g] = (b[id.g] + c[id.g]*(*dt)/2.0); @@ -217,30 +407,131 @@ __global__ void rk3finalR(myprec *a1, myprec *a2, myprec *b, myprec *c, myprec * __global__ void calcState(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret, myprec *ht, myprec *tem, myprec *pre, myprec *mu, myprec *lam, int bc) { - int threadsPerBlock = blockDim.x * blockDim.y; - int threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y; - int blockNumInGrid = blockIdx.x + gridDim.x * blockIdx.y; - int gl = blockNumInGrid * threadsPerBlock + threadNumInBlock; + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + int gl = id.g; + if(bc==1) { + int threadsPerBlock = blockDim.x * blockDim.y; + int threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y; + int blockNumInGrid = blockIdx.x + gridDim.x * blockIdx.y; + gl = blockNumInGrid * threadsPerBlock + threadNumInBlock; + gl += mx*my*mz; + } + myprec cvInv = (gam - 1.0)/Rgas; + myprec r = rho[gl]; + myprec u = uvel[gl]; + myprec v = vvel[gl]; + myprec w = wvel[gl]; + myprec etot = ret[gl]; + + myprec invrho = 1.0/r; + myprec en = etot*invrho - 0.5*(u*u + v*v + w*w); + + myprec t = cvInv*en; + myprec p = r*Rgas*t; + tem[gl] = t; + pre[gl] = p; + + ht[gl] = (etot + p)*invrho; + + myprec suth = pow(t,viscexp); + mu[gl] = suth/Re; + lam[gl] = suth/Re/Pr/Ec; + + __syncthreads(); + +} + +__global__ void calcStateRR(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret, myprec *ht, myprec *tem, myprec *pre, myprec *mu, myprec *lam) { + + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidBCRR(blockIdx.y); + int gl = id.g; + + myprec cv = Rgas/(gam - 1.0) ; + myprec r = rho[gl]; + myprec u = uvel[gl]; + myprec v = vvel[gl]; + myprec w = wvel[gl]; + myprec t = tem[gl]; + + myprec en = cv*t ; + + myprec etot = r*( en + 0.5*(u*u + v*v + w*w) ) ; + myprec p = r*Rgas*t; + ret[gl] = etot; + pre[gl] = p; + + ht[gl] = (etot + p)/r; + + myprec suth = pow(t,viscexp); + mu[gl] = suth/Re; + lam[gl] = suth/Re/Pr/Ec; + + __syncthreads(); + +} + +__global__ void BCwallCenteredTop(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret) { + + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidBCwallTop(); + + + myprec u = 0.0; + myprec v = 0.0; + myprec w = 0.0; + myprec t = TwallTop; + + uvel[id.g] = u; + vvel[id.g] = v; + wvel[id.g] = w; + + myprec cv = Rgas/(gam - 1.0); + myprec r = rho[id.g]; + + myprec en = cv*t; + + myprec etot = r*( en + 0.5*(u*u + v*v + w*w) ); + + ret[id.g] = etot; - if(bc==1) gl += mx*my*mz; + __syncthreads(); - myprec cvInv = (gam - 1.0)/Rgas; +} +__global__ void BCwallCenteredBot(myprec *rho, myprec *uvel, myprec *vvel, myprec *wvel, myprec *ret) { + + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidBCwallBot(); + + + myprec u = 0.0; + myprec v = 0.0; + myprec w = 0.0; + myprec t = TwallBot; - myprec invrho = 1.0/rho[gl]; + uvel[id.g] = u; + vvel[id.g] = v; + wvel[id.g] = w; - myprec en = ret[gl]*invrho - 0.5*(uvel[gl]*uvel[gl] + vvel[gl]*vvel[gl] + wvel[gl]*wvel[gl]); - tem[gl] = cvInv*en; - pre[gl] = rho[gl]*Rgas*tem[gl]; - ht[gl] = (ret[gl] + pre[gl])*invrho; + myprec cv = Rgas/(gam - 1.0); + myprec r = rho[id.g]; - myprec suth = pow(tem[gl],viscexp); - mu[gl] = suth/Re; - lam[gl] = suth/Re/Pr/Ec; - __syncthreads(); + myprec en = cv*t; + + myprec etot = r*( en + 0.5*(u*u + v*v + w*w) ); + + ret[id.g] = etot; + + __syncthreads(); } + __global__ void sumLowStorageRK3(myprec *var, myprec *rhs1, myprec *rhs2, myprec *dt, int step) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); var[id.g] = var[id.g] + (*dt)*(alpha[step]*rhs1[id.g] + beta[step]*rhs2[id.g]); @@ -250,77 +541,371 @@ void calcTimeStepPressGrad(int istep, myprec *dtC, myprec *dpdz, myprec *h_dt, m cudaSetDevice(rk.nodeRank); calcTimeStep(dtC,d_r,d_u,d_v,d_w,d_e,d_m,rk); - cudaMemcpy(h_dt , dtC , sizeof(myprec), cudaMemcpyDeviceToHost); + cudaMemcpy(h_dt, dtC, sizeof(myprec), cudaMemcpyDeviceToHost); allReduceToMin(h_dt,1); mpiBarrier(); - cudaMemcpy(dtC , h_dt , sizeof(myprec), cudaMemcpyHostToDevice); + cudaMemcpy(dtC, h_dt, sizeof(myprec), cudaMemcpyHostToDevice); if(forcing) { - calcPressureGrad(dpdz,d_r,d_w,rk); + calcPressureGrad(dpdz,d_r,d_w,rk); //Changed here dpdz_new = dpdz_old - 0.5*(rw_bulk - 1 ) cudaMemcpy(h_dpdz, dpdz, sizeof(myprec), cudaMemcpyDeviceToHost); allReduceArray(h_dpdz,1); mpiBarrier(); cudaMemcpy(dpdz, h_dpdz, sizeof(myprec), cudaMemcpyHostToDevice); } - if(rk.rank==0) printf("step number %d with %le %le\n",istep,*h_dt,*h_dpdz); + //if(rk.rank==0) printf("step number %d with %le %le\n",nsteps*(file) + istep ,*h_dt,*h_dpdz); +} + +void calcMeanRec(myprec *r, myprec *u,myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm, myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in,Communicator rk) { + + myprec *hostYmean = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec)); + + int gridYAvg = mx_tot; + int blockYAvg = my; + + for (int k = 0; k < stencilSize; k++){ + int krec = recstn - rk.kstart - (stencilSize-1) + k; // stencilSize-1 is there becase we want k = 0 coreesponding to plane recystn-2, k=1 correspond to plane recycstn-1 and k=2 to recycstn + YAvg<<>>(r, rm, krec, k);// *2 because the next power of 2 shall be <= 2*my. Eg: 139.. next power = 256 < 278 + YAvg<<>>(u, um, krec, k); //wall normal velocity + YAvg<<>>(w, wm, krec, k); //streamwise velocity + YAvg<<>>(t, tm, krec, k); + cudaDeviceSynchronize(); + } + + checkCuda( cudaMemcpy(hostYmean, rm, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) ); + allReduceSumYavg(hostYmean,stencilSize*mx_tot); // check if mpi_barrier is needed with comm_col communicator? + checkCuda( cudaMemcpy(rm, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + + checkCuda( cudaMemcpy(hostYmean, um, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) ); + allReduceSumYavg(hostYmean,stencilSize*mx_tot); + checkCuda( cudaMemcpy(um, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + + checkCuda( cudaMemcpy(hostYmean, wm, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) ); + allReduceSumYavg(hostYmean,stencilSize*mx_tot); + checkCuda( cudaMemcpy(wm, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + + checkCuda( cudaMemcpy(hostYmean, tm, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyDeviceToHost) ); + allReduceSumYavg(hostYmean,stencilSize*mx_tot); + checkCuda( cudaMemcpy(tm, hostYmean, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + + deltaRR<<<1, stencilSize,0,s[0]>>>(wm, delta_rec); + interpRR<<>>(a, b, idxm, idxp, delta_rec, delta_in) ; + + free(hostYmean); } +void Recycle_Rescale(myprec *r, myprec *u, myprec *v, myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm, + myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *recy_r, myprec *recy_u, + myprec *recy_v, myprec *recy_w, myprec *recy_t,myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in, Communicator rk) { + + dim3 gridRR; + gridRR = dim3(my,stencilSize,1); + int blockRR = mx_tot; + + Recy_Resc<<>>(r, u, v, w, t, rm, um, wm, tm, rm_in, um_in, wm_in, tm_in, recy_r, recy_u, recy_v, recy_w, recy_t, + a, b, idxm, idxp, delta_rec, delta_in, rk); + + +} + + +void InletMeanUpdate(myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *delta_in, Communicator rk) { + + myprec *um_inRR = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec)); + myprec *wm_inRR = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec)); + myprec *tm_inRR = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec)); + myprec *rm_inRR = (myprec*)malloc(stencilSize*mx_tot*sizeof(myprec)); + myprec *delta99_inRR = (myprec*)malloc(stencilSize*sizeof(myprec)); + + readFileInitBL_inRR('r',rm_inRR,rk); + readFileInitBL_inRR('u',um_inRR,rk); + readFileInitBL_inRR('w',wm_inRR,rk); + readFileInitBL_inRR('t',tm_inRR,rk); + readFileInitBL1D_inRR('d',delta99_inRR,rk); + + checkCuda( cudaMemcpy(rm_in, rm_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + checkCuda( cudaMemcpy(um_in, um_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + checkCuda( cudaMemcpy(wm_in, wm_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + checkCuda( cudaMemcpy(tm_in, tm_inRR, stencilSize*mx_tot*sizeof(myprec), cudaMemcpyHostToDevice) ); + checkCuda( cudaMemcpy(delta_in, delta99_inRR, stencilSize*sizeof(myprec), cudaMemcpyHostToDevice) ); + + free(um_inRR); + free(wm_inRR); + free(tm_inRR); + free(rm_inRR); + free(delta99_inRR); +} + +void Spanwiseshift(myprec *recy_r,myprec *recy_u,myprec *recy_v,myprec *recy_w,myprec *recy_t, Communicator rk){ + + dim3 grid; + grid = dim3(mx_tot, stencilSize, 1); + + spanshifting<<>>(recy_r); + spanshifting<<>>(recy_u); + spanshifting<<>>(recy_v); + spanshifting<<>>(recy_w); + spanshifting<<>>(recy_t); + +} + +__global__ void spanshifting(myprec *var){ + + int k = blockIdx.y; + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidSpanShift(k,0); + __shared__ myprec s_var[my]; + + s_var[id.tix] = var[id.g]; + __syncthreads(); + int shift = my/2; + Indices id1(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id1.mkidSpanShift(k,shift); + var[id1.g] = s_var[id.tix] ; + __syncthreads(); + +} + + +__global__ void Recy_Resc(myprec *r, myprec *u, myprec *v, myprec *w, myprec *t, myprec *rm, myprec *um, myprec *wm, myprec *tm, + myprec *rm_in, myprec *um_in, myprec *wm_in, myprec *tm_in, myprec *recy_r, myprec *recy_u, + myprec *recy_v, myprec *recy_w, myprec *recy_t,myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in, Communicator rk) { + + int k = blockIdx.y; + int krec = (recstn-stencilSize+1) - rk.kstart + k; // stencilSize+1 is there becase we want k = 0 coreesponding to plane recystn-2, k=1 correspond to plane recycstn-1 and k=2 to recycstn + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidBCRR(krec); + + Indices id1(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id1.mkidBCRR(k); + + int id_mean = id.i + mx_tot*k ; // + + int idm = idxm[id_mean]; + int idp = idxp[id_mean]; + + myprec delratio = delta_rec[k]/delta_in[k] ; + myprec beta_rec = pow(abs(delratio),0.1); // (delta_rec / delta_in)^(1/10) -- delta_in = 1; + + __shared__ myprec s_fluc[mx_tot]; + __shared__ myprec s_mean[mx_tot]; + +/* s_mean[id.i] = um[id_mean] ; + __syncthreads(); + um_in[id_mean] = 1*( a[id_mean] * s_mean[idm] + (1.0-a[id_mean])* s_mean[idp] ) ;// not using b as I dont want um_in=0 which will happen when a=0&b=0. +*/ + + __syncthreads(); + s_fluc[id.i] = r[id.g] - rm[id_mean] ; + __syncthreads(); + myprec flucinlet = 1*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ; + recy_r[id1.g] = rm_in[id_mean] + flucinlet; + + __syncthreads(); + s_fluc[id.i] = u[id.g] - um[id_mean] ; + __syncthreads(); + flucinlet = beta_rec*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ; + recy_u[id1.g] = um_in[id_mean] + flucinlet; + + __syncthreads(); + s_fluc[id.i] = v[id.g] - 0 ; + __syncthreads(); + flucinlet = beta_rec*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ; + recy_v[id1.g] = 0 + flucinlet; + + __syncthreads(); + s_fluc[id.i] = w[id.g] - wm[id_mean] ; + __syncthreads(); + flucinlet = beta_rec*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ; + recy_w[id1.g] = wm_in[id_mean] + flucinlet; + + __syncthreads(); + s_fluc[id.i] = t[id.g] - tm[id_mean] ; + __syncthreads(); + flucinlet = 1*( a[id_mean] * s_fluc[idm] + b[id_mean] * s_fluc[idp] ) ; + recy_t[id1.g] = tm_in[id_mean] + flucinlet; + +} + + +__global__ void YAvg(myprec *f, myprec *fm, int krec, int kRR) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidBCRecycAvg(krec); + + unsigned int n = id.bdx; + n = findNextPowerOf2(n); // to perform the reduction fast we need an array that is of the power of 2 + + extern __shared__ myprec s_avg[] ; + + for(int i=id.tix; iid.bdx to be a power of 2. + } + } + __syncthreads(); + + for (int size = n/2; size>0; size/=2) { + if (id.tix 1) { + k = (khi+klo)/2; + if(wm[k + id.tix*mx_tot] > winf) { + khi=k; + } else { + klo=k; + } + } + + + myprec a = (wm[khi+id.tix*mx_tot] - winf)/(wm[khi+id.tix*mx_tot] - wm[klo+id.tix*mx_tot]); + + delta[id.tix] = d_x[khi] - a*(d_x[khi] - d_x[klo]) ; +} + +__global__ void interpRR(myprec *a, myprec *b, int *idxm, int *idxp, myprec *delta_rec, myprec *delta_in) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + int idii = id.tix + mx_tot*id.bix ; + + __shared__ myprec s_x[mx_tot]; + s_x[id.tix] = d_x[id.tix]/delta_rec[id.bix]; + __syncthreads(); + + int khi = mx_tot-1; + int klo = 0; + int k; + + myprec xi = d_x[id.tix]/delta_in[id.bix]; //inlet x/delta + + // In the following lines of code we will find that what indices on the recycling station corresponds to the x/delta of the inlet. + // This will help in interpolating the data at the recycling station so as to be recycled to the inlet. + // Suppose we have x/delta_in = 0.35. But at the recycling end we have x/delta_re = 0.32 and 0.38. Thus, first using the while loop + // we compute the khi and klo correcponding to x=0.32 and x=0.38. Then we compute the weights as b=(0.38-0.35)/(0.38-0.32) & a=1-b; + // a and b are the weights which will be used for interpolation. + while (khi-klo > 1) { + k = (khi+klo)/2; + if( s_x[k] > xi) { // compare with the recycling station x/delta; + khi=k; + } else { + klo=k; + } + } + if (xi <= s_x[id.bdx-1]) { //id.bdx implies last value + idxp[idii] = khi ; + idxm[idii] = klo ; + + a[idii] = (s_x[khi] - xi)/(s_x[khi] - s_x[klo]) ; + b[idii] = 1.0 - a[idii]; + } else { + idxp[idii] = khi ; + idxm[idii] = klo ; + + a[idii] = 0.0 ; + b[idii] = 0.0; // This implies that for all the cells at the inlet that are beyond the x/delta_re|max, + //we do the interpolation such that u[x/deltain> x/delta_remax] = beta_rec * (b * u[idp]) where idp is khi. + //The while loop should ensure that the khi is maintained to its set value of mx_tot-1. + } +} + + + void solverWrapper(Communicator rk) { cudaSetDevice(rk.nodeRank); int start; - myprec *dpar1, *dpar2, *dtime; - myprec *hpar1 = new myprec[nsteps]; - myprec *hpar2 = new myprec[nsteps]; - myprec *htime = new myprec[nsteps]; + myprec *dpar1, *dpar2, *dtime; + myprec *hpar1 = new myprec[nsteps]; + myprec *hpar2 = new myprec[nsteps]; + myprec *htime = new myprec[nsteps]; - checkCuda( cudaMalloc((void**)&dpar1, nsteps*sizeof(myprec)) ); - checkCuda( cudaMalloc((void**)&dpar2, nsteps*sizeof(myprec)) ); - checkCuda( cudaMalloc((void**)&dtime, nsteps*sizeof(myprec)) ); + checkCuda( cudaMalloc((void**)&dpar1, nsteps*sizeof(myprec)) ); + checkCuda( cudaMalloc((void**)&dpar2, nsteps*sizeof(myprec)) ); + checkCuda( cudaMalloc((void**)&dtime, nsteps*sizeof(myprec)) ); - FILE *fp; + FILE *fp; - //check the memory usage of the GPU - checkGpuMem(rk); + //check the memory usage of the GPU + checkGpuMem(rk); + if (inletbc == 5) { + InletMeanUpdate(rm_in, um_in, wm_in, tm_in, delta_in, rk); // The means at the inflow bc (ghost cells of inflow) will remain constant for the entire simulation and hence it is set once outside the loop + } - if(restartFile<0) { - start=0; - } else { - start=restartFile; - } + if(rk.rank==5){ + FILE *fdel = fopen("delstar.txt","w+"); + fclose(fdel);} - if(rk.rank==0) fp = fopen("solution.txt","w+"); - for(int file = start+1; file>= 1; + count += 1; + } + + n = 1 << count; + return n; +} + unsigned int hostFindPreviousPowerOf2(unsigned int n) { while (n & n - 1) { @@ -305,6 +318,20 @@ unsigned int hostFindPreviousPowerOf2(unsigned int n) return n; } +unsigned int hostfindNextPowerOf2(unsigned int n) +{ + unsigned count = 0; + while(n != 0) + { + n >>= 1; + count += 1; + } + + n = 1 << count; + return n; +} + + void hostReduceToMin(myprec *gOut, myprec *var, Communicator rk) { cudaSetDevice(rk.nodeRank); @@ -341,13 +368,12 @@ void hostVolumeIntegral(myprec *gOut, myprec *var, Communicator rk) { int tot = mx*my*mz; int gr = my * mz; - int bl = mx ; - - checkCuda( cudaMalloc((void**)&dwrkM ,gr*sizeof(myprec)) ); - cudaDeviceSynchronize(); + int bl = mx ; bl = hostFindPreviousPowerOf2(bl); + checkCuda( cudaMalloc((void**)&dwrkM ,gr*sizeof(myprec)) ); + cudaDeviceSynchronize(); integrateThreads<<>>(dwrkM, var , tot); cudaDeviceSynchronize(); reduceThreads<<< 1 , bl, bl*sizeof(myprec)>>>(dwrkM, dwrkM, gr); diff --git a/src/cuda_math.h b/src/cuda_math.h index e1aab99..c962bae 100644 --- a/src/cuda_math.h +++ b/src/cuda_math.h @@ -29,8 +29,11 @@ __global__ void reduceThreads (myprec *gOut, myprec *gArr, int arraySize); __global__ void minOfThreads (myprec *gOut, myprec *gArr, int arraySize); __global__ void maxOfThreads (myprec *gOut, myprec *gArr, int arraySize); __device__ unsigned int findPreviousPowerOf2(unsigned int n); +__device__ unsigned int findNextPowerOf2(unsigned int n); void hostReduceToMin(myprec *gOut, myprec *var, Communicator rk); void hostVolumeIntegral(myprec *gOut, myprec *var, Communicator rk); void hostVolumeAverage(myprec *gOut, myprec *var, Communicator rk); +unsigned int hostfindNextPowerOf2(unsigned int n); +unsigned int hostFindPreviousPowerOf2(unsigned int n); #endif /* CUDA_MATH_H_ */ diff --git a/src/cuda_rhs.cu b/src/cuda_rhs.cu index 0e1a015..c24122d 100644 --- a/src/cuda_rhs.cu +++ b/src/cuda_rhs.cu @@ -5,19 +5,30 @@ #include "boundary_condition_x.h" #include "boundary_condition_y.h" #include "boundary_condition_z.h" +#include "boundary.h" +#include "nrbcX.h" +#include "nrbcZ.h" + __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , myprec *t, myprec *p, myprec *mu, myprec *lam, myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz, - myprec *dil, myprec *dpdz, int iNum) { + myprec *dvdy, myprec *dwdz, myprec *dil, myprec *dpdz) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); - id.mkidX(); + int iNum = blockIdx.z ; + id.mkidXFlx(iNum); - int si = id.i + stencilSize; // local i for shared memory access + halo offset + int si = id.tix + stencilSize; // local i for shared memory access + halo offset int sj = id.tiy; // local j for shared memory access + bool boolIsothnrbc_top = ((topbc==3) && (id.i == mx_tot-1)); + bool boolIsothnrbc_bot = ((bottombc==3) && (id.i == 0)); + bool boolFreestreamnrbc_top = ((topbc==4) && (id.i == mx_tot-1)); + bool boolAdiabnrbc_top = ((topbc==6) && (id.i == mx_tot-1)); + bool boolAdiabnrbc_bot = ((bottombc==6) && (id.i == 0)); + myprec rXtmp=0; myprec uXtmp=0; myprec vXtmp=0; @@ -26,14 +37,15 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre myprec wrk1=0; myprec wrk2=0; + myprec wrk3=0; - __shared__ myprec s_u[sPencils][mx+stencilSize*2]; - __shared__ myprec s_v[sPencils][mx+stencilSize*2]; - __shared__ myprec s_w[sPencils][mx+stencilSize*2]; - __shared__ myprec s_t[sPencils][mx+stencilSize*2]; - __shared__ myprec s_p[sPencils][mx+stencilSize*2]; - __shared__ myprec s_prop1[sPencils][mx+stencilSize*2]; - __shared__ myprec s_prop2[sPencils][mx+stencilSize*2]; + __shared__ myprec s_u[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_v[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_w[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_t[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_p[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_prop1[sPencils/2][mx/nX+stencilSize*2]; + __shared__ myprec s_prop2[sPencils/2][mx/nX+stencilSize*2]; s_u[sj][si] = u[id.g]; s_v[sj][si] = v[id.g]; @@ -46,54 +58,119 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre // Boundary conditions in the x-direction are in boundary_condition_x.h // these are the BCs for u,v,w,p,t,mu,lambda - BCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); + if (id.tix < stencilSize){ + if (nX ==1){ + TopBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, topbc); + BotBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, bottombc); + } else { + if (iNum ==0){ + TopBCxCpy(s_u[sj],u,si,id);TopBCxCpy(s_v[sj],v,si,id);TopBCxCpy(s_w[sj],w,si,id);TopBCxCpy(s_p[sj],p,si,id); + TopBCxCpy(s_t[sj],t,si,id);TopBCxCpy(s_prop1[sj],mu,si,id);TopBCxCpy(s_prop2[sj],lam,si,id); + + BotBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, bottombc); + } else if (iNum == nX-1){ + BotBCxCpy(s_u[sj],u,si,id);BotBCxCpy(s_v[sj],v,si,id);BotBCxCpy(s_w[sj],w,si,id);BotBCxCpy(s_p[sj],p,si,id); + BotBCxCpy(s_t[sj],t,si,id);BotBCxCpy(s_prop1[sj],mu,si,id);BotBCxCpy(s_prop2[sj],lam,si,id); + + TopBCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],u,v,w,p,t,mu,lam,id,si,mx, topbc); + } else { + TopBCxCpy(s_u[sj],u,si,id);TopBCxCpy(s_v[sj],v,si,id);TopBCxCpy(s_w[sj],w,si,id);TopBCxCpy(s_p[sj],p,si,id); + TopBCxCpy(s_t[sj],t,si,id);TopBCxCpy(s_prop1[sj],mu,si,id);TopBCxCpy(s_prop2[sj],lam,si,id); + + BotBCxCpy(s_u[sj],u,si,id);BotBCxCpy(s_v[sj],v,si,id);BotBCxCpy(s_w[sj],w,si,id);BotBCxCpy(s_p[sj],p,si,id); + BotBCxCpy(s_t[sj],t,si,id);BotBCxCpy(s_prop1[sj],mu,si,id);BotBCxCpy(s_prop2[sj],lam,si,id); + } + } + } + __syncthreads(); + derDevSharedV1x(&wrk1,s_u[sj],si); + derDevSharedV1x(&wrk2,s_v[sj],si); + derDevSharedV1x(&wrk3,s_w[sj],si); + + dudx[id.g] = wrk1; + dvdx[id.g] = wrk2; + dwdx[id.g] = wrk3; + + rXtmp = wrk1 + dvdy[id.g] + dwdz[id.g]; // rXtmp is for rho but here it just acts acts as a temporary variable. + dil[id.g] = rXtmp; //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms - uXtmp = ( 2 * dudx[id.g] - 2./3.*dil[id.g] ); - vXtmp = ( dvdx[id.g] + dudy[id.g] ); - wXtmp = ( dwdx[id.g] + dudz[id.g] ); + uXtmp = ( 2 * wrk1 - 2./3.*rXtmp ); //rXtmp stores dilatation + vXtmp = ( wrk2 + dudy[id.g] ); + wXtmp = ( wrk3 + dudz[id.g] ); //adding the viscous dissipation part duidx*mu*six - eXtmp = s_prop1[sj][si]*(uXtmp*dudx[id.g] + vXtmp*dvdx[id.g] + wXtmp*dwdx[id.g]); + eXtmp = s_prop1[sj][si]*(uXtmp*wrk1 + vXtmp*wrk2 + wXtmp*wrk3); //Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse); derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx - uXtmp *= wrk2; + uXtmp *= wrk2; vXtmp *= wrk2; wXtmp *= wrk2; // viscous fluxes derivative mu*d^2ui dx^2 derDevSharedV2x(&wrk1,s_u[sj],si); uXtmp = uXtmp + wrk1*s_prop1[sj][si]; - derDevSharedV2x(&wrk1,s_v[sj],si); - vXtmp = vXtmp + wrk1*s_prop1[sj][si]; - derDevSharedV2x(&wrk1,s_w[sj],si); - wXtmp = wXtmp + wrk1*s_prop1[sj][si]; - + derDevSharedV2x(&wrk2,s_v[sj],si); + vXtmp = vXtmp + wrk2*s_prop1[sj][si]; + derDevSharedV2x(&wrk3,s_w[sj],si); + wXtmp = wXtmp + wrk3*s_prop1[sj][si]; + + if ( boolFreestreamnrbc_top ) {// This is what the NSCBC says (to put derivative of tangential stress ==0. BUT NOTE THAT WE ARE not setting THE CROSS DERIVATIVE TERMS COMING OUT OF TAU12 AND TAU 13 I.E. mu d2u/dxdy and mu d2u/dxdz equal to zero. This is because to do it we need to edit the Y and Z kernel which will make the code complicated.; + vXtmp = 0; + wXtmp = 0; + } //adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six) eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp; //adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx) - derDevSharedV2x(&wrk1,s_t[sj],si); - eXtmp = eXtmp + wrk1*s_prop2[sj][si]; - derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx - derDevSharedV1x(&wrk1,s_t[sj],si); //wrk1 = d (t) dx - eXtmp = eXtmp + wrk1*wrk2; + if ( boolFreestreamnrbc_top ) { // dqdx = 0; + eXtmp = eXtmp + 0; + } else { + derDevSharedV2x(&wrk1,s_t[sj],si); + eXtmp = eXtmp + wrk1*s_prop2[sj][si]; + derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx + derDevSharedV1x(&wrk3,s_t[sj],si); //wrk1 = d (t) dx + eXtmp = eXtmp + wrk2*wrk3; + } // pressure and dilation derivatives - s_prop2[sj][si] = dil[id.g]; + __syncthreads(); + s_prop2[sj][si] = rXtmp; // rXtmp stores dilatation temporarily __syncthreads(); // these are the BCs for the dilatation - BCxNumber2(s_prop2[sj],id,si,mx); + + if (id.tix < stencilSize){ + if (nX ==1){ + TopBCxNumber2(s_prop2[sj],dil,id,si,mx, topbc); + BotBCxNumber2(s_prop2[sj],dil,id,si,mx, bottombc); + } else { + if (iNum ==0){ + TopBCxCpy(s_prop2[sj],dil,si,id); + BotBCxNumber2(s_prop2[sj],dil,id,si,mx, bottombc); + } else if (iNum == nX-1){ + BotBCxCpy(s_prop2[sj],dil,si,id); + TopBCxNumber2(s_prop2[sj],dil,id,si,mx, topbc); + } else { + TopBCxCpy(s_prop2[sj],dil,si,id); + BotBCxCpy(s_prop2[sj],dil,si,id); + } + } + } __syncthreads(); derDevSharedV1x(&wrk2,s_prop2[sj],si); derDevShared1x(&wrk1 ,s_p[sj],si); + + if (boolFreestreamnrbc_top){ // pressure is taken care in the nrbc + wrk1 = 0; + } uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0 - wrk1 ; eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si]; + //Adding here the terms - d (ru phi) dx; s_prop1[sj][si] = r[id.g]; s_prop2[sj][si] = h[id.g]; @@ -101,43 +178,85 @@ __global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, mypre // fill in periodic images in shared memory array // these are the BCs for rho (prop1) and enthalpy (prop2) - BCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); - __syncthreads(); - fluxQuadSharedx(&wrk1,s_prop1[sj],s_u[sj],si); - rXtmp = wrk1; - __syncthreads(); - fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); - uXtmp = uXtmp + wrk1; - __syncthreads(); - fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_v[sj],si); - vXtmp = vXtmp + wrk1; - __syncthreads(); - fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_w[sj],si); - wXtmp = wXtmp + wrk1; - __syncthreads(); - fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); - eXtmp = eXtmp + wrk1; + + if (id.tix < stencilSize){ + if (nX ==1){ + TopBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, topbc); + BotBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, bottombc); + } else { + if (iNum ==0){ + TopBCxCpy(s_prop1[sj],r,si,id);TopBCxCpy(s_prop2[sj],h,si,id); + BotBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, bottombc); + } else if (iNum == nX-1){ + BotBCxCpy(s_prop1[sj],r,si,id);BotBCxCpy(s_prop2[sj],h,si,id); + TopBCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],r,h,id,si,mx, topbc); + } else { + TopBCxCpy(s_prop1[sj],r,si,id); TopBCxCpy(s_prop2[sj],h,si,id); + BotBCxCpy(s_prop1[sj],r,si,id); BotBCxCpy(s_prop2[sj],h,si,id); + } + } + } __syncthreads(); + + if ( boolIsothnrbc_top){ + IsothnrbcX_top(&rXtmp,&uXtmp,&vXtmp,&wXtmp,&eXtmp, s_prop1[sj],s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_prop2[sj], si, id); + } else if (boolIsothnrbc_bot){ + IsothnrbcX_bot(&rXtmp,&uXtmp,&vXtmp,&wXtmp,&eXtmp, s_prop1[sj],s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_prop2[sj], si, id); + } else if (boolFreestreamnrbc_top){ + FreestreamnrbcX_top(&rXtmp,&uXtmp,&vXtmp,&wXtmp,&eXtmp, s_prop1[sj],s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_prop2[sj], si, id); + } else if (boolAdiabnrbc_top){ + + } else if (boolAdiabnrbc_bot){ + + } else { + + fluxQuadSharedx(&wrk3,s_prop1[sj],s_u[sj],si); // DONT FORGET TO REMOVE SYNCTHREADS FROM THESE FUNCTIONS AS IT IS PLACED IN A DIVERGENT PATH. + rXtmp = wrk3; + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); + uXtmp = uXtmp + wrk1; + fluxCubeSharedx(&wrk2,s_prop1[sj],s_u[sj],s_v[sj],si); + vXtmp = vXtmp + wrk2; + fluxCubeSharedx(&wrk3,s_prop1[sj],s_u[sj],s_w[sj],si); + wXtmp = wXtmp + wrk3; + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); + eXtmp = eXtmp + wrk1; + } + + + rX[id.g] = rXtmp; uX[id.g] = uXtmp; vX[id.g] = vXtmp; wX[id.g] = wXtmp; eX[id.g] = eXtmp; + + __syncthreads(); + } __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, myprec *eY, myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , myprec *t, myprec *p, myprec *mu, myprec *lam, myprec *dvdx, myprec *dudy, myprec *dvdy, myprec *dwdy, myprec *dvdz, - myprec *dil, myprec *dpdz, int jNum) { + myprec *dil, myprec *dpdz) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); - id.mkidYFlx(); + int jNum = blockIdx.z; + id.mkidYFlx(jNum); - int si = id.j + stencilSize; // local i for shared memory access + halo offset - int sj = id.tiy; // local j for shared memory access + int tid = id.tix + id.bdx*id.tiy; + + int si = id.tiy + stencilSize; // local i for shared memory access + halo offset + int sj = id.tix; // local j for shared memory access + int si1 = tid%id.bdy + stencilSize; // local i for shared memory access + halo offset + int sj1 = tid/id.bdy; + + + + // Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y); + // id1.mkidYFlx(jNum); myprec rYtmp=0; myprec uYtmp=0; @@ -147,13 +266,16 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre myprec wrk1=0; myprec wrk2=0; + myprec wrk3=0; - __shared__ myprec s_u[sPencils][my+stencilSize*2]; - __shared__ myprec s_v[sPencils][my+stencilSize*2]; - __shared__ myprec s_w[sPencils][my+stencilSize*2]; - __shared__ myprec s_prop[sPencils][my+stencilSize*2]; - __shared__ myprec s_dil[sPencils][my+stencilSize*2]; + __shared__ myprec s_u[mx/nDivX][my/nDivY+stencilSize*2]; + __shared__ myprec s_v[mx/nDivX][my/nDivY+stencilSize*2]; + __shared__ myprec s_w[mx/nDivX][my/nDivY+stencilSize*2]; + __shared__ myprec s_prop[mx/nDivX][my/nDivY+stencilSize*2]; + __shared__ myprec s_dil[mx/nDivX][my/nDivY+stencilSize*2]; + __shared__ myprec s_tmp[mx/nDivX][my/nDivY+stencilSize*2]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM s_u[sj][si] = u[id.g]; s_v[sj][si] = v[id.g]; s_w[sj][si] = w[id.g]; @@ -163,87 +285,275 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre // fill in periodic images in shared memory array // these are boundary conditions for u,v,w,mu and dilatation - BCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my); + + if (id.tiy < stencilSize){ + if (nDivY ==1){ + if(pRow>1) { + haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id); haloBCyTop(s_w[sj],w,si,id); + haloBCyTop(s_prop[sj],mu,si,id); haloBCyTop(s_dil[sj],dil,si,id); + + haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id); haloBCyBot(s_w[sj],w,si,id); + haloBCyBot(s_prop[sj],mu,si,id); haloBCyBot(s_dil[sj],dil,si,id); + } else{ + TopBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my); + BotBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my); + } + } else { + if (jNum ==0){ + TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); TopBCyCpy(s_w[sj],w,si,id); + TopBCyCpy(s_prop[sj],mu,si,id); TopBCyCpy(s_dil[sj],dil,si,id); + + if(pRow>1) { + + haloBCyBot(s_u[sj],u,si,id); haloBCyBot(s_v[sj],v,si,id); haloBCyBot(s_w[sj],w,si,id); + haloBCyBot(s_prop[sj],mu,si,id); haloBCyBot(s_dil[sj],dil,si,id); + } else{ + BotBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my); + } + } else if (jNum == nDivY-1){ + BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); BotBCyCpy(s_w[sj],w,si,id); + BotBCyCpy(s_prop[sj],mu,si,id); BotBCyCpy(s_dil[sj],dil,si,id); + if(pRow>1) { + haloBCyTop(s_u[sj],u,si,id); haloBCyTop(s_v[sj],v,si,id); haloBCyTop(s_w[sj],w,si,id); + haloBCyTop(s_prop[sj],mu,si,id); haloBCyTop(s_dil[sj],dil,si,id); + + } else { + TopBCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my); + } + } else { + TopBCyCpy(s_u[sj],u,si,id); TopBCyCpy(s_v[sj],v,si,id); TopBCyCpy(s_w[sj],w,si,id); + TopBCyCpy(s_prop[sj],mu,si,id); TopBCyCpy(s_dil[sj],dil,si,id); + + BotBCyCpy(s_u[sj],u,si,id); BotBCyCpy(s_v[sj],v,si,id); BotBCyCpy(s_w[sj],w,si,id); + BotBCyCpy(s_prop[sj],mu,si,id);BotBCyCpy(s_dil[sj],dil,si,id); + } + } + } + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + derDevSharedV1y(&wrk1,s_u[sj1],si1); + derDevSharedV1y(&wrk2,s_v[sj1],si1); + derDevSharedV1y(&wrk3,s_w[sj1],si1); //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms - uYtmp = ( dudy[id.g] + dvdx[id.g] ); - vYtmp = (2 * dvdy[id.g] - 2./3.*s_dil[sj][si] ); - wYtmp = ( dwdy[id.g] + dvdz[id.g] ); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + s_tmp[sj][si] = dvdx[id.g]; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + uYtmp = ( wrk1 + s_tmp[sj1][si1] ); + __syncthreads(); + s_tmp[sj1][si1] = wrk1; + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + dudy[id.g] = s_tmp[sj][si]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + __syncthreads(); + vYtmp = (2 * wrk2 - 2./3.*s_dil[sj1][si1] ); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + __syncthreads(); + s_tmp[sj][si] = dvdz[id.g]; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + wYtmp = ( wrk3 + s_tmp[sj1][si1] ); + + __syncthreads(); + s_tmp[sj1][si1] = wrk2; + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + dvdy[id.g] = s_tmp[sj][si]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + __syncthreads(); + s_tmp[sj1][si1] = wrk3; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + dwdy[id.g] = s_tmp[sj][si]; + __syncthreads(); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + + //adding the viscous dissipation part duidy*mu*siy - eYtmp = s_prop[sj][si]*(uYtmp*dudy[id.g] + vYtmp*dvdy[id.g] + wYtmp*dvdz[id.g]); + eYtmp = s_prop[sj1][si1]*(uYtmp*wrk1 + vYtmp*wrk2 + wYtmp*wrk3); //Adding here the terms d (mu) dy * syj; - derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dy - uYtmp *= wrk2; + derDevSharedV1y(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dy + uYtmp *= wrk2; vYtmp *= wrk2; wYtmp *= wrk2; // viscous fluxes derivative - derDevSharedV2y(&wrk1,s_u[sj],si); - uYtmp = uYtmp + wrk1*s_prop[sj][si]; - derDevSharedV2y(&wrk1,s_v[sj],si); - vYtmp = vYtmp + wrk1*s_prop[sj][si]; - derDevSharedV2y(&wrk1,s_w[sj],si); - wYtmp = wYtmp + wrk1*s_prop[sj][si]; + derDevSharedV2y(&wrk1,s_u[sj1],si1); + uYtmp = uYtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV2y(&wrk1,s_v[sj1],si1); + vYtmp = vYtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV2y(&wrk1,s_w[sj1],si1); + wYtmp = wYtmp + wrk1*s_prop[sj1][si1]; //adding the viscous dissipation part ui*(mu * d2duidy2 + dmudy * siy) - eYtmp = eYtmp + s_u[sj][si]*uYtmp + s_v[sj][si]*vYtmp + s_w[sj][si]*wYtmp; + eYtmp = eYtmp + s_u[sj1][si1]*uYtmp + s_v[sj1][si1]*vYtmp + s_w[sj1][si1]*wYtmp; //dilation derivatives - derDevSharedV1y(&wrk2,s_dil[sj],si); - vYtmp = vYtmp + s_prop[sj][si]*wrk2/3.0; - eYtmp = eYtmp + s_prop[sj][si]*wrk2/3.0*s_v[sj][si]; + derDevSharedV1y(&wrk2,s_dil[sj1],si1); + vYtmp = vYtmp + s_prop[sj1][si1]*wrk2/3.0; + eYtmp = eYtmp + s_prop[sj1][si1]*wrk2/3.0*s_v[sj1][si1]; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM // pressure derivatives + __syncthreads(); s_dil[sj][si] = p[id.g]; __syncthreads(); // Boundary condition for pressure - BCyNumber2(s_dil[sj],p,id,si,my); + if (id.tiy < stencilSize){ + if (nDivY ==1){ + if(pRow>1) { + haloBCyTop(s_dil[sj],p,si,id); + haloBCyBot(s_dil[sj],p,si,id); + } else { + TopBCyNumber2(s_dil[sj],p,id,si,my); + BotBCyNumber2(s_dil[sj],p,id,si,my); + } + } else { + if (jNum ==0){ + TopBCyCpy(s_dil[sj],p,si,id); + if(pRow>1) { + haloBCyBot(s_dil[sj],p,si,id); + } else { + BotBCyNumber2(s_dil[sj],p,id,si,my); + } + } else if (jNum == nDivY-1){ + BotBCyCpy(s_dil[sj],p,si,id); + if(pRow>1) { + haloBCyTop(s_dil[sj],p,si,id); + } else { + TopBCyNumber2(s_dil[sj],p,id,si,my); + } + } else { + TopBCyCpy(s_dil[sj],p,si,id); + BotBCyCpy(s_dil[sj],p,si,id); + } + } + } __syncthreads(); - derDevShared1y(&wrk1,s_dil[sj],si); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + derDevShared1y(&wrk1,s_dil[sj1],si1); vYtmp = vYtmp - wrk1; + // fourier terms + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + __syncthreads(); s_prop[sj][si] = lam[id.g]; s_dil[sj][si] = t[id.g]; __syncthreads(); // Boundary condition for temperature and thermal conductivity - BCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my); + + if (id.tiy < stencilSize){ + if (nDivY ==1){ + if(pRow>1) { + haloBCyTop(s_prop[sj],lam,si,id); haloBCyTop(s_dil[sj],t,si,id); + + haloBCyBot(s_prop[sj],lam,si,id); haloBCyBot(s_dil[sj],t,si,id); + } else{ + TopBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ; + BotBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ; + } + } else { + if (jNum ==0){ + TopBCyCpy(s_prop[sj],lam,si,id); TopBCyCpy(s_dil[sj],t,si,id); + if(pRow>1) { + haloBCyBot(s_prop[sj],lam,si,id); haloBCyBot(s_dil[sj],t,si,id); + } else { + BotBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ; + } + } else if (jNum == nDivY-1){ + BotBCyCpy(s_prop[sj],lam,si,id); BotBCyCpy(s_dil[sj],t,si,id); + if(pRow>1) { + haloBCyTop(s_prop[sj],lam,si,id); haloBCyTop(s_dil[sj],t,si,id); + } else { + TopBCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my) ; + } + } else { + TopBCyCpy(s_prop[sj],lam,si,id); TopBCyCpy(s_dil[sj],t,si,id); + BotBCyCpy(s_prop[sj],lam,si,id); BotBCyCpy(s_dil[sj],t,si,id); + } + } + } __syncthreads(); - derDevSharedV2y(&wrk1,s_dil[sj],si); - eYtmp = eYtmp + wrk1*s_prop[sj][si]; - derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dy - derDevSharedV1y(&wrk1,s_dil[sj] ,si); //wrk1 = d (t) dy + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + + derDevSharedV2y(&wrk1,s_dil[sj1],si1); + eYtmp = eYtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV1y(&wrk2,s_prop[sj1],si1); //wrk2 = d (lam) dy + derDevSharedV1y(&wrk1,s_dil[sj1] ,si1); //wrk1 = d (t) dy eYtmp = eYtmp + wrk1*wrk2; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM //Adding here the terms - d (ru phi) dy; + __syncthreads(); s_prop[sj][si] = r[id.g]; s_dil[sj][si] = h[id.g]; __syncthreads(); // Boundary condition for density and enthalpy - BCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my); + if (id.tiy < stencilSize){ + if (nDivY ==1){ + if(pRow>1) { + haloBCyTop(s_prop[sj],r,si,id); haloBCyTop(s_dil[sj],h,si,id); + haloBCyBot(s_prop[sj],r,si,id); haloBCyBot(s_dil[sj],h,si,id); + } else{ + TopBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ; + BotBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ; + } + } else { + if (jNum ==0){ + TopBCyCpy(s_prop[sj],r,si,id); TopBCyCpy(s_dil[sj],h,si,id); + if(pRow>1) { + haloBCyBot(s_prop[sj],r,si,id); haloBCyBot(s_dil[sj],h,si,id); + } else{ + BotBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ; + } + } else if (jNum == nDivY-1){ + BotBCyCpy(s_prop[sj],r,si,id); BotBCyCpy(s_dil[sj],h,si,id); + if(pRow>1) { + haloBCyTop(s_prop[sj],r,si,id); haloBCyTop(s_dil[sj],h,si,id); + } else{ + TopBCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my) ; + } + } else { + TopBCyCpy(s_prop[sj],r,si,id); TopBCyCpy(s_dil[sj],h,si,id); + BotBCyCpy(s_prop[sj],r,si,id); BotBCyCpy(s_dil[sj],h,si,id); + } + } + } __syncthreads(); - fluxQuadSharedy(&wrk1,s_prop[sj],s_v[sj],si); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + fluxQuadSharedy(&wrk1,s_prop[sj1],s_v[sj1],si1); rYtmp = wrk1; __syncthreads(); - fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_u[sj],si); + fluxCubeSharedy(&wrk1,s_prop[sj1],s_v[sj1],s_u[sj1],si1); uYtmp = uYtmp + wrk1; __syncthreads(); - fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_v[sj],si); + fluxCubeSharedy(&wrk1,s_prop[sj1],s_v[sj1],s_v[sj1],si1); vYtmp = vYtmp + wrk1; __syncthreads(); - fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_w[sj],si); + fluxCubeSharedy(&wrk1,s_prop[sj1],s_v[sj1],s_w[sj1],si1); wYtmp = wYtmp + wrk1; __syncthreads(); - fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_dil[sj],si); + fluxCubeSharedy(&wrk1,s_prop[sj1],s_v[sj1],s_dil[sj1],si1); eYtmp = eYtmp + wrk1; __syncthreads(); + //USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE. + s_prop[sj1][si1] = rYtmp ; s_u[sj1][si1] = uYtmp ; s_v[sj1][si1] = vYtmp ; s_w[sj1][si1] = wYtmp ; s_dil[sj1][si1] = eYtmp ; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM #if useStreams rY[id.g] = rYtmp; @@ -252,11 +562,11 @@ __global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, mypre wY[id.g] = wYtmp; eY[id.g] = eYtmp; #else - rY[id.g] += rYtmp; - uY[id.g] += uYtmp; - vY[id.g] += vYtmp; - wY[id.g] += wYtmp; - eY[id.g] += eYtmp; + rY[id.g] += s_prop[sj][si]; + uY[id.g] += s_u[sj][si]; + vY[id.g] += s_v[sj][si]; + wY[id.g] += s_w[sj][si]; + eY[id.g] += s_dil[sj][si]; #endif __syncthreads(); } @@ -265,13 +575,27 @@ __global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, mypre myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , myprec *t, myprec *p, myprec *mu, myprec *lam, myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz, - myprec *dil, myprec *dpdz, int kNum) { + myprec *dil, myprec *dpdz , Communicator rk, recycle rec) { Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + int kNum = blockIdx.z; id.mkidZFlx(kNum); - int si = id.tix + stencilSize; // local i for shared memory access + halo offset - int sj = id.tiy; // local j for shared memory access + int tid = id.tix + id.bdx*id.tiy; + + int si = id.tiy + stencilSize; // local i for shared memory access + halo offset + int sj = id.tix; // local j for shared memory access + int si1 = tid%id.bdy + stencilSize; // local i for shared memory access + halo offset + int sj1 = tid/id.bdy; + + + Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y); + id1.mkidZFlx(kNum); + + bool CellAtOutlet = ( (rk.kp == mz_tot - 1) && (id1.k == mz-1) ); + bool CellAtInlet = ( (rk.km == 0) && (id1.k == 0) ); + bool boolOutflownrbc_top = ( (outletbc==4) && CellAtOutlet ); + bool boolInflownrbc_bot = ( (inletbc ==5) && CellAtInlet ); myprec rZtmp=0; myprec uZtmp=0; @@ -281,13 +605,20 @@ __global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, mypre myprec wrk1=0; myprec wrk2=0; + myprec wrk3=0; + + //myprec tmpreg1[stencilSize*2+1]; + //myprec tmpreg2[stencilSize*2+1]; + //myprec tmpreg3[stencilSize*2+1]; - __shared__ myprec s_u[sPencils][mz/nDivZ+stencilSize*2]; - __shared__ myprec s_v[sPencils][mz/nDivZ+stencilSize*2]; - __shared__ myprec s_w[sPencils][mz/nDivZ+stencilSize*2]; - __shared__ myprec s_prop[sPencils][mz/nDivZ+stencilSize*2]; - __shared__ myprec s_dil[sPencils][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_prop[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_dil[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_tmp[mx/nDivX][mz/nDivZ+stencilSize*2]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM s_u[sj][si] = u[id.g]; s_v[sj][si] = v[id.g]; s_w[sj][si] = w[id.g]; @@ -297,104 +628,447 @@ __global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, mypre // fill in periodic images in shared memory array // these are boundary conditions for u,v,w,mu and dilatation - BCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,kNum); + if (id.tiy < stencilSize){ + if (nDivZ ==1){ + if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot.. + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id); + + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol-2){ // last block + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id); + + TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc); + } else if (rk.kp == 1) {// first block + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id); + + BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec); + } else { // all internal blocks + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id); + + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id); + } + } + + } else{ + TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc); + BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec); + } + } else { + if (kNum ==0){ + TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id); + TopBCzCpy(s_prop[sj],mu,si,id); TopBCzCpy(s_dil[sj],dil,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.kp == 1){ // first block + BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec); + } else { // all other blocks + haloBCzBot(s_u[sj],u,si,id); haloBCzBot(s_v[sj],v,si,id); haloBCzBot(s_w[sj],w,si,id); + haloBCzBot(s_prop[sj],mu,si,id); haloBCzBot(s_dil[sj],dil,si,id); + } + } + + } else{ + BotBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,inletbc,rec); + } + } else if (kNum == nDivZ-1){ + BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id); + BotBCzCpy(s_prop[sj],mu,si,id); BotBCzCpy(s_dil[sj],dil,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol -2 ){ // last block + TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc); + } else { // all other blocks + haloBCzTop(s_u[sj],u,si,id); haloBCzTop(s_v[sj],v,si,id); haloBCzTop(s_w[sj],w,si,id); + haloBCzTop(s_prop[sj],mu,si,id); haloBCzTop(s_dil[sj],dil,si,id); + } + } + } else{ + TopBCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,outletbc); + } + } else { + TopBCzCpy(s_u[sj],u,si,id); TopBCzCpy(s_v[sj],v,si,id); TopBCzCpy(s_w[sj],w,si,id); + TopBCzCpy(s_prop[sj],mu,si,id); TopBCzCpy(s_dil[sj],dil,si,id); + + BotBCzCpy(s_u[sj],u,si,id); BotBCzCpy(s_v[sj],v,si,id); BotBCzCpy(s_w[sj],w,si,id); + BotBCzCpy(s_prop[sj],mu,si,id); BotBCzCpy(s_dil[sj],dil,si,id); + } + } + } __syncthreads(); - + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + ////BY COMPUTATION THREAD ARRANGEMENT + derDevSharedV1z(&wrk1,s_u[sj1],si1); + derDevSharedV1z(&wrk2,s_v[sj1],si1); + derDevSharedV1z(&wrk3,s_w[sj1],si1); //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms - uZtmp = ( dudz[id.g] + dwdx[id.g] ); - vZtmp = ( dvdz[id.g] + dwdy[id.g] ); - wZtmp = (2 * dwdz[id.g] - 2./3.*s_dil[sj][si] ); + + // WRK1 CORRESPONDS TO THE NEW THREAD ARRANGEMENT. + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + s_tmp[sj][si] = dwdx[id.g]; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + uZtmp = ( wrk1 + s_tmp[sj1][si1] ); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + __syncthreads(); + s_tmp[sj][si] = dwdy[id.g]; + __syncthreads(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + vZtmp = ( wrk2 + s_tmp[sj1][si1] ); + wZtmp = (2 * wrk3 - 2./3.*s_dil[sj1][si1] ); + + __syncthreads(); + s_tmp[sj1][si1] = wrk1; + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + dudz[id.g] = s_tmp[sj][si]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + __syncthreads(); + s_tmp[sj1][si1] = wrk2; + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + dvdz[id.g] = s_tmp[sj][si]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + __syncthreads(); + s_tmp[sj1][si1] = wrk3; + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + dwdz[id.g] = s_tmp[sj][si]; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + + __syncthreads(); + //adding the viscous dissipation part duidz*mu*siz - eZtmp = s_prop[sj][si]*(uZtmp*dudz[id.g] + vZtmp*dvdz[id.g] + wZtmp*dwdz[id.g]); + eZtmp = s_prop[sj1][si1]*(uZtmp*wrk1 + vZtmp*wrk2 + wZtmp*wrk3); //Adding here the terms d (mu) dz * szj; - derDevSharedV1z(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dz - uZtmp *= wrk2; + derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dz + uZtmp *= wrk2; vZtmp *= wrk2; wZtmp *= wrk2; //viscous fluxes derivative - derDevSharedV2z(&wrk1,s_u[sj],si); - uZtmp = uZtmp + wrk1*s_prop[sj][si]; - derDevSharedV2z(&wrk1,s_v[sj],si); - vZtmp = vZtmp + wrk1*s_prop[sj][si]; - derDevSharedV2z(&wrk1,s_w[sj],si); - wZtmp = wZtmp + wrk1*s_prop[sj][si]; - + derDevSharedV2z(&wrk1,s_u[sj1],si1); + uZtmp = uZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV2z(&wrk1,s_v[sj1],si1); + vZtmp = vZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV2z(&wrk1,s_w[sj1],si1); + wZtmp = wZtmp + wrk1*s_prop[sj1][si1]; + + if ( boolOutflownrbc_top ) { + uZtmp = 0; // derivative of tangential stress = 0 as per NSCBC + vZtmp = 0; + } //adding the viscous dissipation part ui*(mu * d2duidz2 + dmudz * siz) - eZtmp = eZtmp + s_u[sj][si]*uZtmp + s_v[sj][si]*vZtmp + s_w[sj][si]*wZtmp; + eZtmp = eZtmp + s_u[sj1][si1]*uZtmp + s_v[sj1][si1]*vZtmp + s_w[sj1][si1]*wZtmp; //dilation derivatives - derDevSharedV1z(&wrk2,s_dil[sj],si); - wZtmp = wZtmp + s_prop[sj][si]*wrk2/3.0; - eZtmp = eZtmp + s_prop[sj][si]*wrk2/3.0*s_w[sj][si]; + derDevSharedV1z(&wrk2,s_dil[sj1],si1); + wZtmp = wZtmp + s_prop[sj1][si1]*wrk2/3.0; + eZtmp = eZtmp + s_prop[sj1][si1]*wrk2/3.0*s_w[sj1][si1]; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + //SHARED MEM LOADING OPERATION. - ID // pressure derivatives - s_dil[sj][si] = p[id.g]; __syncthreads(); - BCzNumber2(s_dil[sj],p,id,si,mz,kNum); + s_tmp[sj][si] = p[id.g]; + __syncthreads(); + if (id.tiy < stencilSize){ + if (nDivZ ==1){ + if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot.. + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_tmp[sj],p,si,id); + haloBCzBot(s_tmp[sj],p,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol-2){ // last block + haloBCzBot(s_tmp[sj],p,si,id); + TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc); + } else if (rk.kp == 1) {// first block + haloBCzTop(s_tmp[sj],p,si,id); + BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec); + } else { // all internal blocks + haloBCzTop(s_tmp[sj],p,si,id); + haloBCzBot(s_tmp[sj],p,si,id); + } + } + } else{ + TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc); + BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec); + } + } else { + if (kNum ==0){ + TopBCzCpy(s_tmp[sj],p,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzBot(s_tmp[sj],p,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.kp == 1){ // first block + BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec); + } else { // all other blocks + haloBCzBot(s_tmp[sj],p,si,id); + } + } + } else{ + BotBCzNumber2(s_tmp[sj],p,id,si,mz,inletbc,rec); + } + } else if (kNum == nDivZ-1){ + BotBCzCpy(s_tmp[sj],p,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_tmp[sj],p,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol -2 ){ // last block + TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc); + } else { // all other blocks + haloBCzTop(s_tmp[sj],p,si,id); + } + } + } else{ + TopBCzNumber2(s_tmp[sj],p,id,si,mz,outletbc); + } + } else { + TopBCzCpy(s_tmp[sj],p,si,id); + BotBCzCpy(s_tmp[sj],p,si,id); + + } + } + } __syncthreads(); - derDevShared1z(&wrk1,s_dil[sj],si); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + derDevShared1z(&wrk1,s_tmp[sj1],si1); + if ( boolOutflownrbc_top || boolInflownrbc_bot) { + wrk1 = 0; // pressure derivative taken care in the nrbc computation. + } wZtmp = wZtmp - wrk1; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM // fourier terms + __syncthreads(); s_prop[sj][si] = lam[id.g]; s_dil[sj][si] = t[id.g]; __syncthreads(); - // Boundary conditions for thermal conductivity and temperature - BCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,kNum); + + if (id.tiy < stencilSize){ + if (nDivZ ==1){ + if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot.. + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id); + haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol-2){ // last block + haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id); + TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc); + } else if (rk.kp == 1) {// first block + haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id); + BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec); + } else { // all internal blocks + haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id); + haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id); + } + } + } else{ + TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc); + BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec); + } + } else { + if (kNum ==0){ + TopBCzCpy(s_prop[sj],lam,si,id); TopBCzCpy(s_dil[sj],t,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.kp == 1){ // first block + BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec); + } else { // all other blocks + haloBCzBot(s_prop[sj],lam,si,id); haloBCzBot(s_dil[sj],t,si,id); + } + } + } else{ + BotBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,inletbc,rec); + } + } else if (kNum == nDivZ-1){ + BotBCzCpy(s_prop[sj],lam,si,id); BotBCzCpy(s_dil[sj],t,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol -2 ){ // last block + TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc); + } else { // all other blocks + haloBCzTop(s_prop[sj],lam,si,id); haloBCzTop(s_dil[sj],t,si,id); + } + } + } else{ + TopBCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,outletbc); + } + } else { + TopBCzCpy(s_prop[sj],lam,si,id); TopBCzCpy(s_dil[sj],t,si,id); + + BotBCzCpy(s_prop[sj],lam,si,id); BotBCzCpy(s_dil[sj],t,si,id); + } + } + } + __syncthreads(); - derDevSharedV2z(&wrk1,s_dil[sj],si); - eZtmp = eZtmp + wrk1*s_prop[sj][si]; - derDevSharedV1z(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dz - derDevSharedV1z(&wrk1,s_dil[sj],si); //wrk1 = d (t) dz - eZtmp = eZtmp + wrk1*wrk2; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + + if ( boolOutflownrbc_top ) { + eZtmp = eZtmp + 0.0; // dqdz = 0 at the outflow as per NSCBC + } else { + derDevSharedV2z(&wrk1,s_dil[sj1],si1); + eZtmp = eZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (lam) dz + derDevSharedV1z(&wrk1,s_dil[sj1],si1); //wrk1 = d (t) dz + eZtmp = eZtmp + wrk1*wrk2; + } + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM //Adding here the terms - d (ru phi) dz; + __syncthreads(); s_prop[sj][si] = r[id.g]; s_dil[sj][si] = h[id.g]; __syncthreads(); - // Boundary conditions for denisty and enthalpy - BCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,kNum); - __syncthreads(); - fluxQuadSharedz(&wrk1,s_prop[sj],s_w[sj],si); - rZtmp = wrk1; - __syncthreads(); - fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_u[sj],si); - uZtmp = uZtmp + wrk1; - __syncthreads(); - fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_v[sj],si); - vZtmp = vZtmp + wrk1; + if (id.tiy < stencilSize){ + if (nDivZ ==1){ + if(pCol > 1) { // because even if its multi gpu but pcol == 1 then we can use directly TopBCzNumber 1 and Bot.. + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id); + haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol-2){ // last block + haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id); + TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc); + } else if (rk.kp == 1) {// first block + haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id); + BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec); + } else { // all internal blocks + haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id); + haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id); + } + } + } else{ + TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc); + BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec); + } + } else { + if (kNum ==0){ + TopBCzCpy(s_prop[sj],r,si,id); TopBCzCpy(s_dil[sj],h,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.kp == 1){ // first block + BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec); + } else { // all other blocks + haloBCzBot(s_prop[sj],r,si,id); haloBCzBot(s_dil[sj],h,si,id); + } + } + } else{ + BotBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,inletbc,rec); + } + } else if (kNum == nDivZ-1){ + BotBCzCpy(s_prop[sj],r,si,id); BotBCzCpy(s_dil[sj],h,si,id); + if(pCol > 1) { + if (inletbc == 1 || outletbc ==1 ){ // if periodic + haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id); + } else { // bc is something else then periodic. Because periodic is taken care of by the halo exchange(neighbour of last is first) + if (rk.km == pCol -2 ){ // last block + TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc); + } else { // all other blocks + haloBCzTop(s_prop[sj],r,si,id); haloBCzTop(s_dil[sj],h,si,id); + } + } + } else{ + TopBCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,outletbc); + } + } else { + TopBCzCpy(s_prop[sj],r,si,id); TopBCzCpy(s_dil[sj],h,si,id); + BotBCzCpy(s_prop[sj],r,si,id); BotBCzCpy(s_dil[sj],h,si,id); + } + } + } + __syncthreads(); - fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_w[sj],si); - wZtmp = wZtmp + wrk1; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + if ( boolOutflownrbc_top){ + OutflownrbcZ_top(&rZtmp,&uZtmp,&vZtmp,&wZtmp,&eZtmp, s_prop[sj1],s_u[sj1],s_v[sj1],s_w[sj1],s_tmp[sj1],s_dil[sj1], si1, id1); + } else if (boolInflownrbc_bot){ + InflownrbcZ_bot(&rZtmp,&uZtmp,&vZtmp,&wZtmp,&eZtmp, s_prop[sj1],s_u[sj1],s_v[sj1],s_w[sj1],s_tmp[sj1],s_dil[sj1], si1, id1); + } else { + fluxQuadSharedz(&wrk1,s_prop[sj1],s_w[sj1],si1); + rZtmp = wrk1; + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_u[sj1],si1); + uZtmp = uZtmp + wrk1; + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_v[sj1],si1); + vZtmp = vZtmp + wrk1; + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_w[sj1],si1); + wZtmp = wZtmp + wrk1; + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_dil[sj1],si1); + eZtmp = eZtmp + wrk1; + } + + //USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE. __syncthreads(); - fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_dil[sj],si); - eZtmp = eZtmp + wrk1; + s_prop[sj1][si1] = rZtmp ; s_u[sj1][si1] = uZtmp ; s_v[sj1][si1] = vZtmp ; s_tmp[sj1][si1] = wZtmp ; s_dil[sj1][si1] = eZtmp ; __syncthreads(); - -#if useStreams - rZ[id.g] = rZtmp; - uZ[id.g] = uZtmp; - vZ[id.g] = vZtmp; - wZ[id.g] = wZtmp + *dpdz; - eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ; -#else - rZ[id.g] += rZtmp; - uZ[id.g] += uZtmp; - vZ[id.g] += vZtmp; - wZ[id.g] += wZtmp + *dpdz; - eZ[id.g] += eZtmp + *dpdz*s_w[sj][si] ; -#endif + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + + if (forcing) { + rZ[id.g] += s_prop[sj][si]; + uZ[id.g] += s_u[sj][si]; + vZ[id.g] += s_v[sj][si]; + wZ[id.g] += s_tmp[sj][si] + *dpdz; + eZ[id.g] += s_dil[sj][si] + *dpdz*s_w[sj][si]; + } else { + rZ[id.g] += s_prop[sj][si]; + uZ[id.g] += s_u[sj][si]; + vZ[id.g] += s_v[sj][si]; + wZ[id.g] += s_tmp[sj][si] ; + eZ[id.g] += s_dil[sj][si] ; + } + //#if useStreams + // rZ[id.g] = rZtmp; + // uZ[id.g] = uZtmp; + // vZ[id.g] = vZtmp; + // wZ[id.g] = wZtmp + *dpdz; + // eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ; + //#else + + + + + + //#endif __syncthreads(); } + + + + + //__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, // myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , // myprec *t, myprec *p, myprec *mu, myprec *lam, diff --git a/src/cuda_rhs_backup_0511.cu b/src/cuda_rhs_backup_0511.cu new file mode 100644 index 0000000..7d3d2e7 --- /dev/null +++ b/src/cuda_rhs_backup_0511.cu @@ -0,0 +1,554 @@ +#include "globals.h" +#include "cuda_functions.h" +#include "cuda_math.h" +#include "sponge.h" +#include "boundary_condition_x.h" +#include "boundary_condition_y.h" +#include "boundary_condition_z.h" + +__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz, + myprec *dil, myprec *dpdz, int iNum) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + + int si = id.i + stencilSize; // local i for shared memory access + halo offset + int sj = id.tiy; // local j for shared memory access + + myprec rXtmp=0; + myprec uXtmp=0; + myprec vXtmp=0; + myprec wXtmp=0; + myprec eXtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + + __shared__ myprec s_u[sPencils][mx+stencilSize*2]; + __shared__ myprec s_v[sPencils][mx+stencilSize*2]; + __shared__ myprec s_w[sPencils][mx+stencilSize*2]; + __shared__ myprec s_t[sPencils][mx+stencilSize*2]; + __shared__ myprec s_p[sPencils][mx+stencilSize*2]; + __shared__ myprec s_prop1[sPencils][mx+stencilSize*2]; + __shared__ myprec s_prop2[sPencils][mx+stencilSize*2]; + + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + s_t[sj][si] = t[id.g]; + s_p[sj][si] = p[id.g]; + s_prop1[sj][si] = mu[id.g]; + s_prop2[sj][si] = lam[id.g]; + __syncthreads(); + + // Boundary conditions in the x-direction are in boundary_condition_x.h + // these are the BCs for u,v,w,p,t,mu,lambda + BCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); + __syncthreads(); + + //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms + uXtmp = ( 2 * dudx[id.g] - 2./3.*dil[id.g] ); + vXtmp = ( dvdx[id.g] + dudy[id.g] ); + wXtmp = ( dwdx[id.g] + dudz[id.g] ); + + //adding the viscous dissipation part duidx*mu*six + eXtmp = s_prop1[sj][si]*(uXtmp*dudx[id.g] + vXtmp*dvdx[id.g] + wXtmp*dwdx[id.g]); + + //Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse); + derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx + uXtmp *= wrk2; + vXtmp *= wrk2; + wXtmp *= wrk2; + + // viscous fluxes derivative mu*d^2ui dx^2 + derDevSharedV2x(&wrk1,s_u[sj],si); + uXtmp = uXtmp + wrk1*s_prop1[sj][si]; + derDevSharedV2x(&wrk1,s_v[sj],si); + vXtmp = vXtmp + wrk1*s_prop1[sj][si]; + derDevSharedV2x(&wrk1,s_w[sj],si); + wXtmp = wXtmp + wrk1*s_prop1[sj][si]; + + //adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six) + eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp; + + //adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx) + derDevSharedV2x(&wrk1,s_t[sj],si); + eXtmp = eXtmp + wrk1*s_prop2[sj][si]; + derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx + derDevSharedV1x(&wrk1,s_t[sj],si); //wrk1 = d (t) dx + eXtmp = eXtmp + wrk1*wrk2; + + // pressure and dilation derivatives + s_prop2[sj][si] = dil[id.g]; + __syncthreads(); + + // these are the BCs for the dilatation + BCxNumber2(s_prop2[sj],id,si,mx); + __syncthreads(); + + derDevSharedV1x(&wrk2,s_prop2[sj],si); + derDevShared1x(&wrk1 ,s_p[sj],si); + uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0 - wrk1 ; + eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si]; + + //Adding here the terms - d (ru phi) dx; + s_prop1[sj][si] = r[id.g]; + s_prop2[sj][si] = h[id.g]; + __syncthreads(); + + // fill in periodic images in shared memory array + // these are the BCs for rho (prop1) and enthalpy (prop2) + BCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); + __syncthreads(); + + fluxQuadSharedx(&wrk1,s_prop1[sj],s_u[sj],si); + rXtmp = wrk1; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); + uXtmp = uXtmp + wrk1; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_v[sj],si); + vXtmp = vXtmp + wrk1; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_w[sj],si); + wXtmp = wXtmp + wrk1; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); + eXtmp = eXtmp + wrk1; + __syncthreads(); + + rX[id.g] = rXtmp; + uX[id.g] = uXtmp; + vX[id.g] = vXtmp; + wX[id.g] = wXtmp; + eX[id.g] = eXtmp; +} + +__global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, myprec *eY, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dvdx, myprec *dudy, myprec *dvdy, myprec *dwdy, myprec *dvdz, + myprec *dil, myprec *dpdz, int jNum) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidYFlx(); + + int si = id.j + stencilSize; // local i for shared memory access + halo offset + int sj = id.tiy; // local j for shared memory access + + myprec rYtmp=0; + myprec uYtmp=0; + myprec vYtmp=0; + myprec wYtmp=0; + myprec eYtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + + __shared__ myprec s_u[sPencils][my+stencilSize*2]; + __shared__ myprec s_v[sPencils][my+stencilSize*2]; + __shared__ myprec s_w[sPencils][my+stencilSize*2]; + __shared__ myprec s_prop[sPencils][my+stencilSize*2]; + __shared__ myprec s_dil[sPencils][my+stencilSize*2]; + + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + s_prop[sj][si] = mu[id.g]; + s_dil[sj][si] = dil[id.g]; + __syncthreads(); + + // fill in periodic images in shared memory array + // these are boundary conditions for u,v,w,mu and dilatation + BCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my); + __syncthreads(); + + //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms + uYtmp = ( dudy[id.g] + dvdx[id.g] ); + vYtmp = (2 * dvdy[id.g] - 2./3.*s_dil[sj][si] ); + wYtmp = ( dwdy[id.g] + dvdz[id.g] ); + + //adding the viscous dissipation part duidy*mu*siy + eYtmp = s_prop[sj][si]*(uYtmp*dudy[id.g] + vYtmp*dvdy[id.g] + wYtmp*dvdz[id.g]); + + //Adding here the terms d (mu) dy * syj; + derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dy + uYtmp *= wrk2; + vYtmp *= wrk2; + wYtmp *= wrk2; + + // viscous fluxes derivative + derDevSharedV2y(&wrk1,s_u[sj],si); + uYtmp = uYtmp + wrk1*s_prop[sj][si]; + derDevSharedV2y(&wrk1,s_v[sj],si); + vYtmp = vYtmp + wrk1*s_prop[sj][si]; + derDevSharedV2y(&wrk1,s_w[sj],si); + wYtmp = wYtmp + wrk1*s_prop[sj][si]; + + //adding the viscous dissipation part ui*(mu * d2duidy2 + dmudy * siy) + eYtmp = eYtmp + s_u[sj][si]*uYtmp + s_v[sj][si]*vYtmp + s_w[sj][si]*wYtmp; + + //dilation derivatives + derDevSharedV1y(&wrk2,s_dil[sj],si); + vYtmp = vYtmp + s_prop[sj][si]*wrk2/3.0; + eYtmp = eYtmp + s_prop[sj][si]*wrk2/3.0*s_v[sj][si]; + + // pressure derivatives + s_dil[sj][si] = p[id.g]; + __syncthreads(); + + // Boundary condition for pressure + BCyNumber2(s_dil[sj],p,id,si,my); + __syncthreads(); + + derDevShared1y(&wrk1,s_dil[sj],si); + vYtmp = vYtmp - wrk1; + + // fourier terms + s_prop[sj][si] = lam[id.g]; + s_dil[sj][si] = t[id.g]; + __syncthreads(); + + // Boundary condition for temperature and thermal conductivity + BCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my); + __syncthreads(); + + derDevSharedV2y(&wrk1,s_dil[sj],si); + eYtmp = eYtmp + wrk1*s_prop[sj][si]; + derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dy + derDevSharedV1y(&wrk1,s_dil[sj] ,si); //wrk1 = d (t) dy + eYtmp = eYtmp + wrk1*wrk2; + + //Adding here the terms - d (ru phi) dy; + s_prop[sj][si] = r[id.g]; + s_dil[sj][si] = h[id.g]; + __syncthreads(); + // Boundary condition for density and enthalpy + BCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my); + __syncthreads(); + fluxQuadSharedy(&wrk1,s_prop[sj],s_v[sj],si); + rYtmp = wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_u[sj],si); + uYtmp = uYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_v[sj],si); + vYtmp = vYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_w[sj],si); + wYtmp = wYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_dil[sj],si); + eYtmp = eYtmp + wrk1; + __syncthreads(); + +#if useStreams + rY[id.g] = rYtmp; + uY[id.g] = uYtmp; + vY[id.g] = vYtmp; + wY[id.g] = wYtmp; + eY[id.g] = eYtmp; +#else + rY[id.g] += rYtmp; + uY[id.g] += uYtmp; + vY[id.g] += vYtmp; + wY[id.g] += wYtmp; + eY[id.g] += eYtmp; +#endif + __syncthreads(); +} + +__global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, myprec *eZ, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz, + myprec *dil, myprec *dpdz, int kNum) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + + id.mkidZFlx(kNum); + + int si = id.tix + stencilSize; // local i for shared memory access + halo offset + int sj = id.tiy; // local j for shared memory access + + myprec rZtmp=0; + myprec uZtmp=0; + myprec vZtmp=0; + myprec wZtmp=0; + myprec eZtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + + myprec tmpreg1=0; + myprec tmpreg2=0; + myprec tmpreg3=0; + myprec tmpreg4=0; + myprec tmpreg5=0; + + __shared__ myprec s_u[sPencils][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_v[sPencils][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_w[sPencils][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_prop[sPencils][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_dil[sPencils][mz/nDivZ+stencilSize*2]; + + s_u[sj][si] = u[id.g]; + s_prop[sj][si] = mu[id.g]; + s_dil[sj][si] = dil[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + tmpreg1 = dudz[id.g]; + tmpreg2 = dwdx[id.g]; + tmpreg3 = dvdz[id.g]; + tmpreg4 = dwdy[id.g]; + tmpreg5 = dwdz[id.g]; + + uZtmp = ( tmpreg1 + tmpreg2 ); + vZtmp = ( tmpreg3 + tmpreg4 ); + //__syncthreads(); // for dil - not needed as s_dil is the one updated by the same thread and hence RAW dependent + wZtmp = (2 * tmpreg5 - 2./3.*s_dil[sj][si] ); + eZtmp = s_prop[sj][si]*(uZtmp*tmpreg1 + vZtmp*tmpreg3 + wZtmp*tmpreg5); + + __syncthreads(); + BCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,kNum); + __syncthreads(); + + derDevSharedV1z(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dz + derDevSharedV2z(&tmpreg1,s_u[sj],si); //& will come or not? + derDevSharedV2z(&tmpreg2,s_v[sj],si); + derDevSharedV2z(&tmpreg3,s_w[sj],si); + derDevSharedV1z(&tmpreg5,s_dil[sj],si); + + uZtmp *= wrk2; + vZtmp *= wrk2; + wZtmp *= wrk2; + + tmpreg4 = s_prop[sj][si]; + + uZtmp = uZtmp + tmpreg1*tmpreg4; + vZtmp = vZtmp + tmpreg2*tmpreg4; + wZtmp = wZtmp + tmpreg3*tmpreg4; + eZtmp = eZtmp + s_u[sj][si]*uZtmp + s_v[sj][si]*vZtmp + s_w[sj][si]*wZtmp; + + wZtmp = wZtmp + tmpreg4*tmpreg5/3.0; + eZtmp = eZtmp + tmpreg4*tmpreg5/3.0*s_w[sj][si]; + + + + + + s_dil[sj][si] = p[id.g]; + __syncthreads(); + BCzNumber2(s_dil[sj],p,id,si,mz,kNum); + + __syncthreads(); + derDevShared1z(&wrk1,s_dil[sj],si); + wZtmp = wZtmp - wrk1; + + // fourier terms + s_prop[sj][si] = lam[id.g]; + s_dil[sj][si] = t[id.g]; + __syncthreads(); + + // Boundary conditions for thermal conductivity and temperature + BCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,kNum); + __syncthreads(); + + derDevSharedV2z(&wrk1,s_dil[sj],si); + eZtmp = eZtmp + wrk1*s_prop[sj][si]; + derDevSharedV1z(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dz + derDevSharedV1z(&wrk1,s_dil[sj],si); //wrk1 = d (t) dz + eZtmp = eZtmp + wrk1*wrk2; + + //Adding here the terms - d (ru phi) dz; + s_prop[sj][si] = r[id.g]; + s_dil[sj][si] = h[id.g]; + __syncthreads(); + + // Boundary conditions for denisty and enthalpy + BCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,kNum); + __syncthreads(); + + fluxQuadSharedz(&wrk1,s_prop[sj],s_w[sj],si); + rZtmp = wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_u[sj],si); + uZtmp = uZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_v[sj],si); + vZtmp = vZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_w[sj],si); + wZtmp = wZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj],s_w[sj],s_dil[sj],si); + eZtmp = eZtmp + wrk1; + __syncthreads(); + +#if useStreams + rZ[id.g] = rZtmp; + uZ[id.g] = uZtmp; + vZ[id.g] = vZtmp; + wZ[id.g] = wZtmp + *dpdz; + eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ; +#else + rZ[id.g] += rZtmp; + uZ[id.g] += uZtmp; + vZ[id.g] += vZtmp; + wZ[id.g] += wZtmp + *dpdz; + eZ[id.g] += eZtmp + *dpdz*s_w[sj][si] ; +#endif + __syncthreads(); +} + +//__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, +// myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , +// myprec *t, myprec *p, myprec *mu, myprec *lam, +// myprec *dil, myprec *dpdz) { +// +// Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); +// id.mkidX(); +// +// int si = id.i + stencilSize; // local i for shared memory access + halo offset +// int sj = id.tiy; // local j for shared memory access +// +// myprec rXtmp=0; +// myprec uXtmp=0; +// myprec vXtmp=0; +// myprec wXtmp=0; +// myprec eXtmp=0; +// +// myprec wrk1=0; +// myprec wrk2=0; +// +// __shared__ myprec s_u[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_v[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_w[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_t[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_p[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_prop1[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_prop2[sPencils][mx+stencilSize*2]; +//#if !periodicX +// __shared__ myprec s_s0[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_s4[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_s8[sPencils][mx+stencilSize*2]; +//#endif +// __shared__ myprec s_dil[sPencils][mx+stencilSize*2]; +// +// s_u[sj][si] = u[id.g]; +// s_v[sj][si] = v[id.g]; +// s_w[sj][si] = w[id.g]; +// s_t[sj][si] = t[id.g]; +// s_p[sj][si] = p[id.g]; +// s_prop1[sj][si] = mu[id.g]; +// s_prop2[sj][si] = lam[id.g]; +//#if !periodicX +// s_s0[sj][si]= gij[0][id.g]; +// s_s4[sj][si]= gij[4][id.g]; +// s_s8[sj][si]= gij[8][id.g]; +//#endif +// s_dil[sj][si] = dil[id.g]; +// __syncthreads(); +// +// // fill in periodic images in shared memory array +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_u[sj],si); perBCx(s_v[sj],si); perBCx(s_w[sj],si); +// perBCx(s_t[sj],si); perBCx(s_p[sj],si); perBCx(s_prop1[sj],si); +// perBCx(s_prop2[sj],si); +//#else +// wallBCxMir(s_p[sj],si); +// wallBCxVel(s_u[sj],si); wallBCxVel(s_v[sj],si); wallBCxVel(s_w[sj],si); +// wallBCxExt(s_t[sj],si,TwallTop,TwallBot); +// mlBoundPT(s_prop1[sj], s_prop2[sj], s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si); +// wallBCxMir(s_s0[sj],si); wallBCxVel(s_s4[sj],si); wallBCxVel(s_s8[sj],si); +//#endif +// } +// +// __syncthreads(); +// +// //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms +// uXtmp = ( 2 * gij[0][id.g] - 2./3.*s_dil[sj][si] ); +// vXtmp = ( gij[1][id.g] + gij[3][id.g] ); +// wXtmp = ( gij[2][id.g] + gij[6][id.g] ); +// +// //adding the viscous dissipation part duidx*mu*six +// eXtmp = s_prop1[sj][si]*(uXtmp*gij[0][id.g] + vXtmp*gij[1][id.g] + wXtmp*gij[2][id.g]); +// +// //Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse); +// derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx +// uXtmp *= wrk2; +// vXtmp *= wrk2; +// wXtmp *= wrk2; +// +// // viscous fluxes derivative mu*d^2ui dx^2 +// derDevSharedV2x(&wrk1,s_u[sj],si); +// uXtmp = uXtmp + wrk1*s_prop1[sj][si]; +// derDevSharedV2x(&wrk1,s_v[sj],si); +// vXtmp = vXtmp + wrk1*s_prop1[sj][si]; +// derDevSharedV2x(&wrk1,s_w[sj],si); +// wXtmp = wXtmp + wrk1*s_prop1[sj][si]; +// +// //adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six) +// eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp; +// +// //adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx) +// derDevSharedV2x(&wrk1,s_t[sj],si); +// eXtmp = eXtmp + wrk1*s_prop2[sj][si]; +// derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx +// derDevSharedV1x(&wrk1,s_t[sj],si); //wrk1 = d (t) dx +// eXtmp = eXtmp + wrk1*wrk2; +// +// // pressure and dilation derivatives +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_dil[sj],si); +//#else +// wallBCxDil(s_dil[sj],s_s0[sj],s_s4[sj],s_s8[sj],si); +//#endif +// } +// __syncthreads(); +// +// derDevSharedV1x(&wrk2,s_dil[sj],si); +// derDevShared1x(&wrk1 ,s_p[sj],si); +// uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0 - wrk1 ; +// eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si]; +// +// //Adding here the terms - d (ru phi) dx; +// s_prop1[sj][si] = r[id.g]; +// s_prop2[sj][si] = h[id.g]; +// __syncthreads(); +// // fill in periodic images in shared memory array +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_prop1[sj],si); perBCx(s_prop2[sj],si); +//#else +// rhBoundPT(s_prop1[sj], s_prop2[sj], s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si); +//#endif +// } +// +// fluxQuadSharedx(&wrk1,s_prop1[sj],s_u[sj],si); +// rXtmp = wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); +// uXtmp = uXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_v[sj],si); +// vXtmp = vXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_w[sj],si); +// wXtmp = wXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); +// eXtmp = eXtmp + wrk1; +// __syncthreads(); +// +// rX[id.g] = rXtmp; +// uX[id.g] = uXtmp; +// vX[id.g] = vXtmp; +// wX[id.g] = wXtmp; +// eX[id.g] = eXtmp; +//} diff --git a/src/cuda_utils.cu b/src/cuda_utils.cu index d612cd5..d34c620 100644 --- a/src/cuda_utils.cu +++ b/src/cuda_utils.cu @@ -34,8 +34,8 @@ __global__ void cpyCoefficients(myprec *tmpSx, myprec *tmpVSx) { __constant__ myprec d_dx, d_dy, d_dz, d_d2x, d_d2y, d_d2z, d_x[mx], d_xp[mx], d_dxv[mx]; -dim3 d_block[5], grid0, gridBC, gridHalo, gridHaloY, gridHaloZ; -dim3 d_grid[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ; +dim3 d_grid[5], grid0, gridBC, gridHalo, gridHaloY, gridHaloZ, d_gridx, gridBCw; +dim3 d_block[5], block0, blockBC, blockHalo, blockHaloY, blockHaloZ, d_blockx, blockBCw; void copyThreadGridsToDevice(Communicator rk); void sanityCheckThreadGrids(Communicator rk); @@ -65,6 +65,8 @@ void setGPUParameters(Communicator rk) myprec *h_xp = new myprec[mx]; myprec *h_dxv = new myprec[mx]; + + myprec h_dpdz; if(forcing) h_dpdz = 0.00372; if(!forcing) h_dpdz = 0.0; @@ -79,11 +81,16 @@ void setGPUParameters(Communicator rk) h_x[i] = x[i]; h_xp[i] = xp[i]; } - h_dxv[0] = (x[1]+x[0])/2.0; +/* h_dxv[0] = (x[1]+x[0])/2.0; for (int i=1; i mx mod sPencils!=0\n"); } @@ -293,7 +308,7 @@ void sanityCheckThreadGrids(Communicator rk) { } mpiBarrier(); exit(1); - } + }*/ } void copyField(int direction, Communicator rk) { @@ -516,6 +531,35 @@ void initSolver(Communicator rk) { checkCuda( cudaMallocHost(&rcvZm5, 5*bytesZ) ); checkCuda( cudaMallocHost(&rcvZp5, 5*bytesZ) ); + + checkCuda( cudaMalloc( (void**)&recy_r, stencilSize*mx_tot*my*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&recy_u, stencilSize*mx_tot*my*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&recy_v, stencilSize*mx_tot*my*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&recy_w, stencilSize*mx_tot*my*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&recy_e, stencilSize*mx_tot*my*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&recy_h, stencilSize*mx_tot*my*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&recy_p, stencilSize*mx_tot*my*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&recy_t, stencilSize*mx_tot*my*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&recy_m, stencilSize*mx_tot*my*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&recy_l, stencilSize*mx_tot*my*sizeof(myprec) ) ); + + checkCuda( cudaMalloc( (void**)&rm, stencilSize*mx_tot*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&um, stencilSize*mx_tot*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&wm, stencilSize*mx_tot*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&tm, stencilSize*mx_tot*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&rm_in, stencilSize*mx_tot*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&um_in, stencilSize*mx_tot*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&wm_in, stencilSize*mx_tot*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&tm_in, stencilSize*mx_tot*sizeof(myprec) ) ); + + checkCuda( cudaMalloc( (void**)&a_inpl, stencilSize*mx_tot*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&b_inpl, stencilSize*mx_tot*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&idxm, stencilSize*mx_tot*sizeof(int) ) ); + checkCuda( cudaMalloc( (void**)&idxp, stencilSize*mx_tot*sizeof(int) ) ); + + checkCuda( cudaMalloc( (void**)&delta_rec, stencilSize*sizeof(myprec) ) ); + checkCuda( cudaMalloc( (void**)&delta_in, stencilSize*sizeof(myprec) ) ); + } void clearSolver(Communicator rk) { @@ -559,6 +603,35 @@ void clearSolver(Communicator rk) { checkCuda( cudaFree(d_m) ); checkCuda( cudaFree(d_l) ); + + checkCuda( cudaFree( recy_r)); + checkCuda( cudaFree( recy_u)); + checkCuda( cudaFree( recy_v)); + checkCuda( cudaFree( recy_w)); + checkCuda( cudaFree( recy_e)); + checkCuda( cudaFree( recy_h)); + checkCuda( cudaFree( recy_p)); + checkCuda( cudaFree( recy_t)); + checkCuda( cudaFree( recy_m)); + checkCuda( cudaFree( recy_l)); + + checkCuda( cudaFree( rm )); + checkCuda( cudaFree( um )); + checkCuda( cudaFree( wm )); + checkCuda( cudaFree( tm )); + checkCuda( cudaFree( rm_in)); + checkCuda( cudaFree( um_in)); + checkCuda( cudaFree( wm_in)); + checkCuda( cudaFree( tm_in)); + + checkCuda( cudaFree( a_inpl)); + checkCuda( cudaFree( b_inpl)); + checkCuda( cudaFree( idxm )); + checkCuda( cudaFree( idxp )); + + checkCuda( cudaFree( delta_rec )); + checkCuda( cudaFree( delta_in )); + if(!lowStorage) { checkCuda( cudaFree(d_rO) ); checkCuda( cudaFree(d_eO) ); diff --git a/src/globals.h b/src/globals.h index b374a59..2246b79 100644 --- a/src/globals.h +++ b/src/globals.h @@ -14,48 +14,98 @@ #define pRow 1 #define pCol 1 +#define GPUperNode 4 + //Remember : viscous stencil should ALWAYS be smaller than the advective stencil!!! (otherwise errors in how you load global into shared memory) #define stencilSize 3 // the order is double the stencilSize (advective fluxes stencil) #define stencilVisc 2 // the order is double the stencilVisc (viscous fluxes stencil) -#define Lx (20.0) -#define Ly (7.0) -#define Lz (500.0) +#define boundaryLayer (true) +#define channel (false) +#define misc (false) + +#define Lx (12.0) +#define Ly (10.0) +#define Lz (52.0) #define mx_tot 240 -#define my_tot 64 +#define my_tot 60 // 246 #define mz_tot 2048 -#define nsteps 1001 -#define nfiles 100 +#define nsteps 10 +#define nfiles 1 #define CFL 0.75f -#define restartFile -1 +#define restartFile -1 +#define spanshift (true) -#define lowStorage (true) -#define boundaryLayer (true) -#define perturbed (true) -#define forcing (false) -#define periodicX (false) -#define nonUniformX (true) +#define recstn 1500 + +#define lowStorage (false) +#define perturbed (false) #define checkCFLcondition 100 -#define checkBulk 100 +#define checkBulk 100 +#define checkMeanRec 1 -#define Re 1500.0 -#define Pr 0.75 -#define Ma 0.35 -#define viscexp 1.5 +#define Re 16609 +#define Pr 0.7 +#define Ma 1.9 +#define viscexp 0.75 #define gam 1.4 #define Ec ((gam - 1.f)*Ma*Ma) #define Rgas (1.f/(gam*Ma*Ma)) +#define pinf (Rgas) +#define rfac 0.89 +#define Trat 1 +#define Tr (1 + 0.5*(gam-1)*rfac*Ma*Ma) + +#if boundaryLayer + +#define forcing (false) +#define periodicX (false) +#define nonUniformX (true) + +//1 = periodic ; 2 = wall_staggered; 3 = wall_centered; 4= 0th order extrapolation 5 = Inflow +#define inletbc (5) +#define outletbc (4) +#define bottombc (3) +#define topbc (4) + +#define gridtype (0) // 0 = BL; 1 = channel_centered; 2 = channel staggered, no wall ,etc. + +const myprec TwallTop = 0.0; //Holds no relevance for BL +const myprec TwallBot = Trat*Tr; + +#elif channel + +#define forcing (true) +#define periodicX (false) +#define nonUniformX (true) + +//1 = periodic ; 2 = wall_staggered; 3 = wall_centered; 4= 0th order extrapolation 5 = Inflow +#define inletbc (1) +#define outletbc (1) +#define bottombc (2) +#define topbc (2) + +#define gridtype (2) // 0 = BL; 1 = channel_centered; 2 = channel staggered, no wall ,etc. -const double stretch = 5.0; const myprec TwallTop = 1.0; const myprec TwallBot = 1.0; +#elif misc + +#endif + +const double stretch = 5.0; + #define mx (mx_tot) #define my (my_tot/pRow) #define mz (mz_tot/pCol) -#define nDivZ (8) // never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations) +#define nX (1) // never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations) + +#define nDivX (15) // never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations) +#define nDivY (16) // never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations) +#define nDivZ (64) // never put larger than 8 as there will be no streams to accomodate the kernels (max 8) (actually check the machine limitations) #define idx(i,j,k) \ ({ ( k )*mx*my +( j )*mx + ( i ); }) diff --git a/src/init.cpp b/src/init.cpp index ec239b2..c64d556 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -16,9 +16,9 @@ void initField(int timestep, Communicator rk) { readFileMPI('v',timestep,v,rk); readFileMPI('w',timestep,w,rk); readFileMPI('e',timestep,e,rk); - if(rk.rank==0) { - printf("finished initializing field\n"); - } + if(rk.rank==0) { + printf("finished initializing field\n"); + } } void writeField(int timestep, Communicator rk) { @@ -33,40 +33,100 @@ void initGrid(Communicator rk) { //constant grid in y and z and stretched in x - dx = Lx*(1.0)/(mx_tot); + if (gridtype ==0) { + + dx = Lx*(1.0)/(mx_tot-1);// because we have points at the domain boundaries.mx_tot is the total points and mx_tot-1 is the number of parts in which domain is divided + //dy = Ly*(1.0)/(my_tot);// Periodic direction + //dz = Lz*(1.0)/(mz_tot-1);// because we have points at the domain boundaries. + + // y mesh - uniform + for(int j=0;j0){ + L[0] = ev[0]*(dp - s_r[si]*sos*du); + } else { + L[0] = L[4] ; // either 0 (perfectly non-reflecting) or use LODI relations ( Eq. 24-28 in Poinsot and Lele (1992) ) + } + if (ev[1]>0){ + L[1] = ev[1]*(sos*sos*dr - dp); + } else { + L[1] = 0; + } + if (ev[2]>0){ + L[2] = ev[2]*(dv); + } else { + L[2] = 0; + } + if (ev[3]>0){ + L[3] = ev[3]*(dw); + } else { + L[3] = 0; + } + if (ev[4]>0){ + L[4] = ev[4]*(dp + s_r[si]*sos*du); + } else { + L[4] = 0; + } + + myprec d[5]; + + d[0] = 1/(sos*sos) * ( L[1] + 0.5*(L[4] + L[0]) ) ; + d[1] = 0.5*( L[4] + L[0] ) ; + d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ; + d[3] = L[2] ; + d[4] = L[3] ; + + *rXtmp = - d[0]; + fluxCubeSharedx(&wrk1,s_r,s_u,s_u,si); + *uXtmp = *uXtmp + wrk1; + fluxCubeSharedx(&wrk1,s_r,s_u,s_v,si); + *vXtmp = *vXtmp + wrk1; + fluxCubeSharedx(&wrk1,s_r,s_u,s_w,si); + *wXtmp = *wXtmp + wrk1; + fluxCubeSharedx(&wrk1,s_r,s_u,s_h,si); + *eXtmp = *eXtmp + wrk1; + + +} + + +__device__ __forceinline__ __attribute__((always_inline)) void FreestreamnrbcX_top(myprec *rXtmp, myprec *uXtmp, myprec *vXtmp, myprec *wXtmp, myprec *eXtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id) { + + myprec dp = 0 ; + myprec du = 0 ; + myprec dv = 0 ; + myprec dw = 0 ; + myprec dr = 0 ; + + derShared1x_BD(&dp, s_p, si) ; + derShared1x_BD(&du, s_u, si) ; + derShared1x_BD(&dv, s_v, si) ; + derShared1x_BD(&dw, s_w, si) ; + derShared1x_BD(&dr, s_r, si) ; + + myprec sos = pow(gam*s_p[si]/s_r[si],0.5); + + myprec ev[5]; + + ev[0] = s_u[si] - sos; + ev[1] = s_u[si]; + ev[2] = s_u[si]; + ev[3] = s_u[si]; + ev[4] = s_u[si] + sos; + + myprec sig = 0.25; + myprec K = sig * (1-Ma*Ma) * sos / Lx; // Ma is the free stream Mach number that we specify. Poinsot and Lele say that maximum Mach number in the flow should be used. So should there be a comparison kernel that computes maximum Mach number? I dont think its needed as the free stream Mach number and Ma in eqn would be very similar in order and equation 40 is not something accurate but a mere model. + + myprec L[5]; + + if (ev[0]>0){ + L[0] = ev[0]*(dp - s_r[si]*sos*du); + } else { + L[0] = K*(s_p[si] - pinf) ; // either 0 (perfectly non-reflecting) or use LODI relations ( Eq. 24-28 in Poinsot and Lele (1992) ) + } + if (ev[1]>0){ + L[1] = ev[1]*(sos*sos*dr - dp); + } else { + L[1] = 0; + } + if (ev[2]>0){ + L[2] = ev[2]*(dv); + } else { + L[2] = 0; + } + if (ev[3]>0){ + L[3] = ev[3]*(dw); + } else { + L[3] = 0; + } + if (ev[4]>0){ + L[4] = ev[4]*(dp + s_r[si]*sos*du); + } else { + L[4] = 0; + } + + myprec d[5]; + + d[0] = 1/(sos*sos) * ( L[1] + 0.5*(L[4] + L[0]) ) ; + d[1] = 0.5*( L[4] + L[0] ) ; + d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ; + d[3] = L[2] ; + d[4] = L[3] ; + + *rXtmp = - d[0]; + *uXtmp += - s_u[si]*d[0] - s_r[si]*d[2]; + *vXtmp += - s_v[si]*d[0] - s_r[si]*d[3]; + *wXtmp += - s_w[si]*d[0] - s_r[si]*d[4]; + *eXtmp += - ( 0.5*( s_u[si]*s_u[si] + s_v[si]*s_v[si] + s_w[si]*s_w[si] )*d[0] + d[1]/(gam - 1) + s_r[si]*s_u[si] * d[2] + s_r[si]*s_v[si] * d[3] + s_r[si]*s_w[si] * d[4] ) ; + +} + + +#endif diff --git a/src/nrbcZ.h b/src/nrbcZ.h new file mode 100644 index 0000000..a1d7481 --- /dev/null +++ b/src/nrbcZ.h @@ -0,0 +1,163 @@ + +#ifndef NRBCZ_H_ +#define NRBCZ_H_ + +extern __device__ __forceinline__ void InflownrbcZ_bot(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id); +extern __device__ __forceinline__ void OutflownrbcZ_top(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id); + + +__device__ __forceinline__ __attribute__((always_inline)) void InflownrbcZ_bot(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id) { + + + myprec dpf = 0 ; + myprec duf = 0 ; + myprec dvf = 0 ; + myprec dwf = 0 ; + myprec drf = 0 ; + + myprec dpb = 0 ; + myprec dub = 0 ; + myprec dvb = 0 ; + myprec dwb = 0 ; + myprec drb = 0 ; + + + derShared1z_FD(&dpf, s_p, si) ; + derShared1z_FD(&duf, s_u, si) ; + derShared1z_FD(&dvf, s_v, si) ; + derShared1z_FD(&dwf, s_w, si) ; + derShared1z_FD(&drf, s_r, si) ; + + derShared1z_BD(&dpb, s_p, si) ; + derShared1z_BD(&dub, s_u, si) ; + derShared1z_BD(&dvb, s_v, si) ; + derShared1z_BD(&dwb, s_w, si) ; + derShared1z_BD(&drb, s_r, si) ; + + myprec sos = pow(gam*s_p[si]/s_r[si],0.5); + + myprec ev[5]; + + ev[0] = s_w[si] - sos; + ev[1] = s_w[si] ; + ev[2] = s_w[si] ; + ev[3] = s_w[si] ; + ev[4] = s_w[si] + sos; + + myprec L[5]; + + if (ev[0]<0){ + L[0] = ev[0]*(dpf - s_r[si]*sos*dwf); + } else { + L[0] = ev[0]*(dpb - s_r[si]*sos*dwb); + } + if (ev[1]<0){ + L[1] = ev[1]*(sos*sos*drf - dpf); + } else { + L[1] = ev[1]*(sos*sos*drb - dpb); + } + if (ev[2]<0){ + L[2] = ev[2]*(dvf); + } else { + L[2] = ev[2]*(dvb); + } + if (ev[3]<0){ + L[3] = ev[3]*(duf); + } else { + L[3] = ev[3]*(dub); + } + if (ev[4]<0){ + L[4] = ev[4]*(dpf + s_r[si]*sos*dwf); + } else { + L[4] = ev[4]*(dpb + s_r[si]*sos*dwb); + } + + myprec d[5]; + + d[0] = 1/(sos*sos)*( L[1] + 0.5*(L[4] + L[0]) ) ; + d[1] = 0.5*( L[4] + L[0] ) ; + d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ; + d[3] = L[2] ; + d[4] = L[3] ; + + *rZtmp = - d[0]; + *uZtmp += - s_u[si]*d[0] - s_r[si]*d[4]; + *vZtmp += - s_v[si]*d[0] - s_r[si]*d[3]; + *wZtmp += - s_w[si]*d[0] - s_r[si]*d[2]; + *eZtmp += - ( 0.5*( s_u[si]*s_u[si] + s_v[si]*s_v[si] + s_w[si]*s_w[si] )*d[0] + d[1]/(gam - 1) + s_r[si]*s_u[si] * d[4] + s_r[si]*s_v[si] * d[3] + s_r[si]*s_w[si] * d[2] ) ; +} + + +__device__ __forceinline__ __attribute__((always_inline)) void OutflownrbcZ_top(myprec *rZtmp, myprec *uZtmp, myprec *vZtmp, myprec *wZtmp, myprec *eZtmp, myprec *s_r, myprec *s_u, myprec *s_v, myprec *s_w, myprec *s_p, myprec *s_h, int si, Indices id) { + + myprec dp = 0 ; + myprec du = 0 ; + myprec dv = 0 ; + myprec dw = 0 ; + myprec dr = 0 ; + + derShared1z_BD(&dp, s_p, si) ; + derShared1z_BD(&du, s_u, si) ; + derShared1z_BD(&dv, s_v, si) ; + derShared1z_BD(&dw, s_w, si) ; + derShared1z_BD(&dr, s_r, si) ; + + myprec sos = pow(gam*s_p[si]/s_r[si],0.5); + + myprec ev[5]; + + ev[0] = s_w[si] - sos; + ev[1] = s_w[si]; + ev[2] = s_w[si]; + ev[3] = s_w[si]; + ev[4] = s_w[si] + sos; + + myprec sig = 0.25; + myprec K = sig * (1-Ma*Ma) * sos / Lz; // Ma is the free stream Mach number that we specify. Poinsot and Lele say that maximum Mach number in the flow should be used. So should there be a comparison kernel that computes maximum Mach number? I dont think its needed as the free stream Mach number and Ma in eqn would be very similar in order and equation 40 is not something accurate but a mere model. + + myprec L[5]; + + if (ev[0]>0){ + L[0] = ev[0]*(dp - s_r[si]*sos*dw); + } else { + L[0] = K*(s_p[si] - pinf) ; // either 0 (perfectly non-reflecting) or use LODI relations ( Eq. 24-28 in Poinsot and Lele (1992) ) + } + if (ev[1]>0){ + L[1] = ev[1]*(sos*sos*dr - dp); + } else { + L[1] = 0; + } + if (ev[2]>0){ + L[2] = ev[2]*(dv); + } else { + L[2] = 0; + } + if (ev[3]>0){ + L[3] = ev[3]*(du); + } else { + L[3] = 0; + } + if (ev[4]>0){ + L[4] = ev[4]*(dp + s_r[si]*sos*dw); + } else { + L[4] = 0; + } + + myprec d[5]; + + d[0] = 1/(sos*sos) * ( L[1] + 0.5*(L[4] + L[0]) ) ; + d[1] = 0.5*( L[4] + L[0] ) ; + d[2] = 1/(2*s_r[si]*sos) * ( L[4] - L[0] ) ; + d[3] = L[2] ; + d[4] = L[3] ; + + *rZtmp = - d[0]; + *uZtmp += - s_u[si]*d[0] - s_r[si]*d[4]; + *vZtmp += - s_v[si]*d[0] - s_r[si]*d[3]; + *wZtmp += - s_w[si]*d[0] - s_r[si]*d[2]; + *eZtmp += - ( 0.5*( s_u[si]*s_u[si] + s_v[si]*s_v[si] + s_w[si]*s_w[si] )*d[0] + d[1]/(gam - 1) + s_r[si]*s_u[si] * d[4] + s_r[si]*s_v[si] * d[3] + s_r[si]*s_w[si] * d[2] ) ; + +} + + +#endif diff --git a/src/temporary.cu b/src/temporary.cu new file mode 100644 index 0000000..acada91 --- /dev/null +++ b/src/temporary.cu @@ -0,0 +1,608 @@ +#include "globals.h" +#include "cuda_functions.h" +#include "cuda_math.h" +#include "sponge.h" +#include "boundary_condition_x.h" +#include "boundary_condition_y.h" +#include "boundary_condition_z.h" + +__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dudx, myprec *dvdx, myprec *dwdx, myprec *dudy, myprec *dudz, + myprec *dil, myprec *dpdz, int iNum) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + id.mkidX(); + + int si = id.tix + stencilSize; // local i for shared memory access + halo offset + int sj = id.tiy; // local j for shared memory access + + myprec rXtmp=0; + myprec uXtmp=0; + myprec vXtmp=0; + myprec wXtmp=0; + myprec eXtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + myprec wrk3=0; + + + __shared__ myprec s_u[sPencils][mx+stencilSize*2]; + __shared__ myprec s_v[sPencils][mx+stencilSize*2]; + __shared__ myprec s_w[sPencils][mx+stencilSize*2]; + __shared__ myprec s_t[sPencils][mx+stencilSize*2]; + __shared__ myprec s_p[sPencils][mx+stencilSize*2]; + __shared__ myprec s_prop1[sPencils][mx+stencilSize*2]; + __shared__ myprec s_prop2[sPencils][mx+stencilSize*2]; + + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + s_t[sj][si] = t[id.g]; + s_p[sj][si] = p[id.g]; + s_prop1[sj][si] = mu[id.g]; + s_prop2[sj][si] = lam[id.g]; + __syncthreads(); + + // Boundary conditions in the x-direction are in boundary_condition_x.h + // these are the BCs for u,v,w,p,t,mu,lambda + BCxNumber1(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); + __syncthreads(); + + derDevSharedV1x(&wrk1,s_u[sj],si); + derDevSharedV1x(&wrk2,s_v[sj],si); + derDevSharedV1x(&wrk3,s_w[sj],si); + //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms + uXtmp = ( 2 * wrk1 - 2./3.*dil[id.g] ); + vXtmp = ( wrk2 + dudy[id.g] ); + wXtmp = ( wrk3 + dudz[id.g] ); + + //adding the viscous dissipation part duidx*mu*six + eXtmp = s_prop1[sj][si]*(uXtmp*wrk1 + vXtmp*wrk2 + wXtmp*wrk3); + + //Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse); + derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx + uXtmp *= wrk2; + vXtmp *= wrk2; + wXtmp *= wrk2; + + // viscous fluxes derivative mu*d^2ui dx^2 + derDevSharedV2x(&wrk1,s_u[sj],si); + uXtmp = uXtmp + wrk1*s_prop1[sj][si]; + derDevSharedV2x(&wrk2,s_v[sj],si); + vXtmp = vXtmp + wrk2*s_prop1[sj][si]; + derDevSharedV2x(&wrk3,s_w[sj],si); + wXtmp = wXtmp + wrk3*s_prop1[sj][si]; + + //adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six) + eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp; + + //adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx) + derDevSharedV2x(&wrk1,s_t[sj],si); + eXtmp = eXtmp + wrk1*s_prop2[sj][si]; + derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx + derDevSharedV1x(&wrk3,s_t[sj],si); //wrk1 = d (t) dx + eXtmp = eXtmp + wrk2*wrk3; + + // pressure and dilation derivatives + s_prop2[sj][si] = dil[id.g]; + __syncthreads(); + + // these are the BCs for the dilatation + BCxNumber2(s_prop2[sj],id,si,mx); + __syncthreads(); + + derDevSharedV1x(&wrk2,s_prop2[sj],si); + derDevShared1x(&wrk1 ,s_p[sj],si); + uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0 - wrk1 ; + eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si]; + + //Adding here the terms - d (ru phi) dx; + s_prop1[sj][si] = r[id.g]; + s_prop2[sj][si] = h[id.g]; + __syncthreads(); + + // fill in periodic images in shared memory array + // these are the BCs for rho (prop1) and enthalpy (prop2) + BCxNumber3(s_u[sj],s_v[sj],s_w[sj],s_p[sj],s_t[sj],s_prop1[sj],s_prop2[sj],id,si,mx); + __syncthreads(); + + fluxQuadSharedx(&wrk3,s_prop1[sj],s_u[sj],si); + rXtmp = wrk3; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); + uXtmp = uXtmp + wrk1; + __syncthreads(); + fluxCubeSharedx(&wrk2,s_prop1[sj],s_u[sj],s_v[sj],si); + vXtmp = vXtmp + wrk2; + __syncthreads(); + fluxCubeSharedx(&wrk3,s_prop1[sj],s_u[sj],s_w[sj],si); + wXtmp = wXtmp + wrk3; + __syncthreads(); + fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); + eXtmp = eXtmp + wrk1; + __syncthreads(); + + rX[id.g] = rXtmp; + uX[id.g] = uXtmp; + vX[id.g] = vXtmp; + wX[id.g] = wXtmp; + eX[id.g] = eXtmp; +} + +__global__ void deviceRHSY(myprec *rY, myprec *uY, myprec *vY, myprec *wY, myprec *eY, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dvdx, myprec *dudy, myprec *dvdy, myprec *dwdy, myprec *dvdz, + myprec *dil, myprec *dpdz) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + int jNum = blockIdx.z; + id.mkidYFlx(jNum); + + int si = id.tiy + stencilSize; // local i for shared memory access + halo offset + int sj = id.tix; // local j for shared memory access + + myprec rYtmp=0; + myprec uYtmp=0; + myprec vYtmp=0; + myprec wYtmp=0; + myprec eYtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + myprec wrk3=0; + + __shared__ myprec s_u[lPencils][my/nDivY+stencilSize*2]; + __shared__ myprec s_v[lPencils][my/nDivY+stencilSize*2]; + __shared__ myprec s_w[lPencils][my/nDivY+stencilSize*2]; + __shared__ myprec s_prop[lPencils][my/nDivY+stencilSize*2]; + __shared__ myprec s_dil[lPencils][my/nDivY+stencilSize*2]; + + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + s_prop[sj][si] = mu[id.g]; + s_dil[sj][si] = dil[id.g]; + __syncthreads(); + + // fill in periodic images in shared memory array + // these are boundary conditions for u,v,w,mu and dilatation + BCyNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,my,jNum); + __syncthreads(); + + derDevSharedV1y(&wrk1,s_u[sj],si); + derDevSharedV1y(&wrk2,s_v[sj],si); + derDevSharedV1y(&wrk3,s_w[sj],si); + //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms + uYtmp = ( wrk1 + dvdx[id.g] ); + vYtmp = (2 * wrk2 - 2./3.*s_dil[sj][si] ); + wYtmp = ( wrk3 + dvdz[id.g] ); + + //adding the viscous dissipation part duidy*mu*siy + eYtmp = s_prop[sj][si]*(uYtmp*wrk1 + vYtmp*wrk2 + wYtmp*wrk3); + + //Adding here the terms d (mu) dy * syj; + derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (mu) dy + uYtmp *= wrk2; + vYtmp *= wrk2; + wYtmp *= wrk2; + + // viscous fluxes derivative + derDevSharedV2y(&wrk1,s_u[sj],si); + uYtmp = uYtmp + wrk1*s_prop[sj][si]; + derDevSharedV2y(&wrk1,s_v[sj],si); + vYtmp = vYtmp + wrk1*s_prop[sj][si]; + derDevSharedV2y(&wrk1,s_w[sj],si); + wYtmp = wYtmp + wrk1*s_prop[sj][si]; + + //adding the viscous dissipation part ui*(mu * d2duidy2 + dmudy * siy) + eYtmp = eYtmp + s_u[sj][si]*uYtmp + s_v[sj][si]*vYtmp + s_w[sj][si]*wYtmp; + + //dilation derivatives + derDevSharedV1y(&wrk2,s_dil[sj],si); + vYtmp = vYtmp + s_prop[sj][si]*wrk2/3.0; + eYtmp = eYtmp + s_prop[sj][si]*wrk2/3.0*s_v[sj][si]; + + // pressure derivatives + s_dil[sj][si] = p[id.g]; + __syncthreads(); + + // Boundary condition for pressure + BCyNumber2(s_dil[sj],p,id,si,my, jNum); + __syncthreads(); + + derDevShared1y(&wrk1,s_dil[sj],si); + vYtmp = vYtmp - wrk1; + + // fourier terms + s_prop[sj][si] = lam[id.g]; + s_dil[sj][si] = t[id.g]; + __syncthreads(); + + // Boundary condition for temperature and thermal conductivity + BCyNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,my,jNum); + __syncthreads(); + + derDevSharedV2y(&wrk1,s_dil[sj],si); + eYtmp = eYtmp + wrk1*s_prop[sj][si]; + derDevSharedV1y(&wrk2,s_prop[sj],si); //wrk2 = d (lam) dy + derDevSharedV1y(&wrk1,s_dil[sj] ,si); //wrk1 = d (t) dy + eYtmp = eYtmp + wrk1*wrk2; + + //Adding here the terms - d (ru phi) dy; + s_prop[sj][si] = r[id.g]; + s_dil[sj][si] = h[id.g]; + __syncthreads(); + // Boundary condition for density and enthalpy + BCyNumber3(s_prop[sj],s_dil[sj],r,h,id,si,my,jNum); + __syncthreads(); + fluxQuadSharedy(&wrk1,s_prop[sj],s_v[sj],si); + rYtmp = wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_u[sj],si); + uYtmp = uYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_v[sj],si); + vYtmp = vYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_w[sj],si); + wYtmp = wYtmp + wrk1; + __syncthreads(); + fluxCubeSharedy(&wrk1,s_prop[sj],s_v[sj],s_dil[sj],si); + eYtmp = eYtmp + wrk1; + __syncthreads(); + +#if useStreams + rY[id.g] = rYtmp; + uY[id.g] = uYtmp; + vY[id.g] = vYtmp; + wY[id.g] = wYtmp; + eY[id.g] = eYtmp; +#else + rY[id.g] += rYtmp; + uY[id.g] += uYtmp; + vY[id.g] += vYtmp; + wY[id.g] += wYtmp; + eY[id.g] += eYtmp; +#endif + __syncthreads(); +} + +__global__ void deviceRHSZ(myprec *rZ, myprec *uZ, myprec *vZ, myprec *wZ, myprec *eZ, + myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , + myprec *t, myprec *p, myprec *mu, myprec *lam, + myprec *dwdx, myprec *dwdy, myprec *dudz, myprec *dvdz, myprec *dwdz, + myprec *dil, myprec *dpdz) { + + Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); + int kNum = blockIdx.z; + id.mkidZFlx(kNum); + + int tid = id.tix + id.bdx*id.tiy; + + int si = id.tiy + stencilSize; // local i for shared memory access + halo offset + int sj = id.tix; // local j for shared memory access + int si1 = tid%id.bdy + stencilSize; // local i for shared memory access + halo offset + int sj1 = tid/id.bdy; + + Indices id1(sj1,si1-stencilSize, blockIdx.x, blockIdx.y, blockDim.x, blockDim.y); + id1.mkidZFlx(kNum); + + myprec rZtmp=0; + myprec uZtmp=0; + myprec vZtmp=0; + myprec wZtmp=0; + myprec eZtmp=0; + + myprec wrk1=0; + myprec wrk2=0; + myprec wrk3=0; + + //myprec tmpreg1[stencilSize*2+1]; + //myprec tmpreg2[stencilSize*2+1]; + //myprec tmpreg3[stencilSize*2+1]; + + __shared__ myprec s_u[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_v[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_w[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_prop[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_dil[mx/nDivX][mz/nDivZ+stencilSize*2]; + __shared__ myprec s_tmp[mx/nDivX][mz/nDivZ+stencilSize*2]; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + s_u[sj][si] = u[id.g]; + s_v[sj][si] = v[id.g]; + s_w[sj][si] = w[id.g]; + s_prop[sj][si] = mu[id.g]; + s_dil[sj][si] = dil[id.g]; + __syncthreads(); + + // fill in periodic images in shared memory array + // these are boundary conditions for u,v,w,mu and dilatation + + + BCzNumber1(s_u[sj],s_v[sj],s_w[sj],s_prop[sj],s_dil[sj],u,v,w,mu,dil,id,si,mz,kNum); + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP +////BY COMPUTATION THREAD ARRANGEMENT + derDevSharedV1z(&wrk1,s_u[sj1],si1); + derDevSharedV1z(&wrk2,s_v[sj1],si1); + derDevSharedV1z(&wrk3,s_w[sj1],si1); + //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms + +// WRK1 CORRESPONDS TO THE NEW THREAD ARRANGEMENT. + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + s_tmp[sj][si] = dwdx[id.g]; + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + uZtmp = ( wrk1 + s_tmp[sj1][si1] ); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + __syncthreads(); + s_tmp[sj][si] = dwdy[id.g]; + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + vZtmp = ( wrk2 + s_tmp[sj1][si1] ); + wZtmp = (2 * wrk3 - 2./3.*s_dil[sj1][si1] ); + + //adding the viscous dissipation part duidz*mu*siz + eZtmp = s_prop[sj1][si1]*(uZtmp*wrk1 + vZtmp*wrk2 + wZtmp*wrk3); + + //Adding here the terms d (mu) dz * szj; + derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (mu) dz + uZtmp *= wrk2; + vZtmp *= wrk2; + wZtmp *= wrk2; + + //viscous fluxes derivative + derDevSharedV2z(&wrk1,s_u[sj1],si1); + uZtmp = uZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV2z(&wrk1,s_v[sj1],si1); + vZtmp = vZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV2z(&wrk1,s_w[sj1],si1); + wZtmp = wZtmp + wrk1*s_prop[sj1][si1]; + + //adding the viscous dissipation part ui*(mu * d2duidz2 + dmudz * siz) + eZtmp = eZtmp + s_u[sj1][si1]*uZtmp + s_v[sj1][si1]*vZtmp + s_w[sj1][si1]*wZtmp; + + //dilation derivatives + derDevSharedV1z(&wrk2,s_dil[sj1],si1); + wZtmp = wZtmp + s_prop[sj1][si1]*wrk2/3.0; + eZtmp = eZtmp + s_prop[sj1][si1]*wrk2/3.0*s_w[sj1][si1]; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM +//SHARED MEM LOADING OPERATION. - ID + + // pressure derivatives + __syncthreads(); + s_dil[sj][si] = p[id.g]; + __syncthreads(); + BCzNumber2(s_dil[sj],p,id,si,mz,kNum); + __syncthreads(); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + derDevShared1z(&wrk1,s_dil[sj1],si1); + wZtmp = wZtmp - wrk1; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + // fourier terms + __syncthreads(); + s_prop[sj][si] = lam[id.g]; + s_dil[sj][si] = t[id.g]; + __syncthreads(); + // Boundary conditions for thermal conductivity and temperature + BCzNumber3(s_prop[sj],s_dil[sj],lam,t,id,si,mz,kNum); + __syncthreads(); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + + derDevSharedV2z(&wrk1,s_dil[sj1],si1); + eZtmp = eZtmp + wrk1*s_prop[sj1][si1]; + derDevSharedV1z(&wrk2,s_prop[sj1],si1); //wrk2 = d (lam) dz + derDevSharedV1z(&wrk1,s_dil[sj1],si1); //wrk1 = d (t) dz + eZtmp = eZtmp + wrk1*wrk2; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + //Adding here the terms - d (ru phi) dz; + s_prop[sj][si] = r[id.g]; + s_dil[sj][si] = h[id.g]; + __syncthreads(); + // Boundary conditions for denisty and enthalpy + BCzNumber4(s_prop[sj],s_dil[sj],r,h,id,si,mz,kNum); + __syncthreads(); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////COMP + fluxQuadSharedz(&wrk1,s_prop[sj1],s_w[sj1],si1); + rZtmp = wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_u[sj1],si1); + uZtmp = uZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_v[sj1],si1); + vZtmp = vZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_w[sj1],si1); + wZtmp = wZtmp + wrk1; + __syncthreads(); + fluxCubeSharedz(&wrk1,s_prop[sj1],s_w[sj1],s_dil[sj1],si1); + eZtmp = eZtmp + wrk1; + __syncthreads(); +//USE SHARED ARRAYS TO STORE OUTPUT AND THEN WRITE USING MEMORY TILE. + s_prop[sj1][si1] = rZtmp ; s_u[sj1][si1] = uZtmp ; s_v[sj1][si1] = vZtmp ; s_tmp[sj1][si1] = wZtmp ; s_dil[sj1][si1] = eZtmp ; + __syncthreads(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////MEM + + rZ[id.g] += s_prop[sj][si]; + uZ[id.g] += s_u[sj][si]; + vZ[id.g] += s_v[sj][si]; + wZ[id.g] += s_tmp[sj][si] + *dpdz; + eZ[id.g] += s_dil[sj][si] + *dpdz*s_w[sj][si]; + + +//#if useStreams +// rZ[id.g] = rZtmp; +// uZ[id.g] = uZtmp; +// vZ[id.g] = vZtmp; +// wZ[id.g] = wZtmp + *dpdz; +// eZ[id.g] = eZtmp + *dpdz*s_w[sj][si] ; +//#else + + + + + +//#endif + __syncthreads(); +} + +//__global__ void deviceRHSX(myprec *rX, myprec *uX, myprec *vX, myprec *wX, myprec *eX, +// myprec *r, myprec *u, myprec *v, myprec *w, myprec *h , +// myprec *t, myprec *p, myprec *mu, myprec *lam, +// myprec *dil, myprec *dpdz) { +// +// Indices id(threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y); +// id.mkidX(); +// +// int si = id.i + stencilSize; // local i for shared memory access + halo offset +// int sj = id.tiy; // local j for shared memory access +// +// myprec rXtmp=0; +// myprec uXtmp=0; +// myprec vXtmp=0; +// myprec wXtmp=0; +// myprec eXtmp=0; +// +// myprec wrk1=0; +// myprec wrk2=0; +// +// __shared__ myprec s_u[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_v[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_w[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_t[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_p[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_prop1[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_prop2[sPencils][mx+stencilSize*2]; +//#if !periodicX +// __shared__ myprec s_s0[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_s4[sPencils][mx+stencilSize*2]; +// __shared__ myprec s_s8[sPencils][mx+stencilSize*2]; +//#endif +// __shared__ myprec s_dil[sPencils][mx+stencilSize*2]; +// +// s_u[sj][si] = u[id.g]; +// s_v[sj][si] = v[id.g]; +// s_w[sj][si] = w[id.g]; +// s_t[sj][si] = t[id.g]; +// s_p[sj][si] = p[id.g]; +// s_prop1[sj][si] = mu[id.g]; +// s_prop2[sj][si] = lam[id.g]; +//#if !periodicX +// s_s0[sj][si]= gij[0][id.g]; +// s_s4[sj][si]= gij[4][id.g]; +// s_s8[sj][si]= gij[8][id.g]; +//#endif +// s_dil[sj][si] = dil[id.g]; +// __syncthreads(); +// +// // fill in periodic images in shared memory array +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_u[sj],si); perBCx(s_v[sj],si); perBCx(s_w[sj],si); +// perBCx(s_t[sj],si); perBCx(s_p[sj],si); perBCx(s_prop1[sj],si); +// perBCx(s_prop2[sj],si); +//#else +// wallBCxMir(s_p[sj],si); +// wallBCxVel(s_u[sj],si); wallBCxVel(s_v[sj],si); wallBCxVel(s_w[sj],si); +// wallBCxExt(s_t[sj],si,TwallTop,TwallBot); +// mlBoundPT(s_prop1[sj], s_prop2[sj], s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si); +// wallBCxMir(s_s0[sj],si); wallBCxVel(s_s4[sj],si); wallBCxVel(s_s8[sj],si); +//#endif +// } +// +// __syncthreads(); +// +// //initialize momentum RHS with stresses so that they can be added for both viscous terms and viscous heating without having to load additional terms +// uXtmp = ( 2 * gij[0][id.g] - 2./3.*s_dil[sj][si] ); +// vXtmp = ( gij[1][id.g] + gij[3][id.g] ); +// wXtmp = ( gij[2][id.g] + gij[6][id.g] ); +// +// //adding the viscous dissipation part duidx*mu*six +// eXtmp = s_prop1[sj][si]*(uXtmp*gij[0][id.g] + vXtmp*gij[1][id.g] + wXtmp*gij[2][id.g]); +// +// //Adding here the terms d (mu) dx * sxj; (lambda in case of h in rhse); +// derDevSharedV1x(&wrk2,s_prop1[sj],si); //wrk2 = d (mu) dx +// uXtmp *= wrk2; +// vXtmp *= wrk2; +// wXtmp *= wrk2; +// +// // viscous fluxes derivative mu*d^2ui dx^2 +// derDevSharedV2x(&wrk1,s_u[sj],si); +// uXtmp = uXtmp + wrk1*s_prop1[sj][si]; +// derDevSharedV2x(&wrk1,s_v[sj],si); +// vXtmp = vXtmp + wrk1*s_prop1[sj][si]; +// derDevSharedV2x(&wrk1,s_w[sj],si); +// wXtmp = wXtmp + wrk1*s_prop1[sj][si]; +// +// //adding the viscous dissipation part ui*(mu * d2duidx2 + dmudx * six) +// eXtmp = eXtmp + s_u[sj][si]*uXtmp + s_v[sj][si]*vXtmp + s_w[sj][si]*wXtmp; +// +// //adding the molecular conduction part (d2 temp dx2*lambda + dlambda dx * d temp dx) +// derDevSharedV2x(&wrk1,s_t[sj],si); +// eXtmp = eXtmp + wrk1*s_prop2[sj][si]; +// derDevSharedV1x(&wrk2,s_prop2[sj],si); //wrk2 = d (lam) dx +// derDevSharedV1x(&wrk1,s_t[sj],si); //wrk1 = d (t) dx +// eXtmp = eXtmp + wrk1*wrk2; +// +// // pressure and dilation derivatives +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_dil[sj],si); +//#else +// wallBCxDil(s_dil[sj],s_s0[sj],s_s4[sj],s_s8[sj],si); +//#endif +// } +// __syncthreads(); +// +// derDevSharedV1x(&wrk2,s_dil[sj],si); +// derDevShared1x(&wrk1 ,s_p[sj],si); +// uXtmp = uXtmp + s_prop1[sj][si]*wrk2/3.0 - wrk1 ; +// eXtmp = eXtmp + s_prop1[sj][si]*wrk2/3.0*s_u[sj][si]; +// +// //Adding here the terms - d (ru phi) dx; +// s_prop1[sj][si] = r[id.g]; +// s_prop2[sj][si] = h[id.g]; +// __syncthreads(); +// // fill in periodic images in shared memory array +// if (id.i < stencilSize) { +//#if periodicX +// perBCx(s_prop1[sj],si); perBCx(s_prop2[sj],si); +//#else +// rhBoundPT(s_prop1[sj], s_prop2[sj], s_p[sj], s_t[sj], s_u[sj], s_v[sj], s_w[sj], si); +//#endif +// } +// +// fluxQuadSharedx(&wrk1,s_prop1[sj],s_u[sj],si); +// rXtmp = wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_u[sj],si); +// uXtmp = uXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_v[sj],si); +// vXtmp = vXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_w[sj],si); +// wXtmp = wXtmp + wrk1; +// __syncthreads(); +// fluxCubeSharedx(&wrk1,s_prop1[sj],s_u[sj],s_prop2[sj],si); +// eXtmp = eXtmp + wrk1; +// __syncthreads(); +// +// rX[id.g] = rXtmp; +// uX[id.g] = uXtmp; +// vX[id.g] = vXtmp; +// wX[id.g] = wXtmp; +// eX[id.g] = eXtmp; +//}