QUDA: quda/lib/face_mpi.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <quda_internal.h>
00002 #include <face_quda.h>
00003 #include <comm_quda.h>
00004 #include <cstdio>
00005 #include <cstdlib>
00006 #include <quda.h>
00007 #include <string.h>
00008 #include <sys/time.h>
00009 #include <mpi.h>
00010 #include <cuda.h>
00011 
00012 #include <fat_force_quda.h>
00013 
00014 using namespace std;
00015 
00016 #ifdef DSLASH_PROFILING
00017   void printDslashProfile();
00018 #define CUDA_EVENT_RECORD(a,b) cudaEventRecord(a,b)
00019 #else
00020 #define CUDA_EVENT_RECORD(a,b)
00021 #define DSLASH_TIME_PROFILE()
00022 #endif
00023 
00024 cudaStream_t *stream;
00025 
00026 bool globalReduce = true;
00027 
00028 FaceBuffer::FaceBuffer(const int *X, const int nDim, const int Ninternal, 
00029                        const int nFace, const QudaPrecision precision) : 
00030   Ninternal(Ninternal), precision(precision), nDim(nDim), nFace(nFace)
00031 {
00032   setupDims(X);
00033 
00034   // set these both = 0 `for no overlap of qmp and cudamemcpyasync
00035   // sendBackStrmIdx = 0, and sendFwdStrmIdx = 1 for overlap
00036   sendBackStrmIdx = 0;
00037   sendFwdStrmIdx = 1;
00038   recFwdStrmIdx = sendBackStrmIdx;
00039   recBackStrmIdx = sendFwdStrmIdx;
00040 
00041   
00042   for(int i=0;i < QUDA_MAX_DIM; i++){
00043     recv_request1[i] = malloc(sizeof(MPI_Request));
00044     recv_request2[i] = malloc(sizeof(MPI_Request));
00045     send_request1[i] = malloc(sizeof(MPI_Request));    
00046     send_request2[i] = malloc(sizeof(MPI_Request));    
00047     if( recv_request1[i] == 0 || recv_request2[i] == 0
00048         || send_request1[i] == 0 || send_request2[i] == 0){
00049       errorQuda("ERROR: malloc failed for recv/send request handles\n");
00050     }
00051   }
00052 
00053   for(int dir =0 ; dir < 4;dir++){
00054     nbytes[dir] = nFace*faceVolumeCB[dir]*Ninternal*precision;
00055     if (precision == QUDA_HALF_PRECISION) nbytes[dir] += nFace*faceVolumeCB[dir]*sizeof(float);
00056     
00057     cudaMallocHost((void**)&fwd_nbr_spinor_sendbuf[dir], nbytes[dir]); 
00058     cudaMallocHost((void**)&back_nbr_spinor_sendbuf[dir], nbytes[dir]);
00059     
00060     if (fwd_nbr_spinor_sendbuf[dir] == NULL || back_nbr_spinor_sendbuf[dir] == NULL)
00061       errorQuda("dir =%d, malloc failed for fwd_nbr_spinor_sendbuf/back_nbr_spinor_sendbuf", dir); 
00062     
00063     cudaMallocHost((void**)&fwd_nbr_spinor[dir], nbytes[dir]); 
00064     cudaMallocHost((void**)&back_nbr_spinor[dir], nbytes[dir]); 
00065     
00066     if (fwd_nbr_spinor[dir] == NULL || back_nbr_spinor[dir] == NULL)
00067       errorQuda("malloc failed for fwd_nbr_spinor/back_nbr_spinor"); 
00068 
00069 #ifdef GPU_DIRECT
00070     pageable_fwd_nbr_spinor_sendbuf[dir] = fwd_nbr_spinor_sendbuf[dir];
00071     pageable_back_nbr_spinor_sendbuf[dir] = back_nbr_spinor_sendbuf[dir];
00072     pageable_fwd_nbr_spinor[dir] = fwd_nbr_spinor[dir];
00073     pageable_back_nbr_spinor[dir] = back_nbr_spinor[dir];
00074 #else
00075     pageable_fwd_nbr_spinor_sendbuf[dir] = malloc(nbytes[dir]);
00076     pageable_back_nbr_spinor_sendbuf[dir] = malloc(nbytes[dir]);
00077     
00078     if (pageable_fwd_nbr_spinor_sendbuf[dir] == NULL || pageable_back_nbr_spinor_sendbuf[dir] == NULL)
00079       errorQuda("malloc failed for pageable_fwd_nbr_spinor_sendbuf/pageable_back_nbr_spinor_sendbuf");
00080     
00081     pageable_fwd_nbr_spinor[dir]=malloc(nbytes[dir]);
00082     pageable_back_nbr_spinor[dir]=malloc(nbytes[dir]);
00083     
00084     if (pageable_fwd_nbr_spinor[dir] == NULL || pageable_back_nbr_spinor[dir] == NULL)
00085       errorQuda("malloc failed for pageable_fwd_nbr_spinor/pageable_back_nbr_spinor"); 
00086 #endif
00087     
00088   }
00089   
00090   return;
00091 }
00092 
00093 FaceBuffer::FaceBuffer(const FaceBuffer &face) {
00094   errorQuda("FaceBuffer copy constructor not implemented");
00095 }
00096 
00097 // X here is a checkboarded volume
00098 void FaceBuffer::setupDims(const int* X)
00099 {
00100   Volume = 1;
00101   for (int d=0; d<nDim; d++) {
00102     this->X[d] = X[d];
00103     Volume *= this->X[d];    
00104   }
00105   VolumeCB = Volume/2;
00106 
00107   for (int i=0; i<nDim; i++) {
00108     faceVolume[i] = 1;
00109     for (int j=0; j<nDim; j++) {
00110       if (i==j) continue;
00111       faceVolume[i] *= this->X[j];
00112     }
00113     faceVolumeCB[i] = faceVolume[i]/2;
00114   }
00115 }
00116 
00117 FaceBuffer::~FaceBuffer()
00118 {
00119   
00120   for(int i=0;i < QUDA_MAX_DIM; i++){
00121     free((void*)recv_request1[i]);
00122     free((void*)recv_request2[i]);
00123     free((void*)send_request1[i]);
00124     free((void*)send_request2[i]);
00125   }
00126 
00127   for(int dir =0; dir < 4; dir++){
00128     if(fwd_nbr_spinor_sendbuf[dir]) {
00129       cudaFreeHost(fwd_nbr_spinor_sendbuf[dir]);
00130       fwd_nbr_spinor_sendbuf[dir] = NULL;
00131     }
00132     if(back_nbr_spinor_sendbuf[dir]) {
00133       cudaFreeHost(back_nbr_spinor_sendbuf[dir]);
00134       back_nbr_spinor_sendbuf[dir] = NULL;
00135     }
00136     if(fwd_nbr_spinor[dir]) {
00137       cudaFreeHost(fwd_nbr_spinor[dir]);
00138       fwd_nbr_spinor[dir] = NULL;
00139     }
00140     if(back_nbr_spinor[dir]) {
00141       cudaFreeHost(back_nbr_spinor[dir]);
00142       back_nbr_spinor[dir] = NULL;
00143     }    
00144 
00145 #ifdef GPU_DIRECT
00146     pageable_fwd_nbr_spinor_sendbuf[dir] = NULL;
00147     pageable_back_nbr_spinor_sendbuf[dir]=NULL;
00148     pageable_fwd_nbr_spinor[dir]=NULL;
00149     pageable_back_nbr_spinor[dir]=NULL;
00150 #else
00151     if(pageable_fwd_nbr_spinor_sendbuf[dir]){
00152       free(pageable_fwd_nbr_spinor_sendbuf[dir]);
00153       pageable_fwd_nbr_spinor_sendbuf[dir] = NULL;
00154     }
00155 
00156     if(pageable_back_nbr_spinor_sendbuf[dir]){
00157       free(pageable_back_nbr_spinor_sendbuf[dir]);
00158       pageable_back_nbr_spinor_sendbuf[dir]=NULL;
00159     }
00160     
00161     if(pageable_fwd_nbr_spinor[dir]){
00162       free(pageable_fwd_nbr_spinor[dir]);
00163       pageable_fwd_nbr_spinor[dir]=NULL;
00164     }
00165     
00166     if(pageable_back_nbr_spinor[dir]){
00167       free(pageable_back_nbr_spinor[dir]);
00168       pageable_back_nbr_spinor[dir]=NULL;
00169     }
00170 #endif
00171 
00172     
00173   }
00174 }
00175 
00176 void FaceBuffer::pack(cudaColorSpinorField &in, int parity, int dagger, int dim, cudaStream_t *stream_p)
00177 {
00178   if(!commDimPartitioned(dim)) return;
00179 
00180   in.allocateGhostBuffer();   // allocate the ghost buffer if not yet allocated  
00181   stream = stream_p;
00182 
00183   in.packGhost(dim, (QudaParity)parity, dagger, &stream[Nstream-1]);
00184 }
00185 
00186 void FaceBuffer::gather(cudaColorSpinorField &in, int dagger, int dir)
00187 {
00188   int dim = dir/2;
00189   if(!commDimPartitioned(dim)) return;
00190 
00191   if (dir%2==0){ // backwards send
00192     in.sendGhost(back_nbr_spinor_sendbuf[dim], dim, QUDA_BACKWARDS, dagger, &stream[2*dim + sendBackStrmIdx]);
00193   } else { // forwards send
00194     in.sendGhost(fwd_nbr_spinor_sendbuf[dim], dim, QUDA_FORWARDS, dagger, &stream[2*dim + sendFwdStrmIdx]); 
00195   }
00196 }
00197 
00198 void FaceBuffer::commsStart(int dir) { 
00199   int dim = dir / 2;
00200   if(!commDimPartitioned(dim)) return;
00201 
00202   int back_nbr[4] = {X_BACK_NBR,Y_BACK_NBR,Z_BACK_NBR,T_BACK_NBR};
00203   int fwd_nbr[4] = {X_FWD_NBR,Y_FWD_NBR,Z_FWD_NBR,T_FWD_NBR};
00204   int downtags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN};
00205   int uptags[4] = {XUP, YUP, ZUP, TUP};
00206 
00207   if (dir %2 == 0) {
00208     // Prepost all receives
00209 
00210     comm_recv_with_tag(pageable_fwd_nbr_spinor[dim], nbytes[dim], fwd_nbr[dim], downtags[dim], recv_request1[dim]);
00211 #ifndef GPU_DIRECT
00212     memcpy(pageable_back_nbr_spinor_sendbuf[dim], 
00213            back_nbr_spinor_sendbuf[dim], nbytes[dim]);
00214 #endif
00215     comm_send_with_tag(pageable_back_nbr_spinor_sendbuf[dim], nbytes[dim], back_nbr[dim], downtags[dim], send_request1[dim]);
00216   } else {
00217 
00218     comm_recv_with_tag(pageable_back_nbr_spinor[dim], nbytes[dim], back_nbr[dim], uptags[dim], recv_request2[dim]);
00219 #ifndef GPU_DIRECT
00220     memcpy(pageable_fwd_nbr_spinor_sendbuf[dim], 
00221            fwd_nbr_spinor_sendbuf[dim], nbytes[dim]);
00222 #endif
00223     comm_send_with_tag(pageable_fwd_nbr_spinor_sendbuf[dim], nbytes[dim], fwd_nbr[dim], uptags[dim], send_request2[dim]);
00224   }
00225 }
00226 
00227 
00228 int FaceBuffer::commsQuery(int dir) {
00229   int dim = dir / 2;
00230   if(!commDimPartitioned(dim)) return 0;
00231 
00232   if(dir%2==0) {
00233     if (comm_query(recv_request1[dim]) && 
00234         comm_query(send_request1[dim])) {
00235 #ifndef GPU_DIRECT
00236       memcpy(fwd_nbr_spinor[dim], pageable_fwd_nbr_spinor[dim], nbytes[dim]);
00237 #endif
00238       return 1;
00239     }
00240   } else {
00241     if (comm_query(recv_request2[dim]) &&
00242         comm_query(send_request2[dim])) {
00243 #ifndef GPU_DIRECT
00244       memcpy(back_nbr_spinor[dim], pageable_back_nbr_spinor[dim], nbytes[dim]);
00245 #endif
00246       return 1;
00247     }
00248   }
00249 
00250   return 0;
00251 }
00252 
00253 void FaceBuffer::scatter(cudaColorSpinorField &out, int dagger, int dir) {
00254   int dim = dir / 2;
00255   if(!commDimPartitioned(dim)) return;
00256 
00257   if (dir%2 == 0) {
00258     out.unpackGhost(fwd_nbr_spinor[dim], dim, QUDA_FORWARDS,  dagger, &stream[2*dim + recFwdStrmIdx]); 
00259   } else {
00260     out.unpackGhost(back_nbr_spinor[dim], dim, QUDA_BACKWARDS,  dagger, &stream[2*dim + recBackStrmIdx]);
00261   }
00262 }
00263 
00264 void FaceBuffer::exchangeCpuSpinor(cpuColorSpinorField &spinor, int oddBit, int dagger)
00265 {
00266 
00267   //for all dimensions
00268   int len[4] = {
00269     nFace*faceVolumeCB[0]*Ninternal*precision,
00270     nFace*faceVolumeCB[1]*Ninternal*precision,
00271     nFace*faceVolumeCB[2]*Ninternal*precision,
00272     nFace*faceVolumeCB[3]*Ninternal*precision
00273   };
00274 
00275   // allocate the ghost buffer if not yet allocated
00276   spinor.allocateGhostBuffer();
00277 
00278   for(int i=0;i < 4; i++){
00279     spinor.packGhost(spinor.backGhostFaceSendBuffer[i], i, QUDA_BACKWARDS, (QudaParity)oddBit, dagger);
00280     spinor.packGhost(spinor.fwdGhostFaceSendBuffer[i], i, QUDA_FORWARDS, (QudaParity)oddBit, dagger);
00281   }
00282 
00283   int back_nbr[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR,T_BACK_NBR};
00284   int fwd_nbr[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR,T_FWD_NBR};
00285   int uptags[4] = {XUP, YUP, ZUP, TUP};
00286   int downtags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN};
00287   
00288   for(int i= 0;i < 4; i++){
00289     comm_recv_with_tag(spinor.backGhostFaceBuffer[i], len[i], back_nbr[i], uptags[i], recv_request1[i]);
00290     comm_recv_with_tag(spinor.fwdGhostFaceBuffer[i], len[i], fwd_nbr[i], downtags[i], recv_request2[i]);    
00291     comm_send_with_tag(spinor.fwdGhostFaceSendBuffer[i], len[i], fwd_nbr[i], uptags[i], send_request1[i]);
00292     comm_send_with_tag(spinor.backGhostFaceSendBuffer[i], len[i], back_nbr[i], downtags[i], send_request2[i]);
00293   }
00294 
00295   for(int i=0;i < 4;i++){
00296     comm_wait(recv_request1[i]);
00297     comm_wait(recv_request2[i]);
00298     comm_wait(send_request1[i]);
00299     comm_wait(send_request2[i]);
00300   }
00301 
00302 }
00303 
00304 
00305 void FaceBuffer::exchangeCpuLink(void** ghost_link, void** link_sendbuf) {
00306   int uptags[4] = {XUP, YUP, ZUP,TUP};
00307   int fwd_nbrs[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR, T_FWD_NBR};
00308   int back_nbrs[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR, T_BACK_NBR};
00309 
00310   for(int dir =0; dir < 4; dir++)
00311     {
00312       int len = 2*nFace*faceVolumeCB[dir]*Ninternal;
00313       MPI_Request recv_request; 
00314         comm_recv_with_tag(ghost_link[dir], len*precision, back_nbrs[dir], uptags[dir], &recv_request);
00315       MPI_Request send_request;
00316         comm_send_with_tag(link_sendbuf[dir], len*precision, fwd_nbrs[dir], uptags[dir], &send_request);
00317       comm_wait(&recv_request);
00318       comm_wait(&send_request);
00319     }
00320 }
00321 
00322 void reduceMaxDouble(double &max) {
00323 
00324 #ifdef MPI_COMMS
00325   comm_allreduce_max(&max);
00326 #endif
00327 
00328 }
00329 void reduceDouble(double &sum) {
00330 
00331 #ifdef MPI_COMMS
00332   if (globalReduce) comm_allreduce(&sum);
00333 #endif
00334 
00335 }
00336 
00337 void reduceDoubleArray(double *sum, const int len) {
00338 
00339 #ifdef MPI_COMMS
00340   if (globalReduce) comm_allreduce_array(sum, len);
00341 #endif
00342 
00343 }
00344 
00345 int commDim(int dir) { return comm_dim(dir); }
00346 
00347 int commCoords(int dir) { return comm_coords(dir); }
00348 
00349 int commDimPartitioned(int dir){ return comm_dim_partitioned(dir);}
00350 
00351 void commDimPartitionedSet(int dir) { comm_dim_partitioned_set(dir);}
00352 
00353 void commBarrier() { comm_barrier(); }
00354 
00355 
00356 
00357 /**************************************************************
00358  * Staple exchange routine
00359  * used in fat link computation
00360  ***************************************************************/
00361 #if defined(GPU_FATLINK)||defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE)
00362 
00363 #define gaugeSiteSize 18
00364 
00365 #ifdef GPU_DIRECT
00366 static void* fwd_nbr_staple_cpu[4];
00367 static void* back_nbr_staple_cpu[4];
00368 static void* fwd_nbr_staple_sendbuf_cpu[4];
00369 static void* back_nbr_staple_sendbuf_cpu[4];
00370 #endif
00371 
00372 static void* fwd_nbr_staple_gpu[4];
00373 static void* back_nbr_staple_gpu[4];
00374 
00375 static void* fwd_nbr_staple[4];
00376 static void* back_nbr_staple[4];
00377 static void* fwd_nbr_staple_sendbuf[4];
00378 static void* back_nbr_staple_sendbuf[4];
00379 
00380 static int dims[4];
00381 static int X1,X2,X3,X4;
00382 static int V;
00383 static int Vh;
00384 static int Vs[4], Vsh[4];
00385 static int Vs_x, Vs_y, Vs_z, Vs_t;
00386 static int Vsh_x, Vsh_y, Vsh_z, Vsh_t;
00387 static MPI_Request llfat_send_request1[4];
00388 static MPI_Request llfat_recv_request1[4];
00389 static MPI_Request llfat_send_request2[4];
00390 static MPI_Request llfat_recv_request2[4];
00391 
00392 #include "gauge_field.h"
00393 extern void setup_dims_in_gauge(int *XX);
00394 
00395 static void
00396 setup_dims(int* X)
00397 {
00398   V = 1;
00399   for (int d=0; d< 4; d++) {
00400     V *= X[d];
00401     dims[d] = X[d];
00402   }
00403   Vh = V/2;
00404   
00405   X1=X[0];
00406   X2=X[1];
00407   X3=X[2];
00408   X4=X[3];
00409 
00410 
00411   Vs[0] = Vs_x = X[1]*X[2]*X[3];
00412   Vs[1] = Vs_y = X[0]*X[2]*X[3];
00413   Vs[2] = Vs_z = X[0]*X[1]*X[3];
00414   Vs[3] = Vs_t = X[0]*X[1]*X[2];
00415 
00416   Vsh[0] = Vsh_x = Vs_x/2;
00417   Vsh[1] = Vsh_y = Vs_y/2;
00418   Vsh[2] = Vsh_z = Vs_z/2;
00419   Vsh[3] = Vsh_t = Vs_t/2;
00420 
00421 }
00422 
00423 void 
00424 exchange_llfat_init(QudaPrecision prec)
00425 {
00426   static int initialized = 0;
00427   if (initialized){
00428     return;
00429   }
00430   initialized = 1;
00431   
00432 
00433   for(int i=0;i < 4; i++){
00434     if(cudaMalloc((void**)&fwd_nbr_staple_gpu[i], Vs[i]*gaugeSiteSize*prec) != cudaSuccess){
00435       errorQuda("cudaMalloc() failed for fwd_nbr_staple_gpu\n");
00436     }
00437     if(cudaMalloc((void**)&back_nbr_staple_gpu[i], Vs[i]*gaugeSiteSize*prec) != cudaSuccess){
00438       errorQuda("cudaMalloc() failed for back_nbr_staple_gpu\n");
00439     }
00440 
00441     cudaMallocHost((void**)&fwd_nbr_staple[i], Vs[i]*gaugeSiteSize*prec);
00442     cudaMallocHost((void**)&back_nbr_staple[i], Vs[i]*gaugeSiteSize*prec);
00443 
00444     cudaMallocHost((void**)&fwd_nbr_staple_sendbuf[i], Vs[i]*gaugeSiteSize*prec);
00445     cudaMallocHost((void**)&back_nbr_staple_sendbuf[i], Vs[i]*gaugeSiteSize*prec);
00446   }
00447 
00448   
00449 
00450 #ifdef GPU_DIRECT
00451   for(int i=0;i < 4; i++){
00452     fwd_nbr_staple_cpu[i] = malloc(Vs[i]*gaugeSiteSize*prec);
00453     back_nbr_staple_cpu[i] = malloc(Vs[i]*gaugeSiteSize*prec);
00454     if (fwd_nbr_staple_cpu[i] == NULL||back_nbr_staple_cpu[i] == NULL){
00455       printf("ERROR: malloc failed for fwd_nbr_staple_cpu/back_nbr_staple_cpu\n");
00456       comm_exit(1);
00457     }
00458 
00459   }
00460 
00461   for(int i=0;i < 4; i++){
00462     fwd_nbr_staple_sendbuf_cpu[i] = malloc(Vs[i]*gaugeSiteSize*prec);
00463     back_nbr_staple_sendbuf_cpu[i] = malloc(Vs[i]*gaugeSiteSize*prec);
00464     if (fwd_nbr_staple_sendbuf_cpu[i] == NULL || back_nbr_staple_sendbuf_cpu[i] == NULL){
00465       printf("ERROR: malloc failed for fwd_nbr_staple_sendbuf/back_nbr_staple_sendbuf\n");
00466       comm_exit(1);
00467     }
00468   }
00469 #endif
00470   
00471   return;
00472 }
00473 
00474 template<typename Float>
00475 void
00476 exchange_sitelink_diag(int* X, Float** sitelink,  Float** ghost_sitelink_diag, int optflag)
00477 {
00478   /*
00479     nu |          |
00480        |__________|
00481            mu 
00482 
00483   * There are total 12 different combinations for (nu,mu)
00484   * since nu/mu = X,Y,Z,T and nu != mu
00485   * For each combination, we need to communicate with the corresponding
00486   * neighbor and get the diag ghost data
00487   * The neighbor we need to get data from is dx[nu]=-1, dx[mu]= +1
00488   * and we need to send our data to neighbor with dx[nu]=+1, dx[mu]=-1
00489   */
00490   
00491   for(int nu = XUP; nu <=TUP; nu++){
00492     for(int mu = XUP; mu <= TUP; mu++){
00493       if(nu == mu){
00494         continue;
00495       }
00496       if(optflag && (!commDimPartitioned(mu) || !commDimPartitioned(nu))){
00497         continue;
00498       }
00499 
00500       int dir1, dir2; //other two dimensions
00501       for(dir1=0; dir1 < 4; dir1 ++){
00502         if(dir1 != nu && dir1 != mu){
00503           break;
00504         }
00505       }
00506       for(dir2=0; dir2 < 4; dir2 ++){
00507         if(dir2 != nu && dir2 != mu && dir2 != dir1){
00508           break;
00509         }
00510       }  
00511       
00512       if(dir1 == 4 || dir2 == 4){
00513         errorQuda("Invalid dir1/dir2");
00514       }
00515       int len = X[dir1]*X[dir2]*gaugeSiteSize*sizeof(Float);
00516       void* sendbuf = malloc(len);
00517       if(sendbuf == NULL){
00518         errorQuda("Malloc failed for diag sendbuf\n");
00519       }
00520       
00521       pack_gauge_diag(sendbuf, X, (void**)sitelink, nu, mu, dir1, dir2, (QudaPrecision)sizeof(Float));
00522   
00523       //post recv
00524       int dx[4]={0,0,0,0};
00525       dx[nu]=-1;
00526       dx[mu]=+1;
00527       int src_rank = comm_get_neighbor_rank(dx[0], dx[1], dx[2], dx[3]);
00528       MPI_Request recv_request;
00529       comm_recv_from_rank(ghost_sitelink_diag[nu*4+mu], len, src_rank, &recv_request);
00530       //do send
00531       dx[nu]=+1;
00532       dx[mu]=-1;
00533       int dst_rank = comm_get_neighbor_rank(dx[0], dx[1], dx[2], dx[3]);
00534       MPI_Request send_request;
00535       comm_send_to_rank(sendbuf, len, dst_rank, &send_request);
00536       
00537       comm_wait(&recv_request);
00538       comm_wait(&send_request);
00539             
00540       free(sendbuf);
00541     }//mu
00542   }//nu
00543 }
00544 
00545 
00546 template<typename Float>
00547 void
00548 exchange_sitelink(int*X, Float** sitelink, Float** ghost_sitelink, Float** ghost_sitelink_diag, 
00549                   Float** sitelink_fwd_sendbuf, Float** sitelink_back_sendbuf, int optflag)
00550 {
00551 
00552 
00553 #if 0
00554   int i;
00555   int len = Vsh_t*gaugeSiteSize*sizeof(Float);
00556   for(i=0;i < 4;i++){
00557     Float* even_sitelink_back_src = sitelink[i];
00558     Float* odd_sitelink_back_src = sitelink[i] + Vh*gaugeSiteSize;
00559     Float* sitelink_back_dst = sitelink_back_sendbuf[3] + 2*i*Vsh_t*gaugeSiteSize;
00560 
00561     if(dims[3] % 2 == 0){    
00562       memcpy(sitelink_back_dst, even_sitelink_back_src, len);
00563       memcpy(sitelink_back_dst + Vsh_t*gaugeSiteSize, odd_sitelink_back_src, len);
00564     }else{
00565       //switching odd and even ghost sitelink
00566       memcpy(sitelink_back_dst, odd_sitelink_back_src, len);
00567       memcpy(sitelink_back_dst + Vsh_t*gaugeSiteSize, even_sitelink_back_src, len);
00568     }
00569   }
00570 
00571   for(i=0;i < 4;i++){
00572     Float* even_sitelink_fwd_src = sitelink[i] + (Vh - Vsh_t)*gaugeSiteSize;
00573     Float* odd_sitelink_fwd_src = sitelink[i] + Vh*gaugeSiteSize + (Vh - Vsh_t)*gaugeSiteSize;
00574     Float* sitelink_fwd_dst = sitelink_fwd_sendbuf[3] + 2*i*Vsh_t*gaugeSiteSize;
00575     if(dims[3] % 2 == 0){    
00576       memcpy(sitelink_fwd_dst, even_sitelink_fwd_src, len);
00577       memcpy(sitelink_fwd_dst + Vsh_t*gaugeSiteSize, odd_sitelink_fwd_src, len);
00578     }else{
00579       //switching odd and even ghost sitelink
00580       memcpy(sitelink_fwd_dst, odd_sitelink_fwd_src, len);
00581       memcpy(sitelink_fwd_dst + Vsh_t*gaugeSiteSize, even_sitelink_fwd_src, len);
00582     }
00583     
00584   }
00585 #else
00586   int nFace =1;
00587   for(int dir=0; dir < 4; dir++){
00588     if(optflag && !commDimPartitioned(dir)) continue;
00589     pack_ghost_all_links((void**)sitelink, (void**)sitelink_back_sendbuf, (void**)sitelink_fwd_sendbuf, dir, nFace, (QudaPrecision)(sizeof(Float)), X);
00590   }
00591 #endif
00592 
00593 
00594   int fwd_neighbors[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR, T_FWD_NBR};
00595   int back_neighbors[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR, T_BACK_NBR};
00596   int up_tags[4] = {XUP, YUP, ZUP, TUP};
00597   int down_tags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN};
00598 
00599   for(int dir  =0; dir < 4; dir++){
00600     if(optflag && !commDimPartitioned(dir)) continue;
00601     int len = Vsh[dir]*gaugeSiteSize*sizeof(Float);
00602     Float* ghost_sitelink_back = ghost_sitelink[dir];
00603     Float* ghost_sitelink_fwd = ghost_sitelink[dir] + 8*Vsh[dir]*gaugeSiteSize;
00604     
00605     MPI_Request recv_request1;
00606     MPI_Request recv_request2;
00607     comm_recv_with_tag(ghost_sitelink_back, 8*len, back_neighbors[dir], up_tags[dir], &recv_request1);
00608     comm_recv_with_tag(ghost_sitelink_fwd, 8*len, fwd_neighbors[dir], down_tags[dir], &recv_request2);
00609     MPI_Request send_request1; 
00610     MPI_Request send_request2;
00611     comm_send_with_tag(sitelink_fwd_sendbuf[dir], 8*len, fwd_neighbors[dir], up_tags[dir], &send_request1);
00612     comm_send_with_tag(sitelink_back_sendbuf[dir], 8*len, back_neighbors[dir], down_tags[dir], &send_request2);
00613     comm_wait(&recv_request1);
00614     comm_wait(&recv_request2);
00615     comm_wait(&send_request1);
00616     comm_wait(&send_request2);
00617   }
00618 
00619   exchange_sitelink_diag(X, sitelink, ghost_sitelink_diag, optflag);
00620 }
00621 
00622 //this function is used for link fattening computation
00623 //@optflag: if this flag is set, we only communicate in directions that are partitioned
00624 //          if not set, then we communicate in all directions regradless of partitions
00625 void exchange_cpu_sitelink(int* X,
00626                            void** sitelink, void** ghost_sitelink,
00627                            void** ghost_sitelink_diag,
00628                            QudaPrecision gPrecision, QudaGaugeParam* param, int optflag)
00629 {  
00630   setup_dims(X);
00631   static void*  sitelink_fwd_sendbuf[4];
00632   static void*  sitelink_back_sendbuf[4];
00633   static int allocated = 0;
00634 
00635   if(!allocated){
00636     for(int i=0;i < 4;i++){
00637       sitelink_fwd_sendbuf[i] = malloc(4*Vs[i]*gaugeSiteSize*gPrecision);
00638       sitelink_back_sendbuf[i] = malloc(4*Vs[i]*gaugeSiteSize*gPrecision);
00639       if (sitelink_fwd_sendbuf[i] == NULL|| sitelink_back_sendbuf[i] == NULL){
00640         errorQuda("ERROR: malloc failed for sitelink_sendbuf/site_link_back_sendbuf\n");
00641       }  
00642       memset(sitelink_fwd_sendbuf[i], 0, 4*Vs[i]*gaugeSiteSize*gPrecision);
00643       memset(sitelink_back_sendbuf[i], 0, 4*Vs[i]*gaugeSiteSize*gPrecision);
00644     }
00645     allocated = 1;
00646   }
00647   
00648   if (gPrecision == QUDA_DOUBLE_PRECISION){
00649     exchange_sitelink(X, (double**)sitelink, (double**)(ghost_sitelink), (double**)ghost_sitelink_diag, 
00650                       (double**)sitelink_fwd_sendbuf, (double**)sitelink_back_sendbuf, optflag);
00651   }else{ //single
00652     exchange_sitelink(X, (float**)sitelink, (float**)(ghost_sitelink), (float**)ghost_sitelink_diag, 
00653                       (float**)sitelink_fwd_sendbuf, (float**)sitelink_back_sendbuf, optflag);
00654   }
00655   
00656   if(!(param->preserve_gauge & QUDA_FAT_PRESERVE_COMM_MEM)){
00657     for(int i=0;i < 4;i++){
00658       free(sitelink_fwd_sendbuf[i]);
00659       free(sitelink_back_sendbuf[i]);
00660     }
00661     allocated = 0;
00662   }
00663 }
00664 
00665 
00666 
00667 
00668 /* This function exchange the sitelink and store them in the correspoinding portion of 
00669  * the extended sitelink memory region
00670  * @sitelink: this is stored according to dimension size (X1+4) (X2+4) (X3+4) (X4+4)
00671  */
00672 
00673 void exchange_cpu_sitelink_ex(int* X, void** sitelink, QudaGaugeFieldOrder cpu_order,
00674                               QudaPrecision gPrecision, int optflag)
00675 {
00676   int X1,X2,X3,X4;
00677   int E1,E2,E3,E4;  
00678   X1 = X[0]; X2 = X[1]; X3 = X[2]; X4 = X[3]; 
00679   E1 = X[0]+4; E2 = X[1]+4; E3 = X[2]+4; E4 = X[3]+4; 
00680   int E3E2E1=E3*E2*E1;
00681   int E2E1=E2*E1;
00682   int E4E3E2=E4*E3*E2;
00683   int E3E2=E3*E2;
00684   int E4E3E1=E4*E3*E1;
00685   int E3E1=E3*E1;
00686   int E4E2E1=E4*E2*E1;
00687   int Vh_ex = E4*E3*E2*E1/2;
00688   
00689   //...............x.........y.....z......t
00690   int starta[] = {2,      2,       2,     0};
00691   int enda[]   = {X4+2,   X4+2,    X4+2,  X3+4};
00692   
00693   int startb[] = {2,      2,       0,     0};
00694   int endb[]   = {X3+2,   X3+2,    X2+4,  X2+4};
00695   
00696   int startc[] = {2,      0,       0,     0};
00697   int endc[]   = {X2+2,   X1+4,    X1+4,  X1+4};
00698   
00699   int f_main[4][4] = {
00700     {E3E2E1,    E2E1, E1,     1},
00701     {E3E2E1,    E2E1,    1,  E1},
00702     {E3E2E1,  E1,    1,    E2E1},
00703     {E2E1,  E1,    1,   E3E2E1}
00704   };  
00705   
00706   int f_bound[4][4]={
00707     {E3E2, E2, 1, E4E3E2},
00708     {E3E1, E1, 1, E4E3E1}, 
00709     {E2E1, E1, 1, E4E2E1},
00710     {E2E1, E1, 1, E3E2E1}
00711   };
00712   
00713   int nslices = 2;
00714   int slice_3d[] = { E4E3E2, E4E3E1, E4E2E1, E3E2E1};  
00715   int len[4];
00716   for(int i=0;i < 4;i++){
00717     len[i] = slice_3d[i] * nslices* 4*gaugeSiteSize*gPrecision; //2 slices, 4 directions' links
00718   }
00719   void* ghost_sitelink_fwd_sendbuf[4];
00720   void* ghost_sitelink_back_sendbuf[4];
00721   void* ghost_sitelink_fwd[4];
00722   void* ghost_sitelink_back[4];  
00723   for(int i=0;i < 4;i++){    
00724     if(!commDimPartitioned(i)) continue;
00725     ghost_sitelink_fwd_sendbuf[i] = malloc(len[i]);
00726     ghost_sitelink_back_sendbuf[i] = malloc(len[i]);
00727     if(ghost_sitelink_fwd_sendbuf[i] == NULL || ghost_sitelink_back_sendbuf[i] == NULL){
00728       errorQuda("Error: malloc for ghost sitelink send buffer failed\n");
00729     } 
00730     
00731     ghost_sitelink_fwd[i] = malloc(len[i]);
00732     ghost_sitelink_back[i] = malloc(len[i]);
00733     if(ghost_sitelink_fwd[i] == NULL || ghost_sitelink_back[i] == NULL){
00734       errorQuda("Error: malloc for ghost sitelink failed\n");
00735     } 
00736     
00737   }
00738 
00739   int back_nbr[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR,T_BACK_NBR};
00740   int fwd_nbr[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR,T_FWD_NBR};
00741   int uptags[4] = {XUP, YUP, ZUP, TUP};
00742   int downtags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN};
00743   MPI_Request recv_request1[4], recv_request2[4];
00744   MPI_Request send_request1[4], send_request2[4];
00745   
00746   int gaugebytes = gaugeSiteSize*gPrecision;
00747   int a, b, c,d;
00748   for(int dir =0;dir < 4;dir++){
00749     if( (!commDimPartitioned(dir)) && optflag) continue;
00750     if(commDimPartitioned(dir)){
00751       //fill the sendbuf here
00752       //back
00753       for(d=2; d < 4; d++)
00754         for(a=starta[dir];a < enda[dir]; a++)
00755           for(b=startb[dir]; b < endb[dir]; b++)
00756 
00757             if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
00758               for (c=startc[dir]; c < endc[dir]; c++){
00759                 int oddness = (a+b+c+d)%2;
00760                 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
00761                 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-2)*f_bound[dir][3])>> 1;        
00762                 
00763                 int src_oddness = oddness;
00764                 int dst_oddness = oddness;
00765                 if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
00766                   dst_oddness = 1-oddness;
00767                 }
00768 
00769 #define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir) \
00770                 if(src_oddness){                                        \
00771                   src_idx += Vh_ex;                                     \
00772                 }                                                       \
00773                 if(dst_oddness){                                        \
00774                   dst_idx += nslices*slice_3d[dir]/2;                   \
00775                 }                                                       \
00776                 if(cpu_order == QUDA_QDP_GAUGE_ORDER){                  \
00777                   for(int linkdir=0; linkdir < 4; linkdir++){           \
00778                     char* src = (char*) sitelink[linkdir] + (src_idx)*gaugebytes; \
00779                     char* dst = ((char*)ghost_buf[dir])+ linkdir*nslices*slice_3d[dir]*gaugebytes + (dst_idx)*gaugebytes; \
00780                     memcpy(dst, src, gaugebytes*(num));                 \
00781                   }                                                     \
00782                 }else{  /*QUDA_MILC_GAUGE_ORDER*/                       \
00783                   char* src = ((char*)sitelink)+ 4*(src_idx)*gaugebytes; \
00784                   char* dst = ((char*)ghost_buf[dir]) + 4*(dst_idx)*gaugebytes; \
00785                   memcpy(dst, src, 4*gaugebytes*(num));         \
00786                 }                                                       \
00787 
00788                 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, 1, dir);              
00789 
00790               }//c
00791             }else{
00792               for(int loop=0; loop < 2; loop++){
00793                 c=startc[dir]+loop;
00794                 if(c < endc[dir]){
00795                   int oddness = (a+b+c+d)%2;
00796                   int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
00797                   int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-2)*f_bound[dir][3])>> 1;              
00798                   
00799                   int src_oddness = oddness;
00800                   int dst_oddness = oddness;
00801                   if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
00802                     dst_oddness = 1-oddness;
00803                   }
00804                   MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, (endc[dir] -c +1)/2, dir);  
00805 
00806                 }//if c
00807               }//for loop
00808             }//if
00809       
00810       
00811       //fwd
00812       for(d=X[dir]; d < X[dir]+2; d++)
00813         for(a=starta[dir];a < enda[dir]; a++)
00814           for(b=startb[dir]; b < endb[dir]; b++)
00815             
00816             if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
00817             for (c=startc[dir]; c < endc[dir]; c++){
00818               int oddness = (a+b+c+d)%2;
00819               int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
00820               int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
00821 
00822               int src_oddness = oddness;
00823               int dst_oddness = oddness;
00824               if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
00825                 dst_oddness = 1-oddness;
00826               }
00827 
00828               MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, 1,dir);
00829             }//c
00830             }else{
00831               for(int loop=0; loop < 2; loop++){
00832                 c=startc[dir]+loop;
00833                 if(c < endc[dir]){
00834                   int oddness = (a+b+c+d)%2;
00835                   int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
00836                   int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
00837                   
00838                   int src_oddness = oddness;
00839                   int dst_oddness = oddness;
00840                   if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
00841                     dst_oddness = 1-oddness;
00842                   }
00843                   
00844                   MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, (endc[dir] -c +1)/2,dir);    
00845 
00846 
00847                 }
00848               }//for loop
00849             }//if
00850       comm_recv_with_tag(ghost_sitelink_back[dir], len[dir], back_nbr[dir], uptags[dir], &recv_request1[dir]);
00851       comm_recv_with_tag(ghost_sitelink_fwd[dir], len[dir], fwd_nbr[dir], downtags[dir], &recv_request2[dir]);
00852       comm_send_with_tag(ghost_sitelink_fwd_sendbuf[dir], len[dir], fwd_nbr[dir], uptags[dir], &send_request1[dir]);
00853       comm_send_with_tag(ghost_sitelink_back_sendbuf[dir], len[dir], back_nbr[dir], downtags[dir], &send_request2[dir]);    
00854       
00855       //need the messages to be here before we can send the next messages
00856       comm_wait(&recv_request1[dir]);
00857       comm_wait(&recv_request2[dir]);
00858       comm_wait(&send_request1[dir]);
00859       comm_wait(&send_request2[dir]);
00860     }//if
00861 
00862     //use the messages to fill the sitelink data
00863     //back
00864     if (dir < 3 ){
00865       for(d=0; d < 2; d++)
00866         for(a=starta[dir];a < enda[dir]; a++)
00867           for(b=startb[dir]; b < endb[dir]; b++)
00868             if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
00869               for (c=startc[dir]; c < endc[dir]; c++){
00870                 int oddness = (a+b+c+d)%2;
00871                 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
00872                 int src_idx;
00873                 if(commDimPartitioned(dir)){
00874                   src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
00875                 }else{
00876                   src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
00877                 }
00878                 /*
00879                 if(oddness){
00880                   if(commDimPartitioned(dir)){
00881                     src_idx += nslices*slice_3d[dir]/2;
00882                   }else{
00883                     src_idx += Vh_ex;
00884                   }
00885                   dst_idx += Vh_ex;
00886                 }
00887                 */
00888 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir) \
00889                 if(oddness){                                            \
00890                   if(commDimPartitioned(dir)){                          \
00891                     src_idx += nslices*slice_3d[dir]/2;                 \
00892                   }else{                                                \
00893                     src_idx += Vh_ex;                                   \
00894                   }                                                     \
00895                   dst_idx += Vh_ex;                                     \
00896                 }                                                       \
00897                 if(cpu_order == QUDA_QDP_GAUGE_ORDER){                  \
00898                   for(int linkdir=0; linkdir < 4; linkdir++){           \
00899                     char* src;                                          \
00900                     if(commDimPartitioned(dir)){                        \
00901                       src = ((char*)ghost_buf[dir])+ linkdir*nslices*slice_3d[dir]*gaugebytes + (src_idx)*gaugebytes; \
00902                     }else{                                              \
00903                       src = ((char*)sitelink[linkdir])+ (src_idx)*gaugebytes; \
00904                     }                                                   \
00905                     char* dst = (char*) sitelink[linkdir] + (dst_idx)*gaugebytes; \
00906                     memcpy(dst, src, gaugebytes*(num));                 \
00907                   }                                                     \
00908                 }else{/*QUDA_MILC_GAUGE_FIELD*/                         \
00909                   char* src;                                            \
00910                   if(commDimPartitioned(dir)){                          \
00911                     src=((char*)ghost_buf[dir]) + 4*(src_idx)*gaugebytes; \
00912                   }else{                                                \
00913                     src = ((char*)sitelink)+ 4*(src_idx)*gaugebytes;    \
00914                   }                                                     \
00915                   char* dst = ((char*)sitelink) + 4*(dst_idx)*gaugebytes; \
00916                   memcpy(dst, src, 4*gaugebytes*(num));                 \
00917                 }
00918 
00919                 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, 1, dir);
00920                 
00921               }//c    
00922             }else{
00923               //optimized copy
00924               //first half:   startc[dir] -> end[dir] with step=2
00925 
00926               for(int loop =0;loop <2;loop++){
00927                 int c=startc[dir]+loop;
00928                 if(c < endc[dir]){
00929                   int oddness = (a+b+c+d)%2;
00930                   int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
00931                   int src_idx;
00932                   if(commDimPartitioned(dir)){
00933                     src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
00934                   }else{
00935                     src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
00936                   }
00937 
00938                   MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, (endc[dir]-c+1)/2, dir);
00939 
00940                 }//if c                 
00941               }//for loop             
00942 
00943             }//if
00944     }else{
00945       //when dir == 3 (T direction), the data layout format in sitelink and the message is the same, we can do large copys
00946       
00947 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir) \
00948         /*even*/                                                        \
00949         int even_dst_idx = (dst_face*E3E2E1)/2;                         \
00950         int even_src_idx;                                               \
00951         if(commDimPartitioned(dir)){                                    \
00952           even_src_idx = 0;                                             \
00953         }else{                                                          \
00954           even_src_idx = (src_face*E3E2E1)/2;                           \
00955         }                                                               \
00956         /*odd*/                                                         \
00957         int odd_dst_idx = even_dst_idx+Vh_ex;                           \
00958         int odd_src_idx;                                                \
00959         if(commDimPartitioned(dir)){                                    \
00960           odd_src_idx = nslices*slice_3d[dir]/2;                        \
00961         }else{                                                          \
00962           odd_src_idx = even_src_idx+Vh_ex;                             \
00963         }                                                               \
00964         if(cpu_order == QUDA_QDP_GAUGE_ORDER){                                                          \
00965           for(int linkdir=0; linkdir < 4; linkdir ++){                  \
00966             char* dst = (char*)sitelink[linkdir];                       \
00967             char* src;                                                  \
00968             if(commDimPartitioned(dir)){                                \
00969               src = ((char*)ghost_buf[dir]) + linkdir*nslices*slice_3d[dir]*gaugebytes; \
00970             }else{                                                      \
00971               src = (char*)sitelink[linkdir];                           \
00972             }                                                           \
00973             memcpy(dst + even_dst_idx * gaugebytes, src + even_src_idx*gaugebytes, nslices*slice_3d[dir]*gaugebytes/2); \
00974             memcpy(dst + odd_dst_idx * gaugebytes, src + odd_src_idx*gaugebytes, nslices*slice_3d[dir]*gaugebytes/2); \
00975           }                                                             \
00976         }else{/*QUDA_MILC_GAUGE_ORDER*/                                 \
00977           char* dst = (char*)sitelink;                                  \
00978           char* src;                                                    \
00979           if(commDimPartitioned(dir)){                                  \
00980             src = (char*)ghost_buf[dir];                                \
00981           }else{                                                        \
00982             src = (char*)sitelink;                                      \
00983           }                                                             \
00984           memcpy(dst+4*even_dst_idx*gaugebytes, src+4*even_src_idx*gaugebytes, 4*nslices*slice_3d[dir]*gaugebytes/2); \
00985           memcpy(dst+4*odd_dst_idx*gaugebytes, src+4*odd_src_idx*gaugebytes, 4*nslices*slice_3d[dir]*gaugebytes/2); \
00986         }      
00987 
00988       MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_back, 0, X4, dir)
00989     }//if
00990     
00991     //fwd
00992     if( dir < 3 ){
00993       for(d=X[dir]+2; d < X[dir]+4; d++)
00994         for(a=starta[dir];a < enda[dir]; a++)
00995           for(b=startb[dir]; b < endb[dir]; b++)
00996 
00997             if(f_main[dir][2] != 1 || f_bound[dir][2] != 1){
00998               for (c=startc[dir]; c < endc[dir]; c++){
00999                 int oddness = (a+b+c+d)%2;
01000                 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
01001                 int src_idx;
01002                 if(commDimPartitioned(dir)){
01003                   src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-2)*f_bound[dir][3])>> 1;
01004                 }else{
01005                   src_idx =  ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
01006                 }
01007 
01008                 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, 1, dir);
01009 
01010               }//c
01011             }else{
01012               for(int loop =0; loop < 2; loop++){
01013                 //for (c=startc[dir]; c < endc[dir]; c++){
01014                 c=startc[dir] + loop;
01015                 if(c < endc[dir]){
01016                   int oddness = (a+b+c+d)%2;
01017                   int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
01018                   int src_idx;
01019                   if(commDimPartitioned(dir)){
01020                     src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-2)*f_bound[dir][3])>> 1;
01021                   }else{
01022                     src_idx =  ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
01023                   }
01024 
01025                   MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, (endc[dir]-c+1)/2, dir);
01026                 }//if
01027                 
01028               }//for loop
01029               
01030             }//if 
01031       
01032 
01033     }else{
01034       //when dir == 3 (T direction), the data layout format in sitelink and the message is the same, we can do large copys
01035       MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_fwd, (X4+2), 2, dir)
01036 
01037     }//if    
01038 
01039   }//dir for loop
01040     
01041   
01042   for(int dir=0;dir < 4;dir++){
01043     if(!commDimPartitioned(dir)) continue;
01044     free(ghost_sitelink_fwd_sendbuf[dir]);
01045     free(ghost_sitelink_back_sendbuf[dir]);    
01046     free(ghost_sitelink_fwd[dir]);
01047     free(ghost_sitelink_back[dir]);    
01048   }
01049   
01050   
01051 }
01052 
01053 
01054 
01055 template<typename Float>
01056 void
01057 do_exchange_cpu_staple(Float* staple, Float** ghost_staple, Float** staple_fwd_sendbuf, Float** staple_back_sendbuf, int* X)
01058 {
01059 
01060 
01061 #if 0  
01062   int len = Vsh_t*gaugeSiteSize*sizeof(Float);
01063   Float* even_staple_back_src = staple;
01064   Float* odd_staple_back_src = staple + Vh*gaugeSiteSize;
01065   Float* staple_back_dst = staple_back_sendbuf[3];
01066   
01067   if(dims[3] % 2 == 0){    
01068     memcpy(staple_back_dst, even_staple_back_src, len);
01069     memcpy(staple_back_dst + Vsh_t*gaugeSiteSize, odd_staple_back_src, len);
01070   }else{
01071     //switching odd and even ghost staple
01072     memcpy(staple_back_dst, odd_staple_back_src, len);
01073     memcpy(staple_back_dst + Vsh_t*gaugeSiteSize, even_staple_back_src, len);
01074   }
01075   
01076   
01077   Float* even_staple_fwd_src = staple + (Vh - Vsh_t)*gaugeSiteSize;
01078   Float* odd_staple_fwd_src = staple + Vh*gaugeSiteSize + (Vh - Vsh_t)*gaugeSiteSize;
01079   Float* staple_fwd_dst = staple_fwd_sendbuf[3];
01080   if(dims[3] % 2 == 0){    
01081     memcpy(staple_fwd_dst, even_staple_fwd_src, len);
01082     memcpy(staple_fwd_dst + Vsh_t*gaugeSiteSize, odd_staple_fwd_src, len);
01083   }else{
01084     //switching odd and even ghost staple
01085     memcpy(staple_fwd_dst, odd_staple_fwd_src, len);
01086     memcpy(staple_fwd_dst + Vsh_t*gaugeSiteSize, even_staple_fwd_src, len);
01087   }
01088 #else
01089   int nFace =1;
01090   pack_ghost_all_staples_cpu(staple, (void**)staple_back_sendbuf, 
01091                              (void**)staple_fwd_sendbuf,  nFace, (QudaPrecision)(sizeof(Float)), X);
01092 
01093 #endif  
01094   
01095   int Vsh[4] = {Vsh_x, Vsh_y, Vsh_z, Vsh_t};
01096   int len[4] = {
01097     Vsh_x*gaugeSiteSize*sizeof(Float),
01098     Vsh_y*gaugeSiteSize*sizeof(Float),
01099     Vsh_z*gaugeSiteSize*sizeof(Float),
01100     Vsh_t*gaugeSiteSize*sizeof(Float)
01101   };
01102   
01103   int fwd_neighbors[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR, T_FWD_NBR};
01104   int back_neighbors[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR, T_BACK_NBR};
01105   int up_tags[4] = {XUP, YUP, ZUP, TUP};
01106   int down_tags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN};
01107   
01108   for(int dir=0;dir < 4; dir++){
01109     Float* ghost_staple_back = ghost_staple[dir];
01110     Float* ghost_staple_fwd = ghost_staple[dir] + 2*Vsh[dir]*gaugeSiteSize;
01111     
01112     MPI_Request recv_request1;  
01113     MPI_Request recv_request2;  
01114     MPI_Request send_request1; 
01115     MPI_Request send_request2; 
01116     comm_recv_with_tag(ghost_staple_back, 2*len[dir], back_neighbors[dir], up_tags[dir], &recv_request1);
01117     comm_recv_with_tag(ghost_staple_fwd, 2*len[dir], fwd_neighbors[dir], down_tags[dir], &recv_request2);
01118     comm_send_with_tag(staple_fwd_sendbuf[dir], 2*len[dir], fwd_neighbors[dir], up_tags[dir], &send_request1);
01119     comm_send_with_tag(staple_back_sendbuf[dir], 2*len[dir], back_neighbors[dir], down_tags[dir], &send_request2);
01120     
01121     comm_wait(&recv_request1);
01122     comm_wait(&recv_request2);
01123     comm_wait(&send_request1);
01124     comm_wait(&send_request2);
01125   }
01126 }
01127 //this function is used for link fattening computation
01128 void exchange_cpu_staple(int* X,
01129                          void* staple, void** ghost_staple,
01130                          QudaPrecision gPrecision)
01131 {
01132   
01133   setup_dims(X);
01134 
01135   int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
01136   void*  staple_fwd_sendbuf[4];
01137   void*  staple_back_sendbuf[4];
01138 
01139   for(int i=0;i < 4; i++){
01140     staple_fwd_sendbuf[i] = malloc(Vs[i]*gaugeSiteSize*gPrecision);
01141     staple_back_sendbuf[i] = malloc(Vs[i]*gaugeSiteSize*gPrecision);
01142     if (staple_fwd_sendbuf[i] == NULL|| staple_back_sendbuf[i] == NULL){
01143       printf("ERROR: malloc failed for staple_sendbuf/site_link_back_sendbuf\n");
01144       exit(1);
01145     }
01146   }
01147   
01148   if (gPrecision == QUDA_DOUBLE_PRECISION){
01149     do_exchange_cpu_staple((double*)staple, (double**)ghost_staple, 
01150                            (double**)staple_fwd_sendbuf, (double**)staple_back_sendbuf, X);
01151   }else{ //single
01152     do_exchange_cpu_staple((float*)staple, (float**)ghost_staple, 
01153                            (float**)staple_fwd_sendbuf, (float**)staple_back_sendbuf, X);
01154   }
01155   
01156   for(int i=0;i < 4;i++){
01157     free(staple_fwd_sendbuf[i]);
01158     free(staple_back_sendbuf[i]);
01159   }
01160 }
01161 
01162 //@whichway indicates send direction
01163 void
01164 exchange_gpu_staple_start(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream)
01165 {
01166   setup_dims(X);
01167   
01168   cudaGaugeField* cudaStaple = (cudaGaugeField*) _cudaStaple;
01169   exchange_llfat_init(cudaStaple->Precision());
01170   
01171 
01172   void* even = cudaStaple->Even_p();
01173   void* odd = cudaStaple->Odd_p();
01174   int volume = cudaStaple->VolumeCB();
01175   QudaPrecision prec = cudaStaple->Precision();
01176   int stride = cudaStaple->Stride();
01177   
01178   packGhostStaple(X, even, odd, volume, prec, stride, 
01179                   dir, whichway, fwd_nbr_staple_gpu, back_nbr_staple_gpu,
01180                   fwd_nbr_staple_sendbuf, back_nbr_staple_sendbuf, stream);
01181 }
01182 
01183 
01184 //@whichway indicates send direction
01185 //we use recv_whichway to indicate recv direction
01186 void
01187 exchange_gpu_staple_comms(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream)
01188 {
01189   cudaGaugeField* cudaStaple = (cudaGaugeField*) _cudaStaple;  
01190   QudaPrecision prec = cudaStaple->Precision();
01191   
01192   int fwd_neighbors[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR, T_FWD_NBR};
01193   int back_neighbors[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR, T_BACK_NBR};
01194   int up_tags[4] = {XUP, YUP, ZUP, TUP};
01195   int down_tags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN};
01196 
01197   cudaStreamSynchronize(*stream);  
01198 
01199   int recv_whichway;
01200   if(whichway == QUDA_BACKWARDS){
01201     recv_whichway = QUDA_FORWARDS;
01202   }else{
01203     recv_whichway = QUDA_BACKWARDS;
01204   }
01205   
01206 
01207   int i = dir;
01208   int len = Vs[i]*gaugeSiteSize*prec;
01209   int normlen = Vs[i]*sizeof(float);
01210   
01211   if(recv_whichway == QUDA_BACKWARDS){   
01212 #ifdef GPU_DIRECT
01213     comm_recv_with_tag(back_nbr_staple[i], len, back_neighbors[i], up_tags[i], &llfat_recv_request1[i]);
01214     comm_send_with_tag(fwd_nbr_staple_sendbuf[i], len, fwd_neighbors[i],  up_tags[i], &llfat_send_request1[i]);
01215 #else
01216     comm_recv_with_tag(back_nbr_staple_cpu[i], len, back_neighbors[i], up_tags[i], &llfat_recv_request1[i]);
01217     memcpy(fwd_nbr_staple_sendbuf_cpu[i], fwd_nbr_staple_sendbuf[i], len);
01218     comm_send_with_tag(fwd_nbr_staple_sendbuf_cpu[i], len, fwd_neighbors[i],  up_tags[i], &llfat_send_request1[i]);
01219 #endif
01220   } else { // QUDA_FORWARDS
01221 #ifdef GPU_DIRECT
01222     comm_recv_with_tag(fwd_nbr_staple[i], len, fwd_neighbors[i], down_tags[i], &llfat_recv_request2[i]);
01223     comm_send_with_tag(back_nbr_staple_sendbuf[i], len, back_neighbors[i] ,down_tags[i], &llfat_send_request2[i]);
01224 #else
01225     comm_recv_with_tag(fwd_nbr_staple_cpu[i], len, fwd_neighbors[i], down_tags[i], &llfat_recv_request2[i]);
01226     memcpy(back_nbr_staple_sendbuf_cpu[i], back_nbr_staple_sendbuf[i], len);
01227     comm_send_with_tag(back_nbr_staple_sendbuf_cpu[i], len, back_neighbors[i] ,down_tags[i], &llfat_send_request2[i]);
01228 #endif
01229   }
01230 }
01231 
01232 
01233 //@whichway indicates send direction
01234 //we use recv_whichway to indicate recv direction
01235 void
01236 exchange_gpu_staple_wait(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream)
01237 {
01238   cudaGaugeField* cudaStaple = (cudaGaugeField*) _cudaStaple;  
01239 
01240   void* even = cudaStaple->Even_p();
01241   void* odd = cudaStaple->Odd_p();
01242   int volume = cudaStaple->VolumeCB();
01243   QudaPrecision prec = cudaStaple->Precision();
01244   int stride = cudaStaple->Stride();
01245 
01246   int recv_whichway;
01247   if(whichway == QUDA_BACKWARDS){
01248     recv_whichway = QUDA_FORWARDS;
01249   }else{
01250     recv_whichway = QUDA_BACKWARDS;
01251   }
01252   
01253 
01254   int i = dir;
01255   int len = Vs[i]*gaugeSiteSize*prec;
01256   int normlen = Vs[i]*sizeof(float);
01257   
01258   if(recv_whichway == QUDA_BACKWARDS){   
01259     comm_wait(&llfat_recv_request1[i]);
01260     comm_wait(&llfat_send_request1[i]);
01261 
01262 #ifdef GPU_DIRECT
01263     unpackGhostStaple(X, even, odd, volume, prec, stride, 
01264                       i, QUDA_BACKWARDS, fwd_nbr_staple, back_nbr_staple, stream);
01265 #else   
01266     memcpy(back_nbr_staple[i], back_nbr_staple_cpu[i], len);
01267     unpackGhostStaple(X, even, odd, volume, prec, stride, 
01268                       i, QUDA_BACKWARDS, fwd_nbr_staple, back_nbr_staple, stream);
01269 #endif
01270 
01271   } else { // QUDA_FORWARDS
01272     comm_wait(&llfat_recv_request2[i]);  
01273     comm_wait(&llfat_send_request2[i]);
01274 
01275 #ifdef GPU_DIRECT
01276     unpackGhostStaple(X, even, odd, volume, prec, stride, 
01277                       i, QUDA_FORWARDS, fwd_nbr_staple, back_nbr_staple, stream);
01278 #else        
01279     memcpy(fwd_nbr_staple[i], fwd_nbr_staple_cpu[i], len);
01280     unpackGhostStaple(X, even, odd, volume, prec, stride,
01281                       i, QUDA_FORWARDS, fwd_nbr_staple, back_nbr_staple, stream);
01282 #endif
01283 
01284   }
01285 }
01286 
01287 
01288 void
01289 exchange_llfat_cleanup(void)
01290 {
01291   
01292   for(int i=0;i < 4; i++){
01293     if(fwd_nbr_staple_gpu[i]){
01294       cudaFree(fwd_nbr_staple_gpu[i]); fwd_nbr_staple_gpu[i] =NULL;
01295     }      
01296     if(back_nbr_staple_gpu[i]){
01297       cudaFree(back_nbr_staple_gpu[i]);back_nbr_staple_gpu[i] = NULL;
01298     }
01299 
01300   }
01301 
01302 #ifdef GPU_DIRECT
01303   for(int i=0;i < 4; i++){
01304     if(fwd_nbr_staple_cpu[i]){
01305       free(fwd_nbr_staple_cpu[i]); fwd_nbr_staple_cpu[i] =NULL;
01306     }      
01307     if(back_nbr_staple_cpu[i]){
01308       free(back_nbr_staple_cpu[i]);back_nbr_staple_cpu[i] = NULL;
01309     }
01310   }
01311   for(int i=0;i < 4; i++){
01312     if(fwd_nbr_staple_sendbuf_cpu[i]){
01313       free(fwd_nbr_staple_sendbuf_cpu[i]); fwd_nbr_staple_sendbuf_cpu[i] = NULL;
01314     }
01315     if(back_nbr_staple_sendbuf_cpu[i]){
01316       free(back_nbr_staple_sendbuf_cpu[i]); back_nbr_staple_sendbuf_cpu[i] = NULL;
01317     }    
01318   }
01319 #endif
01320 
01321   for(int i=0;i < 4; i++){
01322     if(fwd_nbr_staple[i]){
01323       cudaFreeHost(fwd_nbr_staple[i]); fwd_nbr_staple[i] = NULL;
01324     }
01325     if(back_nbr_staple[i]){
01326       cudaFreeHost(back_nbr_staple[i]); back_nbr_staple[i] = NULL;
01327     }
01328   }
01329   
01330   for(int i=0;i < 4; i++){
01331     if(fwd_nbr_staple_sendbuf[i]){
01332       cudaFreeHost(fwd_nbr_staple_sendbuf[i]); fwd_nbr_staple_sendbuf[i] = NULL;
01333     }
01334     if(back_nbr_staple_sendbuf[i]){
01335       cudaFreeHost(back_nbr_staple_sendbuf[i]); back_nbr_staple_sendbuf[i] = NULL;
01336     }
01337   }
01338 
01339 }
01340 
01341 #endif