QUDA: quda/lib/fat_force_quda.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <stdlib.h>
00002 #include <stdio.h>
00003 #include <math.h>
00004 #include <string.h>
00005 
00006 #include <typeinfo>
00007 #include <quda.h>
00008 #include "fat_force_quda.h"
00009 #include <quda_internal.h>
00010 #include <face_quda.h>
00011 #include "misc_helpers.h"
00012 #include <assert.h>
00013 #include <cuda.h>
00014 
00015 #define MAX(a,b) ((a)>(b)?(a):(b))
00016 #define ALIGNMENT 4096 
00017 
00018 #ifdef MPI_COMMS
00019 #include "face_quda.h"
00020 #endif
00021 
00022 static double anisotropy_;
00023 extern float fat_link_max_;
00024 static int X_[4];
00025 static QudaTboundary t_boundary_;
00026 
00027 #define SHORT_LENGTH 65536
00028 #define SCALE_FLOAT ((SHORT_LENGTH-1) / 2.f)
00029 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1))
00030 
00031 #include <pack_gauge.h>
00032 #include "gauge_field.h"
00033 
00034 /********************** Staple code, used by link fattening **************/
00035 
00036 #if defined(GPU_FATLINK)||defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE)
00037 
00038 
00039 template <typename Float>
00040 void packGhostAllStaples(Float *cpuStaple, Float **cpuGhostBack,Float**cpuGhostFwd, int nFace, int* X) {
00041   int XY=X[0]*X[1];
00042   int XYZ=X[0]*X[1]*X[2];
00043   int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
00044   int faceVolumeCB[4]={
00045     X[1]*X[2]*X[3]/2,
00046     X[0]*X[2]*X[3]/2,
00047     X[0]*X[1]*X[3]/2,
00048     X[0]*X[1]*X[2]/2
00049   };
00050 
00051   //loop variables: a, b, c with a the most signifcant and c the least significant
00052   //A, B, C the maximum value
00053   //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
00054   int A[4], B[4], C[4];
00055   
00056   //X dimension
00057   A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
00058   
00059   //Y dimension
00060   A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
00061 
00062   //Z dimension
00063   A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
00064 
00065   //T dimension
00066   A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
00067 
00068 
00069   //multiplication factor to compute index in original cpu memory
00070   int f[4][4]={
00071     {XYZ,    XY, X[0],     1},
00072     {XYZ,    XY,    1,  X[0]},
00073     {XYZ,  X[0],    1,    XY},
00074     { XY,  X[0],    1,   XYZ}
00075   };
00076   
00077   
00078   for(int ite = 0; ite < 2; ite++){
00079     //ite == 0: back
00080     //ite == 1: fwd
00081     Float** dst;
00082     if (ite == 0){
00083       dst = cpuGhostBack;
00084     }else{
00085       dst = cpuGhostFwd;
00086     }
00087     
00088     //collect back ghost staple
00089     for(int dir =0; dir < 4; dir++){
00090       int d;
00091       int a,b,c;
00092       
00093       //ther is only one staple in the same location
00094       for(int linkdir=0; linkdir < 1; linkdir ++){
00095         Float* even_src = cpuStaple;
00096         Float* odd_src = cpuStaple + volumeCB*gaugeSiteSize;
00097 
00098         Float* even_dst;
00099         Float* odd_dst;
00100         
00101         //switching odd and even ghost cpuLink when that dimension size is odd
00102         //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
00103         if((X[dir] % 2 ==0) || (commDim(dir) == 1)){
00104           even_dst = dst[dir];
00105           odd_dst = even_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;   
00106         }else{
00107           odd_dst = dst[dir];
00108           even_dst = dst[dir] + nFace*faceVolumeCB[dir]*gaugeSiteSize;
00109         }
00110 
00111         int even_dst_index = 0;
00112         int odd_dst_index = 0;
00113         
00114         int startd;
00115         int endd; 
00116         if(ite == 0){ //back
00117           startd = 0; 
00118           endd= nFace;
00119         }else{//fwd
00120           startd = X[dir] - nFace;
00121           endd =X[dir];
00122         }
00123         for(d = startd; d < endd; d++){
00124           for(a = 0; a < A[dir]; a++){
00125             for(b = 0; b < B[dir]; b++){
00126               for(c = 0; c < C[dir]; c++){
00127                 int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
00128                 int oddness = (a+b+c+d)%2;
00129                 if (oddness == 0){ //even
00130                   for(int i=0;i < 18;i++){
00131                     even_dst[18*even_dst_index+i] = even_src[18*index + i];
00132                   }
00133                   even_dst_index++;
00134                 }else{ //odd
00135                   for(int i=0;i < 18;i++){
00136                     odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
00137                   }
00138                   odd_dst_index++;
00139                 }
00140               }//c
00141             }//b
00142           }//a
00143         }//d
00144         assert( even_dst_index == nFace*faceVolumeCB[dir]);
00145         assert( odd_dst_index == nFace*faceVolumeCB[dir]);      
00146       }//linkdir
00147       
00148     }//dir
00149   }//ite
00150 }
00151 
00152 
00153 void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void** cpuGhostStapleFwd, 
00154                                 int nFace, QudaPrecision precision, int* X) {
00155   
00156   if (precision == QUDA_DOUBLE_PRECISION) {
00157     packGhostAllStaples((double*)staple, (double**)cpuGhostStapleBack, (double**) cpuGhostStapleFwd, nFace, X);
00158   } else {
00159     packGhostAllStaples((float*)staple, (float**)cpuGhostStapleBack, (float**)cpuGhostStapleFwd, nFace, X);
00160   }
00161   
00162 }
00163 
00164 void pack_gauge_diag(void* buf, int* X, void** sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
00165 {
00166     /*
00167       nu |          |
00168          |__________|
00169             mu 
00170     *       
00171     * nu, mu are the directions we are working on
00172     * Since we are packing our own data, we need to go to the north-west corner in the diagram
00173     * i.e. x[nu] = X[nu]-1, x[mu]=0, and looop throught x[dir1],x[dir2]
00174     * in the remaining two directions (dir1/dir2), dir2 is the slowest changing dim when computing
00175     * index
00176     */
00177 
00178 
00179   int mul_factor[4]={
00180     1, X[0], X[1]*X[0], X[2]*X[1]*X[0],
00181   };
00182 
00183   int even_dst_idx = 0;
00184   int odd_dst_idx = 0;
00185   char* dst_even =(char*)buf;
00186   char* dst_odd = dst_even + (X[dir1]*X[dir2]/2)*gaugeSiteSize*prec;
00187   char* src_even = (char*)sitelink[nu];
00188   char* src_odd = src_even + (X[0]*X[1]*X[2]*X[3]/2)*gaugeSiteSize*prec;
00189 
00190   if( (X[nu]+X[mu]) % 2 == 1){
00191     //oddness will change between me and the diagonal neighbor
00192     //switch it now
00193     char* tmp = dst_odd;
00194     dst_odd = dst_even;
00195     dst_even = tmp;
00196   }
00197 
00198   for(int i=0;i < X[dir2]; i++){
00199     for(int j=0; j < X[dir1]; j++){
00200       int src_idx = ((X[nu]-1)*mul_factor[nu]+ 0*mul_factor[mu]+i*mul_factor[dir2]+j*mul_factor[dir1])>>1;
00201       //int dst_idx = (i*X[dir1]+j) >> 1; 
00202       int oddness = ( (X[nu]-1) + 0 + i + j) %2;
00203 
00204       if(oddness==0){
00205         for(int tmpidx = 0; tmpidx < gaugeSiteSize; tmpidx++){
00206           memcpy(&dst_even[(18*even_dst_idx+tmpidx)*prec], &src_even[(18*src_idx + tmpidx)*prec], prec);
00207         }
00208         even_dst_idx++;
00209       }else{
00210         for(int tmpidx = 0; tmpidx < gaugeSiteSize; tmpidx++){  
00211           memcpy(&dst_odd[(18*odd_dst_idx+tmpidx)*prec], &src_odd[(18*src_idx + tmpidx)*prec], prec);
00212         }
00213         odd_dst_idx++;
00214       }//if
00215 
00216     }//for j
00217   }//for i
00218       
00219   if( (even_dst_idx != X[dir1]*X[dir2]/2)|| (odd_dst_idx != X[dir1]*X[dir2]/2)){
00220     errorQuda("even_dst_idx/odd_dst_idx(%d/%d) does not match the value of X[dir1]*X[dir2]/2 (%d)\n",
00221               even_dst_idx, odd_dst_idx, X[dir1]*X[dir2]/2);
00222   }
00223   return ;
00224   
00225 
00226 }
00227 
00228 void
00229 packGhostStaple(int* X, void* even, void* odd, int volume, QudaPrecision prec,
00230                 int stride, 
00231                 int dir, int whichway,
00232                 void** fwd_nbr_buf_gpu, void** back_nbr_buf_gpu,
00233                 void** fwd_nbr_buf, void** back_nbr_buf,
00234                 cudaStream_t* stream)
00235 {
00236   int Vs_x, Vs_y, Vs_z, Vs_t;
00237   
00238   Vs_x = X[1]*X[2]*X[3];
00239   Vs_y = X[0]*X[2]*X[3];
00240   Vs_z = X[0]*X[1]*X[3];
00241   Vs_t = X[0]*X[1]*X[2];  
00242   int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
00243   
00244   if (dir != 3){ //the code would work for dir=3 as well
00245     //even and odd ness switch (if necessary) is taken caren of in collectGhostStaple();
00246     void* gpu_buf;
00247     int i =dir;
00248     if (whichway ==  QUDA_BACKWARDS){
00249       gpu_buf = back_nbr_buf_gpu[i];
00250       collectGhostStaple(X, even, odd, volume, prec, gpu_buf, i, whichway, stream);
00251       cudaMemcpyAsync(back_nbr_buf[i], gpu_buf, Vs[i]*gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream);
00252     }else{//whichway is  QUDA_FORWARDS;
00253       gpu_buf = fwd_nbr_buf_gpu[i];
00254       collectGhostStaple(X, even, odd, volume, prec,  gpu_buf, i, whichway, stream);
00255       cudaMemcpyAsync(fwd_nbr_buf[i], gpu_buf, Vs[i]*gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream);        
00256     }
00257   }else{ //special case for dir=3 since no gather kernel is required
00258     int Vh = volume;
00259     int Vsh = X[0]*X[1]*X[2]/2;
00260     int sizeOfFloatN = 2*prec;
00261     int len = Vsh*sizeOfFloatN;
00262     int i;
00263     if(X[3] %2 == 0){
00264       //back,even
00265       for(i=0;i < 9; i++){
00266         void* dst = ((char*)back_nbr_buf[3]) + i*len ; 
00267         void* src = ((char*)even) + i*stride*sizeOfFloatN;
00268         cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 
00269       }
00270       //back, odd
00271       for(i=0;i < 9; i++){
00272         void* dst = ((char*)back_nbr_buf[3]) + 9*len + i*len ; 
00273         void* src = ((char*)odd) + i*stride*sizeOfFloatN;
00274         cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 
00275       }
00276       //fwd,even
00277       for(i=0;i < 9; i++){
00278         void* dst = ((char*)fwd_nbr_buf[3]) + i*len ; 
00279         void* src = ((char*)even) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
00280         cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 
00281       }
00282       //fwd, odd
00283       for(i=0;i < 9; i++){
00284         void* dst = ((char*)fwd_nbr_buf[3]) + 9*len + i*len ; 
00285         void* src = ((char*)odd) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
00286         cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 
00287       }
00288     }else{
00289       //reverse even and odd position
00290       //back,odd
00291       for(i=0;i < 9; i++){
00292         void* dst = ((char*)back_nbr_buf[3]) + i*len ; 
00293         void* src = ((char*)odd) + i*stride*sizeOfFloatN;
00294         cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 
00295       }
00296       //back, even
00297       for(i=0;i < 9; i++){
00298         void* dst = ((char*)back_nbr_buf[3]) + 9*len + i*len ; 
00299         void* src = ((char*)even) + i*stride*sizeOfFloatN;
00300         cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 
00301       }
00302       //fwd,odd
00303       for(i=0;i < 9; i++){
00304         void* dst = ((char*)fwd_nbr_buf[3]) + i*len ; 
00305         void* src = ((char*)odd) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
00306         cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 
00307       }
00308       //fwd, even
00309       for(i=0;i < 9; i++){
00310         void* dst = ((char*)fwd_nbr_buf[3]) + 9*len + i*len ; 
00311         void* src = ((char*)even) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
00312         cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream); 
00313       }
00314       
00315     } 
00316   }
00317   
00318 }
00319 
00320 
00321 void 
00322 unpackGhostStaple(int* X, void* _even, void* _odd, int volume, QudaPrecision prec,
00323                   int stride, 
00324                   int dir, int whichway, void** fwd_nbr_buf, void** back_nbr_buf,
00325                   cudaStream_t* stream)
00326 {
00327 
00328   int Vsh_x, Vsh_y, Vsh_z, Vsh_t;
00329   
00330   Vsh_x = X[1]*X[2]*X[3]/2;
00331   Vsh_y = X[0]*X[2]*X[3]/2;
00332   Vsh_z = X[0]*X[1]*X[3]/2;
00333   Vsh_t = X[0]*X[1]*X[2]/2;  
00334   int Vsh[4] = {Vsh_x, Vsh_y, Vsh_z, Vsh_t};
00335 
00336   int Vh = volume;
00337   int sizeOfFloatN = 2*prec;
00338   int len[4] = {
00339     Vsh_x*sizeOfFloatN,
00340     Vsh_y*sizeOfFloatN,
00341     Vsh_z*sizeOfFloatN,
00342     Vsh_t*sizeOfFloatN 
00343   };
00344   
00345   int tmpint[4] = {
00346     0,
00347     Vsh_x, 
00348     Vsh_x + Vsh_y, 
00349     Vsh_x + Vsh_y + Vsh_z, 
00350   };
00351   
00352   char* even = ((char*)_even) + Vh*sizeOfFloatN + 2*tmpint[dir]*sizeOfFloatN;
00353   char* odd = ((char*)_odd) + Vh*sizeOfFloatN +2*tmpint[dir]*sizeOfFloatN;
00354   
00355   if(whichway == QUDA_BACKWARDS){   
00356     //back,even
00357     for(int i=0;i < 9; i++){
00358       void* dst = even + i*stride*sizeOfFloatN;
00359       void* src = ((char*)back_nbr_buf[dir]) + i*len[dir] ; 
00360       cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream); 
00361     }
00362     //back, odd
00363     for(int i=0;i < 9; i++){
00364       void* dst = odd + i*stride*sizeOfFloatN;
00365       void* src = ((char*)back_nbr_buf[dir]) + 9*len[dir] + i*len[dir] ; 
00366       cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream); 
00367     }
00368   }else { //QUDA_FORWARDS
00369     //fwd,even
00370     for(int i=0;i < 9; i++){
00371       void* dst = even + Vsh[dir]*sizeOfFloatN + i*stride*sizeOfFloatN;
00372       void* src = ((char*)fwd_nbr_buf[dir]) + i*len[dir] ; 
00373       cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream); 
00374     }
00375     //fwd, odd
00376     for(int i=0;i < 9; i++){
00377       void* dst = odd + Vsh[dir]*sizeOfFloatN + i*stride*sizeOfFloatN;
00378       void* src = ((char*)fwd_nbr_buf[dir]) + 9*len[dir] + i*len[dir] ; 
00379       cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream); 
00380     }
00381   }
00382 }
00383 
00384 /*
00385   This is the packing kernel for the multi-dimensional ghost zone in
00386   the padded region.  This is called by cpuexchangesitelink in
00387   FaceBuffer (MPI only), which was called by loadLinkToGPU (defined at
00388   the bottom).  
00389 
00390   Not currently included since it will be replaced by Guochun's new
00391   routine which uses an enlarged domain instead of a ghost zone.
00392  */
00393 template <typename Float>
00394 void packGhostAllLinks(Float **cpuLink, Float **cpuGhostBack,Float**cpuGhostFwd, int dir, int nFace, int* X) {
00395   int XY=X[0]*X[1];
00396   int XYZ=X[0]*X[1]*X[2];
00397 
00398   int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
00399   int faceVolumeCB[4]={
00400     X[1]*X[2]*X[3]/2,
00401     X[0]*X[2]*X[3]/2,
00402     X[0]*X[1]*X[3]/2,
00403     X[0]*X[1]*X[2]/2
00404   };
00405 
00406   //loop variables: a, b, c with a the most signifcant and c the least significant
00407   //A, B, C the maximum value
00408   //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
00409   int A[4], B[4], C[4];
00410   
00411   //X dimension
00412   A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
00413   
00414   //Y dimension
00415   A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
00416 
00417   //Z dimension
00418   A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
00419 
00420   //T dimension
00421   A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
00422 
00423 
00424   //multiplication factor to compute index in original cpu memory
00425   int f[4][4]={
00426     {XYZ,    XY, X[0],     1},
00427     {XYZ,    XY,    1,  X[0]},
00428     {XYZ,  X[0],    1,    XY},
00429     { XY,  X[0],    1,   XYZ}
00430   };
00431   
00432   
00433   for(int ite = 0; ite < 2; ite++){
00434     //ite == 0: back
00435     //ite == 1: fwd
00436     Float** dst;
00437     if (ite == 0){
00438       dst = cpuGhostBack;
00439     }else{
00440       dst = cpuGhostFwd;
00441     }
00442     
00443     //collect back ghost gauge field
00444     //for(int dir =0; dir < 4; dir++){
00445       int d;
00446       int a,b,c;
00447       
00448       //we need copy all 4 links in the same location
00449       for(int linkdir=0; linkdir < 4; linkdir ++){
00450         Float* even_src = cpuLink[linkdir];
00451         Float* odd_src = cpuLink[linkdir] + volumeCB*gaugeSiteSize;
00452 
00453         Float* even_dst;
00454         Float* odd_dst;
00455         
00456         //switching odd and even ghost cpuLink when that dimension size is odd
00457         //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
00458         if((X[dir] % 2 ==0) || (commDim(dir) == 1)){
00459           even_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*gaugeSiteSize;      
00460           odd_dst = even_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;   
00461         }else{
00462           odd_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*gaugeSiteSize;
00463           even_dst = odd_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;
00464         }
00465 
00466         int even_dst_index = 0;
00467         int odd_dst_index = 0;
00468         
00469         int startd;
00470         int endd; 
00471         if(ite == 0){ //back
00472           startd = 0; 
00473           endd= nFace;
00474         }else{//fwd
00475           startd = X[dir] - nFace;
00476           endd =X[dir];
00477         }
00478         for(d = startd; d < endd; d++){
00479           for(a = 0; a < A[dir]; a++){
00480             for(b = 0; b < B[dir]; b++){
00481               for(c = 0; c < C[dir]; c++){
00482                 int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
00483                 int oddness = (a+b+c+d)%2;
00484                 if (oddness == 0){ //even
00485                   for(int i=0;i < 18;i++){
00486                     even_dst[18*even_dst_index+i] = even_src[18*index + i];
00487                   }
00488                   even_dst_index++;
00489                 }else{ //odd
00490                   for(int i=0;i < 18;i++){
00491                     odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
00492                   }
00493                   odd_dst_index++;
00494                 }
00495               }//c
00496             }//b
00497           }//a
00498         }//d
00499         assert( even_dst_index == nFace*faceVolumeCB[dir]);
00500         assert( odd_dst_index == nFace*faceVolumeCB[dir]);      
00501       }//linkdir
00502       
00503     //}//dir
00504   }//ite
00505 }
00506 
00507 
00508 void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void** cpuGhostFwd, 
00509                           int dir, int nFace, QudaPrecision precision, int *X) {
00510   
00511   if (precision == QUDA_DOUBLE_PRECISION) {
00512     packGhostAllLinks((double**)cpuLink, (double**)cpuGhostBack, (double**) cpuGhostFwd, dir,  nFace, X);
00513   } else {
00514     packGhostAllLinks((float**)cpuLink, (float**)cpuGhostBack, (float**)cpuGhostFwd, dir, nFace, X);
00515   }
00516   
00517 }
00518 
00519 /*
00520   Copies the device gauge field to the host.
00521   - no reconstruction support
00522   - device data is always Float2 ordered
00523   - device data is a 1-dimensional array (MILC ordered)
00524   - no support for half precision
00525  */
00526 
00527 static void 
00528 do_loadLinkToGPU(int* X, void *even, void*odd, void **cpuGauge, void** ghost_cpuGauge,
00529                  void** ghost_cpuGauge_diag, 
00530                  QudaReconstructType reconstruct, int bytes, int Vh, int pad, 
00531                  int Vsh_x, int Vsh_y, int Vsh_z, int Vsh_t,
00532                  QudaPrecision prec, QudaGaugeFieldOrder cpu_order) 
00533 {
00534 
00535   int Vh_2d_max = MAX(X[0]*X[1]/2, X[0]*X[2]/2);
00536   Vh_2d_max = MAX(Vh_2d_max, X[0]*X[3]/2);
00537   Vh_2d_max = MAX(Vh_2d_max, X[1]*X[2]/2);
00538   Vh_2d_max = MAX(Vh_2d_max, X[1]*X[3]/2);
00539   Vh_2d_max = MAX(Vh_2d_max, X[2]*X[3]/2);
00540 
00541   int i;
00542   char* tmp_even;
00543   char* tmp_odd;
00544   int len = Vh*gaugeSiteSize*prec;
00545 
00546 #ifdef MULTI_GPU    
00547   int glen[4] = {
00548     Vsh_x*gaugeSiteSize*prec,
00549     Vsh_y*gaugeSiteSize*prec,
00550     Vsh_z*gaugeSiteSize*prec,
00551     Vsh_t*gaugeSiteSize*prec
00552   };
00553   
00554   int ghostV = 2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t)+4*Vh_2d_max;
00555 #else
00556   int ghostV = 0;
00557 #endif  
00558 
00559   int glen_sum = ghostV*gaugeSiteSize*prec;
00560   if(cudaMalloc(&tmp_even, 4*(len+glen_sum)) != cudaSuccess){
00561     errorQuda("cudaMalloc() failed for tmp_even\n");
00562   }
00563   tmp_odd = tmp_even; 
00564 
00565   //even links
00566   if(cpu_order == QUDA_QDP_GAUGE_ORDER){
00567     for(i=0;i < 4; i++){
00568 #ifdef GPU_DIRECT
00569       cudaMemcpyAsync(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice, streams[0]); 
00570 #else
00571       cudaMemcpy(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice); 
00572 #endif
00573     }
00574   }else{ //QUDA_MILC_GAUGE_ORDER
00575     
00576 #ifdef MULTI_GPU
00577     errorQuda("MULTI-GPU for milc format in this functino is not supported yet\n");
00578 #endif    
00579 #ifdef GPU_DIRECT
00580     cudaMemcpyAsync(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice, streams[0]);
00581 #else
00582     cudaMemcpy(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice);
00583 #endif
00584   }
00585 
00586 
00587   for(i=0;i < 4;i++){
00588 #ifdef MULTI_GPU 
00589     //dir: the source direction
00590     char* dest = tmp_even + i*(len+glen_sum)+len;
00591     for(int dir = 0; dir < 4; dir++){
00592 #ifdef GPU_DIRECT 
00593       cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); 
00594       cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]);         
00595 #else
00596       cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 
00597       cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 
00598 #endif
00599       dest += 2*glen[dir];
00600   }
00601     //fill in diag 
00602     //@nu is @i, mu iterats from 0 to 4 and mu != nu
00603     int nu = i;
00604     for(int mu = 0; mu < 4; mu++){
00605       if(nu  == mu ){
00606         continue;
00607       }
00608       int dir1, dir2;
00609       for(dir1=0; dir1 < 4; dir1 ++){
00610         if(dir1 != nu && dir1 != mu){
00611           break;
00612         }
00613       }
00614       for(dir2=0; dir2 < 4; dir2 ++){
00615         if(dir2 != nu && dir2 != mu && dir2 != dir1){
00616           break;
00617         }
00618       }
00619 #ifdef GPU_DIRECT 
00620       cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu], 
00621                       X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]);        
00622 #else   
00623       cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu], 
00624                  X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice); 
00625 #endif
00626       
00627     }
00628     
00629 #endif
00630   }    
00631   
00632   link_format_cpu_to_gpu((void*)even, (void*)tmp_even,  reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]); 
00633 
00634   //odd links
00635   if(cpu_order ==  QUDA_QDP_GAUGE_ORDER){
00636     for(i=0;i < 4; i++){
00637 #ifdef GPU_DIRECT 
00638       cudaMemcpyAsync(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice, streams[0]);
00639 #else
00640     cudaMemcpy(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
00641 #endif
00642     }
00643   }else{  //QUDA_MILC_GAUGE_ORDER
00644 #ifdef GPU_DIRECT 
00645     cudaMemcpyAsync(tmp_odd , ((char*)cpuGauge)+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice, streams[0]);
00646 #else
00647     cudaMemcpy(tmp_odd, (char*)cpuGauge+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
00648 #endif    
00649   }
00650   
00651 
00652   for(i=0;i < 4; i++){
00653 #ifdef MULTI_GPU  
00654       char* dest = tmp_odd + i*(len+glen_sum)+len;
00655       for(int dir = 0; dir < 4; dir++){
00656 #ifdef GPU_DIRECT 
00657         cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]); 
00658         cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], 
00659                         cudaMemcpyHostToDevice, streams[0]); 
00660 #else
00661         cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 
00662         cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice); 
00663 
00664 #endif
00665 
00666         dest += 2*glen[dir];
00667       }
00668       //fill in diag 
00669       //@nu is @i, mu iterats from 0 to 4 and mu != nu
00670       int nu = i;
00671       for(int mu = 0; mu < 4; mu++){
00672         if(nu  == mu ){
00673           continue;
00674         }
00675         int dir1, dir2;
00676         for(dir1=0; dir1 < 4; dir1 ++){
00677           if(dir1 != nu && dir1 != mu){
00678             break;
00679           }
00680         }
00681         for(dir2=0; dir2 < 4; dir2 ++){
00682           if(dir2 != nu && dir2 != mu && dir2 != dir1){
00683             break;
00684           }
00685         }
00686 #ifdef GPU_DIRECT 
00687         cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec, 
00688                         X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]);      
00689 #else
00690         cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec, 
00691                    X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice );              
00692 #endif
00693       }
00694       
00695 
00696 #endif
00697   }
00698   link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]); 
00699   
00700   cudaStreamSynchronize(streams[0]);
00701 
00702   cudaFree(tmp_even);
00703 
00704 }
00705 
00706 
00707 void 
00708 loadLinkToGPU(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge, QudaGaugeParam* param)
00709 {
00710 
00711   if (cudaGauge->Precision() != cpuGauge->Precision()){
00712     printf("ERROR: cpu precision and cuda precision must be the same in this function %s\n", __FUNCTION__);
00713     exit(1);
00714   }
00715   QudaPrecision prec= cudaGauge->Precision();
00716 
00717 #ifdef MULTI_GPU
00718   const int* Z = cudaGauge->X();
00719 #endif
00720   int pad = cudaGauge->Pad();
00721   int Vsh_x = param->X[1]*param->X[2]*param->X[3]/2;
00722   int Vsh_y = param->X[0]*param->X[2]*param->X[3]/2;
00723   int Vsh_z = param->X[0]*param->X[1]*param->X[3]/2;
00724   int Vsh_t = param->X[0]*param->X[1]*param->X[2]/2;
00725 
00726 
00727 
00728   static void* ghost_cpuGauge[4];
00729   static void* ghost_cpuGauge_diag[16];
00730 
00731 #ifdef MULTI_GPU
00732   static int allocated = 0;
00733   int Vs[4] = {2*Vsh_x, 2*Vsh_y, 2*Vsh_z, 2*Vsh_t};
00734   
00735   if(allocated == 0){
00736     for(int i=0;i < 4; i++){
00737       
00738 #ifdef GPU_DIRECT 
00739       cudaMallocHost((void**)&ghost_cpuGauge[i], 8*Vs[i]*gaugeSiteSize*prec);
00740 #else
00741       ghost_cpuGauge[i] = malloc(8*Vs[i]*gaugeSiteSize*prec);
00742 #endif
00743       if(ghost_cpuGauge[i] == NULL){
00744         errorQuda("ERROR: malloc failed for ghost_sitelink[%d] \n",i);
00745       }
00746     }
00747     
00748     
00749     /*
00750      *  nu |     |
00751      *     |_____|
00752      *       mu     
00753      */
00754     
00755     int ghost_diag_len[16];
00756     for(int nu=0;nu < 4;nu++){
00757       for(int mu=0; mu < 4;mu++){
00758         if(nu == mu){
00759           ghost_cpuGauge_diag[nu*4+mu] = NULL;
00760         }else{
00761           //the other directions
00762           int dir1, dir2;
00763           for(dir1= 0; dir1 < 4; dir1++){
00764             if(dir1 !=nu && dir1 != mu){
00765               break;
00766             }
00767           }
00768           for(dir2=0; dir2 < 4; dir2++){
00769             if(dir2 != nu && dir2 != mu && dir2 != dir1){
00770               break;
00771             }
00772           }
00773           //int rc = posix_memalign((void**)&ghost_cpuGauge_diag[nu*4+mu], ALIGNMENT, Z[dir1]*Z[dir2]*gaugeSiteSize*prec);
00774 #ifdef GPU_DIRECT 
00775           cudaMallocHost((void**)&ghost_cpuGauge_diag[nu*4+mu],  Z[dir1]*Z[dir2]*gaugeSiteSize*prec);
00776 #else
00777           ghost_cpuGauge_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*prec);
00778 #endif
00779           if(ghost_cpuGauge_diag[nu*4+mu] == NULL){
00780             errorQuda("malloc failed for ghost_sitelink_diag\n");
00781           }
00782           
00783           memset(ghost_cpuGauge_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*prec);
00784           ghost_diag_len[nu*4+mu] = Z[dir1]*Z[dir2]*gaugeSiteSize*prec;
00785         }       
00786       }
00787     }
00788     allocated = 1;
00789   }
00790 
00791   int optflag=1;
00792   // driver for for packalllink
00793   exchange_cpu_sitelink(param->X, (void**)cpuGauge->Gauge_p(), ghost_cpuGauge, ghost_cpuGauge_diag, prec, param, optflag);
00794   
00795 #endif
00796   
00797   do_loadLinkToGPU(param->X, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(), 
00798                    ghost_cpuGauge, ghost_cpuGauge_diag, 
00799                      cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad, 
00800                    Vsh_x, Vsh_y, Vsh_z, Vsh_t, 
00801                    prec, cpuGauge->Order());
00802   
00803 #ifdef MULTI_GPU
00804   if(!(param->preserve_gauge & QUDA_FAT_PRESERVE_COMM_MEM)){
00805     
00806     for(int i=0;i < 4;i++){
00807 #ifdef GPU_DIRECT 
00808       cudaFreeHost(ghost_cpuGauge[i]);
00809 #else
00810       free(ghost_cpuGauge[i]);
00811 #endif
00812     }
00813     for(int i=0;i <4; i++){ 
00814       for(int j=0;j <4; j++){
00815         if (i==j){
00816           continue;
00817         }
00818 #ifdef GPU_DIRECT 
00819         cudaFreeHost(ghost_cpuGauge_diag[i*4+j]);
00820 #else
00821         free(ghost_cpuGauge_diag[i*4+j]);
00822 #endif
00823       }
00824     }
00825     
00826     allocated = 0;
00827   }
00828 #endif
00829   
00830 }
00831 
00832 static void
00833 do_loadLinkToGPU_ex(const int* X, void *even, void *odd, void**cpuGauge,
00834                     QudaReconstructType reconstruct, int bytes, int Vh_ex, int pad,
00835                     QudaPrecision prec, QudaGaugeFieldOrder cpu_order)
00836 {
00837 
00838   cudaStream_t streams[2];
00839   for(int i=0;i < 2; i++){
00840     cudaStreamCreate(&streams[i]);
00841   }
00842 
00843 
00844   int i;
00845   char* tmp_even = NULL;
00846   char* tmp_odd = NULL;
00847   int len = Vh_ex*gaugeSiteSize*prec;
00848   
00849   if(cudaMalloc(&tmp_even, 8*len) != cudaSuccess){
00850     errorQuda("Error: cudaMalloc failed for tmp_even\n");
00851   }
00852   tmp_odd = tmp_even + 4*len;
00853 
00854   //even links
00855   if(cpu_order == QUDA_QDP_GAUGE_ORDER){
00856     for(i=0;i < 4; i++){
00857 #ifdef GPU_DIRECT 
00858       cudaMemcpyAsync(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice, streams[0]);
00859 #else
00860       cudaMemcpy(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice);
00861 #endif
00862       
00863     }
00864   }else{ //QUDA_MILC_GAUGE_ORDER
00865 #ifdef GPU_DIRECT 
00866     cudaMemcpyAsync(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice, streams[0]);
00867 #else
00868     cudaMemcpy(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice);
00869 #endif
00870   }
00871   
00872   link_format_cpu_to_gpu((void*)even, (void*)tmp_even,  reconstruct, Vh_ex, pad, 0, prec, cpu_order, streams[0]);
00873   
00874   //odd links
00875   if(cpu_order == QUDA_QDP_GAUGE_ORDER){
00876     for(i=0;i < 4; i++){
00877 #ifdef GPU_DIRECT 
00878       cudaMemcpyAsync(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice, streams[1]);
00879 #else
00880       cudaMemcpy(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
00881 #endif
00882     }
00883   }else{//QUDA_MILC_GAUGE_ORDER
00884 #ifdef GPU_DIRECT 
00885     cudaMemcpyAsync(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice, streams[1]);
00886 #else
00887     cudaMemcpy(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
00888 #endif    
00889   }
00890   link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh_ex, pad, 0, prec, cpu_order, streams[1]);
00891   
00892   
00893   for(int i=0;i < 2;i++){
00894     cudaStreamSynchronize(streams[i]);
00895   }
00896 
00897   cudaFree(tmp_even);
00898 
00899   for(int i=0;i < 2;i++){
00900     cudaStreamDestroy(streams[i]);
00901   }
00902   
00903 }
00904 
00905 
00906 
00907 
00908 void
00909 loadLinkToGPU_ex(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge)
00910 {
00911 
00912   if (cudaGauge->Precision()  != cpuGauge->Precision()){
00913     printf("ERROR: cpu precision and cuda precision must be the same in this function %s\n", __FUNCTION__);
00914     exit(1);
00915   }
00916   QudaPrecision prec= cudaGauge->Precision();
00917   const int* E = cudaGauge->X();
00918   int pad = cudaGauge->Pad();
00919   
00920   do_loadLinkToGPU_ex(E, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(),
00921                       cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad,
00922                       prec, cpuGauge->Order());
00923   
00924 
00925 }
00926 
00927 
00928 template<typename FloatN, typename Float>
00929 static void 
00930 do_storeLinkToCPU(Float* cpuGauge, FloatN *even, FloatN *odd, 
00931                   int bytes, int Vh, int stride, QudaPrecision prec) 
00932 {  
00933   
00934   double* unpackedDataEven;
00935   double* unpackedDataOdd;
00936   int datalen = 4*Vh*gaugeSiteSize*sizeof(Float);
00937   if(cudaMalloc(&unpackedDataEven, datalen) != cudaSuccess){
00938     errorQuda("cudaMalloc failed for unpackedDataEven\n");
00939   }
00940   unpackedDataOdd = unpackedDataEven;
00941   //unpack even data kernel
00942   link_format_gpu_to_cpu((void*)unpackedDataEven, (void*)even, Vh, stride, prec, streams[0]);
00943 
00944 #ifdef GPU_DIRECT 
00945   cudaMemcpyAsync(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost, streams[0]);
00946 #else
00947   cudaMemcpy(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost);
00948 #endif
00949   
00950   //unpack odd data kernel
00951   link_format_gpu_to_cpu((void*)unpackedDataOdd, (void*)odd, Vh, stride, prec, streams[0]);
00952 #ifdef GPU_DIRECT 
00953   cudaMemcpyAsync(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost, streams[0]);  
00954 #else
00955   cudaMemcpy(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost);  
00956 #endif
00957   
00958   cudaFree(unpackedDataEven);
00959 }
00960 void 
00961 storeLinkToCPU(cpuGaugeField* cpuGauge, cudaGaugeField *cudaGauge, QudaGaugeParam* param)
00962 {
00963   
00964   QudaPrecision cpu_prec = param->cpu_prec;
00965   QudaPrecision cuda_prec= param->cuda_prec;
00966 
00967   if (cpu_prec  != cuda_prec){
00968     printf("ERROR: cpu precision and cuda precision must be the same in this function %s\n", __FUNCTION__);
00969     exit(1);
00970   }
00971   
00972   if (cudaGauge->Reconstruct() != QUDA_RECONSTRUCT_NO){
00973     printf("ERROR: it makes no sense to get data back to cpu for 8/12 reconstruct, function %s\n", __FUNCTION__);
00974     exit(1);
00975   }
00976   
00977   int stride = cudaGauge->VolumeCB() + cudaGauge->Pad();
00978   
00979   if (cuda_prec == QUDA_DOUBLE_PRECISION){
00980     do_storeLinkToCPU( (double*)cpuGauge->Gauge_p(), (double2*) cudaGauge->Even_p(), (double2*)cudaGauge->Odd_p(), 
00981                        cudaGauge->Bytes(), cudaGauge->VolumeCB(), stride, cuda_prec);
00982   }else if (cuda_prec == QUDA_SINGLE_PRECISION){
00983     do_storeLinkToCPU( (float*)cpuGauge->Gauge_p(), (float2*) cudaGauge->Even_p(), (float2*)cudaGauge->Odd_p(), 
00984                        cudaGauge->Bytes(), cudaGauge->VolumeCB(), stride, cuda_prec);
00985   }else{
00986     printf("ERROR: half precision not supported in this function %s\n", __FUNCTION__);
00987     exit(1);
00988   }
00989 }
00990 
00991 
00992 
00993 
00994 #endif
00995