QUDA: quda/lib/llfat_quda_itf.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 
00002 #include <stdio.h>
00003 #include <cuda_runtime.h>
00004 #include <cuda.h>
00005 
00006 #include <quda_internal.h>
00007 #include <read_gauge.h>
00008 #include "gauge_field.h"
00009 #include <force_common.h>
00010 #include "llfat_quda.h"
00011 #include <face_quda.h>
00012 
00013 #define BLOCK_DIM 64
00014 
00015 void
00016 llfat_cuda(cudaGaugeField& cudaFatLink, cudaGaugeField& cudaSiteLink, 
00017            cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1,
00018            QudaGaugeParam* param, double* act_path_coeff)
00019 {
00020   int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3];
00021   int Vh = volume/2;
00022   dim3 gridDim(volume/BLOCK_DIM,1,1);
00023   dim3 halfGridDim(Vh/BLOCK_DIM,1,1);
00024   dim3 blockDim(BLOCK_DIM , 1, 1);
00025   
00026   QudaPrecision prec = cudaSiteLink.Precision();
00027   QudaReconstructType recon = cudaSiteLink.Reconstruct();
00028   
00029   if( ((param->X[0] % 2 != 0)
00030        ||(param->X[1] % 2 != 0)
00031        ||(param->X[2] % 2 != 0)
00032        ||(param->X[3] % 2 != 0))
00033       && (recon  == QUDA_RECONSTRUCT_12)){
00034     errorQuda("12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n");
00035     
00036   }
00037       
00038   int nStream=9;
00039   cudaStream_t stream[nStream];
00040   for(int i = 0;i < nStream; i++){
00041     cudaStreamCreate(&stream[i]);
00042   }
00043 
00044 
00045   llfatOneLinkKernel(cudaFatLink, cudaSiteLink,cudaStaple, cudaStaple1,
00046                      param, act_path_coeff); 
00047 
00048   
00049   llfat_kernel_param_t kparam;
00050   for(int i=0;i < 4;i++){
00051      kparam.ghostDim[i] = commDimPartitioned(i);
00052   }
00053   int ktype[8] = {
00054                 LLFAT_EXTERIOR_KERNEL_BACK_X, 
00055                 LLFAT_EXTERIOR_KERNEL_FWD_X, 
00056                 LLFAT_EXTERIOR_KERNEL_BACK_Y, 
00057                 LLFAT_EXTERIOR_KERNEL_FWD_Y, 
00058                 LLFAT_EXTERIOR_KERNEL_BACK_Z, 
00059                 LLFAT_EXTERIOR_KERNEL_FWD_Z, 
00060                 LLFAT_EXTERIOR_KERNEL_BACK_T, 
00061                 LLFAT_EXTERIOR_KERNEL_FWD_T, 
00062   };
00063 
00064 
00065   for(int dir = 0;dir < 4; dir++){
00066     for(int nu = 0; nu < 4; nu++){
00067       if (nu != dir){
00068 
00069 #ifdef MULTI_GPU
00070         //start of one call
00071         for(int k=3; k >= 0 ;k--){
00072           if(!commDimPartitioned(k)) continue;
00073           
00074           kparam.kernel_type = ktype[2*k];
00075           siteComputeGenStapleParityKernel((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
00076                                            (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00077                                            (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
00078                                            dir, nu,
00079                                            act_path_coeff[2],
00080                                            recon, prec, halfGridDim,
00081                                            kparam, &stream[2*k]);
00082           
00083           exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]);
00084           
00085           kparam.kernel_type = ktype[2*k+1];
00086           siteComputeGenStapleParityKernel((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
00087                                            (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00088                                            (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(),
00089                                            dir, nu,
00090                                            act_path_coeff[2],
00091                                            recon, prec, halfGridDim,
00092                                            kparam, &stream[2*k+1]);
00093           exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]);
00094         }
00095 #endif
00096         kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
00097         siteComputeGenStapleParityKernel((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
00098                                          (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00099                                          (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 
00100                                          dir, nu,
00101                                          act_path_coeff[2],
00102                                          recon, prec, halfGridDim, 
00103                                          kparam, &stream[nStream-1]);
00104 
00105 #ifdef MULTI_GPU        
00106         for(int k=3; k >= 0 ;k--){
00107           if(!commDimPartitioned(k)) continue;
00108           exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]);
00109           exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]);
00110         }       
00111         for(int k=3; k >= 0 ;k--){
00112           if(!commDimPartitioned(k)) continue;
00113           exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]);
00114           exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]);
00115         }
00116         for(int k=3; k >= 0 ;k--){
00117           if(!commDimPartitioned(k)) continue;
00118           cudaStreamSynchronize(stream[2*k]);
00119           cudaStreamSynchronize(stream[2*k+1]);
00120         }       
00121 #endif
00122         //end
00123 
00124         //start of one call
00125         kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
00126         if(act_path_coeff[5] != 0.0){
00127           computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL,
00128                                             (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00129                                             (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 
00130                                             (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
00131                                             dir, nu, 0,
00132                                             act_path_coeff[5],
00133                                             recon, prec,  halfGridDim, kparam, &stream[nStream-1]);
00134         }
00135         //end
00136         for(int rho = 0; rho < 4; rho++){
00137           if (rho != dir && rho != nu){
00138 
00139             //start of one call
00140 #ifdef MULTI_GPU        
00141             for(int k=3; k >= 0 ;k--){
00142               if(!commDimPartitioned(k)) continue;
00143               kparam.kernel_type = ktype[2*k];      
00144               computeGenStapleFieldParityKernel((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(),
00145                                                 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00146                                                 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 
00147                                                 (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
00148                                                 dir, rho, 1,
00149                                                 act_path_coeff[3],
00150                                                 recon, prec, halfGridDim, kparam, &stream[2*k]);
00151               exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]);
00152               kparam.kernel_type = ktype[2*k+1];            
00153               computeGenStapleFieldParityKernel((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(),
00154                                                 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00155                                                 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 
00156                                                 (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
00157                                                 dir, rho, 1,
00158                                                 act_path_coeff[3],
00159                                                 recon, prec, halfGridDim, kparam, &stream[2*k+1]);
00160               exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]);
00161             }       
00162 #endif
00163 
00164             kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
00165             computeGenStapleFieldParityKernel((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(),
00166                                               (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00167                                               (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 
00168                                               (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
00169                                               dir, rho, 1,
00170                                               act_path_coeff[3],
00171                                               recon, prec, halfGridDim, kparam, &stream[nStream-1]);
00172 
00173 #ifdef MULTI_GPU
00174             for(int k=3; k >= 0 ;k--){
00175               if(!commDimPartitioned(k)) continue;
00176               exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]);
00177               exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]);
00178             }
00179             for(int k=3; k >= 0 ;k--){
00180               if(!commDimPartitioned(k)) continue;
00181               exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_BACKWARDS, &stream[2*k]);
00182               exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_FORWARDS, &stream[2*k+1]);
00183             }
00184             for(int k=3; k >= 0 ;k--){
00185               if(!commDimPartitioned(k)) continue;
00186               cudaStreamSynchronize(stream[2*k]);
00187               cudaStreamSynchronize(stream[2*k+1]);
00188             }   
00189 #endif      
00190             //end
00191 
00192             
00193             for(int sig = 0; sig < 4; sig++){
00194               if (sig != dir && sig != nu && sig != rho){                                               
00195                 
00196                 //start of one call
00197                 kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
00198                 computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL, 
00199                                                   (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00200                                                   (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 
00201                                                   (void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(),
00202                                                   dir, sig, 0,
00203                                                   act_path_coeff[4],
00204                                                   recon, prec, halfGridDim, kparam, &stream[nStream-1]);
00205 
00206                 //end
00207                 
00208               }                     
00209             }//sig
00210           }
00211         }//rho  
00212       }
00213     }//nu
00214   }//dir
00215   
00216   
00217   cudaThreadSynchronize(); 
00218   checkCudaError();
00219   
00220   for(int i=0;i < nStream; i++){
00221     cudaStreamDestroy(stream[i]);
00222   }
00223 
00224   return;
00225 }
00226 
00227 
00228 
00229 void
00230 llfat_cuda_ex(cudaGaugeField& cudaFatLink, cudaGaugeField& cudaSiteLink, 
00231               cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1,
00232               QudaGaugeParam* param, double* act_path_coeff)
00233 {
00234 
00235   dim3 blockDim(BLOCK_DIM, 1,1);
00236   
00237   int volume = (param->X[0])*(param->X[1])*(param->X[2])*(param->X[3]);
00238   int Vh = volume/2;
00239   dim3 halfGridDim(Vh/blockDim.x,1,1);
00240   if(Vh % blockDim.x != 0){
00241     halfGridDim.x +=1;
00242   }
00243 
00244 
00245   int volume_1g = (param->X[0]+2)*(param->X[1]+2)*(param->X[2]+2)*(param->X[3]+2);
00246   int Vh_1g = volume_1g/2;
00247   dim3 halfGridDim_1g(Vh_1g/blockDim.x,1,1);
00248   if(Vh_1g % blockDim.x != 0){
00249     halfGridDim_1g.x +=1;
00250   }
00251   
00252   int volume_2g = (param->X[0]+4)*(param->X[1]+4)*(param->X[2]+4)*(param->X[3]+4);
00253   int Vh_2g = volume_2g/2;
00254   dim3 halfGridDim_2g(Vh_2g/blockDim.x,1,1);
00255   if(Vh_2g % blockDim.x != 0){
00256     halfGridDim_2g.x +=1;
00257   }
00258 
00259   QudaPrecision prec = cudaSiteLink.Precision();
00260   QudaReconstructType recon = cudaSiteLink.Reconstruct();
00261   
00262   if( ((param->X[0] % 2 != 0)
00263        ||(param->X[1] % 2 != 0)
00264        ||(param->X[2] % 2 != 0)
00265        ||(param->X[3] % 2 != 0))
00266       && (recon  == QUDA_RECONSTRUCT_12)){
00267     errorQuda("12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n");
00268     
00269   }
00270       
00271   
00272   llfat_kernel_param_t kparam;
00273   llfat_kernel_param_t kparam_1g;
00274   llfat_kernel_param_t kparam_2g;
00275   
00276   kparam.threads= Vh;
00277   kparam.halfGridDim = halfGridDim;
00278   kparam.D1 = param->X[0];
00279   kparam.D2 = param->X[1];
00280   kparam.D3 = param->X[2];
00281   kparam.D4 = param->X[3];
00282   kparam.D1h = param->X[0]/2;
00283   kparam.base_idx = 2;
00284   
00285   kparam_1g.threads= Vh_1g;
00286   kparam_1g.halfGridDim = halfGridDim_1g;
00287   kparam_1g.D1 = param->X[0] + 2;
00288   kparam_1g.D2 = param->X[1] + 2;
00289   kparam_1g.D3 = param->X[2] + 2;
00290   kparam_1g.D4 = param->X[3] + 2;
00291   kparam_1g.D1h = (param->X[0] + 2)/2;
00292   kparam_1g.base_idx = 1;
00293 
00294   kparam_2g.threads= Vh_2g;
00295   kparam_2g.halfGridDim = halfGridDim_2g;
00296   kparam_2g.D1 = param->X[0] + 4;
00297   kparam_2g.D2 = param->X[1] + 4;
00298   kparam_2g.D3 = param->X[2] + 4;
00299   kparam_2g.D4 = param->X[3] + 4;
00300   kparam_2g.D1h = (param->X[0] + 4)/2;
00301   kparam_2g.base_idx = 0;
00302   
00303   kparam_1g.blockDim = kparam_2g.blockDim = kparam.blockDim = blockDim;
00304 
00305   /*
00306   {
00307     static dim3 blocks[3]={{64, 1, 1}, {64,1,1}, {64,1,1}};
00308     QudaVerbosity verbose = QUDA_DEBUG_VERBOSE;
00309     TuneLinkFattening fatTune(cudaFatLink, cudaSiteLink, cudaStaple, cudaStaple1,
00310                               kparam, kparam_1g, verbose);
00311     fatTune.BenchmarkMulti(blocks, 3);
00312     
00313   }
00314   */
00315 
00316   llfatOneLinkKernel_ex(cudaFatLink, cudaSiteLink,cudaStaple, cudaStaple1,
00317                         param, act_path_coeff, kparam);
00318 
00319   for(int dir = 0;dir < 4; dir++){
00320     for(int nu = 0; nu < 4; nu++){
00321       if (nu != dir){
00322         
00323 
00324         siteComputeGenStapleParityKernel_ex((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
00325                                             (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00326                                             (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 
00327                                             dir, nu,
00328                                             act_path_coeff[2],
00329                                             recon, prec, kparam_1g); 
00330 
00331         if(act_path_coeff[5] != 0.0){
00332           computeGenStapleFieldParityKernel_ex((void*)NULL, (void*)NULL,
00333                                                (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00334                                                (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 
00335                                                (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
00336                                                dir, nu, 0,
00337                                                act_path_coeff[5],
00338                                                recon, prec, kparam);
00339         } 
00340 
00341         for(int rho = 0; rho < 4; rho++){
00342           if (rho != dir && rho != nu){
00343             
00344             computeGenStapleFieldParityKernel_ex((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(),
00345                                                  (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00346                                                  (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 
00347                                                  (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(),
00348                                                  dir, rho, 1,
00349                                                  act_path_coeff[3],
00350                                                  recon, prec, kparam_1g);
00351             
00352             for(int sig = 0; sig < 4; sig++){
00353               if (sig != dir && sig != nu && sig != rho){                                               
00354                 
00355                 computeGenStapleFieldParityKernel_ex((void*)NULL, (void*)NULL, 
00356                                                      (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(),
00357                                                      (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 
00358                                                      (void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(),
00359                                                      dir, sig, 0,
00360                                                      act_path_coeff[4],
00361                                                      recon, prec, kparam);
00362                 
00363               }                     
00364             }//sig
00365           }
00366         }//rho  
00367       }
00368     }//nu
00369   }//dir
00370   
00371   
00372   cudaThreadSynchronize(); 
00373   checkCudaError();
00374   
00375   return;
00376 }