QUDA v0.4.0
A library for QCD on GPUs
|
00001 00002 #include <stdio.h> 00003 #include <cuda_runtime.h> 00004 #include <cuda.h> 00005 00006 #include <quda_internal.h> 00007 #include <read_gauge.h> 00008 #include "gauge_field.h" 00009 #include <force_common.h> 00010 #include "llfat_quda.h" 00011 #include <face_quda.h> 00012 00013 #define BLOCK_DIM 64 00014 00015 void 00016 llfat_cuda(cudaGaugeField& cudaFatLink, cudaGaugeField& cudaSiteLink, 00017 cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1, 00018 QudaGaugeParam* param, double* act_path_coeff) 00019 { 00020 int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3]; 00021 int Vh = volume/2; 00022 dim3 gridDim(volume/BLOCK_DIM,1,1); 00023 dim3 halfGridDim(Vh/BLOCK_DIM,1,1); 00024 dim3 blockDim(BLOCK_DIM , 1, 1); 00025 00026 QudaPrecision prec = cudaSiteLink.Precision(); 00027 QudaReconstructType recon = cudaSiteLink.Reconstruct(); 00028 00029 if( ((param->X[0] % 2 != 0) 00030 ||(param->X[1] % 2 != 0) 00031 ||(param->X[2] % 2 != 0) 00032 ||(param->X[3] % 2 != 0)) 00033 && (recon == QUDA_RECONSTRUCT_12)){ 00034 errorQuda("12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n"); 00035 00036 } 00037 00038 int nStream=9; 00039 cudaStream_t stream[nStream]; 00040 for(int i = 0;i < nStream; i++){ 00041 cudaStreamCreate(&stream[i]); 00042 } 00043 00044 00045 llfatOneLinkKernel(cudaFatLink, cudaSiteLink,cudaStaple, cudaStaple1, 00046 param, act_path_coeff); 00047 00048 00049 llfat_kernel_param_t kparam; 00050 for(int i=0;i < 4;i++){ 00051 kparam.ghostDim[i] = commDimPartitioned(i); 00052 } 00053 int ktype[8] = { 00054 LLFAT_EXTERIOR_KERNEL_BACK_X, 00055 LLFAT_EXTERIOR_KERNEL_FWD_X, 00056 LLFAT_EXTERIOR_KERNEL_BACK_Y, 00057 LLFAT_EXTERIOR_KERNEL_FWD_Y, 00058 LLFAT_EXTERIOR_KERNEL_BACK_Z, 00059 LLFAT_EXTERIOR_KERNEL_FWD_Z, 00060 LLFAT_EXTERIOR_KERNEL_BACK_T, 00061 LLFAT_EXTERIOR_KERNEL_FWD_T, 00062 }; 00063 00064 00065 for(int dir = 0;dir < 4; dir++){ 00066 for(int nu = 0; nu < 4; nu++){ 00067 if (nu != dir){ 00068 00069 #ifdef MULTI_GPU 00070 //start of one call 00071 for(int k=3; k >= 0 ;k--){ 00072 if(!commDimPartitioned(k)) continue; 00073 00074 kparam.kernel_type = ktype[2*k]; 00075 siteComputeGenStapleParityKernel((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(), 00076 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00077 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00078 dir, nu, 00079 act_path_coeff[2], 00080 recon, prec, halfGridDim, 00081 kparam, &stream[2*k]); 00082 00083 exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); 00084 00085 kparam.kernel_type = ktype[2*k+1]; 00086 siteComputeGenStapleParityKernel((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(), 00087 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00088 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00089 dir, nu, 00090 act_path_coeff[2], 00091 recon, prec, halfGridDim, 00092 kparam, &stream[2*k+1]); 00093 exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); 00094 } 00095 #endif 00096 kparam.kernel_type = LLFAT_INTERIOR_KERNEL; 00097 siteComputeGenStapleParityKernel((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(), 00098 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00099 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00100 dir, nu, 00101 act_path_coeff[2], 00102 recon, prec, halfGridDim, 00103 kparam, &stream[nStream-1]); 00104 00105 #ifdef MULTI_GPU 00106 for(int k=3; k >= 0 ;k--){ 00107 if(!commDimPartitioned(k)) continue; 00108 exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); 00109 exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); 00110 } 00111 for(int k=3; k >= 0 ;k--){ 00112 if(!commDimPartitioned(k)) continue; 00113 exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); 00114 exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); 00115 } 00116 for(int k=3; k >= 0 ;k--){ 00117 if(!commDimPartitioned(k)) continue; 00118 cudaStreamSynchronize(stream[2*k]); 00119 cudaStreamSynchronize(stream[2*k+1]); 00120 } 00121 #endif 00122 //end 00123 00124 //start of one call 00125 kparam.kernel_type = LLFAT_INTERIOR_KERNEL; 00126 if(act_path_coeff[5] != 0.0){ 00127 computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL, 00128 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00129 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00130 (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(), 00131 dir, nu, 0, 00132 act_path_coeff[5], 00133 recon, prec, halfGridDim, kparam, &stream[nStream-1]); 00134 } 00135 //end 00136 for(int rho = 0; rho < 4; rho++){ 00137 if (rho != dir && rho != nu){ 00138 00139 //start of one call 00140 #ifdef MULTI_GPU 00141 for(int k=3; k >= 0 ;k--){ 00142 if(!commDimPartitioned(k)) continue; 00143 kparam.kernel_type = ktype[2*k]; 00144 computeGenStapleFieldParityKernel((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(), 00145 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00146 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00147 (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(), 00148 dir, rho, 1, 00149 act_path_coeff[3], 00150 recon, prec, halfGridDim, kparam, &stream[2*k]); 00151 exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]); 00152 kparam.kernel_type = ktype[2*k+1]; 00153 computeGenStapleFieldParityKernel((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(), 00154 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00155 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00156 (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(), 00157 dir, rho, 1, 00158 act_path_coeff[3], 00159 recon, prec, halfGridDim, kparam, &stream[2*k+1]); 00160 exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]); 00161 } 00162 #endif 00163 00164 kparam.kernel_type = LLFAT_INTERIOR_KERNEL; 00165 computeGenStapleFieldParityKernel((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(), 00166 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00167 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00168 (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(), 00169 dir, rho, 1, 00170 act_path_coeff[3], 00171 recon, prec, halfGridDim, kparam, &stream[nStream-1]); 00172 00173 #ifdef MULTI_GPU 00174 for(int k=3; k >= 0 ;k--){ 00175 if(!commDimPartitioned(k)) continue; 00176 exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]); 00177 exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]); 00178 } 00179 for(int k=3; k >= 0 ;k--){ 00180 if(!commDimPartitioned(k)) continue; 00181 exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_BACKWARDS, &stream[2*k]); 00182 exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_FORWARDS, &stream[2*k+1]); 00183 } 00184 for(int k=3; k >= 0 ;k--){ 00185 if(!commDimPartitioned(k)) continue; 00186 cudaStreamSynchronize(stream[2*k]); 00187 cudaStreamSynchronize(stream[2*k+1]); 00188 } 00189 #endif 00190 //end 00191 00192 00193 for(int sig = 0; sig < 4; sig++){ 00194 if (sig != dir && sig != nu && sig != rho){ 00195 00196 //start of one call 00197 kparam.kernel_type = LLFAT_INTERIOR_KERNEL; 00198 computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL, 00199 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00200 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00201 (void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(), 00202 dir, sig, 0, 00203 act_path_coeff[4], 00204 recon, prec, halfGridDim, kparam, &stream[nStream-1]); 00205 00206 //end 00207 00208 } 00209 }//sig 00210 } 00211 }//rho 00212 } 00213 }//nu 00214 }//dir 00215 00216 00217 cudaThreadSynchronize(); 00218 checkCudaError(); 00219 00220 for(int i=0;i < nStream; i++){ 00221 cudaStreamDestroy(stream[i]); 00222 } 00223 00224 return; 00225 } 00226 00227 00228 00229 void 00230 llfat_cuda_ex(cudaGaugeField& cudaFatLink, cudaGaugeField& cudaSiteLink, 00231 cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1, 00232 QudaGaugeParam* param, double* act_path_coeff) 00233 { 00234 00235 dim3 blockDim(BLOCK_DIM, 1,1); 00236 00237 int volume = (param->X[0])*(param->X[1])*(param->X[2])*(param->X[3]); 00238 int Vh = volume/2; 00239 dim3 halfGridDim(Vh/blockDim.x,1,1); 00240 if(Vh % blockDim.x != 0){ 00241 halfGridDim.x +=1; 00242 } 00243 00244 00245 int volume_1g = (param->X[0]+2)*(param->X[1]+2)*(param->X[2]+2)*(param->X[3]+2); 00246 int Vh_1g = volume_1g/2; 00247 dim3 halfGridDim_1g(Vh_1g/blockDim.x,1,1); 00248 if(Vh_1g % blockDim.x != 0){ 00249 halfGridDim_1g.x +=1; 00250 } 00251 00252 int volume_2g = (param->X[0]+4)*(param->X[1]+4)*(param->X[2]+4)*(param->X[3]+4); 00253 int Vh_2g = volume_2g/2; 00254 dim3 halfGridDim_2g(Vh_2g/blockDim.x,1,1); 00255 if(Vh_2g % blockDim.x != 0){ 00256 halfGridDim_2g.x +=1; 00257 } 00258 00259 QudaPrecision prec = cudaSiteLink.Precision(); 00260 QudaReconstructType recon = cudaSiteLink.Reconstruct(); 00261 00262 if( ((param->X[0] % 2 != 0) 00263 ||(param->X[1] % 2 != 0) 00264 ||(param->X[2] % 2 != 0) 00265 ||(param->X[3] % 2 != 0)) 00266 && (recon == QUDA_RECONSTRUCT_12)){ 00267 errorQuda("12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n"); 00268 00269 } 00270 00271 00272 llfat_kernel_param_t kparam; 00273 llfat_kernel_param_t kparam_1g; 00274 llfat_kernel_param_t kparam_2g; 00275 00276 kparam.threads= Vh; 00277 kparam.halfGridDim = halfGridDim; 00278 kparam.D1 = param->X[0]; 00279 kparam.D2 = param->X[1]; 00280 kparam.D3 = param->X[2]; 00281 kparam.D4 = param->X[3]; 00282 kparam.D1h = param->X[0]/2; 00283 kparam.base_idx = 2; 00284 00285 kparam_1g.threads= Vh_1g; 00286 kparam_1g.halfGridDim = halfGridDim_1g; 00287 kparam_1g.D1 = param->X[0] + 2; 00288 kparam_1g.D2 = param->X[1] + 2; 00289 kparam_1g.D3 = param->X[2] + 2; 00290 kparam_1g.D4 = param->X[3] + 2; 00291 kparam_1g.D1h = (param->X[0] + 2)/2; 00292 kparam_1g.base_idx = 1; 00293 00294 kparam_2g.threads= Vh_2g; 00295 kparam_2g.halfGridDim = halfGridDim_2g; 00296 kparam_2g.D1 = param->X[0] + 4; 00297 kparam_2g.D2 = param->X[1] + 4; 00298 kparam_2g.D3 = param->X[2] + 4; 00299 kparam_2g.D4 = param->X[3] + 4; 00300 kparam_2g.D1h = (param->X[0] + 4)/2; 00301 kparam_2g.base_idx = 0; 00302 00303 kparam_1g.blockDim = kparam_2g.blockDim = kparam.blockDim = blockDim; 00304 00305 /* 00306 { 00307 static dim3 blocks[3]={{64, 1, 1}, {64,1,1}, {64,1,1}}; 00308 QudaVerbosity verbose = QUDA_DEBUG_VERBOSE; 00309 TuneLinkFattening fatTune(cudaFatLink, cudaSiteLink, cudaStaple, cudaStaple1, 00310 kparam, kparam_1g, verbose); 00311 fatTune.BenchmarkMulti(blocks, 3); 00312 00313 } 00314 */ 00315 00316 llfatOneLinkKernel_ex(cudaFatLink, cudaSiteLink,cudaStaple, cudaStaple1, 00317 param, act_path_coeff, kparam); 00318 00319 for(int dir = 0;dir < 4; dir++){ 00320 for(int nu = 0; nu < 4; nu++){ 00321 if (nu != dir){ 00322 00323 00324 siteComputeGenStapleParityKernel_ex((void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(), 00325 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00326 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00327 dir, nu, 00328 act_path_coeff[2], 00329 recon, prec, kparam_1g); 00330 00331 if(act_path_coeff[5] != 0.0){ 00332 computeGenStapleFieldParityKernel_ex((void*)NULL, (void*)NULL, 00333 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00334 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00335 (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(), 00336 dir, nu, 0, 00337 act_path_coeff[5], 00338 recon, prec, kparam); 00339 } 00340 00341 for(int rho = 0; rho < 4; rho++){ 00342 if (rho != dir && rho != nu){ 00343 00344 computeGenStapleFieldParityKernel_ex((void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(), 00345 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00346 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00347 (void*)cudaStaple.Even_p(), (void*)cudaStaple.Odd_p(), 00348 dir, rho, 1, 00349 act_path_coeff[3], 00350 recon, prec, kparam_1g); 00351 00352 for(int sig = 0; sig < 4; sig++){ 00353 if (sig != dir && sig != nu && sig != rho){ 00354 00355 computeGenStapleFieldParityKernel_ex((void*)NULL, (void*)NULL, 00356 (void*)cudaSiteLink.Even_p(), (void*)cudaSiteLink.Odd_p(), 00357 (void*)cudaFatLink.Even_p(), (void*)cudaFatLink.Odd_p(), 00358 (void*)cudaStaple1.Even_p(), (void*)cudaStaple1.Odd_p(), 00359 dir, sig, 0, 00360 act_path_coeff[4], 00361 recon, prec, kparam); 00362 00363 } 00364 }//sig 00365 } 00366 }//rho 00367 } 00368 }//nu 00369 }//dir 00370 00371 00372 cudaThreadSynchronize(); 00373 checkCudaError(); 00374 00375 return; 00376 }