|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 00002 #include <read_gauge.h> 00003 #include <gauge_quda.h> 00004 00005 #include "fermion_force_quda.h" 00006 #include "force_common.h" 00007 #include "hw_quda.h" 00008 00009 #define LOAD_ANTI_HERMITIAN LOAD_ANTI_HERMITIAN_SINGLE 00010 00011 00012 00013 #define LOAD_HW_SINGLE(hw, idx, var) do{ \ 00014 var##0 = hw[idx + 0*Vh]; \ 00015 var##1 = hw[idx + 1*Vh]; \ 00016 var##2 = hw[idx + 2*Vh]; \ 00017 var##3 = hw[idx + 3*Vh]; \ 00018 var##4 = hw[idx + 4*Vh]; \ 00019 var##5 = hw[idx + 5*Vh]; \ 00020 }while(0) 00021 00022 00023 #define WRITE_HW_SINGLE(hw, idx, var) do{ \ 00024 hw[idx + 0*Vh] = var##0; \ 00025 hw[idx + 1*Vh] = var##1; \ 00026 hw[idx + 2*Vh] = var##2; \ 00027 hw[idx + 3*Vh] = var##3; \ 00028 hw[idx + 4*Vh] = var##4; \ 00029 hw[idx + 5*Vh] = var##5; \ 00030 }while(0) 00031 00032 #define LOAD_HW(hw, idx, var) LOAD_HW_SINGLE(hw, idx, var) 00033 #define WRITE_HW(hw, idx, var) WRITE_HW_SINGLE(hw, idx, var) 00034 #define LOAD_MATRIX(src, dir, idx, var) LOAD_MATRIX_12_SINGLE(src, dir, idx, var) 00035 //#define LOAD_ANTI_HERMITIAN(mom, mydir, idx, AH); 00036 00037 #define FF_SITE_MATRIX_LOAD_TEX 1 00038 00039 #if (FF_SITE_MATRIX_LOAD_TEX == 1) 00040 #define linkEvenTex siteLink0TexSingle 00041 #define linkOddTex siteLink1TexSingle 00042 #define FF_LOAD_MATRIX(src, dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX(src##Tex, dir, idx, var) 00043 #else 00044 #define FF_LOAD_MATRIX(src, dir, idx, var) LOAD_MATRIX_12_SINGLE(src, dir, idx, var) 00045 #endif 00046 00047 00048 00049 00050 #define MAT_MUL_HW(M, HW, HWOUT) \ 00051 HWOUT##00_re = (M##00_re * HW##00_re - M##00_im * HW##00_im) \ 00052 + (M##01_re * HW##01_re - M##01_im * HW##01_im) \ 00053 + (M##02_re * HW##02_re - M##02_im * HW##02_im); \ 00054 HWOUT##00_im = (M##00_re * HW##00_im + M##00_im * HW##00_re) \ 00055 + (M##01_re * HW##01_im + M##01_im * HW##01_re) \ 00056 + (M##02_re * HW##02_im + M##02_im * HW##02_re); \ 00057 HWOUT##01_re = (M##10_re * HW##00_re - M##10_im * HW##00_im) \ 00058 + (M##11_re * HW##01_re - M##11_im * HW##01_im) \ 00059 + (M##12_re * HW##02_re - M##12_im * HW##02_im); \ 00060 HWOUT##01_im = (M##10_re * HW##00_im + M##10_im * HW##00_re) \ 00061 + (M##11_re * HW##01_im + M##11_im * HW##01_re) \ 00062 + (M##12_re * HW##02_im + M##12_im * HW##02_re); \ 00063 HWOUT##02_re = (M##20_re * HW##00_re - M##20_im * HW##00_im) \ 00064 + (M##21_re * HW##01_re - M##21_im * HW##01_im) \ 00065 + (M##22_re * HW##02_re - M##22_im * HW##02_im); \ 00066 HWOUT##02_im = (M##20_re * HW##00_im + M##20_im * HW##00_re) \ 00067 + (M##21_re * HW##01_im + M##21_im * HW##01_re) \ 00068 + (M##22_re * HW##02_im + M##22_im * HW##02_re); \ 00069 HWOUT##10_re = (M##00_re * HW##10_re - M##00_im * HW##10_im) \ 00070 + (M##01_re * HW##11_re - M##01_im * HW##11_im) \ 00071 + (M##02_re * HW##12_re - M##02_im * HW##12_im); \ 00072 HWOUT##10_im = (M##00_re * HW##10_im + M##00_im * HW##10_re) \ 00073 + (M##01_re * HW##11_im + M##01_im * HW##11_re) \ 00074 + (M##02_re * HW##12_im + M##02_im * HW##12_re); \ 00075 HWOUT##11_re = (M##10_re * HW##10_re - M##10_im * HW##10_im) \ 00076 + (M##11_re * HW##11_re - M##11_im * HW##11_im) \ 00077 + (M##12_re * HW##12_re - M##12_im * HW##12_im); \ 00078 HWOUT##11_im = (M##10_re * HW##10_im + M##10_im * HW##10_re) \ 00079 + (M##11_re * HW##11_im + M##11_im * HW##11_re) \ 00080 + (M##12_re * HW##12_im + M##12_im * HW##12_re); \ 00081 HWOUT##12_re = (M##20_re * HW##10_re - M##20_im * HW##10_im) \ 00082 + (M##21_re * HW##11_re - M##21_im * HW##11_im) \ 00083 + (M##22_re * HW##12_re - M##22_im * HW##12_im); \ 00084 HWOUT##12_im = (M##20_re * HW##10_im + M##20_im * HW##10_re) \ 00085 + (M##21_re * HW##11_im + M##21_im * HW##11_re) \ 00086 + (M##22_re * HW##12_im + M##22_im * HW##12_re); 00087 00088 00089 #define ADJ_MAT_MUL_HW(M, HW, HWOUT) \ 00090 HWOUT##00_re = (M##00_re * HW##00_re + M##00_im * HW##00_im) \ 00091 + (M##10_re * HW##01_re + M##10_im * HW##01_im) \ 00092 + (M##20_re * HW##02_re + M##20_im * HW##02_im); \ 00093 HWOUT##00_im = (M##00_re * HW##00_im - M##00_im * HW##00_re) \ 00094 + (M##10_re * HW##01_im - M##10_im * HW##01_re) \ 00095 + (M##20_re * HW##02_im - M##20_im * HW##02_re); \ 00096 HWOUT##01_re = (M##01_re * HW##00_re + M##01_im * HW##00_im) \ 00097 + (M##11_re * HW##01_re + M##11_im * HW##01_im) \ 00098 + (M##21_re * HW##02_re + M##21_im * HW##02_im); \ 00099 HWOUT##01_im = (M##01_re * HW##00_im - M##01_im * HW##00_re) \ 00100 + (M##11_re * HW##01_im - M##11_im * HW##01_re) \ 00101 + (M##21_re * HW##02_im - M##21_im * HW##02_re); \ 00102 HWOUT##02_re = (M##02_re * HW##00_re + M##02_im * HW##00_im) \ 00103 + (M##12_re * HW##01_re + M##12_im * HW##01_im) \ 00104 + (M##22_re * HW##02_re + M##22_im * HW##02_im); \ 00105 HWOUT##02_im = (M##02_re * HW##00_im - M##02_im * HW##00_re) \ 00106 + (M##12_re * HW##01_im - M##12_im * HW##01_re) \ 00107 + (M##22_re * HW##02_im - M##22_im * HW##02_re); \ 00108 HWOUT##10_re = (M##00_re * HW##10_re + M##00_im * HW##10_im) \ 00109 + (M##10_re * HW##11_re + M##10_im * HW##11_im) \ 00110 + (M##20_re * HW##12_re + M##20_im * HW##12_im); \ 00111 HWOUT##10_im = (M##00_re * HW##10_im - M##00_im * HW##10_re) \ 00112 + (M##10_re * HW##11_im - M##10_im * HW##11_re) \ 00113 + (M##20_re * HW##12_im - M##20_im * HW##12_re); \ 00114 HWOUT##11_re = (M##01_re * HW##10_re + M##01_im * HW##10_im) \ 00115 + (M##11_re * HW##11_re + M##11_im * HW##11_im) \ 00116 + (M##21_re * HW##12_re + M##21_im * HW##12_im); \ 00117 HWOUT##11_im = (M##01_re * HW##10_im - M##01_im * HW##10_re) \ 00118 + (M##11_re * HW##11_im - M##11_im * HW##11_re) \ 00119 + (M##21_re * HW##12_im - M##21_im * HW##12_re); \ 00120 HWOUT##12_re = (M##02_re * HW##10_re + M##02_im * HW##10_im) \ 00121 + (M##12_re * HW##11_re + M##12_im * HW##11_im) \ 00122 + (M##22_re * HW##12_re + M##22_im * HW##12_im); \ 00123 HWOUT##12_im = (M##02_re * HW##10_im - M##02_im * HW##10_re) \ 00124 + (M##12_re * HW##11_im - M##12_im * HW##11_re) \ 00125 + (M##22_re * HW##12_im - M##22_im * HW##12_re); 00126 00127 00128 #define SU3_PROJECTOR(va, vb, m) \ 00129 m##00_re = va##0_re * vb##0_re + va##0_im * vb##0_im; \ 00130 m##00_im = va##0_im * vb##0_re - va##0_re * vb##0_im; \ 00131 m##01_re = va##0_re * vb##1_re + va##0_im * vb##1_im; \ 00132 m##01_im = va##0_im * vb##1_re - va##0_re * vb##1_im; \ 00133 m##02_re = va##0_re * vb##2_re + va##0_im * vb##2_im; \ 00134 m##02_im = va##0_im * vb##2_re - va##0_re * vb##2_im; \ 00135 m##10_re = va##1_re * vb##0_re + va##1_im * vb##0_im; \ 00136 m##10_im = va##1_im * vb##0_re - va##1_re * vb##0_im; \ 00137 m##11_re = va##1_re * vb##1_re + va##1_im * vb##1_im; \ 00138 m##11_im = va##1_im * vb##1_re - va##1_re * vb##1_im; \ 00139 m##12_re = va##1_re * vb##2_re + va##1_im * vb##2_im; \ 00140 m##12_im = va##1_im * vb##2_re - va##1_re * vb##2_im; \ 00141 m##20_re = va##2_re * vb##0_re + va##2_im * vb##0_im; \ 00142 m##20_im = va##2_im * vb##0_re - va##2_re * vb##0_im; \ 00143 m##21_re = va##2_re * vb##1_re + va##2_im * vb##1_im; \ 00144 m##21_im = va##2_im * vb##1_re - va##2_re * vb##1_im; \ 00145 m##22_re = va##2_re * vb##2_re + va##2_im * vb##2_im; \ 00146 m##22_im = va##2_im * vb##2_re - va##2_re * vb##2_im; 00147 00148 //vc = va + vb*s 00149 #define SCALAR_MULT_ADD_SU3_VECTOR(va, vb, s, vc) do { \ 00150 vc##0_re = va##0_re + vb##0_re * s; \ 00151 vc##0_im = va##0_im + vb##0_im * s; \ 00152 vc##1_re = va##1_re + vb##1_re * s; \ 00153 vc##1_im = va##1_im + vb##1_im * s; \ 00154 vc##2_re = va##2_re + vb##2_re * s; \ 00155 vc##2_im = va##2_im + vb##2_im * s; \ 00156 }while (0) 00157 00158 00159 #define FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx, new_idx) do { \ 00160 switch(mydir){ \ 00161 case 0: \ 00162 new_idx = ( (new_x1==X1m1)?idx-X1m1:idx+1); \ 00163 new_x1 = (new_x1==X1m1)?0:new_x1+1; \ 00164 break; \ 00165 case 1: \ 00166 new_idx = ( (new_x2==X2m1)?idx-X2X1mX1:idx+X1); \ 00167 new_x2 = (new_x2==X2m1)?0:new_x2+1; \ 00168 break; \ 00169 case 2: \ 00170 new_idx = ( (new_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \ 00171 new_x3 = (new_x3==X3m1)?0:new_x3+1; \ 00172 break; \ 00173 case 3: \ 00174 new_idx = ( (new_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \ 00175 new_x4 = (new_x4==X4m1)?0:new_x4+1; \ 00176 break; \ 00177 } \ 00178 }while(0) 00179 00180 #define FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx, new_idx) do { \ 00181 switch(mydir){ \ 00182 case 0: \ 00183 new_idx = ( (new_x1==0)?idx+X1m1:idx-1); \ 00184 new_x1 = (new_x1==0)?X1m1:new_x1 - 1; \ 00185 break; \ 00186 case 1: \ 00187 new_idx = ( (new_x2==0)?idx+X2X1mX1:idx-X1); \ 00188 new_x2 = (new_x2==0)?X2m1:new_x2 - 1; \ 00189 break; \ 00190 case 2: \ 00191 new_idx = ( (new_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \ 00192 new_x3 = (new_x3==0)?X3m1:new_x3 - 1; \ 00193 break; \ 00194 case 3: \ 00195 new_idx = ( (new_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \ 00196 new_x4 = (new_x4==0)?X4m1:new_x4 - 1; \ 00197 break; \ 00198 } \ 00199 }while(0) 00200 00201 00202 #define FF_COMPUTE_NEW_FULL_IDX_PLUS(old_x1, old_x2, old_x3, old_x4, idx, mydir, new_idx) do { \ 00203 switch(mydir){ \ 00204 case 0: \ 00205 new_idx = ( (old_x1==X1m1)?idx-X1m1:idx+1); \ 00206 break; \ 00207 case 1: \ 00208 new_idx = ( (old_x2==X2m1)?idx-X2X1mX1:idx+X1); \ 00209 break; \ 00210 case 2: \ 00211 new_idx = ( (old_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \ 00212 break; \ 00213 case 3: \ 00214 new_idx = ( (old_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \ 00215 break; \ 00216 } \ 00217 }while(0) 00218 00219 #define FF_COMPUTE_NEW_FULL_IDX_MINUS(old_x1, old_x2, old_x3, old_x4, idx, mydir, new_idx) do { \ 00220 switch(mydir){ \ 00221 case 0: \ 00222 new_idx = ( (old_x1==0)?idx+X1m1:idx-1); \ 00223 break; \ 00224 case 1: \ 00225 new_idx = ( (old_x2==0)?idx+X2X1mX1:idx-X1); \ 00226 break; \ 00227 case 2: \ 00228 new_idx = ( (old_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \ 00229 break; \ 00230 case 3: \ 00231 new_idx = ( (old_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \ 00232 break; \ 00233 } \ 00234 }while(0) 00235 00236 //this macro require linka, linkb, and ah variables defined 00237 #define ADD_FORCE_TO_MOM(hw1, hw2, mom, idx, dir, cf,oddness) do{ \ 00238 float2 my_coeff; \ 00239 int mydir; \ 00240 if (GOES_BACKWARDS(dir)){ \ 00241 mydir=OPP_DIR(dir); \ 00242 my_coeff.x = -cf.x; \ 00243 my_coeff.y = -cf.y; \ 00244 }else{ \ 00245 mydir=dir; \ 00246 my_coeff.x = cf.x; \ 00247 my_coeff.y = cf.y; \ 00248 } \ 00249 float2 tmp_coeff; \ 00250 tmp_coeff.x = my_coeff.x; \ 00251 tmp_coeff.y = my_coeff.y; \ 00252 if(oddness){ \ 00253 tmp_coeff.x = - my_coeff.x; \ 00254 tmp_coeff.y = - my_coeff.y; \ 00255 } \ 00256 LOAD_ANTI_HERMITIAN(mom, mydir, idx, AH); \ 00257 UNCOMPRESS_ANTI_HERMITIAN(ah, linka); \ 00258 SU3_PROJECTOR(hw1##0, hw2##0, linkb); \ 00259 SCALAR_MULT_ADD_SU3_MATRIX(linka, linkb, tmp_coeff.x, linka); \ 00260 SU3_PROJECTOR(hw1##1, hw2##1, linkb); \ 00261 SCALAR_MULT_ADD_SU3_MATRIX(linka, linkb, tmp_coeff.y, linka); \ 00262 MAKE_ANTI_HERMITIAN(linka, ah); \ 00263 WRITE_ANTI_HERMITIAN_SINGLE(mom, mydir, idx, AH); \ 00264 }while(0) 00265 00266 #define FF_COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \ 00267 sign =1; \ 00268 switch(dir){ \ 00269 case XUP: \ 00270 if ( (i4 & 1) == 1){ \ 00271 sign = -1; \ 00272 } \ 00273 break; \ 00274 case YUP: \ 00275 if ( ((i4+i1) & 1) == 1){ \ 00276 sign = -1; \ 00277 } \ 00278 break; \ 00279 case ZUP: \ 00280 if ( ((i4+i1+i2) & 1) == 1){ \ 00281 sign = -1; \ 00282 } \ 00283 break; \ 00284 case TUP: \ 00285 if (i4 == X4m1 ){ \ 00286 sign = -1; \ 00287 } \ 00288 break; \ 00289 } \ 00290 }while (0) 00291 00292 00293 #define hwa00_re HWA0.x 00294 #define hwa00_im HWA0.y 00295 #define hwa01_re HWA1.x 00296 #define hwa01_im HWA1.y 00297 #define hwa02_re HWA2.x 00298 #define hwa02_im HWA2.y 00299 #define hwa10_re HWA3.x 00300 #define hwa10_im HWA3.y 00301 #define hwa11_re HWA4.x 00302 #define hwa11_im HWA4.y 00303 #define hwa12_re HWA5.x 00304 #define hwa12_im HWA5.y 00305 00306 #define hwb00_re HWB0.x 00307 #define hwb00_im HWB0.y 00308 #define hwb01_re HWB1.x 00309 #define hwb01_im HWB1.y 00310 #define hwb02_re HWB2.x 00311 #define hwb02_im HWB2.y 00312 #define hwb10_re HWB3.x 00313 #define hwb10_im HWB3.y 00314 #define hwb11_re HWB4.x 00315 #define hwb11_im HWB4.y 00316 #define hwb12_re HWB5.x 00317 #define hwb12_im HWB5.y 00318 00319 #define hwc00_re HWC0.x 00320 #define hwc00_im HWC0.y 00321 #define hwc01_re HWC1.x 00322 #define hwc01_im HWC1.y 00323 #define hwc02_re HWC2.x 00324 #define hwc02_im HWC2.y 00325 #define hwc10_re HWC3.x 00326 #define hwc10_im HWC3.y 00327 #define hwc11_re HWC4.x 00328 #define hwc11_im HWC4.y 00329 #define hwc12_re HWC5.x 00330 #define hwc12_im HWC5.y 00331 00332 #define hwd00_re HWD0.x 00333 #define hwd00_im HWD0.y 00334 #define hwd01_re HWD1.x 00335 #define hwd01_im HWD1.y 00336 #define hwd02_re HWD2.x 00337 #define hwd02_im HWD2.y 00338 #define hwd10_re HWD3.x 00339 #define hwd10_im HWD3.y 00340 #define hwd11_re HWD4.x 00341 #define hwd11_im HWD4.y 00342 #define hwd12_re HWD5.x 00343 #define hwd12_im HWD5.y 00344 00345 #define hwe00_re HWE0.x 00346 #define hwe00_im HWE0.y 00347 #define hwe01_re HWE1.x 00348 #define hwe01_im HWE1.y 00349 #define hwe02_re HWE2.x 00350 #define hwe02_im HWE2.y 00351 #define hwe10_re HWE3.x 00352 #define hwe10_im HWE3.y 00353 #define hwe11_re HWE4.x 00354 #define hwe11_im HWE4.y 00355 #define hwe12_re HWE5.x 00356 #define hwe12_im HWE5.y 00357 00358 00359 void 00360 fermion_force_init_cuda(QudaGaugeParam* param) 00361 { 00362 static int fermion_force_init_cuda_flag = 0; 00363 00364 if (fermion_force_init_cuda_flag){ 00365 return; 00366 } 00367 00368 fermion_force_init_cuda_flag=1; 00369 00370 init_kernel_cuda(param); 00371 } 00372 00373 00374 template<int sig_positive, int mu_positive, int oddBit> 00375 __global__ void 00376 do_middle_link_kernel(float2* tempxEven, float2* tempxOdd, 00377 float2* PmuEven, float2* PmuOdd, 00378 float2* P3Even, float2* P3Odd, 00379 int sig, int mu, float2 coeff, 00380 float4* linkEven, float4* linkOdd, 00381 float2* momEven, float2* momOdd) 00382 { 00383 int sid = blockIdx.x * blockDim.x + threadIdx.x; 00384 00385 int z1 = FAST_INT_DIVIDE(sid, X1h); 00386 int x1h = sid - z1*X1h; 00387 int z2 = FAST_INT_DIVIDE(z1, X2); 00388 int x2 = z1 - z2*X2; 00389 int x4 = FAST_INT_DIVIDE(z2, X3); 00390 int x3 = z2 - x4*X3; 00391 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00392 int x1 = 2*x1h + x1odd; 00393 int X = 2*sid + x1odd; 00394 00395 int new_x1, new_x2, new_x3, new_x4; 00396 int new_mem_idx; 00397 int ad_link_sign=1; 00398 int ab_link_sign=1; 00399 int bc_link_sign=1; 00400 00401 float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5; 00402 float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5; 00403 float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5; 00404 float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5; 00405 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4; 00406 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4; 00407 float2 AH0, AH1, AH2, AH3, AH4; 00408 00409 /* sig 00410 A________B 00411 mu | | 00412 D | |C 00413 00414 A is the current point (sid) 00415 */ 00416 00417 int point_b, point_c, point_d; 00418 int ad_link_nbr_idx, ab_link_nbr_idx, bc_link_nbr_idx; 00419 int mymu; 00420 00421 new_x1 = x1; 00422 new_x2 = x2; 00423 new_x3 = x3; 00424 new_x4 = x4; 00425 00426 if(mu_positive){ 00427 mymu =mu; 00428 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mu, X, new_mem_idx); 00429 }else{ 00430 mymu = OPP_DIR(mu); 00431 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(OPP_DIR(mu), X, new_mem_idx); 00432 } 00433 point_d = (new_mem_idx >> 1); 00434 if (mu_positive){ 00435 ad_link_nbr_idx = point_d; 00436 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00437 }else{ 00438 ad_link_nbr_idx = sid; 00439 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4); 00440 } 00441 00442 int mysig; 00443 if(sig_positive){ 00444 mysig = sig; 00445 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx); 00446 }else{ 00447 mysig = OPP_DIR(sig); 00448 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx); 00449 } 00450 point_c = (new_mem_idx >> 1); 00451 if (mu_positive){ 00452 bc_link_nbr_idx = point_c; 00453 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00454 } 00455 new_x1 = x1; 00456 new_x2 = x2; 00457 new_x3 = x3; 00458 new_x4 = x4; 00459 if(sig_positive){ 00460 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, X, new_mem_idx); 00461 }else{ 00462 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), X, new_mem_idx); 00463 } 00464 point_b = (new_mem_idx >> 1); 00465 00466 if (!mu_positive){ 00467 bc_link_nbr_idx = point_b; 00468 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00469 } 00470 00471 if(sig_positive){ 00472 ab_link_nbr_idx = sid; 00473 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, x1, x2, x3, x4); 00474 }else{ 00475 ab_link_nbr_idx = point_b; 00476 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, new_x1,new_x2,new_x3,new_x4); 00477 } 00478 00479 LOAD_HW(tempxOdd, point_d, HWA); 00480 if(mu_positive){ 00481 FF_LOAD_MATRIX(linkOdd, mymu, ad_link_nbr_idx, LINKA); 00482 }else{ 00483 FF_LOAD_MATRIX(linkEven, mymu, ad_link_nbr_idx, LINKA); 00484 } 00485 00486 RECONSTRUCT_LINK_12(mymu, ad_link_nbr_idx, ad_link_sign, linka); 00487 if (mu_positive){ 00488 ADJ_MAT_MUL_HW(linka, hwa, hwd); 00489 }else{ 00490 MAT_MUL_HW(linka, hwa, hwd); 00491 } 00492 WRITE_HW(PmuEven, sid, HWD); 00493 00494 LOAD_HW(tempxEven, point_c, HWA); 00495 if(mu_positive){ 00496 FF_LOAD_MATRIX(linkEven, mymu, bc_link_nbr_idx, LINKA); 00497 }else{ 00498 FF_LOAD_MATRIX(linkOdd, mymu, bc_link_nbr_idx, LINKA); 00499 } 00500 00501 RECONSTRUCT_LINK_12(mymu, bc_link_nbr_idx, bc_link_sign, linka); 00502 if (mu_positive){ 00503 ADJ_MAT_MUL_HW(linka, hwa, hwb); 00504 }else{ 00505 MAT_MUL_HW(linka, hwa, hwb); 00506 } 00507 if(sig_positive){ 00508 FF_LOAD_MATRIX(linkEven, mysig, ab_link_nbr_idx, LINKB); 00509 }else{ 00510 FF_LOAD_MATRIX(linkOdd, mysig, ab_link_nbr_idx, LINKB); 00511 } 00512 00513 RECONSTRUCT_LINK_12(mysig, ab_link_nbr_idx, ab_link_sign, linkb); 00514 if (sig_positive){ 00515 MAT_MUL_HW(linkb, hwb, hwc); 00516 }else{ 00517 ADJ_MAT_MUL_HW(linkb, hwb, hwc); 00518 } 00519 WRITE_HW(P3Even, sid, HWC); 00520 00521 if (sig_positive){ 00522 00523 //add the force to mom 00524 00525 ADD_FORCE_TO_MOM(hwc, hwd, momEven, sid, sig, coeff, oddBit); 00526 } 00527 } 00528 00529 static void 00530 middle_link_kernel(float2* tempxEven, float2* tempxOdd, 00531 float2* PmuEven, float2* PmuOdd, 00532 float2* P3Even, float2* P3Odd, 00533 int sig, int mu, float2 coeff, 00534 float4* linkEven, float4* linkOdd, FullGauge cudaSiteLink, 00535 float2* momEven, float2* momOdd, 00536 dim3 gridDim, dim3 BlockDim) 00537 { 00538 dim3 halfGridDim(gridDim.x/2, 1,1); 00539 00540 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 00541 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 00542 00543 if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){ 00544 do_middle_link_kernel<1,1,0><<<halfGridDim, BlockDim>>>( tempxEven, tempxOdd, 00545 PmuEven, PmuOdd, 00546 P3Even, P3Odd, 00547 sig, mu, coeff, 00548 linkEven, linkOdd, 00549 momEven, momOdd); 00550 cudaUnbindTexture(siteLink0TexSingle); 00551 cudaUnbindTexture(siteLink1TexSingle); 00552 00553 //opposive binding 00554 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 00555 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 00556 00557 do_middle_link_kernel<1,1,1><<<halfGridDim, BlockDim>>>( tempxOdd, tempxEven, 00558 PmuOdd, PmuEven, 00559 P3Odd, P3Even, 00560 sig, mu, coeff, 00561 linkOdd, linkEven, 00562 momOdd, momEven); 00563 }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){ 00564 do_middle_link_kernel<1,0,0><<<halfGridDim, BlockDim>>>( tempxEven, tempxOdd, 00565 PmuEven, PmuOdd, 00566 P3Even, P3Odd, 00567 sig, mu, coeff, 00568 linkEven, linkOdd, 00569 momEven, momOdd); 00570 cudaUnbindTexture(siteLink0TexSingle); 00571 cudaUnbindTexture(siteLink1TexSingle); 00572 00573 //opposive binding 00574 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 00575 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 00576 00577 do_middle_link_kernel<1,0,1><<<halfGridDim, BlockDim>>>( tempxOdd, tempxEven, 00578 PmuOdd, PmuEven, 00579 P3Odd, P3Even, 00580 sig, mu, coeff, 00581 linkOdd, linkEven, 00582 momOdd, momEven); 00583 00584 }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){ 00585 do_middle_link_kernel<0,1,0><<<halfGridDim, BlockDim>>>( tempxEven, tempxOdd, 00586 PmuEven, PmuOdd, 00587 P3Even, P3Odd, 00588 sig, mu, coeff, 00589 linkEven, linkOdd, 00590 momEven, momOdd); 00591 cudaUnbindTexture(siteLink0TexSingle); 00592 cudaUnbindTexture(siteLink1TexSingle); 00593 00594 //opposive binding 00595 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 00596 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 00597 00598 do_middle_link_kernel<0,1,1><<<halfGridDim, BlockDim>>>( tempxOdd, tempxEven, 00599 PmuOdd, PmuEven, 00600 P3Odd, P3Even, 00601 sig, mu, coeff, 00602 linkOdd, linkEven, 00603 momOdd, momEven); 00604 }else{ 00605 do_middle_link_kernel<0,0,0><<<halfGridDim, BlockDim>>>( tempxEven, tempxOdd, 00606 PmuEven, PmuOdd, 00607 P3Even, P3Odd, 00608 sig, mu, coeff, 00609 linkEven, linkOdd, 00610 momEven, momOdd); 00611 00612 cudaUnbindTexture(siteLink0TexSingle); 00613 cudaUnbindTexture(siteLink1TexSingle); 00614 00615 //opposive binding 00616 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 00617 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 00618 00619 do_middle_link_kernel<0,0,1><<<halfGridDim, BlockDim>>>( tempxOdd, tempxEven, 00620 PmuOdd, PmuEven, 00621 P3Odd, P3Even, 00622 sig, mu, coeff, 00623 linkOdd, linkEven, 00624 momOdd, momEven); 00625 } 00626 cudaUnbindTexture(siteLink0TexSingle); 00627 cudaUnbindTexture(siteLink1TexSingle); 00628 00629 } 00630 00631 template<int sig_positive, int mu_positive, int oddBit> 00632 __global__ void 00633 do_side_link_kernel(float2* P3Even, float2* P3Odd, 00634 float2* P3muEven, float2* P3muOdd, 00635 float2* TempxEven, float2* TempxOdd, 00636 float2* PmuEven, float2* PmuOdd, 00637 float2* shortPEven, float2* shortPOdd, 00638 int sig, int mu, float2 coeff, float2 accumu_coeff, 00639 float4* linkEven, float4* linkOdd, 00640 float2* momEven, float2* momOdd) 00641 { 00642 float2 mcoeff; 00643 mcoeff.x = -coeff.x; 00644 mcoeff.y = -coeff.y; 00645 00646 int sid = blockIdx.x * blockDim.x + threadIdx.x; 00647 00648 int z1 = FAST_INT_DIVIDE(sid, X1h); 00649 int x1h = sid - z1*X1h; 00650 int z2 = FAST_INT_DIVIDE(z1, X2); 00651 int x2 = z1 - z2*X2; 00652 int x4 = FAST_INT_DIVIDE(z2, X3); 00653 int x3 = z2 - x4*X3; 00654 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00655 int x1 = 2*x1h + x1odd; 00656 int X = 2*sid + x1odd; 00657 00658 int ad_link_sign = 1; 00659 float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5; 00660 float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5; 00661 float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5; 00662 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4; 00663 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4; 00664 float2 AH0, AH1, AH2, AH3, AH4; 00665 00666 00667 00668 /* 00669 * compute the side link contribution to the momentum 00670 * 00671 00672 sig 00673 A________B 00674 | | mu 00675 D | |C 00676 00677 A is the current point (sid) 00678 */ 00679 00680 int point_d; 00681 int ad_link_nbr_idx; 00682 int mymu; 00683 int new_mem_idx; 00684 00685 int new_x1 = x1; 00686 int new_x2 = x2; 00687 int new_x3 = x3; 00688 int new_x4 = x4; 00689 00690 if(mu_positive){ 00691 mymu =mu; 00692 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mymu,X, new_mem_idx); 00693 }else{ 00694 mymu = OPP_DIR(mu); 00695 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mymu, X, new_mem_idx); 00696 } 00697 point_d = (new_mem_idx >> 1); 00698 00699 if (mu_positive){ 00700 ad_link_nbr_idx = point_d; 00701 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00702 }else{ 00703 ad_link_nbr_idx = sid; 00704 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4); 00705 } 00706 00707 00708 LOAD_HW(P3Even, sid, HWA); 00709 if(mu_positive){ 00710 FF_LOAD_MATRIX(linkOdd, mymu, ad_link_nbr_idx, LINKA); 00711 }else{ 00712 FF_LOAD_MATRIX(linkEven, mymu, ad_link_nbr_idx, LINKA); 00713 } 00714 00715 RECONSTRUCT_LINK_12(mymu, ad_link_nbr_idx, ad_link_sign, linka); 00716 if (mu_positive){ 00717 MAT_MUL_HW(linka, hwa, hwb); 00718 }else{ 00719 ADJ_MAT_MUL_HW(linka, hwa, hwb); 00720 } 00721 00722 00723 //start to add side link force 00724 if (mu_positive){ 00725 LOAD_HW(TempxOdd, point_d, HWC); 00726 00727 if (sig_positive){ 00728 ADD_FORCE_TO_MOM(hwb, hwc, momOdd, point_d, mu, coeff, 1-oddBit); 00729 }else{ 00730 ADD_FORCE_TO_MOM(hwc, hwb, momOdd, point_d, OPP_DIR(mu), mcoeff, 1- oddBit); 00731 } 00732 }else{ 00733 LOAD_HW(PmuEven, sid, HWC); 00734 if (sig_positive){ 00735 ADD_FORCE_TO_MOM(hwa, hwc, momEven, sid, mu, mcoeff, oddBit); 00736 }else{ 00737 ADD_FORCE_TO_MOM(hwc, hwa, momEven, sid, OPP_DIR(mu), coeff, oddBit); 00738 } 00739 00740 } 00741 00742 if (shortPOdd){ 00743 LOAD_HW(shortPOdd, point_d, HWA); 00744 SCALAR_MULT_ADD_SU3_VECTOR(hwa0, hwb0, accumu_coeff.x, hwa0); 00745 SCALAR_MULT_ADD_SU3_VECTOR(hwa1, hwb1, accumu_coeff.y, hwa1); 00746 WRITE_HW(shortPOdd, point_d, HWA); 00747 } 00748 00749 } 00750 00751 static void 00752 side_link_kernel(float2* P3Even, float2* P3Odd, 00753 float2* P3muEven, float2* P3muOdd, 00754 float2* TempxEven, float2* TempxOdd, 00755 float2* PmuEven, float2* PmuOdd, 00756 float2* shortPEven, float2* shortPOdd, 00757 int sig, int mu, float2 coeff, float2 accumu_coeff, 00758 float4* linkEven, float4* linkOdd, FullGauge cudaSiteLink, 00759 float2* momEven, float2* momOdd, 00760 dim3 gridDim, dim3 blockDim) 00761 { 00762 dim3 halfGridDim(gridDim.x/2,1,1); 00763 00764 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 00765 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 00766 00767 if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){ 00768 do_side_link_kernel<1,1,0><<<halfGridDim, blockDim>>>( P3Even, P3Odd, 00769 P3muEven, P3muOdd, 00770 TempxEven, TempxOdd, 00771 PmuEven, PmuOdd, 00772 shortPEven, shortPOdd, 00773 sig, mu, coeff, accumu_coeff, 00774 linkEven, linkOdd, 00775 momEven, momOdd); 00776 cudaUnbindTexture(siteLink0TexSingle); 00777 cudaUnbindTexture(siteLink1TexSingle); 00778 00779 //opposive binding 00780 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 00781 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 00782 00783 do_side_link_kernel<1,1,1><<<halfGridDim, blockDim>>>( P3Odd, P3Even, 00784 P3muOdd, P3muEven, 00785 TempxOdd, TempxEven, 00786 PmuOdd, PmuEven, 00787 shortPOdd, shortPEven, 00788 sig, mu, coeff, accumu_coeff, 00789 linkOdd, linkEven, 00790 momOdd, momEven); 00791 00792 }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){ 00793 do_side_link_kernel<1,0,0><<<halfGridDim, blockDim>>>( P3Even, P3Odd, 00794 P3muEven, P3muOdd, 00795 TempxEven, TempxOdd, 00796 PmuEven, PmuOdd, 00797 shortPEven, shortPOdd, 00798 sig, mu, coeff, accumu_coeff, 00799 linkEven, linkOdd, 00800 momEven, momOdd); 00801 cudaUnbindTexture(siteLink0TexSingle); 00802 cudaUnbindTexture(siteLink1TexSingle); 00803 00804 //opposive binding 00805 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 00806 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 00807 00808 do_side_link_kernel<1,0,1><<<halfGridDim, blockDim>>>( P3Odd, P3Even, 00809 P3muOdd, P3muEven, 00810 TempxOdd, TempxEven, 00811 PmuOdd, PmuEven, 00812 shortPOdd, shortPEven, 00813 sig, mu, coeff, accumu_coeff, 00814 linkOdd, linkEven, 00815 momOdd, momEven); 00816 }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){ 00817 do_side_link_kernel<0,1,0><<<halfGridDim, blockDim>>>( P3Even, P3Odd, 00818 P3muEven, P3muOdd, 00819 TempxEven, TempxOdd, 00820 PmuEven, PmuOdd, 00821 shortPEven, shortPOdd, 00822 sig, mu, coeff, accumu_coeff, 00823 linkEven, linkOdd, 00824 momEven, momOdd); 00825 cudaUnbindTexture(siteLink0TexSingle); 00826 cudaUnbindTexture(siteLink1TexSingle); 00827 00828 //opposive binding 00829 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 00830 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 00831 00832 do_side_link_kernel<0,1,1><<<halfGridDim, blockDim>>>( P3Odd, P3Even, 00833 P3muOdd, P3muEven, 00834 TempxOdd, TempxEven, 00835 PmuOdd, PmuEven, 00836 shortPOdd, shortPEven, 00837 sig, mu, coeff, accumu_coeff, 00838 linkOdd, linkEven, 00839 momOdd, momEven); 00840 00841 }else{ 00842 do_side_link_kernel<0,0,0><<<halfGridDim, blockDim>>>( P3Even, P3Odd, 00843 P3muEven, P3muOdd, 00844 TempxEven, TempxOdd, 00845 PmuEven, PmuOdd, 00846 shortPEven, shortPOdd, 00847 sig, mu, coeff, accumu_coeff, 00848 linkEven, linkOdd, 00849 momEven, momOdd); 00850 cudaUnbindTexture(siteLink0TexSingle); 00851 cudaUnbindTexture(siteLink1TexSingle); 00852 00853 //opposive binding 00854 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 00855 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 00856 00857 do_side_link_kernel<0,0,1><<<halfGridDim, blockDim>>>( P3Odd, P3Even, 00858 P3muOdd, P3muEven, 00859 TempxOdd, TempxEven, 00860 PmuOdd, PmuEven, 00861 shortPOdd, shortPEven, 00862 sig, mu, coeff, accumu_coeff, 00863 linkOdd, linkEven, 00864 momOdd, momEven); 00865 } 00866 00867 cudaUnbindTexture(siteLink0TexSingle); 00868 cudaUnbindTexture(siteLink1TexSingle); 00869 00870 } 00871 00872 template<int sig_positive, int mu_positive, int oddBit> 00873 __global__ void 00874 do_all_link_kernel(float2* tempxEven, float2* tempxOdd, 00875 float2* PmuEven, float2* PmuOdd, 00876 float2* P3Even, float2* P3Odd, 00877 float2* P3muEven, float2* P3muOdd, 00878 float2* shortPEven, float2* shortPOdd, 00879 int sig, int mu, float2 coeff, float2 mcoeff, float2 accumu_coeff, 00880 float4* linkEven, float4* linkOdd, 00881 float2* momEven, float2* momOdd) 00882 { 00883 int sid = blockIdx.x * blockDim.x + threadIdx.x; 00884 00885 int z1 = FAST_INT_DIVIDE(sid, X1h); 00886 int x1h = sid - z1*X1h; 00887 int z2 = FAST_INT_DIVIDE(z1, X2); 00888 int x2 = z1 - z2*X2; 00889 int x4 = FAST_INT_DIVIDE(z2, X3); 00890 int x3 = z2 - x4*X3; 00891 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00892 int x1 = 2*x1h + x1odd; 00893 int X = 2*sid + x1odd; 00894 00895 int new_x1, new_x2, new_x3, new_x4; 00896 int ad_link_sign=1; 00897 int ab_link_sign=1; 00898 int bc_link_sign=1; 00899 00900 float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5; 00901 float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5; 00902 float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5; 00903 float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5; 00904 float2 HWE0, HWE1, HWE2, HWE3, HWE4, HWE5; 00905 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4; 00906 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4; 00907 float4 LINKC0, LINKC1, LINKC2, LINKC3, LINKC4; 00908 float2 AH0, AH1, AH2, AH3, AH4; 00909 00910 00911 /* sig 00912 A________B 00913 mu | | 00914 D | |C 00915 00916 A is the current point (sid) 00917 */ 00918 00919 00920 int point_b, point_c, point_d; 00921 int ad_link_nbr_idx, ab_link_nbr_idx, bc_link_nbr_idx; 00922 int mymu; 00923 int new_mem_idx; 00924 new_x1 = x1; 00925 new_x2 = x2; 00926 new_x3 = x3; 00927 new_x4 = x4; 00928 00929 if(mu_positive){ 00930 mymu =mu; 00931 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mu, X, new_mem_idx); 00932 }else{ 00933 mymu = OPP_DIR(mu); 00934 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(OPP_DIR(mu), X, new_mem_idx); 00935 } 00936 point_d = (new_mem_idx >> 1); 00937 00938 if (mu_positive){ 00939 ad_link_nbr_idx = point_d; 00940 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00941 }else{ 00942 ad_link_nbr_idx = sid; 00943 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4); 00944 } 00945 00946 int mysig; 00947 if(sig_positive){ 00948 mysig = sig; 00949 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx); 00950 }else{ 00951 mysig = OPP_DIR(sig); 00952 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx); 00953 } 00954 point_c = (new_mem_idx >> 1); 00955 if (mu_positive){ 00956 bc_link_nbr_idx = point_c; 00957 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00958 } 00959 00960 new_x1 = x1; 00961 new_x2 = x2; 00962 new_x3 = x3; 00963 new_x4 = x4; 00964 if(sig_positive){ 00965 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, X, new_mem_idx); 00966 }else{ 00967 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), X, new_mem_idx); 00968 } 00969 point_b = (new_mem_idx >> 1); 00970 if (!mu_positive){ 00971 bc_link_nbr_idx = point_b; 00972 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00973 } 00974 00975 if(sig_positive){ 00976 ab_link_nbr_idx = sid; 00977 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, x1, x2, x3, x4); 00978 }else{ 00979 ab_link_nbr_idx = point_b; 00980 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, new_x1,new_x2,new_x3,new_x4); 00981 } 00982 00983 LOAD_HW(tempxOdd, point_d, HWE); 00984 if (mu_positive){ 00985 FF_LOAD_MATRIX(linkOdd, mymu, ad_link_nbr_idx, LINKC); 00986 }else{ 00987 FF_LOAD_MATRIX(linkEven, mymu, ad_link_nbr_idx, LINKC); 00988 } 00989 00990 RECONSTRUCT_LINK_12(mymu, ad_link_nbr_idx, ad_link_sign, linkc); 00991 if (mu_positive){ 00992 ADJ_MAT_MUL_HW(linkc, hwe, hwd); 00993 }else{ 00994 MAT_MUL_HW(linkc, hwe, hwd); 00995 } 00996 //we do not need to write Pmu here 00997 //WRITE_HW(myPmu, sid, HWD); 00998 00999 LOAD_HW(tempxEven, point_c, HWA); 01000 if (mu_positive){ 01001 FF_LOAD_MATRIX(linkEven, mymu, bc_link_nbr_idx, LINKA); 01002 }else{ 01003 FF_LOAD_MATRIX(linkOdd, mymu, bc_link_nbr_idx, LINKA); 01004 } 01005 01006 RECONSTRUCT_LINK_12(mymu, bc_link_nbr_idx, bc_link_sign, linka); 01007 if (mu_positive){ 01008 ADJ_MAT_MUL_HW(linka, hwa, hwb); 01009 }else{ 01010 MAT_MUL_HW(linka, hwa, hwb); 01011 } 01012 if (sig_positive){ 01013 FF_LOAD_MATRIX(linkEven, mysig, ab_link_nbr_idx, LINKA); 01014 }else{ 01015 FF_LOAD_MATRIX(linkOdd, mysig, ab_link_nbr_idx, LINKA); 01016 } 01017 01018 RECONSTRUCT_LINK_12(mysig, ab_link_nbr_idx, ab_link_sign, linka); 01019 if (sig_positive){ 01020 MAT_MUL_HW(linka, hwb, hwc); 01021 }else{ 01022 ADJ_MAT_MUL_HW(linka, hwb, hwc); 01023 } 01024 01025 //we do not need to write P3 here 01026 //WRITE_HW(myP3, sid, HWC); 01027 01028 if (sig_positive){ 01029 //add the force to mom 01030 ADD_FORCE_TO_MOM(hwc, hwd, momEven, sid, sig, mcoeff, oddBit); 01031 } 01032 01033 //P3 is hwc 01034 //ad_link is linkc 01035 if (mu_positive){ 01036 MAT_MUL_HW(linkc, hwc, hwa); 01037 }else{ 01038 ADJ_MAT_MUL_HW(linkc, hwc, hwa); 01039 } 01040 01041 //accumulate P7rho to P5 01042 //WRITE_HW(otherP3mu, point_d, HWA); 01043 LOAD_HW(shortPOdd, point_d, HWB); 01044 SCALAR_MULT_ADD_SU3_VECTOR(hwb0, hwa0, accumu_coeff.x, hwb0); 01045 SCALAR_MULT_ADD_SU3_VECTOR(hwb1, hwa1, accumu_coeff.y, hwb1); 01046 WRITE_HW(shortPOdd, point_d, HWB); 01047 01048 //hwe holds tempx at point_d 01049 //hwd holds Pmu at point A(sid) 01050 if (mu_positive){ 01051 if (sig_positive){ 01052 ADD_FORCE_TO_MOM(hwa, hwe, momOdd, point_d, mu, coeff, 1-oddBit); 01053 }else{ 01054 ADD_FORCE_TO_MOM(hwe, hwa, momOdd, point_d, OPP_DIR(mu), mcoeff, 1- oddBit); 01055 } 01056 }else{ 01057 if (sig_positive){ 01058 ADD_FORCE_TO_MOM(hwc, hwd, momEven, sid, mu, mcoeff, oddBit); 01059 }else{ 01060 ADD_FORCE_TO_MOM(hwd, hwc, momEven, sid, OPP_DIR(mu), coeff, oddBit); 01061 } 01062 01063 } 01064 01065 01066 } 01067 01068 01069 static void 01070 all_link_kernel(float2* tempxEven, float2* tempxOdd, 01071 float2* PmuEven, float2* PmuOdd, 01072 float2* P3Even, float2* P3Odd, 01073 float2* P3muEven, float2* P3muOdd, 01074 float2* shortPEven, float2* shortPOdd, 01075 int sig, int mu, float2 coeff, float2 mcoeff, float2 accumu_coeff, 01076 float4* linkEven, float4* linkOdd, FullGauge cudaSiteLink, 01077 float2* momEven, float2* momOdd, 01078 dim3 gridDim, dim3 blockDim) 01079 01080 { 01081 dim3 halfGridDim(gridDim.x/2, 1,1); 01082 01083 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 01084 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 01085 01086 if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){ 01087 do_all_link_kernel<1,1,0><<<halfGridDim, blockDim>>>( tempxEven, tempxOdd, 01088 PmuEven, PmuOdd, 01089 P3Even, P3Odd, 01090 P3muEven, P3muOdd, 01091 shortPEven, shortPOdd, 01092 sig, mu, coeff, mcoeff, accumu_coeff, 01093 linkEven, linkOdd, 01094 momEven, momOdd); 01095 cudaUnbindTexture(siteLink0TexSingle); 01096 cudaUnbindTexture(siteLink1TexSingle); 01097 01098 //opposive binding 01099 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 01100 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 01101 do_all_link_kernel<1,1,1><<<halfGridDim, blockDim>>>( tempxOdd, tempxEven, 01102 PmuOdd, PmuEven, 01103 P3Odd, P3Even, 01104 P3muOdd, P3muEven, 01105 shortPOdd, shortPEven, 01106 sig, mu, coeff, mcoeff, accumu_coeff, 01107 linkOdd, linkEven, 01108 momOdd, momEven); 01109 01110 01111 }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){ 01112 01113 do_all_link_kernel<1,0,0><<<halfGridDim, blockDim>>>( tempxEven, tempxOdd, 01114 PmuEven, PmuOdd, 01115 P3Even, P3Odd, 01116 P3muEven, P3muOdd, 01117 shortPEven, shortPOdd, 01118 sig, mu, coeff, mcoeff, accumu_coeff, 01119 linkEven, linkOdd, 01120 momEven, momOdd); 01121 cudaUnbindTexture(siteLink0TexSingle); 01122 cudaUnbindTexture(siteLink1TexSingle); 01123 01124 //opposive binding 01125 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 01126 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 01127 01128 do_all_link_kernel<1,0,1><<<halfGridDim, blockDim>>>( tempxOdd, tempxEven, 01129 PmuOdd, PmuEven, 01130 P3Odd, P3Even, 01131 P3muOdd, P3muEven, 01132 shortPOdd, shortPEven, 01133 sig, mu, coeff, mcoeff, accumu_coeff, 01134 linkOdd, linkEven, 01135 momOdd, momEven); 01136 01137 }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){ 01138 do_all_link_kernel<0,1,0><<<halfGridDim, blockDim>>>( tempxEven, tempxOdd, 01139 PmuEven, PmuOdd, 01140 P3Even, P3Odd, 01141 P3muEven, P3muOdd, 01142 shortPEven, shortPOdd, 01143 sig, mu, coeff, mcoeff, accumu_coeff, 01144 linkEven, linkOdd, 01145 momEven, momOdd); 01146 cudaUnbindTexture(siteLink0TexSingle); 01147 cudaUnbindTexture(siteLink1TexSingle); 01148 01149 //opposive binding 01150 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 01151 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 01152 01153 01154 do_all_link_kernel<0,1,1><<<halfGridDim, blockDim>>>( tempxOdd, tempxEven, 01155 PmuOdd, PmuEven, 01156 P3Odd, P3Even, 01157 P3muOdd, P3muEven, 01158 shortPOdd, shortPEven, 01159 sig, mu, coeff, mcoeff, accumu_coeff, 01160 linkOdd, linkEven, 01161 momOdd, momEven); 01162 }else{ 01163 do_all_link_kernel<0,0,0><<<halfGridDim, blockDim>>>( tempxEven, tempxOdd, 01164 PmuEven, PmuOdd, 01165 P3Even, P3Odd, 01166 P3muEven, P3muOdd, 01167 shortPEven, shortPOdd, 01168 sig, mu, coeff, mcoeff, accumu_coeff, 01169 linkEven, linkOdd, 01170 momEven, momOdd); 01171 01172 cudaUnbindTexture(siteLink0TexSingle); 01173 cudaUnbindTexture(siteLink1TexSingle); 01174 01175 //opposive binding 01176 cudaBindTexture(0, siteLink0TexSingle, cudaSiteLink.odd, cudaSiteLink.bytes); 01177 cudaBindTexture(0, siteLink1TexSingle, cudaSiteLink.even, cudaSiteLink.bytes); 01178 01179 do_all_link_kernel<0,0,1><<<halfGridDim, blockDim>>>( tempxOdd, tempxEven, 01180 PmuOdd, PmuEven, 01181 P3Odd, P3Even, 01182 P3muOdd, P3muEven, 01183 shortPOdd, shortPEven, 01184 sig, mu, coeff, mcoeff, accumu_coeff, 01185 linkOdd, linkEven, 01186 momOdd, momEven); 01187 } 01188 01189 cudaUnbindTexture(siteLink0TexSingle); 01190 cudaUnbindTexture(siteLink1TexSingle); 01191 01192 } 01193 01194 01195 01196 __global__ void 01197 one_and_naik_terms_kernel(float2* TempxEven, float2* TempxOdd, 01198 float2* PmuEven, float2* PmuOdd, 01199 float2* PnumuEven, float2* PnumuOdd, 01200 int mu, float2 OneLink, float2 Naik, float2 mNaik, 01201 float4* linkEven, float4* linkOdd, 01202 float2* momEven, float2* momOdd) 01203 { 01204 int sid = blockIdx.x * blockDim.x + threadIdx.x; 01205 int oddBit = 0; 01206 float2* myTempx = TempxEven; 01207 float2* myPmu = PmuEven; 01208 float2* myPnumu = PnumuEven; 01209 float2* myMom = momEven; 01210 float4* myLink = linkEven; 01211 float2* otherTempx = TempxOdd; 01212 float2* otherPnumu = PnumuOdd; 01213 float4* otherLink = linkOdd; 01214 01215 float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5; 01216 float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5; 01217 float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5; 01218 float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5; 01219 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4; 01220 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4; 01221 float2 AH0, AH1, AH2, AH3, AH4; 01222 01223 if (sid >= Vh){ 01224 oddBit =1; 01225 sid -= Vh; 01226 01227 myTempx = TempxOdd; 01228 myPmu = PmuOdd; 01229 myPnumu = PnumuOdd; 01230 myMom = momOdd; 01231 myLink = linkOdd; 01232 otherTempx = TempxEven; 01233 otherPnumu = PnumuEven; 01234 otherLink = linkEven; 01235 } 01236 01237 int z1 = FAST_INT_DIVIDE(sid, X1h); 01238 int x1h = sid - z1*X1h; 01239 int z2 = FAST_INT_DIVIDE(z1, X2); 01240 int x2 = z1 - z2*X2; 01241 int x4 = FAST_INT_DIVIDE(z2, X3); 01242 int x3 = z2 - x4*X3; 01243 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 01244 int x1 = 2*x1h + x1odd; 01245 //int X = 2*sid + x1odd; 01246 01247 int dx[4]; 01248 int new_x1, new_x2, new_x3, new_x4, new_idx; 01249 int sign=1; 01250 01251 if (GOES_BACKWARDS(mu)){ 01252 //The one link 01253 LOAD_HW(myPmu, sid, HWA); 01254 LOAD_HW(myTempx, sid, HWB); 01255 ADD_FORCE_TO_MOM(hwa, hwb, myMom, sid, OPP_DIR(mu), OneLink, oddBit); 01256 01257 //Naik term 01258 dx[3]=dx[2]=dx[1]=dx[0]=0; 01259 dx[OPP_DIR(mu)] = -1; 01260 new_x1 = (x1 + dx[0] + X1)%X1; 01261 new_x2 = (x2 + dx[1] + X2)%X2; 01262 new_x3 = (x3 + dx[2] + X3)%X3; 01263 new_x4 = (x4 + dx[3] + X4)%X4; 01264 new_idx = (new_x4*X3X2X1+new_x3*X2X1+new_x2*X1+new_x1) >> 1; 01265 LOAD_HW(otherTempx, new_idx, HWA); 01266 LOAD_MATRIX(otherLink, OPP_DIR(mu), new_idx, LINKA); 01267 FF_COMPUTE_RECONSTRUCT_SIGN(sign, OPP_DIR(mu), new_x1,new_x2,new_x3,new_x4); 01268 RECONSTRUCT_LINK_12(OPP_DIR(mu), new_idx, sign, linka); 01269 ADJ_MAT_MUL_HW(linka, hwa, hwc); //Popmu 01270 01271 LOAD_HW(myPnumu, sid, HWD); 01272 ADD_FORCE_TO_MOM(hwd, hwc, myMom, sid, OPP_DIR(mu), mNaik, oddBit); 01273 01274 dx[3]=dx[2]=dx[1]=dx[0]=0; 01275 dx[OPP_DIR(mu)] = 1; 01276 new_x1 = (x1 + dx[0] + X1)%X1; 01277 new_x2 = (x2 + dx[1] + X2)%X2; 01278 new_x3 = (x3 + dx[2] + X3)%X3; 01279 new_x4 = (x4 + dx[3] + X4)%X4; 01280 new_idx = (new_x4*X3X2X1+new_x3*X2X1+new_x2*X1+new_x1) >> 1; 01281 LOAD_HW(otherPnumu, new_idx, HWA); 01282 LOAD_MATRIX(myLink, OPP_DIR(mu), sid, LINKA); 01283 FF_COMPUTE_RECONSTRUCT_SIGN(sign, OPP_DIR(mu), x1, x2, x3, x4); 01284 RECONSTRUCT_LINK_12(OPP_DIR(mu), sid, sign, linka); 01285 MAT_MUL_HW(linka, hwa, hwc); 01286 ADD_FORCE_TO_MOM(hwc, hwb, myMom, sid, OPP_DIR(mu), Naik, oddBit); 01287 }else{ 01288 dx[3]=dx[2]=dx[1]=dx[0]=0; 01289 dx[mu] = 1; 01290 new_x1 = (x1 + dx[0] + X1)%X1; 01291 new_x2 = (x2 + dx[1] + X2)%X2; 01292 new_x3 = (x3 + dx[2] + X3)%X3; 01293 new_x4 = (x4 + dx[3] + X4)%X4; 01294 new_idx = (new_x4*X3X2X1+new_x3*X2X1+new_x2*X1+new_x1) >> 1; 01295 LOAD_HW(otherTempx, new_idx, HWA); 01296 LOAD_MATRIX(myLink, mu, sid, LINKA); 01297 FF_COMPUTE_RECONSTRUCT_SIGN(sign, mu, x1, x2, x3, x4); 01298 RECONSTRUCT_LINK_12(mu, sid, sign, linka); 01299 MAT_MUL_HW(linka, hwa, hwb); 01300 01301 LOAD_HW(myPnumu, sid, HWC); 01302 ADD_FORCE_TO_MOM(hwb, hwc, myMom, sid, mu, Naik, oddBit); 01303 01304 01305 } 01306 } 01307 01308 01309 #define Pmu tempvec[0] 01310 #define Pnumu tempvec[1] 01311 #define Prhonumu tempvec[2] 01312 #define P7 tempvec[3] 01313 #define P7rho tempvec[4] 01314 #define P7rhonu tempvec[5] 01315 #define P5 tempvec[6] 01316 #define P3 tempvec[7] 01317 #define P5nu tempvec[3] 01318 #define P3mu tempvec[3] 01319 #define Popmu tempvec[4] 01320 #define Pmumumu tempvec[4] 01321 01322 template<typename Real> 01323 static void 01324 do_fermion_force_cuda(Real eps, Real weight1, Real weight2, Real* act_path_coeff, FullHw cudaHw, 01325 FullGauge cudaSiteLink, FullMom cudaMom, FullHw tempvec[8], QudaGaugeParam* param) 01326 { 01327 01328 int mu, nu, rho, sig; 01329 float2 coeff; 01330 01331 float2 OneLink, Lepage, Naik, FiveSt, ThreeSt, SevenSt; 01332 float2 mNaik, mLepage, mFiveSt, mThreeSt, mSevenSt; 01333 01334 Real ferm_epsilon; 01335 ferm_epsilon = 2.0*weight1*eps; 01336 OneLink.x = act_path_coeff[0]*ferm_epsilon ; 01337 Naik.x = act_path_coeff[1]*ferm_epsilon ; mNaik.x = -Naik.x; 01338 ThreeSt.x = act_path_coeff[2]*ferm_epsilon ; mThreeSt.x = -ThreeSt.x; 01339 FiveSt.x = act_path_coeff[3]*ferm_epsilon ; mFiveSt.x = -FiveSt.x; 01340 SevenSt.x = act_path_coeff[4]*ferm_epsilon ; mSevenSt.x = -SevenSt.x; 01341 Lepage.x = act_path_coeff[5]*ferm_epsilon ; mLepage.x = -Lepage.x; 01342 01343 ferm_epsilon = 2.0*weight2*eps; 01344 OneLink.y = act_path_coeff[0]*ferm_epsilon ; 01345 Naik.y = act_path_coeff[1]*ferm_epsilon ; mNaik.y = -Naik.y; 01346 ThreeSt.y = act_path_coeff[2]*ferm_epsilon ; mThreeSt.y = -ThreeSt.y; 01347 FiveSt.y = act_path_coeff[3]*ferm_epsilon ; mFiveSt.y = -FiveSt.y; 01348 SevenSt.y = act_path_coeff[4]*ferm_epsilon ; mSevenSt.y = -SevenSt.y; 01349 Lepage.y = act_path_coeff[5]*ferm_epsilon ; mLepage.y = -Lepage.y; 01350 01351 int DirectLinks[8] ; 01352 01353 for(mu=0;mu<8;mu++){ 01354 DirectLinks[mu] = 0 ; 01355 } 01356 01357 int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3]; 01358 dim3 blockDim(BLOCK_DIM,1,1); 01359 dim3 gridDim(volume/blockDim.x, 1, 1); 01360 01361 for(sig=0; sig < 8; sig++){ 01362 for(mu = 0; mu < 8; mu++){ 01363 if ( (mu == sig) || (mu == OPP_DIR(sig))){ 01364 continue; 01365 } 01366 //3-link 01367 //Kernel A: middle link 01368 01369 middle_link_kernel( (float2*)cudaHw.even.data, (float2*)cudaHw.odd.data, 01370 (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01371 (float2*)P3.even.data, (float2*)P3.odd.data, 01372 sig, mu, mThreeSt, 01373 (float4*)cudaSiteLink.even, (float4*)cudaSiteLink.odd, cudaSiteLink, 01374 (float2*)cudaMom.even, (float2*)cudaMom.odd, 01375 gridDim, blockDim); 01376 checkCudaError(); 01377 for(nu=0; nu < 8; nu++){ 01378 if (nu == sig || nu == OPP_DIR(sig) 01379 || nu == mu || nu == OPP_DIR(mu)){ 01380 continue; 01381 } 01382 //5-link: middle link 01383 //Kernel B 01384 middle_link_kernel( (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01385 (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01386 (float2*)P5.even.data, (float2*)P5.odd.data, 01387 sig, nu, FiveSt, 01388 (float4*)cudaSiteLink.even, (float4*)cudaSiteLink.odd, cudaSiteLink, 01389 (float2*)cudaMom.even, (float2*)cudaMom.odd, 01390 gridDim, blockDim); 01391 checkCudaError(); 01392 01393 for(rho =0; rho < 8; rho++){ 01394 if (rho == sig || rho == OPP_DIR(sig) 01395 || rho == mu || rho == OPP_DIR(mu) 01396 || rho == nu || rho == OPP_DIR(nu)){ 01397 continue; 01398 } 01399 //7-link: middle link and side link 01400 //kernel C 01401 01402 if(FiveSt.x != 0)coeff.x = SevenSt.x/FiveSt.x ; else coeff.x = 0; 01403 if(FiveSt.y != 0)coeff.y = SevenSt.y/FiveSt.y ; else coeff.y = 0; 01404 all_link_kernel((float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01405 (float2*)Prhonumu.even.data, (float2*)Prhonumu.odd.data, 01406 (float2*)P7.even.data, (float2*)P7.odd.data, 01407 (float2*)P7rho.even.data, (float2*)P7rho.odd.data, 01408 (float2*)P5.even.data, (float2*)P5.odd.data, 01409 sig, rho, SevenSt,mSevenSt,coeff, 01410 (float4*)cudaSiteLink.even, (float4*)cudaSiteLink.odd, cudaSiteLink, 01411 (float2*)cudaMom.even, (float2*)cudaMom.odd, 01412 gridDim, blockDim); 01413 checkCudaError(); 01414 01415 }//rho 01416 01417 //5-link: side link 01418 //kernel B2 01419 if(ThreeSt.x != 0)coeff.x = FiveSt.x/ThreeSt.x ; else coeff.x = 0; 01420 if(ThreeSt.y != 0)coeff.y = FiveSt.y/ThreeSt.y ; else coeff.y = 0; 01421 side_link_kernel((float2*)P5.even.data, (float2*)P5.odd.data, 01422 (float2*)P5nu.even.data, (float2*)P5nu.odd.data, 01423 (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01424 (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01425 (float2*)P3.even.data, (float2*)P3.odd.data, 01426 sig, nu, mFiveSt, coeff, 01427 (float4*)cudaSiteLink.even, (float4*)cudaSiteLink.odd, cudaSiteLink, 01428 (float2*)cudaMom.even, (float2*)cudaMom.odd, 01429 gridDim, blockDim); 01430 checkCudaError(); 01431 01432 }//nu 01433 01434 //lepage 01435 //Kernel A2 01436 middle_link_kernel( (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01437 (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01438 (float2*)P5.even.data, (float2*)P5.odd.data, 01439 sig, mu, Lepage, 01440 (float4*)cudaSiteLink.even, (float4*)cudaSiteLink.odd, cudaSiteLink, 01441 (float2*)cudaMom.even, (float2*)cudaMom.odd, 01442 gridDim, blockDim); 01443 checkCudaError(); 01444 01445 if(ThreeSt.x != 0)coeff.x = Lepage.x/ThreeSt.x ; else coeff.x = 0; 01446 if(ThreeSt.y != 0)coeff.y = Lepage.y/ThreeSt.y ; else coeff.y = 0; 01447 01448 side_link_kernel((float2*)P5.even.data, (float2*)P5.odd.data, 01449 (float2*)P5nu.even.data, (float2*)P5nu.odd.data, 01450 (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01451 (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01452 (float2*)P3.even.data, (float2*)P3.odd.data, 01453 sig, mu, mLepage,coeff, 01454 (float4*)cudaSiteLink.even, (float4*)cudaSiteLink.odd, cudaSiteLink, 01455 (float2*)cudaMom.even, (float2*)cudaMom.odd, 01456 gridDim, blockDim); 01457 checkCudaError(); 01458 01459 //3-link side link 01460 coeff.x=coeff.y=0; 01461 side_link_kernel((float2*)P3.even.data, (float2*)P3.odd.data, 01462 (float2*)P3mu.even.data, (float2*)P3mu.odd.data, 01463 (float2*)cudaHw.even.data, (float2*)cudaHw.odd.data, 01464 (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01465 (float2*)NULL, (float2*)NULL, 01466 sig, mu, ThreeSt,coeff, 01467 (float4*)cudaSiteLink.even, (float4*)cudaSiteLink.odd, cudaSiteLink, 01468 (float2*)cudaMom.even, (float2*)cudaMom.odd, 01469 gridDim, blockDim); 01470 checkCudaError(); 01471 01472 //1-link and naik term 01473 if (!DirectLinks[mu]){ 01474 DirectLinks[mu]=1; 01475 //kernel Z 01476 one_and_naik_terms_kernel<<<gridDim, blockDim>>>((float2*)cudaHw.even.data, (float2*)cudaHw.odd.data, 01477 (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01478 (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01479 mu, OneLink, Naik, mNaik, 01480 (float4*)cudaSiteLink.even, (float4*)cudaSiteLink.odd, 01481 (float2*)cudaMom.even, (float2*)cudaMom.odd); 01482 checkCudaError(); 01483 } 01484 01485 }//mu 01486 01487 }//sig 01488 01489 01490 } 01491 01492 #undef Pmu 01493 #undef Pnumu 01494 #undef Prhonumu 01495 #undef P7 01496 #undef P7rho 01497 #undef P7rhonu 01498 #undef P5 01499 #undef P3 01500 #undef P5nu 01501 #undef P3mu 01502 #undef Popmu 01503 #undef Pmumumu 01504 01505 void 01506 fermion_force_cuda(double eps, double weight1, double weight2, void* act_path_coeff, 01507 FullHw cudaHw, FullGauge cudaSiteLink, FullMom cudaMom, QudaGaugeParam* param) 01508 { 01509 int i; 01510 FullHw tempvec[8]; 01511 01512 for(i=0;i < 8;i++){ 01513 tempvec[i] = createHwQuda(param->X, param->cuda_prec); 01514 } 01515 01516 if (param->cuda_prec == QUDA_DOUBLE_PRECISION){ 01517 /* 01518 do_fermion_force_cuda( (double)eps, (double)weight1, (double)weight2, (double*)act_path_coeff, 01519 cudaHw, cudaSiteLink, cudaMom, tempvec, param); 01520 */ 01521 01522 }else{ 01523 do_fermion_force_cuda( (float)eps, (float)weight1, (float)weight2, (float*)act_path_coeff, 01524 cudaHw, cudaSiteLink, cudaMom, tempvec, param); 01525 } 01526 01527 for(i=0;i < 8;i++){ 01528 freeHwQuda(tempvec[i]); 01529 } 01530 01531 }
1.7.3