QUDA v0.4.0
A library for QCD on GPUs
|
00001 00002 #include <read_gauge.h> 00003 #include <gauge_field.h> 00004 00005 #include <fermion_force_quda.h> 00006 #include <force_common.h> 00007 #include <hw_quda.h> 00008 00009 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, Vh) 00010 00011 #define LOAD_HW_SINGLE(hw_even, hw_odd, idx, var, oddness) do{ \ 00012 Float2* hw = (oddness)?hw_odd:hw_even; \ 00013 var##0 = hw[idx + 0*Vh]; \ 00014 var##1 = hw[idx + 1*Vh]; \ 00015 var##2 = hw[idx + 2*Vh]; \ 00016 var##3 = hw[idx + 3*Vh]; \ 00017 var##4 = hw[idx + 4*Vh]; \ 00018 var##5 = hw[idx + 5*Vh]; \ 00019 }while(0) 00020 00021 #define WRITE_HW_SINGLE(hw_even, hw_odd, idx, var, oddness) do{ \ 00022 Float2* hw = (oddness)?hw_odd:hw_even; \ 00023 hw[idx + 0*Vh] = var##0; \ 00024 hw[idx + 1*Vh] = var##1; \ 00025 hw[idx + 2*Vh] = var##2; \ 00026 hw[idx + 3*Vh] = var##3; \ 00027 hw[idx + 4*Vh] = var##4; \ 00028 hw[idx + 5*Vh] = var##5; \ 00029 }while(0) 00030 00031 #define LOAD_HW(hw_eve, hw_odd, idx, var, oddness) LOAD_HW_SINGLE(hw_eve, hw_odd, idx, var, oddness) 00032 #define WRITE_HW(hw_even, hw_odd, idx, var, oddness) WRITE_HW_SINGLE(hw_even, hw_odd, idx, var, oddness) 00033 #define LOAD_MATRIX(src, dir, idx, var) LOAD_MATRIX_12_SINGLE(src, dir, idx, var, Vh) 00034 00035 #define FF_SITE_MATRIX_LOAD_TEX 1 00036 00037 #define linkEvenTex siteLink0TexSingle_recon 00038 #define linkOddTex siteLink1TexSingle_recon 00039 00040 #if (FF_SITE_MATRIX_LOAD_TEX == 1) 00041 #define FF_LOAD_MATRIX(dir, idx, var, oddness) LOAD_MATRIX_12_SINGLE_TEX(((oddness)?linkOddTex:linkEvenTex), dir, idx, var, Vh) 00042 #else 00043 #define FF_LOAD_MATRIX(dir, idx, var, oddness) LOAD_MATRIX_12_SINGLE(((oddness)?linkOdd:linkEven), dir, idx, var, Vh) 00044 #endif 00045 00046 00047 #define linka00_re LINKA0.x 00048 #define linka00_im LINKA0.y 00049 #define linka01_re LINKA0.z 00050 #define linka01_im LINKA0.w 00051 #define linka02_re LINKA1.x 00052 #define linka02_im LINKA1.y 00053 #define linka10_re LINKA1.z 00054 #define linka10_im LINKA1.w 00055 #define linka11_re LINKA2.x 00056 #define linka11_im LINKA2.y 00057 #define linka12_re LINKA2.z 00058 #define linka12_im LINKA2.w 00059 #define linka20_re LINKA3.x 00060 #define linka20_im LINKA3.y 00061 #define linka21_re LINKA3.z 00062 #define linka21_im LINKA3.w 00063 #define linka22_re LINKA4.x 00064 #define linka22_im LINKA4.y 00065 00066 #define linkb00_re LINKB0.x 00067 #define linkb00_im LINKB0.y 00068 #define linkb01_re LINKB0.z 00069 #define linkb01_im LINKB0.w 00070 #define linkb02_re LINKB1.x 00071 #define linkb02_im LINKB1.y 00072 #define linkb10_re LINKB1.z 00073 #define linkb10_im LINKB1.w 00074 #define linkb11_re LINKB2.x 00075 #define linkb11_im LINKB2.y 00076 #define linkb12_re LINKB2.z 00077 #define linkb12_im LINKB2.w 00078 #define linkb20_re LINKB3.x 00079 #define linkb20_im LINKB3.y 00080 #define linkb21_re LINKB3.z 00081 #define linkb21_im LINKB3.w 00082 #define linkb22_re LINKB4.x 00083 #define linkb22_im LINKB4.y 00084 00085 00086 #define MAT_MUL_HW(M, HW, HWOUT) \ 00087 HWOUT##00_re = (M##00_re * HW##00_re - M##00_im * HW##00_im) \ 00088 + (M##01_re * HW##01_re - M##01_im * HW##01_im) \ 00089 + (M##02_re * HW##02_re - M##02_im * HW##02_im); \ 00090 HWOUT##00_im = (M##00_re * HW##00_im + M##00_im * HW##00_re) \ 00091 + (M##01_re * HW##01_im + M##01_im * HW##01_re) \ 00092 + (M##02_re * HW##02_im + M##02_im * HW##02_re); \ 00093 HWOUT##01_re = (M##10_re * HW##00_re - M##10_im * HW##00_im) \ 00094 + (M##11_re * HW##01_re - M##11_im * HW##01_im) \ 00095 + (M##12_re * HW##02_re - M##12_im * HW##02_im); \ 00096 HWOUT##01_im = (M##10_re * HW##00_im + M##10_im * HW##00_re) \ 00097 + (M##11_re * HW##01_im + M##11_im * HW##01_re) \ 00098 + (M##12_re * HW##02_im + M##12_im * HW##02_re); \ 00099 HWOUT##02_re = (M##20_re * HW##00_re - M##20_im * HW##00_im) \ 00100 + (M##21_re * HW##01_re - M##21_im * HW##01_im) \ 00101 + (M##22_re * HW##02_re - M##22_im * HW##02_im); \ 00102 HWOUT##02_im = (M##20_re * HW##00_im + M##20_im * HW##00_re) \ 00103 + (M##21_re * HW##01_im + M##21_im * HW##01_re) \ 00104 + (M##22_re * HW##02_im + M##22_im * HW##02_re); \ 00105 HWOUT##10_re = (M##00_re * HW##10_re - M##00_im * HW##10_im) \ 00106 + (M##01_re * HW##11_re - M##01_im * HW##11_im) \ 00107 + (M##02_re * HW##12_re - M##02_im * HW##12_im); \ 00108 HWOUT##10_im = (M##00_re * HW##10_im + M##00_im * HW##10_re) \ 00109 + (M##01_re * HW##11_im + M##01_im * HW##11_re) \ 00110 + (M##02_re * HW##12_im + M##02_im * HW##12_re); \ 00111 HWOUT##11_re = (M##10_re * HW##10_re - M##10_im * HW##10_im) \ 00112 + (M##11_re * HW##11_re - M##11_im * HW##11_im) \ 00113 + (M##12_re * HW##12_re - M##12_im * HW##12_im); \ 00114 HWOUT##11_im = (M##10_re * HW##10_im + M##10_im * HW##10_re) \ 00115 + (M##11_re * HW##11_im + M##11_im * HW##11_re) \ 00116 + (M##12_re * HW##12_im + M##12_im * HW##12_re); \ 00117 HWOUT##12_re = (M##20_re * HW##10_re - M##20_im * HW##10_im) \ 00118 + (M##21_re * HW##11_re - M##21_im * HW##11_im) \ 00119 + (M##22_re * HW##12_re - M##22_im * HW##12_im); \ 00120 HWOUT##12_im = (M##20_re * HW##10_im + M##20_im * HW##10_re) \ 00121 + (M##21_re * HW##11_im + M##21_im * HW##11_re) \ 00122 + (M##22_re * HW##12_im + M##22_im * HW##12_re); 00123 00124 00125 #define ADJ_MAT_MUL_HW(M, HW, HWOUT) \ 00126 HWOUT##00_re = (M##00_re * HW##00_re + M##00_im * HW##00_im) \ 00127 + (M##10_re * HW##01_re + M##10_im * HW##01_im) \ 00128 + (M##20_re * HW##02_re + M##20_im * HW##02_im); \ 00129 HWOUT##00_im = (M##00_re * HW##00_im - M##00_im * HW##00_re) \ 00130 + (M##10_re * HW##01_im - M##10_im * HW##01_re) \ 00131 + (M##20_re * HW##02_im - M##20_im * HW##02_re); \ 00132 HWOUT##01_re = (M##01_re * HW##00_re + M##01_im * HW##00_im) \ 00133 + (M##11_re * HW##01_re + M##11_im * HW##01_im) \ 00134 + (M##21_re * HW##02_re + M##21_im * HW##02_im); \ 00135 HWOUT##01_im = (M##01_re * HW##00_im - M##01_im * HW##00_re) \ 00136 + (M##11_re * HW##01_im - M##11_im * HW##01_re) \ 00137 + (M##21_re * HW##02_im - M##21_im * HW##02_re); \ 00138 HWOUT##02_re = (M##02_re * HW##00_re + M##02_im * HW##00_im) \ 00139 + (M##12_re * HW##01_re + M##12_im * HW##01_im) \ 00140 + (M##22_re * HW##02_re + M##22_im * HW##02_im); \ 00141 HWOUT##02_im = (M##02_re * HW##00_im - M##02_im * HW##00_re) \ 00142 + (M##12_re * HW##01_im - M##12_im * HW##01_re) \ 00143 + (M##22_re * HW##02_im - M##22_im * HW##02_re); \ 00144 HWOUT##10_re = (M##00_re * HW##10_re + M##00_im * HW##10_im) \ 00145 + (M##10_re * HW##11_re + M##10_im * HW##11_im) \ 00146 + (M##20_re * HW##12_re + M##20_im * HW##12_im); \ 00147 HWOUT##10_im = (M##00_re * HW##10_im - M##00_im * HW##10_re) \ 00148 + (M##10_re * HW##11_im - M##10_im * HW##11_re) \ 00149 + (M##20_re * HW##12_im - M##20_im * HW##12_re); \ 00150 HWOUT##11_re = (M##01_re * HW##10_re + M##01_im * HW##10_im) \ 00151 + (M##11_re * HW##11_re + M##11_im * HW##11_im) \ 00152 + (M##21_re * HW##12_re + M##21_im * HW##12_im); \ 00153 HWOUT##11_im = (M##01_re * HW##10_im - M##01_im * HW##10_re) \ 00154 + (M##11_re * HW##11_im - M##11_im * HW##11_re) \ 00155 + (M##21_re * HW##12_im - M##21_im * HW##12_re); \ 00156 HWOUT##12_re = (M##02_re * HW##10_re + M##02_im * HW##10_im) \ 00157 + (M##12_re * HW##11_re + M##12_im * HW##11_im) \ 00158 + (M##22_re * HW##12_re + M##22_im * HW##12_im); \ 00159 HWOUT##12_im = (M##02_re * HW##10_im - M##02_im * HW##10_re) \ 00160 + (M##12_re * HW##11_im - M##12_im * HW##11_re) \ 00161 + (M##22_re * HW##12_im - M##22_im * HW##12_re); 00162 00163 00164 #define SU3_PROJECTOR(va, vb, m) \ 00165 m##00_re = va##0_re * vb##0_re + va##0_im * vb##0_im; \ 00166 m##00_im = va##0_im * vb##0_re - va##0_re * vb##0_im; \ 00167 m##01_re = va##0_re * vb##1_re + va##0_im * vb##1_im; \ 00168 m##01_im = va##0_im * vb##1_re - va##0_re * vb##1_im; \ 00169 m##02_re = va##0_re * vb##2_re + va##0_im * vb##2_im; \ 00170 m##02_im = va##0_im * vb##2_re - va##0_re * vb##2_im; \ 00171 m##10_re = va##1_re * vb##0_re + va##1_im * vb##0_im; \ 00172 m##10_im = va##1_im * vb##0_re - va##1_re * vb##0_im; \ 00173 m##11_re = va##1_re * vb##1_re + va##1_im * vb##1_im; \ 00174 m##11_im = va##1_im * vb##1_re - va##1_re * vb##1_im; \ 00175 m##12_re = va##1_re * vb##2_re + va##1_im * vb##2_im; \ 00176 m##12_im = va##1_im * vb##2_re - va##1_re * vb##2_im; \ 00177 m##20_re = va##2_re * vb##0_re + va##2_im * vb##0_im; \ 00178 m##20_im = va##2_im * vb##0_re - va##2_re * vb##0_im; \ 00179 m##21_re = va##2_re * vb##1_re + va##2_im * vb##1_im; \ 00180 m##21_im = va##2_im * vb##1_re - va##2_re * vb##1_im; \ 00181 m##22_re = va##2_re * vb##2_re + va##2_im * vb##2_im; \ 00182 m##22_im = va##2_im * vb##2_re - va##2_re * vb##2_im; 00183 00184 //vc = va + vb*s 00185 #define SCALAR_MULT_ADD_SU3_VECTOR(va, vb, s, vc) do { \ 00186 vc##0_re = va##0_re + vb##0_re * s; \ 00187 vc##0_im = va##0_im + vb##0_im * s; \ 00188 vc##1_re = va##1_re + vb##1_re * s; \ 00189 vc##1_im = va##1_im + vb##1_im * s; \ 00190 vc##2_re = va##2_re + vb##2_re * s; \ 00191 vc##2_im = va##2_im + vb##2_im * s; \ 00192 }while (0) 00193 00194 00195 #define FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx, new_idx) do { \ 00196 switch(mydir){ \ 00197 case 0: \ 00198 new_idx = ( (new_x1==X1m1)?idx-X1m1:idx+1); \ 00199 new_x1 = (new_x1==X1m1)?0:new_x1+1; \ 00200 break; \ 00201 case 1: \ 00202 new_idx = ( (new_x2==X2m1)?idx-X2X1mX1:idx+X1); \ 00203 new_x2 = (new_x2==X2m1)?0:new_x2+1; \ 00204 break; \ 00205 case 2: \ 00206 new_idx = ( (new_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \ 00207 new_x3 = (new_x3==X3m1)?0:new_x3+1; \ 00208 break; \ 00209 case 3: \ 00210 new_idx = ( (new_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \ 00211 new_x4 = (new_x4==X4m1)?0:new_x4+1; \ 00212 break; \ 00213 } \ 00214 }while(0) 00215 00216 #define FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx, new_idx) do { \ 00217 switch(mydir){ \ 00218 case 0: \ 00219 new_idx = ( (new_x1==0)?idx+X1m1:idx-1); \ 00220 new_x1 = (new_x1==0)?X1m1:new_x1 - 1; \ 00221 break; \ 00222 case 1: \ 00223 new_idx = ( (new_x2==0)?idx+X2X1mX1:idx-X1); \ 00224 new_x2 = (new_x2==0)?X2m1:new_x2 - 1; \ 00225 break; \ 00226 case 2: \ 00227 new_idx = ( (new_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \ 00228 new_x3 = (new_x3==0)?X3m1:new_x3 - 1; \ 00229 break; \ 00230 case 3: \ 00231 new_idx = ( (new_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \ 00232 new_x4 = (new_x4==0)?X4m1:new_x4 - 1; \ 00233 break; \ 00234 } \ 00235 }while(0) 00236 00237 00238 #define FF_COMPUTE_NEW_FULL_IDX_PLUS(old_x1, old_x2, old_x3, old_x4, idx, mydir, new_idx) do { \ 00239 switch(mydir){ \ 00240 case 0: \ 00241 new_idx = ( (old_x1==X1m1)?idx-X1m1:idx+1); \ 00242 break; \ 00243 case 1: \ 00244 new_idx = ( (old_x2==X2m1)?idx-X2X1mX1:idx+X1); \ 00245 break; \ 00246 case 2: \ 00247 new_idx = ( (old_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \ 00248 break; \ 00249 case 3: \ 00250 new_idx = ( (old_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \ 00251 break; \ 00252 } \ 00253 }while(0) 00254 00255 #define FF_COMPUTE_NEW_FULL_IDX_MINUS(old_x1, old_x2, old_x3, old_x4, idx, mydir, new_idx) do { \ 00256 switch(mydir){ \ 00257 case 0: \ 00258 new_idx = ( (old_x1==0)?idx+X1m1:idx-1); \ 00259 break; \ 00260 case 1: \ 00261 new_idx = ( (old_x2==0)?idx+X2X1mX1:idx-X1); \ 00262 break; \ 00263 case 2: \ 00264 new_idx = ( (old_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \ 00265 break; \ 00266 case 3: \ 00267 new_idx = ( (old_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \ 00268 break; \ 00269 } \ 00270 }while(0) 00271 00272 //this macro require linka, linkb, and ah variables defined 00273 #define ADD_FORCE_TO_MOM(hw1, hw2, idx, dir, cf,oddness) do{ \ 00274 Float2 my_coeff; \ 00275 int mydir; \ 00276 if (GOES_BACKWARDS(dir)){ \ 00277 mydir=OPP_DIR(dir); \ 00278 my_coeff.x = -cf.x; \ 00279 my_coeff.y = -cf.y; \ 00280 }else{ \ 00281 mydir=dir; \ 00282 my_coeff.x = cf.x; \ 00283 my_coeff.y = cf.y; \ 00284 } \ 00285 Float2 tmp_coeff; \ 00286 tmp_coeff.x = my_coeff.x; \ 00287 tmp_coeff.y = my_coeff.y; \ 00288 if(oddness){ \ 00289 tmp_coeff.x = - my_coeff.x; \ 00290 tmp_coeff.y = - my_coeff.y; \ 00291 } \ 00292 Float2* mom = oddness?momOdd:momEven; \ 00293 LOAD_ANTI_HERMITIAN(mom, mydir, idx, AH); \ 00294 UNCOMPRESS_ANTI_HERMITIAN(ah, linka); \ 00295 SU3_PROJECTOR(hw1##0, hw2##0, linkb); \ 00296 SCALAR_MULT_ADD_SU3_MATRIX(linka, linkb, tmp_coeff.x, linka); \ 00297 SU3_PROJECTOR(hw1##1, hw2##1, linkb); \ 00298 SCALAR_MULT_ADD_SU3_MATRIX(linka, linkb, tmp_coeff.y, linka); \ 00299 MAKE_ANTI_HERMITIAN(linka, ah); \ 00300 WRITE_ANTI_HERMITIAN(mom, mydir, idx, AH, Vh); \ 00301 }while(0) 00302 00303 00304 #define FF_COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \ 00305 sign =1; \ 00306 switch(dir){ \ 00307 case XUP: \ 00308 if ( (i4 & 1) == 1){ \ 00309 sign = -1; \ 00310 } \ 00311 break; \ 00312 case YUP: \ 00313 if ( ((i4+i1) & 1) == 1){ \ 00314 sign = -1; \ 00315 } \ 00316 break; \ 00317 case ZUP: \ 00318 if ( ((i4+i1+i2) & 1) == 1){ \ 00319 sign = -1; \ 00320 } \ 00321 break; \ 00322 case TUP: \ 00323 if (i4 == X4m1 ){ \ 00324 sign = -1; \ 00325 } \ 00326 break; \ 00327 } \ 00328 }while (0) 00329 00330 00331 #define hwa00_re HWA0.x 00332 #define hwa00_im HWA0.y 00333 #define hwa01_re HWA1.x 00334 #define hwa01_im HWA1.y 00335 #define hwa02_re HWA2.x 00336 #define hwa02_im HWA2.y 00337 #define hwa10_re HWA3.x 00338 #define hwa10_im HWA3.y 00339 #define hwa11_re HWA4.x 00340 #define hwa11_im HWA4.y 00341 #define hwa12_re HWA5.x 00342 #define hwa12_im HWA5.y 00343 00344 #define hwb00_re HWB0.x 00345 #define hwb00_im HWB0.y 00346 #define hwb01_re HWB1.x 00347 #define hwb01_im HWB1.y 00348 #define hwb02_re HWB2.x 00349 #define hwb02_im HWB2.y 00350 #define hwb10_re HWB3.x 00351 #define hwb10_im HWB3.y 00352 #define hwb11_re HWB4.x 00353 #define hwb11_im HWB4.y 00354 #define hwb12_re HWB5.x 00355 #define hwb12_im HWB5.y 00356 00357 #define hwc00_re HWC0.x 00358 #define hwc00_im HWC0.y 00359 #define hwc01_re HWC1.x 00360 #define hwc01_im HWC1.y 00361 #define hwc02_re HWC2.x 00362 #define hwc02_im HWC2.y 00363 #define hwc10_re HWC3.x 00364 #define hwc10_im HWC3.y 00365 #define hwc11_re HWC4.x 00366 #define hwc11_im HWC4.y 00367 #define hwc12_re HWC5.x 00368 #define hwc12_im HWC5.y 00369 00370 #define hwd00_re HWD0.x 00371 #define hwd00_im HWD0.y 00372 #define hwd01_re HWD1.x 00373 #define hwd01_im HWD1.y 00374 #define hwd02_re HWD2.x 00375 #define hwd02_im HWD2.y 00376 #define hwd10_re HWD3.x 00377 #define hwd10_im HWD3.y 00378 #define hwd11_re HWD4.x 00379 #define hwd11_im HWD4.y 00380 #define hwd12_re HWD5.x 00381 #define hwd12_im HWD5.y 00382 00383 #define hwe00_re HWE0.x 00384 #define hwe00_im HWE0.y 00385 #define hwe01_re HWE1.x 00386 #define hwe01_im HWE1.y 00387 #define hwe02_re HWE2.x 00388 #define hwe02_im HWE2.y 00389 #define hwe10_re HWE3.x 00390 #define hwe10_im HWE3.y 00391 #define hwe11_re HWE4.x 00392 #define hwe11_im HWE4.y 00393 #define hwe12_re HWE5.x 00394 #define hwe12_im HWE5.y 00395 00396 00397 void fermion_force_init_cuda(QudaGaugeParam* param) 00398 { 00399 00400 #ifdef MULTI_GPU 00401 #error "multi gpu is not supported for fermion force computation" 00402 #endif 00403 00404 static int fermion_force_init_cuda_flag = 0; 00405 00406 if (fermion_force_init_cuda_flag) return; 00407 00408 fermion_force_init_cuda_flag=1; 00409 00410 } 00411 00412 /* 00413 * This function computes contribution to mometum from the middle link in a staple 00414 * 00415 * tempx: IN 00416 * Pmu: OUT 00417 * P3: OUT 00418 * 00419 */ 00420 00421 template<int sig_positive, int mu_positive, int oddBit, typename Float2> 00422 __global__ void 00423 do_middle_link_kernel(Float2* tempxEven, Float2* tempxOdd, 00424 Float2* PmuEven, Float2* PmuOdd, 00425 Float2* P3Even, Float2* P3Odd, 00426 int sig, int mu, Float2 coeff, 00427 float4* linkEven, float4* linkOdd, 00428 Float2* momEven, Float2* momOdd) 00429 { 00430 int sid = blockIdx.x * blockDim.x + threadIdx.x; 00431 00432 int z1 = sid / X1h; 00433 int x1h = sid - z1*X1h; 00434 int z2 = z1 / X2; 00435 int x2 = z1 - z2*X2; 00436 int x4 = z2 / X3; 00437 int x3 = z2 - x4*X3; 00438 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00439 int x1 = 2*x1h + x1odd; 00440 int X = 2*sid + x1odd; 00441 00442 int new_x1, new_x2, new_x3, new_x4; 00443 int new_mem_idx; 00444 int ad_link_sign=1; 00445 int ab_link_sign=1; 00446 int bc_link_sign=1; 00447 00448 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5; 00449 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5; 00450 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5; 00451 Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5; 00452 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4; 00453 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4; 00454 Float2 AH0, AH1, AH2, AH3, AH4; 00455 00456 /* sig 00457 * A________B 00458 * mu | | 00459 * D | |C 00460 * 00461 * A is the current point (sid) 00462 */ 00463 00464 int point_b, point_c, point_d; 00465 int ad_link_nbr_idx, ab_link_nbr_idx, bc_link_nbr_idx; 00466 int mymu; 00467 00468 new_x1 = x1; 00469 new_x2 = x2; 00470 new_x3 = x3; 00471 new_x4 = x4; 00472 00473 if(mu_positive){ 00474 mymu =mu; 00475 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mu, X, new_mem_idx); 00476 }else{ 00477 mymu = OPP_DIR(mu); 00478 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(OPP_DIR(mu), X, new_mem_idx); 00479 } 00480 point_d = (new_mem_idx >> 1); 00481 if (mu_positive){ 00482 ad_link_nbr_idx = point_d; 00483 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00484 }else{ 00485 ad_link_nbr_idx = sid; 00486 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4); 00487 } 00488 00489 int mysig; 00490 if(sig_positive){ 00491 mysig = sig; 00492 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx); 00493 }else{ 00494 mysig = OPP_DIR(sig); 00495 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx); 00496 } 00497 point_c = (new_mem_idx >> 1); 00498 if (mu_positive){ 00499 bc_link_nbr_idx = point_c; 00500 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00501 } 00502 new_x1 = x1; 00503 new_x2 = x2; 00504 new_x3 = x3; 00505 new_x4 = x4; 00506 if(sig_positive){ 00507 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, X, new_mem_idx); 00508 }else{ 00509 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), X, new_mem_idx); 00510 } 00511 point_b = (new_mem_idx >> 1); 00512 00513 if (!mu_positive){ 00514 bc_link_nbr_idx = point_b; 00515 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00516 } 00517 00518 if(sig_positive){ 00519 ab_link_nbr_idx = sid; 00520 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, x1, x2, x3, x4); 00521 }else{ 00522 ab_link_nbr_idx = point_b; 00523 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, new_x1,new_x2,new_x3,new_x4); 00524 } 00525 00526 LOAD_HW(tempxEven, tempxOdd, point_d, HWA, 1-oddBit ); 00527 if(mu_positive){ 00528 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA, 1-oddBit); 00529 }else{ 00530 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA, oddBit); 00531 } 00532 00533 RECONSTRUCT_LINK_12(ad_link_sign, linka); 00534 if (mu_positive){ 00535 ADJ_MAT_MUL_HW(linka, hwa, hwd); 00536 }else{ 00537 MAT_MUL_HW(linka, hwa, hwd); 00538 } 00539 WRITE_HW(PmuEven,PmuOdd, sid, HWD, oddBit); 00540 00541 LOAD_HW(tempxEven,tempxOdd, point_c, HWA, oddBit); 00542 if(mu_positive){ 00543 FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA, oddBit); 00544 }else{ 00545 FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA, 1-oddBit); 00546 } 00547 00548 RECONSTRUCT_LINK_12(bc_link_sign, linka); 00549 if (mu_positive){ 00550 ADJ_MAT_MUL_HW(linka, hwa, hwb); 00551 }else{ 00552 MAT_MUL_HW(linka, hwa, hwb); 00553 } 00554 if(sig_positive){ 00555 FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKB, oddBit); 00556 }else{ 00557 FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKB, 1-oddBit); 00558 } 00559 00560 RECONSTRUCT_LINK_12(ab_link_sign, linkb); 00561 if (sig_positive){ 00562 MAT_MUL_HW(linkb, hwb, hwc); 00563 }else{ 00564 ADJ_MAT_MUL_HW(linkb, hwb, hwc); 00565 } 00566 WRITE_HW(P3Even, P3Odd, sid, HWC, oddBit); 00567 00568 if (sig_positive){ 00569 //add the force to mom 00570 ADD_FORCE_TO_MOM(hwc, hwd, sid, sig, coeff, oddBit); 00571 } 00572 } 00573 00574 00575 template<typename Float2> 00576 static void 00577 middle_link_kernel(Float2* tempxEven, Float2* tempxOdd, 00578 Float2* PmuEven, Float2* PmuOdd, 00579 Float2* P3Even, Float2* P3Odd, 00580 int sig, int mu, Float2 coeff, 00581 float4* linkEven, float4* linkOdd, cudaGaugeField &siteLink, 00582 Float2* momEven, Float2* momOdd, 00583 dim3 gridDim, dim3 BlockDim) 00584 { 00585 dim3 halfGridDim(gridDim.x/2, 1,1); 00586 00587 00588 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \ 00589 do_middle_link_kernel<sig_sign, mu_sign,0><<<halfGridDim, BlockDim>>>( tempxEven, tempxOdd, \ 00590 PmuEven, PmuOdd, \ 00591 P3Even, P3Odd, \ 00592 sig, mu, coeff, \ 00593 linkEven, linkOdd, \ 00594 momEven, momOdd); \ 00595 do_middle_link_kernel<sig_sign, mu_sign, 1><<<halfGridDim, BlockDim>>>(tempxEven, tempxOdd, \ 00596 PmuEven, PmuOdd, \ 00597 P3Even, P3Odd, \ 00598 sig, mu, coeff, \ 00599 linkEven, linkOdd, \ 00600 momEven, momOdd); 00601 00602 00603 if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){ 00604 CALL_MIDDLE_LINK_KERNEL(1, 1); 00605 }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){ 00606 CALL_MIDDLE_LINK_KERNEL(1, 0); 00607 }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){ 00608 CALL_MIDDLE_LINK_KERNEL(0, 1); 00609 }else{ 00610 CALL_MIDDLE_LINK_KERNEL(0, 0); 00611 } 00612 #undef CALL_MIDDLE_LINK_KERNEL 00613 00614 } 00615 00616 /* 00617 * Computes contribution to momentum from the side links in a staple 00618 * 00619 * P3: IN 00620 * P3mu: not used 00621 * Tempx: IN 00622 * Pmu: IN 00623 * shortPE: OUT 00624 * 00625 */ 00626 00627 template<int sig_positive, int mu_positive, int oddBit, typename Float2> 00628 __global__ void 00629 do_side_link_kernel(Float2* P3Even, Float2* P3Odd, 00630 Float2* P3muEven, Float2* P3muOdd, 00631 Float2* TempxEven, Float2* TempxOdd, 00632 Float2* PmuEven, Float2* PmuOdd, 00633 Float2* shortPEven, Float2* shortPOdd, 00634 int sig, int mu, Float2 coeff, Float2 accumu_coeff, 00635 float4* linkEven, float4* linkOdd, 00636 Float2* momEven, Float2* momOdd) 00637 { 00638 Float2 mcoeff; 00639 mcoeff.x = -coeff.x; 00640 mcoeff.y = -coeff.y; 00641 00642 int sid = blockIdx.x * blockDim.x + threadIdx.x; 00643 00644 int z1 = sid / X1h; 00645 int x1h = sid - z1*X1h; 00646 int z2 = z1 / X2; 00647 int x2 = z1 - z2*X2; 00648 int x4 = z2 / X3; 00649 int x3 = z2 - x4*X3; 00650 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00651 int x1 = 2*x1h + x1odd; 00652 int X = 2*sid + x1odd; 00653 00654 int ad_link_sign = 1; 00655 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5; 00656 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5; 00657 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5; 00658 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4; 00659 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4; 00660 Float2 AH0, AH1, AH2, AH3, AH4; 00661 00662 00663 00664 /* 00665 * compute the side link contribution to the momentum 00666 * 00667 * 00668 * sig 00669 * A________B 00670 * | | mu 00671 * D | |C 00672 * 00673 * A is the current point (sid) 00674 */ 00675 00676 int point_d; 00677 int ad_link_nbr_idx; 00678 int mymu; 00679 int new_mem_idx; 00680 00681 int new_x1 = x1; 00682 int new_x2 = x2; 00683 int new_x3 = x3; 00684 int new_x4 = x4; 00685 00686 if(mu_positive){ 00687 mymu =mu; 00688 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mymu,X, new_mem_idx); 00689 }else{ 00690 mymu = OPP_DIR(mu); 00691 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mymu, X, new_mem_idx); 00692 } 00693 point_d = (new_mem_idx >> 1); 00694 00695 if (mu_positive){ 00696 ad_link_nbr_idx = point_d; 00697 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00698 }else{ 00699 ad_link_nbr_idx = sid; 00700 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4); 00701 } 00702 00703 00704 LOAD_HW(P3Even, P3Odd, sid, HWA, oddBit); 00705 if(mu_positive){ 00706 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA, 1 - oddBit); 00707 }else{ 00708 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA, oddBit); 00709 } 00710 00711 RECONSTRUCT_LINK_12(ad_link_sign, linka); 00712 if (mu_positive){ 00713 MAT_MUL_HW(linka, hwa, hwb); 00714 }else{ 00715 ADJ_MAT_MUL_HW(linka, hwa, hwb); 00716 } 00717 00718 00719 //start to add side link force 00720 if (mu_positive){ 00721 LOAD_HW(TempxEven, TempxOdd, point_d, HWC, 1-oddBit); 00722 00723 if (sig_positive){ 00724 ADD_FORCE_TO_MOM(hwb, hwc, point_d, mu, coeff, 1-oddBit); 00725 }else{ 00726 ADD_FORCE_TO_MOM(hwc, hwb, point_d, OPP_DIR(mu), mcoeff, 1- oddBit); 00727 } 00728 }else{ 00729 LOAD_HW(PmuEven, PmuOdd, sid, HWC, oddBit); 00730 if (sig_positive){ 00731 ADD_FORCE_TO_MOM(hwa, hwc, sid, mu, mcoeff, oddBit); 00732 }else{ 00733 ADD_FORCE_TO_MOM(hwc, hwa, sid, OPP_DIR(mu), coeff, oddBit); 00734 } 00735 00736 } 00737 00738 if (shortPOdd){ 00739 LOAD_HW(shortPEven, shortPOdd, point_d, HWA, 1-oddBit); 00740 SCALAR_MULT_ADD_SU3_VECTOR(hwa0, hwb0, accumu_coeff.x, hwa0); 00741 SCALAR_MULT_ADD_SU3_VECTOR(hwa1, hwb1, accumu_coeff.y, hwa1); 00742 WRITE_HW(shortPEven, shortPOdd, point_d, HWA, 1-oddBit); 00743 } 00744 00745 } 00746 00747 00748 template<typename Float2> 00749 static void 00750 side_link_kernel(Float2* P3Even, Float2* P3Odd, 00751 Float2* P3muEven, Float2* P3muOdd, 00752 Float2* TempxEven, Float2* TempxOdd, 00753 Float2* PmuEven, Float2* PmuOdd, 00754 Float2* shortPEven, Float2* shortPOdd, 00755 int sig, int mu, Float2 coeff, Float2 accumu_coeff, 00756 float4* linkEven, float4* linkOdd, cudaGaugeField &siteLink, 00757 Float2* momEven, Float2* momOdd, 00758 dim3 gridDim, dim3 blockDim) 00759 { 00760 dim3 halfGridDim(gridDim.x/2,1,1); 00761 00762 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \ 00763 do_side_link_kernel<sig_sign,mu_sign,0><<<halfGridDim, blockDim>>>( P3Even, P3Odd, \ 00764 P3muEven, P3muOdd, \ 00765 TempxEven, TempxOdd, \ 00766 PmuEven, PmuOdd, \ 00767 shortPEven, shortPOdd, \ 00768 sig, mu, coeff, accumu_coeff, \ 00769 linkEven, linkOdd, \ 00770 momEven, momOdd); \ 00771 do_side_link_kernel<sig_sign,mu_sign,1><<<halfGridDim, blockDim>>>( P3Even, P3Odd, \ 00772 P3muEven, P3muOdd, \ 00773 TempxEven, TempxOdd, \ 00774 PmuEven, PmuOdd, \ 00775 shortPEven, shortPOdd, \ 00776 sig, mu, coeff, accumu_coeff, \ 00777 linkEven, linkOdd, \ 00778 momEven, momOdd); 00779 00780 if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){ 00781 CALL_SIDE_LINK_KERNEL(1,1); 00782 }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){ 00783 CALL_SIDE_LINK_KERNEL(1,0); 00784 }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){ 00785 CALL_SIDE_LINK_KERNEL(0,1); 00786 }else{ 00787 CALL_SIDE_LINK_KERNEL(0,0); 00788 } 00789 00790 #undef CALL_SITE_LINK_KERNEL 00791 00792 } 00793 00794 /* 00795 * This function computes the contribution to momentum from middle and side links 00796 * 00797 * tempx: IN 00798 * Pmu: not used 00799 * P3: not used 00800 * P3mu: not used 00801 * shortP: OUT 00802 * 00803 */ 00804 00805 template<int sig_positive, int mu_positive, int oddBit, typename Float2> 00806 __global__ void 00807 do_all_link_kernel(Float2* tempxEven, Float2* tempxOdd, 00808 Float2* PmuEven, Float2* PmuOdd, 00809 Float2* P3Even, Float2* P3Odd, 00810 Float2* P3muEven, Float2* P3muOdd, 00811 Float2* shortPEven, Float2* shortPOdd, 00812 int sig, int mu, Float2 coeff, Float2 mcoeff, Float2 accumu_coeff, 00813 float4* linkEven, float4* linkOdd, 00814 Float2* momEven, Float2* momOdd) 00815 { 00816 int sid = blockIdx.x * blockDim.x + threadIdx.x; 00817 00818 int z1 = sid / X1h; 00819 int x1h = sid - z1*X1h; 00820 int z2 = z1 / X2; 00821 int x2 = z1 - z2*X2; 00822 int x4 = z2 / X3; 00823 int x3 = z2 - x4*X3; 00824 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00825 int x1 = 2*x1h + x1odd; 00826 int X = 2*sid + x1odd; 00827 00828 int new_x1, new_x2, new_x3, new_x4; 00829 int ad_link_sign=1; 00830 int ab_link_sign=1; 00831 int bc_link_sign=1; 00832 00833 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5; 00834 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5; 00835 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5; 00836 Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5; 00837 Float2 HWE0, HWE1, HWE2, HWE3, HWE4, HWE5; 00838 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4; 00839 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4; 00840 float4 LINKC0, LINKC1, LINKC2, LINKC3, LINKC4; 00841 Float2 AH0, AH1, AH2, AH3, AH4; 00842 00843 00844 /* sig 00845 * A________B 00846 * mu | | 00847 * D | |C 00848 * 00849 * A is the current point (sid) 00850 */ 00851 00852 00853 int point_b, point_c, point_d; 00854 int ad_link_nbr_idx, ab_link_nbr_idx, bc_link_nbr_idx; 00855 int mymu; 00856 int new_mem_idx; 00857 new_x1 = x1; 00858 new_x2 = x2; 00859 new_x3 = x3; 00860 new_x4 = x4; 00861 00862 if(mu_positive){ 00863 mymu =mu; 00864 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mu, X, new_mem_idx); 00865 }else{ 00866 mymu = OPP_DIR(mu); 00867 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(OPP_DIR(mu), X, new_mem_idx); 00868 } 00869 point_d = (new_mem_idx >> 1); 00870 00871 if (mu_positive){ 00872 ad_link_nbr_idx = point_d; 00873 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00874 }else{ 00875 ad_link_nbr_idx = sid; 00876 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4); 00877 } 00878 00879 int mysig; 00880 if(sig_positive){ 00881 mysig = sig; 00882 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx); 00883 }else{ 00884 mysig = OPP_DIR(sig); 00885 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx); 00886 } 00887 point_c = (new_mem_idx >> 1); 00888 if (mu_positive){ 00889 bc_link_nbr_idx = point_c; 00890 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00891 } 00892 00893 new_x1 = x1; 00894 new_x2 = x2; 00895 new_x3 = x3; 00896 new_x4 = x4; 00897 if(sig_positive){ 00898 FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, X, new_mem_idx); 00899 }else{ 00900 FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), X, new_mem_idx); 00901 } 00902 point_b = (new_mem_idx >> 1); 00903 if (!mu_positive){ 00904 bc_link_nbr_idx = point_b; 00905 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4); 00906 } 00907 00908 if(sig_positive){ 00909 ab_link_nbr_idx = sid; 00910 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, x1, x2, x3, x4); 00911 }else{ 00912 ab_link_nbr_idx = point_b; 00913 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, new_x1,new_x2,new_x3,new_x4); 00914 } 00915 00916 LOAD_HW(tempxEven, tempxOdd, point_d, HWE, 1-oddBit); 00917 if (mu_positive){ 00918 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKC, 1-oddBit); 00919 }else{ 00920 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKC, oddBit); 00921 } 00922 00923 RECONSTRUCT_LINK_12(ad_link_sign, linkc); 00924 if (mu_positive){ 00925 ADJ_MAT_MUL_HW(linkc, hwe, hwd); 00926 }else{ 00927 MAT_MUL_HW(linkc, hwe, hwd); 00928 } 00929 //we do not need to write Pmu here 00930 //WRITE_HW(myPmu, sid, HWD); 00931 00932 LOAD_HW(tempxEven, tempxOdd, point_c, HWA, oddBit); 00933 if (mu_positive){ 00934 FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA, oddBit); 00935 }else{ 00936 FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA, 1-oddBit); 00937 } 00938 00939 RECONSTRUCT_LINK_12(bc_link_sign, linka); 00940 if (mu_positive){ 00941 ADJ_MAT_MUL_HW(linka, hwa, hwb); 00942 }else{ 00943 MAT_MUL_HW(linka, hwa, hwb); 00944 } 00945 if (sig_positive){ 00946 FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKA, oddBit); 00947 }else{ 00948 FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKA, 1-oddBit); 00949 } 00950 00951 RECONSTRUCT_LINK_12(ab_link_sign, linka); 00952 if (sig_positive){ 00953 MAT_MUL_HW(linka, hwb, hwc); 00954 }else{ 00955 ADJ_MAT_MUL_HW(linka, hwb, hwc); 00956 } 00957 00958 //we do not need to write P3 here 00959 //WRITE_HW(myP3, sid, HWC); 00960 00961 //The middle link contribution 00962 if (sig_positive){ 00963 //add the force to mom 00964 ADD_FORCE_TO_MOM(hwc, hwd, sid, sig, mcoeff, oddBit); 00965 } 00966 00967 //P3 is hwc 00968 //ad_link is linkc 00969 if (mu_positive){ 00970 MAT_MUL_HW(linkc, hwc, hwa); 00971 }else{ 00972 ADJ_MAT_MUL_HW(linkc, hwc, hwa); 00973 } 00974 00975 //accumulate P7rho to P5 00976 //WRITE_HW(otherP3mu, point_d, HWA); 00977 LOAD_HW(shortPEven, shortPOdd, point_d, HWB, 1-oddBit); 00978 SCALAR_MULT_ADD_SU3_VECTOR(hwb0, hwa0, accumu_coeff.x, hwb0); 00979 SCALAR_MULT_ADD_SU3_VECTOR(hwb1, hwa1, accumu_coeff.y, hwb1); 00980 WRITE_HW(shortPEven, shortPOdd, point_d, HWB, 1-oddBit); 00981 00982 //hwe holds tempx at point_d 00983 //hwd holds Pmu at point A(sid) 00984 if (mu_positive){ 00985 if (sig_positive){ 00986 ADD_FORCE_TO_MOM(hwa, hwe, point_d, mu, coeff, 1-oddBit); 00987 }else{ 00988 ADD_FORCE_TO_MOM(hwe, hwa, point_d, OPP_DIR(mu), mcoeff, 1- oddBit); 00989 } 00990 }else{ 00991 if (sig_positive){ 00992 ADD_FORCE_TO_MOM(hwc, hwd, sid, mu, mcoeff, oddBit); 00993 }else{ 00994 ADD_FORCE_TO_MOM(hwd, hwc, sid, OPP_DIR(mu), coeff, oddBit); 00995 } 00996 00997 } 00998 00999 01000 } 01001 01002 01003 01004 template<typename Float2> 01005 static void 01006 all_link_kernel(Float2* tempxEven, Float2* tempxOdd, 01007 Float2* PmuEven, Float2* PmuOdd, 01008 Float2* P3Even, Float2* P3Odd, 01009 Float2* P3muEven, Float2* P3muOdd, 01010 Float2* shortPEven, Float2* shortPOdd, 01011 int sig, int mu, Float2 coeff, Float2 mcoeff, Float2 accumu_coeff, 01012 float4* linkEven, float4* linkOdd, cudaGaugeField &siteLink, 01013 Float2* momEven, Float2* momOdd, 01014 dim3 gridDim, dim3 blockDim) 01015 01016 { 01017 dim3 halfGridDim(gridDim.x/2, 1,1); 01018 01019 #define CALL_ALL_LINK_KERNEL(sig_sign, mu_sign) \ 01020 do_all_link_kernel<sig_sign,mu_sign,0><<<halfGridDim, blockDim>>>(tempxEven, tempxOdd, \ 01021 PmuEven, PmuOdd, \ 01022 P3Even, P3Odd, \ 01023 P3muEven, P3muOdd, \ 01024 shortPEven, shortPOdd, \ 01025 sig, mu, coeff, mcoeff, accumu_coeff, \ 01026 linkEven, linkOdd, \ 01027 momEven, momOdd); \ 01028 do_all_link_kernel<sig_sign,mu_sign,1><<<halfGridDim, blockDim>>>(tempxEven, tempxOdd, \ 01029 PmuEven, PmuOdd, \ 01030 P3Even, P3Odd, \ 01031 P3muEven, P3muOdd, \ 01032 shortPEven, shortPOdd, \ 01033 sig, mu, coeff, mcoeff, accumu_coeff, \ 01034 linkEven, linkOdd, \ 01035 momEven, momOdd); 01036 01037 01038 if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){ 01039 CALL_ALL_LINK_KERNEL(1,1); 01040 }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){ 01041 CALL_ALL_LINK_KERNEL(1,0); 01042 }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){ 01043 CALL_ALL_LINK_KERNEL(0,1); 01044 }else{ 01045 CALL_ALL_LINK_KERNEL(0,0); 01046 } 01047 01048 #undef CALL_ALL_LINK_KERNEL 01049 01050 } 01051 01052 /* This function computes the one and naik terms' contribution to momentum 01053 * 01054 * Tempx: IN 01055 * Pmu: IN 01056 * Pnumu: IN 01057 * 01058 */ 01059 template <int oddBit, typename Float2> 01060 __global__ void 01061 do_one_and_naik_terms_kernel(Float2* TempxEven, Float2* TempxOdd, 01062 Float2* PmuEven, Float2* PmuOdd, 01063 Float2* PnumuEven, Float2* PnumuOdd, 01064 int mu, Float2 OneLink, Float2 Naik, Float2 mNaik, 01065 float4* linkEven, float4* linkOdd, 01066 Float2* momEven, Float2* momOdd) 01067 { 01068 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5; 01069 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5; 01070 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5; 01071 Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5; 01072 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4; 01073 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4; 01074 Float2 AH0, AH1, AH2, AH3, AH4; 01075 01076 int sid = blockIdx.x * blockDim.x + threadIdx.x; 01077 int z1 = sid / X1h; 01078 int x1h = sid - z1*X1h; 01079 int z2 = z1 / X2; 01080 int x2 = z1 - z2*X2; 01081 int x4 = z2 / X3; 01082 int x3 = z2 - x4*X3; 01083 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 01084 int x1 = 2*x1h + x1odd; 01085 //int X = 2*sid + x1odd; 01086 01087 int dx[4]; 01088 int new_x1, new_x2, new_x3, new_x4, new_idx; 01089 int sign=1; 01090 01091 if (GOES_BACKWARDS(mu)){ 01092 //The one link 01093 LOAD_HW(PmuEven, PmuOdd, sid, HWA, oddBit); 01094 LOAD_HW(TempxEven, TempxOdd, sid, HWB, oddBit); 01095 ADD_FORCE_TO_MOM(hwa, hwb, sid, OPP_DIR(mu), OneLink, oddBit); 01096 01097 //Naik term 01098 dx[3]=dx[2]=dx[1]=dx[0]=0; 01099 dx[OPP_DIR(mu)] = -1; 01100 new_x1 = (x1 + dx[0] + X1)%X1; 01101 new_x2 = (x2 + dx[1] + X2)%X2; 01102 new_x3 = (x3 + dx[2] + X3)%X3; 01103 new_x4 = (x4 + dx[3] + X4)%X4; 01104 new_idx = (new_x4*X3X2X1+new_x3*X2X1+new_x2*X1+new_x1) >> 1; 01105 LOAD_HW(TempxEven, TempxOdd, new_idx, HWA, 1-oddBit); 01106 FF_LOAD_MATRIX(OPP_DIR(mu), new_idx, LINKA, 1-oddBit); 01107 FF_COMPUTE_RECONSTRUCT_SIGN(sign, OPP_DIR(mu), new_x1,new_x2,new_x3,new_x4); 01108 RECONSTRUCT_LINK_12(sign, linka); 01109 ADJ_MAT_MUL_HW(linka, hwa, hwc); //Popmu 01110 01111 LOAD_HW(PnumuEven, PnumuOdd, sid, HWD, oddBit); 01112 ADD_FORCE_TO_MOM(hwd, hwc, sid, OPP_DIR(mu), mNaik, oddBit); 01113 01114 dx[3]=dx[2]=dx[1]=dx[0]=0; 01115 dx[OPP_DIR(mu)] = 1; 01116 new_x1 = (x1 + dx[0] + X1)%X1; 01117 new_x2 = (x2 + dx[1] + X2)%X2; 01118 new_x3 = (x3 + dx[2] + X3)%X3; 01119 new_x4 = (x4 + dx[3] + X4)%X4; 01120 new_idx = (new_x4*X3X2X1+new_x3*X2X1+new_x2*X1+new_x1) >> 1; 01121 LOAD_HW(PnumuEven, PnumuOdd, new_idx, HWA, 1-oddBit); 01122 FF_LOAD_MATRIX(OPP_DIR(mu), sid, LINKA, oddBit); 01123 FF_COMPUTE_RECONSTRUCT_SIGN(sign, OPP_DIR(mu), x1, x2, x3, x4); 01124 RECONSTRUCT_LINK_12(sign, linka); 01125 MAT_MUL_HW(linka, hwa, hwc); 01126 ADD_FORCE_TO_MOM(hwc, hwb, sid, OPP_DIR(mu), Naik, oddBit); 01127 }else{ 01128 dx[3]=dx[2]=dx[1]=dx[0]=0; 01129 dx[mu] = 1; 01130 new_x1 = (x1 + dx[0] + X1)%X1; 01131 new_x2 = (x2 + dx[1] + X2)%X2; 01132 new_x3 = (x3 + dx[2] + X3)%X3; 01133 new_x4 = (x4 + dx[3] + X4)%X4; 01134 new_idx = (new_x4*X3X2X1+new_x3*X2X1+new_x2*X1+new_x1) >> 1; 01135 LOAD_HW(TempxEven, TempxOdd, new_idx, HWA, 1-oddBit); 01136 FF_LOAD_MATRIX(mu, sid, LINKA, oddBit); 01137 FF_COMPUTE_RECONSTRUCT_SIGN(sign, mu, x1, x2, x3, x4); 01138 RECONSTRUCT_LINK_12(sign, linka); 01139 MAT_MUL_HW(linka, hwa, hwb); 01140 01141 LOAD_HW(PnumuEven, PnumuOdd, sid, HWC, oddBit); 01142 ADD_FORCE_TO_MOM(hwb, hwc, sid, mu, Naik, oddBit); 01143 01144 01145 } 01146 } 01147 01148 template<typename Float2> 01149 static void 01150 one_and_naik_terms_kernel(Float2* TempxEven, Float2* TempxOdd, 01151 Float2* PmuEven, Float2* PmuOdd, 01152 Float2* PnumuEven, Float2* PnumuOdd, 01153 int mu, Float2 OneLink, Float2 Naik, Float2 mNaik, 01154 float4* linkEven, float4* linkOdd, 01155 Float2* momEven, Float2* momOdd, 01156 dim3 gridDim, dim3 blockDim) 01157 { 01158 dim3 halfGridDim(gridDim.x/2, 1,1); 01159 01160 do_one_and_naik_terms_kernel<0><<<halfGridDim, blockDim>>>(TempxEven, TempxOdd, 01161 PmuEven, PmuOdd, 01162 PnumuEven, PnumuOdd, 01163 mu, OneLink, Naik, mNaik, 01164 linkEven, linkOdd, 01165 momEven, momOdd); 01166 do_one_and_naik_terms_kernel<1><<<halfGridDim, blockDim>>>(TempxEven, TempxOdd, 01167 PmuEven, PmuOdd, 01168 PnumuEven, PnumuOdd, 01169 mu, OneLink, Naik, mNaik, 01170 linkEven, linkOdd, 01171 momEven, momOdd); 01172 return; 01173 } 01174 01175 01176 01177 #define Pmu tempvec[0] 01178 #define Pnumu tempvec[1] 01179 #define Prhonumu tempvec[2] 01180 #define P7 tempvec[3] 01181 #define P7rho tempvec[4] 01182 #define P7rhonu tempvec[5] 01183 #define P5 tempvec[6] 01184 #define P3 tempvec[7] 01185 #define P5nu tempvec[3] 01186 #define P3mu tempvec[3] 01187 #define Popmu tempvec[4] 01188 #define Pmumumu tempvec[4] 01189 01190 template<typename Real> 01191 static void 01192 do_fermion_force_cuda(Real eps, Real weight1, Real weight2, Real* act_path_coeff, FullHw cudaHw, 01193 cudaGaugeField &siteLink, cudaGaugeField &cudaMom, FullHw tempvec[8], QudaGaugeParam* param) 01194 { 01195 01196 int mu, nu, rho, sig; 01197 float2 coeff; 01198 01199 float2 OneLink, Lepage, Naik, FiveSt, ThreeSt, SevenSt; 01200 float2 mNaik, mLepage, mFiveSt, mThreeSt, mSevenSt; 01201 01202 Real ferm_epsilon; 01203 ferm_epsilon = 2.0*weight1*eps; 01204 OneLink.x = act_path_coeff[0]*ferm_epsilon ; 01205 Naik.x = act_path_coeff[1]*ferm_epsilon ; mNaik.x = -Naik.x; 01206 ThreeSt.x = act_path_coeff[2]*ferm_epsilon ; mThreeSt.x = -ThreeSt.x; 01207 FiveSt.x = act_path_coeff[3]*ferm_epsilon ; mFiveSt.x = -FiveSt.x; 01208 SevenSt.x = act_path_coeff[4]*ferm_epsilon ; mSevenSt.x = -SevenSt.x; 01209 Lepage.x = act_path_coeff[5]*ferm_epsilon ; mLepage.x = -Lepage.x; 01210 01211 ferm_epsilon = 2.0*weight2*eps; 01212 OneLink.y = act_path_coeff[0]*ferm_epsilon ; 01213 Naik.y = act_path_coeff[1]*ferm_epsilon ; mNaik.y = -Naik.y; 01214 ThreeSt.y = act_path_coeff[2]*ferm_epsilon ; mThreeSt.y = -ThreeSt.y; 01215 FiveSt.y = act_path_coeff[3]*ferm_epsilon ; mFiveSt.y = -FiveSt.y; 01216 SevenSt.y = act_path_coeff[4]*ferm_epsilon ; mSevenSt.y = -SevenSt.y; 01217 Lepage.y = act_path_coeff[5]*ferm_epsilon ; mLepage.y = -Lepage.y; 01218 01219 int DirectLinks[8] ; 01220 01221 for(mu=0;mu<8;mu++){ 01222 DirectLinks[mu] = 0 ; 01223 } 01224 01225 int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3]; 01226 dim3 blockDim(BLOCK_DIM,1,1); 01227 dim3 gridDim(volume/blockDim.x, 1, 1); 01228 01229 01230 cudaBindTexture(0, siteLink0TexSingle_recon, siteLink.Even_p(), siteLink.Bytes()/2); 01231 cudaBindTexture(0, siteLink1TexSingle_recon, siteLink.Odd_p(), siteLink.Bytes()/2); 01232 01233 01234 for(sig=0; sig < 8; sig++){ 01235 for(mu = 0; mu < 8; mu++){ 01236 if ( (mu == sig) || (mu == OPP_DIR(sig))){ 01237 continue; 01238 } 01239 //3-link 01240 //Kernel A: middle link 01241 01242 middle_link_kernel( (float2*)cudaHw.even.data, (float2*)cudaHw.odd.data, 01243 (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01244 (float2*)P3.even.data, (float2*)P3.odd.data, 01245 sig, mu, mThreeSt, 01246 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink, 01247 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(), 01248 gridDim, blockDim); 01249 checkCudaError(); 01250 for(nu=0; nu < 8; nu++){ 01251 if (nu == sig || nu == OPP_DIR(sig) 01252 || nu == mu || nu == OPP_DIR(mu)){ 01253 continue; 01254 } 01255 //5-link: middle link 01256 //Kernel B 01257 middle_link_kernel( (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01258 (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01259 (float2*)P5.even.data, (float2*)P5.odd.data, 01260 sig, nu, FiveSt, 01261 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink, 01262 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(), 01263 gridDim, blockDim); 01264 checkCudaError(); 01265 01266 for(rho =0; rho < 8; rho++){ 01267 if (rho == sig || rho == OPP_DIR(sig) 01268 || rho == mu || rho == OPP_DIR(mu) 01269 || rho == nu || rho == OPP_DIR(nu)){ 01270 continue; 01271 } 01272 //7-link: middle link and side link 01273 //kernel C 01274 01275 if(FiveSt.x != 0)coeff.x = SevenSt.x/FiveSt.x ; else coeff.x = 0; 01276 if(FiveSt.y != 0)coeff.y = SevenSt.y/FiveSt.y ; else coeff.y = 0; 01277 all_link_kernel((float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01278 (float2*)Prhonumu.even.data, (float2*)Prhonumu.odd.data, 01279 (float2*)P7.even.data, (float2*)P7.odd.data, 01280 (float2*)P7rho.even.data, (float2*)P7rho.odd.data, 01281 (float2*)P5.even.data, (float2*)P5.odd.data, 01282 sig, rho, SevenSt,mSevenSt,coeff, 01283 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink, 01284 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(), 01285 gridDim, blockDim); 01286 checkCudaError(); 01287 01288 }//rho 01289 01290 //5-link: side link 01291 //kernel B2 01292 if(ThreeSt.x != 0)coeff.x = FiveSt.x/ThreeSt.x ; else coeff.x = 0; 01293 if(ThreeSt.y != 0)coeff.y = FiveSt.y/ThreeSt.y ; else coeff.y = 0; 01294 side_link_kernel((float2*)P5.even.data, (float2*)P5.odd.data, 01295 (float2*)P5nu.even.data, (float2*)P5nu.odd.data, 01296 (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01297 (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01298 (float2*)P3.even.data, (float2*)P3.odd.data, 01299 sig, nu, mFiveSt, coeff, 01300 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink, 01301 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(), 01302 gridDim, blockDim); 01303 checkCudaError(); 01304 01305 }//nu 01306 01307 //lepage 01308 //Kernel A2 01309 middle_link_kernel( (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01310 (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01311 (float2*)P5.even.data, (float2*)P5.odd.data, 01312 sig, mu, Lepage, 01313 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink, 01314 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(), 01315 gridDim, blockDim); 01316 checkCudaError(); 01317 01318 if(ThreeSt.x != 0)coeff.x = Lepage.x/ThreeSt.x ; else coeff.x = 0; 01319 if(ThreeSt.y != 0)coeff.y = Lepage.y/ThreeSt.y ; else coeff.y = 0; 01320 01321 side_link_kernel((float2*)P5.even.data, (float2*)P5.odd.data, 01322 (float2*)P5nu.even.data, (float2*)P5nu.odd.data, 01323 (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01324 (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01325 (float2*)P3.even.data, (float2*)P3.odd.data, 01326 sig, mu, mLepage,coeff, 01327 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink, 01328 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(), 01329 gridDim, blockDim); 01330 checkCudaError(); 01331 01332 //3-link side link 01333 coeff.x=coeff.y=0; 01334 side_link_kernel((float2*)P3.even.data, (float2*)P3.odd.data, 01335 (float2*)P3mu.even.data, (float2*)P3mu.odd.data, 01336 (float2*)cudaHw.even.data, (float2*)cudaHw.odd.data, 01337 (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01338 (float2*)NULL, (float2*)NULL, 01339 sig, mu, ThreeSt,coeff, 01340 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink, 01341 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(), 01342 gridDim, blockDim); 01343 checkCudaError(); 01344 01345 //1-link and naik term 01346 if (!DirectLinks[mu]){ 01347 DirectLinks[mu]=1; 01348 //kernel Z 01349 one_and_naik_terms_kernel((float2*)cudaHw.even.data, (float2*)cudaHw.odd.data, 01350 (float2*)Pmu.even.data, (float2*)Pmu.odd.data, 01351 (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data, 01352 mu, OneLink, Naik, mNaik, 01353 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), 01354 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(), 01355 gridDim, blockDim); 01356 01357 checkCudaError(); 01358 } 01359 01360 }//mu 01361 01362 }//sig 01363 01364 cudaUnbindTexture(siteLink0TexSingle_recon); 01365 cudaUnbindTexture(siteLink1TexSingle_recon); 01366 01367 } 01368 01369 #undef Pmu 01370 #undef Pnumu 01371 #undef Prhonumu 01372 #undef P7 01373 #undef P7rho 01374 #undef P7rhonu 01375 #undef P5 01376 #undef P3 01377 #undef P5nu 01378 #undef P3mu 01379 #undef Popmu 01380 #undef Pmumumu 01381 01382 void 01383 fermion_force_cuda(double eps, double weight1, double weight2, void* act_path_coeff, 01384 FullHw cudaHw, cudaGaugeField &siteLink, cudaGaugeField &cudaMom, QudaGaugeParam* param) 01385 { 01386 int i; 01387 FullHw tempvec[8]; 01388 01389 if (siteLink.Reconstruct() != QUDA_RECONSTRUCT_12) 01390 errorQuda("Reconstruct type %d not supported for gauge field", siteLink.Reconstruct()); 01391 01392 if (cudaMom.Reconstruct() != QUDA_RECONSTRUCT_10) 01393 errorQuda("Reconstruct type %d not supported for momentum field", cudaMom.Reconstruct()); 01394 01395 for(i=0;i < 8;i++){ 01396 tempvec[i] = createHwQuda(param->X, param->cuda_prec); 01397 } 01398 01399 if (param->cuda_prec == QUDA_DOUBLE_PRECISION){ 01400 /* 01401 do_fermion_force_cuda( (double)eps, (double)weight1, (double)weight2, (double*)act_path_coeff, 01402 cudaHw, siteLink, cudaMom, tempvec, param); 01403 */ 01404 errorQuda("Double precision not supported?"); 01405 }else{ 01406 do_fermion_force_cuda( (float)eps, (float)weight1, (float)weight2, (float*)act_path_coeff, 01407 cudaHw, siteLink, cudaMom, tempvec, param); 01408 } 01409 01410 for(i=0;i < 8;i++){ 01411 freeHwQuda(tempvec[i]); 01412 } 01413 01414 01415 01416 }