QUDA v0.4.0
A library for QCD on GPUs
|
00001 00002 #define xcomm kparam.ghostDim[0] 00003 #define ycomm kparam.ghostDim[1] 00004 #define zcomm kparam.ghostDim[2] 00005 #define tcomm kparam.ghostDim[3] 00006 00007 #if (N_IN_FLOATN == 4) 00008 #define linka00_re LINKA0.x 00009 #define linka00_im LINKA0.y 00010 #define linka01_re LINKA0.z 00011 #define linka01_im LINKA0.w 00012 #define linka02_re LINKA1.x 00013 #define linka02_im LINKA1.y 00014 #define linka10_re LINKA1.z 00015 #define linka10_im LINKA1.w 00016 #define linka11_re LINKA2.x 00017 #define linka11_im LINKA2.y 00018 #define linka12_re LINKA2.z 00019 #define linka12_im LINKA2.w 00020 #define linka20_re LINKA3.x 00021 #define linka20_im LINKA3.y 00022 #define linka21_re LINKA3.z 00023 #define linka21_im LINKA3.w 00024 #define linka22_re LINKA4.x 00025 #define linka22_im LINKA4.y 00026 00027 00028 #define linkb00_re LINKB0.x 00029 #define linkb00_im LINKB0.y 00030 #define linkb01_re LINKB0.z 00031 #define linkb01_im LINKB0.w 00032 #define linkb02_re LINKB1.x 00033 #define linkb02_im LINKB1.y 00034 #define linkb10_re LINKB1.z 00035 #define linkb10_im LINKB1.w 00036 #define linkb11_re LINKB2.x 00037 #define linkb11_im LINKB2.y 00038 #define linkb12_re LINKB2.z 00039 #define linkb12_im LINKB2.w 00040 #define linkb20_re LINKB3.x 00041 #define linkb20_im LINKB3.y 00042 #define linkb21_re LINKB3.z 00043 #define linkb21_im LINKB3.w 00044 #define linkb22_re LINKB4.x 00045 #define linkb22_im LINKB4.y 00046 00047 #else 00048 #define linka00_re LINKA0.x 00049 #define linka00_im LINKA0.y 00050 #define linka01_re LINKA1.x 00051 #define linka01_im LINKA1.y 00052 #define linka02_re LINKA2.x 00053 #define linka02_im LINKA2.y 00054 #define linka10_re LINKA3.x 00055 #define linka10_im LINKA3.y 00056 #define linka11_re LINKA4.x 00057 #define linka11_im LINKA4.y 00058 #define linka12_re LINKA5.x 00059 #define linka12_im LINKA5.y 00060 #define linka20_re LINKA6.x 00061 #define linka20_im LINKA6.y 00062 #define linka21_re LINKA7.x 00063 #define linka21_im LINKA7.y 00064 #define linka22_re LINKA8.x 00065 #define linka22_im LINKA8.y 00066 00067 #define linkb00_re LINKB0.x 00068 #define linkb00_im LINKB0.y 00069 #define linkb01_re LINKB1.x 00070 #define linkb01_im LINKB1.y 00071 #define linkb02_re LINKB2.x 00072 #define linkb02_im LINKB2.y 00073 #define linkb10_re LINKB3.x 00074 #define linkb10_im LINKB3.y 00075 #define linkb11_re LINKB4.x 00076 #define linkb11_im LINKB4.y 00077 #define linkb12_re LINKB5.x 00078 #define linkb12_im LINKB5.y 00079 #define linkb20_re LINKB6.x 00080 #define linkb20_im LINKB6.y 00081 #define linkb21_re LINKB7.x 00082 #define linkb21_im LINKB7.y 00083 #define linkb22_re LINKB8.x 00084 #define linkb22_im LINKB8.y 00085 00086 #endif 00087 00088 00089 #ifdef MULTI_GPU 00090 00091 #define COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx) do { \ 00092 switch(mydir){ \ 00093 case 0: \ 00094 new_mem_idx = ((!xcomm) && (new_x1 == (X1+1)))?(idx - X1m1): idx+1; \ 00095 new_x1 = ((!xcomm)&& (new_x1 == (X1+1)))? (new_x1 - X1m1):(new_x1+1); \ 00096 break; \ 00097 case 1: \ 00098 new_mem_idx = ((!ycomm) && (new_x2 == (X2+1)))?(idx - X2m1*E1): idx+E1; \ 00099 new_x2 = ((!ycomm)&& (new_x2 == (X2+1)))? (new_x2 - X2m1):(new_x2+1); \ 00100 break; \ 00101 case 2: \ 00102 new_mem_idx = ((!zcomm) && (new_x3 == (X3+1)))?(idx - X3m1*E2E1): idx+E2E1; \ 00103 new_x3 = ((!zcomm)&& (new_x3 == (X3+1)))? (new_x3 - X3m1):(new_x3+1); \ 00104 break; \ 00105 case 3: \ 00106 new_mem_idx = ((!tcomm) && (new_x4 == (X4+1)))?(idx - X4m1*E3E2E1): idx+E3E2E1; \ 00107 new_x4 = ((!tcomm)&& (new_x4 == (X4+1)))? (new_x4 - X4m1):(new_x4+1); \ 00108 break; \ 00109 } \ 00110 }while(0) 00111 00112 #define COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx) do { \ 00113 switch(mydir){ \ 00114 case 0: \ 00115 new_mem_idx = ((!xcomm) && new_x1 == 2)?(idx+X1m1):(idx-1); \ 00116 new_x1 = ((!xcomm) && new_x1 == 2)? (new_x1+X1m1): (new_x1-1); \ 00117 break; \ 00118 case 1: \ 00119 new_mem_idx = ((!ycomm) && new_x2 == 2)?(idx+X2m1*E1):(idx-E1); \ 00120 new_x2 = ((!ycomm) && new_x2 == 2)? (new_x2+X2m1): (new_x2-1); \ 00121 break; \ 00122 case 2: \ 00123 new_mem_idx = ((!zcomm) && new_x3 == 2)?(idx+X3m1*E2E1):(idx-E2E1); \ 00124 new_x3 = ((!zcomm) && new_x3 == 2)? (new_x3+X3m1): (new_x3-1); \ 00125 break; \ 00126 case 3: \ 00127 new_mem_idx = ((!tcomm) && new_x4 == 2)?(idx+X4m1*E3E2E1):(idx-E3E2E1); \ 00128 new_x4 = ((!tcomm) && new_x4 == 2)? (new_x4+X4m1): (new_x4-1); \ 00129 break; \ 00130 } \ 00131 }while(0) 00132 00133 #else 00134 #define COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx) do { \ 00135 switch(mydir){ \ 00136 case 0: \ 00137 new_mem_idx = ( (new_x1==X1m1)?idx-X1m1:idx+1); \ 00138 new_x1 = (new_x1==X1m1)?0:new_x1+1; \ 00139 break; \ 00140 case 1: \ 00141 new_mem_idx = ( (new_x2==X2m1)?idx-X2X1mX1:idx+X1); \ 00142 new_x2 = (new_x2==X2m1)?0:new_x2+1; \ 00143 break; \ 00144 case 2: \ 00145 new_mem_idx = ( (new_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \ 00146 new_x3 = (new_x3==X3m1)?0:new_x3+1; \ 00147 break; \ 00148 case 3: \ 00149 new_mem_idx = ( (new_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \ 00150 new_x4 = (new_x4==X4m1)?0:new_x4+1; \ 00151 break; \ 00152 } \ 00153 }while(0) 00154 00155 #define COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx) do { \ 00156 switch(mydir){ \ 00157 case 0: \ 00158 new_mem_idx = ( (new_x1==0)?idx+X1m1:idx-1); \ 00159 new_x1 = (new_x1==0)?X1m1:new_x1 - 1; \ 00160 break; \ 00161 case 1: \ 00162 new_mem_idx = ( (new_x2==0)?idx+X2X1mX1:idx-X1); \ 00163 new_x2 = (new_x2==0)?X2m1:new_x2 - 1; \ 00164 break; \ 00165 case 2: \ 00166 new_mem_idx = ( (new_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \ 00167 new_x3 = (new_x3==0)?X3m1:new_x3 - 1; \ 00168 break; \ 00169 case 3: \ 00170 new_mem_idx = ( (new_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \ 00171 new_x4 = (new_x4==0)?X4m1:new_x4 - 1; \ 00172 break; \ 00173 } \ 00174 }while(0) 00175 00176 #endif 00177 00178 00179 #define MULT_SU3_NN_TEST(ma, mb) do{ \ 00180 Float fa_re,fa_im, fb_re, fb_im, fc_re, fc_im; \ 00181 fa_re = \ 00182 ma##00_re * mb##00_re - ma##00_im * mb##00_im + \ 00183 ma##01_re * mb##10_re - ma##01_im * mb##10_im + \ 00184 ma##02_re * mb##20_re - ma##02_im * mb##20_im; \ 00185 fa_im = \ 00186 ma##00_re * mb##00_im + ma##00_im * mb##00_re + \ 00187 ma##01_re * mb##10_im + ma##01_im * mb##10_re + \ 00188 ma##02_re * mb##20_im + ma##02_im * mb##20_re; \ 00189 fb_re = \ 00190 ma##00_re * mb##01_re - ma##00_im * mb##01_im + \ 00191 ma##01_re * mb##11_re - ma##01_im * mb##11_im + \ 00192 ma##02_re * mb##21_re - ma##02_im * mb##21_im; \ 00193 fb_im = \ 00194 ma##00_re * mb##01_im + ma##00_im * mb##01_re + \ 00195 ma##01_re * mb##11_im + ma##01_im * mb##11_re + \ 00196 ma##02_re * mb##21_im + ma##02_im * mb##21_re; \ 00197 fc_re = \ 00198 ma##00_re * mb##02_re - ma##00_im * mb##02_im + \ 00199 ma##01_re * mb##12_re - ma##01_im * mb##12_im + \ 00200 ma##02_re * mb##22_re - ma##02_im * mb##22_im; \ 00201 fc_im = \ 00202 ma##00_re * mb##02_im + ma##00_im * mb##02_re + \ 00203 ma##01_re * mb##12_im + ma##01_im * mb##12_re + \ 00204 ma##02_re * mb##22_im + ma##02_im * mb##22_re; \ 00205 ma##00_re = fa_re; \ 00206 ma##00_im = fa_im; \ 00207 ma##01_re = fb_re; \ 00208 ma##01_im = fb_im; \ 00209 ma##02_re = fc_re; \ 00210 ma##02_im = fc_im; \ 00211 fa_re = \ 00212 ma##10_re * mb##00_re - ma##10_im * mb##00_im + \ 00213 ma##11_re * mb##10_re - ma##11_im * mb##10_im + \ 00214 ma##12_re * mb##20_re - ma##12_im * mb##20_im; \ 00215 fa_im = \ 00216 ma##10_re * mb##00_im + ma##10_im * mb##00_re + \ 00217 ma##11_re * mb##10_im + ma##11_im * mb##10_re + \ 00218 ma##12_re * mb##20_im + ma##12_im * mb##20_re; \ 00219 fb_re = \ 00220 ma##10_re * mb##01_re - ma##10_im * mb##01_im + \ 00221 ma##11_re * mb##11_re - ma##11_im * mb##11_im + \ 00222 ma##12_re * mb##21_re - ma##12_im * mb##21_im; \ 00223 fb_im = \ 00224 ma##10_re * mb##01_im + ma##10_im * mb##01_re + \ 00225 ma##11_re * mb##11_im + ma##11_im * mb##11_re + \ 00226 ma##12_re * mb##21_im + ma##12_im * mb##21_re; \ 00227 fc_re = \ 00228 ma##10_re * mb##02_re - ma##10_im * mb##02_im + \ 00229 ma##11_re * mb##12_re - ma##11_im * mb##12_im + \ 00230 ma##12_re * mb##22_re - ma##12_im * mb##22_im; \ 00231 fc_im = \ 00232 ma##10_re * mb##02_im + ma##10_im * mb##02_re + \ 00233 ma##11_re * mb##12_im + ma##11_im * mb##12_re + \ 00234 ma##12_re * mb##22_im + ma##12_im * mb##22_re; \ 00235 ma##10_re = fa_re; \ 00236 ma##10_im = fa_im; \ 00237 ma##11_re = fb_re; \ 00238 ma##11_im = fb_im; \ 00239 ma##12_re = fc_re; \ 00240 ma##12_im = fc_im; \ 00241 fa_re = \ 00242 ma##20_re * mb##00_re - ma##20_im * mb##00_im + \ 00243 ma##21_re * mb##10_re - ma##21_im * mb##10_im + \ 00244 ma##22_re * mb##20_re - ma##22_im * mb##20_im; \ 00245 fa_im = \ 00246 ma##20_re * mb##00_im + ma##20_im * mb##00_re + \ 00247 ma##21_re * mb##10_im + ma##21_im * mb##10_re + \ 00248 ma##22_re * mb##20_im + ma##22_im * mb##20_re; \ 00249 fb_re = \ 00250 ma##20_re * mb##01_re - ma##20_im * mb##01_im + \ 00251 ma##21_re * mb##11_re - ma##21_im * mb##11_im + \ 00252 ma##22_re * mb##21_re - ma##22_im * mb##21_im; \ 00253 fb_im = \ 00254 ma##20_re * mb##01_im + ma##20_im * mb##01_re + \ 00255 ma##21_re * mb##11_im + ma##21_im * mb##11_re + \ 00256 ma##22_re * mb##21_im + ma##22_im * mb##21_re; \ 00257 fc_re = \ 00258 ma##20_re * mb##02_re - ma##20_im * mb##02_im + \ 00259 ma##21_re * mb##12_re - ma##21_im * mb##12_im + \ 00260 ma##22_re * mb##22_re - ma##22_im * mb##22_im; \ 00261 fc_im = \ 00262 ma##20_re * mb##02_im + ma##20_im * mb##02_re + \ 00263 ma##21_re * mb##12_im + ma##21_im * mb##12_re + \ 00264 ma##22_re * mb##22_im + ma##22_im * mb##22_re; \ 00265 ma##20_re = fa_re; \ 00266 ma##20_im = fa_im; \ 00267 ma##21_re = fb_re; \ 00268 ma##21_im = fb_im; \ 00269 ma##22_re = fc_re; \ 00270 ma##22_im = fc_im; \ 00271 }while(0) 00272 00273 00274 #define MULT_SU3_NA_TEST(ma, mb) do{ \ 00275 Float fa_re, fa_im, fb_re, fb_im, fc_re, fc_im; \ 00276 fa_re = \ 00277 ma##00_re * mb##T00_re - ma##00_im * mb##T00_im + \ 00278 ma##01_re * mb##T10_re - ma##01_im * mb##T10_im + \ 00279 ma##02_re * mb##T20_re - ma##02_im * mb##T20_im; \ 00280 fa_im = \ 00281 ma##00_re * mb##T00_im + ma##00_im * mb##T00_re + \ 00282 ma##01_re * mb##T10_im + ma##01_im * mb##T10_re + \ 00283 ma##02_re * mb##T20_im + ma##02_im * mb##T20_re; \ 00284 fb_re = \ 00285 ma##00_re * mb##T01_re - ma##00_im * mb##T01_im + \ 00286 ma##01_re * mb##T11_re - ma##01_im * mb##T11_im + \ 00287 ma##02_re * mb##T21_re - ma##02_im * mb##T21_im; \ 00288 fb_im = \ 00289 ma##00_re * mb##T01_im + ma##00_im * mb##T01_re + \ 00290 ma##01_re * mb##T11_im + ma##01_im * mb##T11_re + \ 00291 ma##02_re * mb##T21_im + ma##02_im * mb##T21_re; \ 00292 fc_re = \ 00293 ma##00_re * mb##T02_re - ma##00_im * mb##T02_im + \ 00294 ma##01_re * mb##T12_re - ma##01_im * mb##T12_im + \ 00295 ma##02_re * mb##T22_re - ma##02_im * mb##T22_im; \ 00296 fc_im = \ 00297 ma##00_re * mb##T02_im + ma##00_im * mb##T02_re + \ 00298 ma##01_re * mb##T12_im + ma##01_im * mb##T12_re + \ 00299 ma##02_re * mb##T22_im + ma##02_im * mb##T22_re; \ 00300 ma##00_re = fa_re; \ 00301 ma##00_im = fa_im; \ 00302 ma##01_re = fb_re; \ 00303 ma##01_im = fb_im; \ 00304 ma##02_re = fc_re; \ 00305 ma##02_im = fc_im; \ 00306 fa_re = \ 00307 ma##10_re * mb##T00_re - ma##10_im * mb##T00_im + \ 00308 ma##11_re * mb##T10_re - ma##11_im * mb##T10_im + \ 00309 ma##12_re * mb##T20_re - ma##12_im * mb##T20_im; \ 00310 fa_im = \ 00311 ma##10_re * mb##T00_im + ma##10_im * mb##T00_re + \ 00312 ma##11_re * mb##T10_im + ma##11_im * mb##T10_re + \ 00313 ma##12_re * mb##T20_im + ma##12_im * mb##T20_re; \ 00314 fb_re = \ 00315 ma##10_re * mb##T01_re - ma##10_im * mb##T01_im + \ 00316 ma##11_re * mb##T11_re - ma##11_im * mb##T11_im + \ 00317 ma##12_re * mb##T21_re - ma##12_im * mb##T21_im; \ 00318 fb_im = \ 00319 ma##10_re * mb##T01_im + ma##10_im * mb##T01_re + \ 00320 ma##11_re * mb##T11_im + ma##11_im * mb##T11_re + \ 00321 ma##12_re * mb##T21_im + ma##12_im * mb##T21_re; \ 00322 fc_re = \ 00323 ma##10_re * mb##T02_re - ma##10_im * mb##T02_im + \ 00324 ma##11_re * mb##T12_re - ma##11_im * mb##T12_im + \ 00325 ma##12_re * mb##T22_re - ma##12_im * mb##T22_im; \ 00326 fc_im = \ 00327 ma##10_re * mb##T02_im + ma##10_im * mb##T02_re + \ 00328 ma##11_re * mb##T12_im + ma##11_im * mb##T12_re + \ 00329 ma##12_re * mb##T22_im + ma##12_im * mb##T22_re; \ 00330 ma##10_re = fa_re; \ 00331 ma##10_im = fa_im; \ 00332 ma##11_re = fb_re; \ 00333 ma##11_im = fb_im; \ 00334 ma##12_re = fc_re; \ 00335 ma##12_im = fc_im; \ 00336 fa_re = \ 00337 ma##20_re * mb##T00_re - ma##20_im * mb##T00_im + \ 00338 ma##21_re * mb##T10_re - ma##21_im * mb##T10_im + \ 00339 ma##22_re * mb##T20_re - ma##22_im * mb##T20_im; \ 00340 fa_im = \ 00341 ma##20_re * mb##T00_im + ma##20_im * mb##T00_re + \ 00342 ma##21_re * mb##T10_im + ma##21_im * mb##T10_re + \ 00343 ma##22_re * mb##T20_im + ma##22_im * mb##T20_re; \ 00344 fb_re = \ 00345 ma##20_re * mb##T01_re - ma##20_im * mb##T01_im + \ 00346 ma##21_re * mb##T11_re - ma##21_im * mb##T11_im + \ 00347 ma##22_re * mb##T21_re - ma##22_im * mb##T21_im; \ 00348 fb_im = \ 00349 ma##20_re * mb##T01_im + ma##20_im * mb##T01_re + \ 00350 ma##21_re * mb##T11_im + ma##21_im * mb##T11_re + \ 00351 ma##22_re * mb##T21_im + ma##22_im * mb##T21_re; \ 00352 fc_re = \ 00353 ma##20_re * mb##T02_re - ma##20_im * mb##T02_im + \ 00354 ma##21_re * mb##T12_re - ma##21_im * mb##T12_im + \ 00355 ma##22_re * mb##T22_re - ma##22_im * mb##T22_im; \ 00356 fc_im = \ 00357 ma##20_re * mb##T02_im + ma##20_im * mb##T02_re + \ 00358 ma##21_re * mb##T12_im + ma##21_im * mb##T12_re + \ 00359 ma##22_re * mb##T22_im + ma##22_im * mb##T22_re; \ 00360 ma##20_re = fa_re; \ 00361 ma##20_im = fa_im; \ 00362 ma##21_re = fb_re; \ 00363 ma##21_im = fb_im; \ 00364 ma##22_re = fc_re; \ 00365 ma##22_im = fc_im; \ 00366 }while(0) 00367 00368 00369 00370 #define MULT_SU3_AN_TEST(ma, mb) do{ \ 00371 Float fa_re, fa_im, fb_re, fb_im, fc_re, fc_im; \ 00372 fa_re = \ 00373 ma##T00_re * mb##00_re - ma##T00_im * mb##00_im + \ 00374 ma##T01_re * mb##10_re - ma##T01_im * mb##10_im + \ 00375 ma##T02_re * mb##20_re - ma##T02_im * mb##20_im; \ 00376 fa_im = \ 00377 ma##T00_re * mb##00_im + ma##T00_im * mb##00_re + \ 00378 ma##T01_re * mb##10_im + ma##T01_im * mb##10_re + \ 00379 ma##T02_re * mb##20_im + ma##T02_im * mb##20_re; \ 00380 fb_re = \ 00381 ma##T10_re * mb##00_re - ma##T10_im * mb##00_im + \ 00382 ma##T11_re * mb##10_re - ma##T11_im * mb##10_im + \ 00383 ma##T12_re * mb##20_re - ma##T12_im * mb##20_im; \ 00384 fb_im = \ 00385 ma##T10_re * mb##00_im + ma##T10_im * mb##00_re + \ 00386 ma##T11_re * mb##10_im + ma##T11_im * mb##10_re + \ 00387 ma##T12_re * mb##20_im + ma##T12_im * mb##20_re; \ 00388 fc_re = \ 00389 ma##T20_re * mb##00_re - ma##T20_im * mb##00_im + \ 00390 ma##T21_re * mb##10_re - ma##T21_im * mb##10_im + \ 00391 ma##T22_re * mb##20_re - ma##T22_im * mb##20_im; \ 00392 fc_im = \ 00393 ma##T20_re * mb##00_im + ma##T20_im * mb##00_re + \ 00394 ma##T21_re * mb##10_im + ma##T21_im * mb##10_re + \ 00395 ma##T22_re * mb##20_im + ma##T22_im * mb##20_re; \ 00396 mb##00_re = fa_re; \ 00397 mb##00_im = fa_im; \ 00398 mb##10_re = fb_re; \ 00399 mb##10_im = fb_im; \ 00400 mb##20_re = fc_re; \ 00401 mb##20_im = fc_im; \ 00402 fa_re = \ 00403 ma##T00_re * mb##01_re - ma##T00_im * mb##01_im + \ 00404 ma##T01_re * mb##11_re - ma##T01_im * mb##11_im + \ 00405 ma##T02_re * mb##21_re - ma##T02_im * mb##21_im; \ 00406 fa_im = \ 00407 ma##T00_re * mb##01_im + ma##T00_im * mb##01_re + \ 00408 ma##T01_re * mb##11_im + ma##T01_im * mb##11_re + \ 00409 ma##T02_re * mb##21_im + ma##T02_im * mb##21_re; \ 00410 fb_re = \ 00411 ma##T10_re * mb##01_re - ma##T10_im * mb##01_im + \ 00412 ma##T11_re * mb##11_re - ma##T11_im * mb##11_im + \ 00413 ma##T12_re * mb##21_re - ma##T12_im * mb##21_im; \ 00414 fb_im = \ 00415 ma##T10_re * mb##01_im + ma##T10_im * mb##01_re + \ 00416 ma##T11_re * mb##11_im + ma##T11_im * mb##11_re + \ 00417 ma##T12_re * mb##21_im + ma##T12_im * mb##21_re; \ 00418 fc_re = \ 00419 ma##T20_re * mb##01_re - ma##T20_im * mb##01_im + \ 00420 ma##T21_re * mb##11_re - ma##T21_im * mb##11_im + \ 00421 ma##T22_re * mb##21_re - ma##T22_im * mb##21_im; \ 00422 fc_im = \ 00423 ma##T20_re * mb##01_im + ma##T20_im * mb##01_re + \ 00424 ma##T21_re * mb##11_im + ma##T21_im * mb##11_re + \ 00425 ma##T22_re * mb##21_im + ma##T22_im * mb##21_re; \ 00426 mb##01_re = fa_re; \ 00427 mb##01_im = fa_im; \ 00428 mb##11_re = fb_re; \ 00429 mb##11_im = fb_im; \ 00430 mb##21_re = fc_re; \ 00431 mb##21_im = fc_im; \ 00432 fa_re = \ 00433 ma##T00_re * mb##02_re - ma##T00_im * mb##02_im + \ 00434 ma##T01_re * mb##12_re - ma##T01_im * mb##12_im + \ 00435 ma##T02_re * mb##22_re - ma##T02_im * mb##22_im; \ 00436 fa_im = \ 00437 ma##T00_re * mb##02_im + ma##T00_im * mb##02_re + \ 00438 ma##T01_re * mb##12_im + ma##T01_im * mb##12_re + \ 00439 ma##T02_re * mb##22_im + ma##T02_im * mb##22_re; \ 00440 fb_re = \ 00441 ma##T10_re * mb##02_re - ma##T10_im * mb##02_im + \ 00442 ma##T11_re * mb##12_re - ma##T11_im * mb##12_im + \ 00443 ma##T12_re * mb##22_re - ma##T12_im * mb##22_im; \ 00444 fb_im = \ 00445 ma##T10_re * mb##02_im + ma##T10_im * mb##02_re + \ 00446 ma##T11_re * mb##12_im + ma##T11_im * mb##12_re + \ 00447 ma##T12_re * mb##22_im + ma##T12_im * mb##22_re; \ 00448 fc_re = \ 00449 ma##T20_re * mb##02_re - ma##T20_im * mb##02_im + \ 00450 ma##T21_re * mb##12_re - ma##T21_im * mb##12_im + \ 00451 ma##T22_re * mb##22_re - ma##T22_im * mb##22_im; \ 00452 fc_im = \ 00453 ma##T20_re * mb##02_im + ma##T20_im * mb##02_re + \ 00454 ma##T21_re * mb##12_im + ma##T21_im * mb##12_re + \ 00455 ma##T22_re * mb##22_im + ma##T22_im * mb##22_re; \ 00456 mb##02_re = fa_re; \ 00457 mb##02_im = fa_im; \ 00458 mb##12_re = fb_re; \ 00459 mb##12_im = fb_im; \ 00460 mb##22_re = fc_re; \ 00461 mb##22_im = fc_im; \ 00462 }while(0) 00463 00464 00465 00466 #define print_matrix(mul) \ 00467 printf(" (%f %f) (%f %f) (%f %f)\n", mul##00_re, mul##00_im, mul##01_re, mul##01_im, mul##02_re, mul##02_im); \ 00468 printf(" (%f %f) (%f %f) (%f %f)\n", mul##10_re, mul##10_im, mul##11_re, mul##11_im, mul##12_re, mul##12_im); \ 00469 printf(" (%f %f) (%f %f) (%f %f)\n", mul##20_re, mul##20_im, mul##21_re, mul##21_im, mul##22_re, mul##22_im); 00470 00471 00472 //FloatN can be float2/float4/double2 00473 //Float2 can be float2/double2 00474 template<int oddBit, typename Float2, typename FloatN, typename Float> 00475 __global__ void 00476 GAUGE_FORCE_KERN_NAME(Float2* momEven, Float2* momOdd, 00477 int dir, double eb3, 00478 FloatN* linkEven, FloatN* linkOdd, 00479 int* input_path, 00480 int* length, Float* path_coeff, int num_paths, kernel_param_t kparam) 00481 { 00482 int i,j=0; 00483 int sid = blockIdx.x * blockDim.x + threadIdx.x; 00484 00485 int z1 = sid / X1h; 00486 int x1h = sid - z1*X1h; 00487 int z2 = z1 / X2; 00488 int x2 = z1 - z2*X2; 00489 int x4 = z2 / X3; 00490 int x3 = z2 - x4*X3; 00491 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00492 int x1 = 2*x1h + x1odd; 00493 00494 #ifdef MULTI_GPU 00495 x4 += 2; x3 += 2; x2 += 2; x1 += 2; 00496 int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1; 00497 #else 00498 int X = 2*sid + x1odd; 00499 #endif 00500 00501 Float2* mymom=momEven; 00502 if (oddBit){ 00503 mymom = momOdd; 00504 } 00505 00506 DECLARE_LINK_VARS(LINKA); 00507 DECLARE_LINK_VARS(LINKB); 00508 Float2 STAPLE0, STAPLE1, STAPLE2, STAPLE3,STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8; 00509 Float2 AH0, AH1, AH2, AH3, AH4; 00510 00511 int new_mem_idx; 00512 00513 00514 SET_SU3_MATRIX(staple, 0); 00515 for(i=0;i < num_paths; i++){ 00516 int nbr_oddbit = (oddBit^1 ); 00517 00518 int new_x1 =x1; 00519 int new_x2 =x2; 00520 int new_x3 =x3; 00521 int new_x4 =x4; 00522 COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(dir, X); 00523 00524 //linka: current matrix 00525 //linkb: the loaded matrix in this round 00526 SET_UNIT_SU3_MATRIX(linka); 00527 int* path = input_path + i*path_max_length; 00528 00529 int lnkdir; 00530 int path0 = path[0]; 00531 if (GOES_FORWARDS(path0)){ 00532 lnkdir=path0; 00533 }else{ 00534 lnkdir=OPP_DIR(path0); 00535 COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(path0), new_mem_idx); 00536 nbr_oddbit = nbr_oddbit^1; 00537 00538 } 00539 00540 int nbr_idx = new_mem_idx >>1; 00541 if (nbr_oddbit){ 00542 LOAD_ODD_MATRIX( lnkdir, nbr_idx, LINKB); 00543 }else{ 00544 LOAD_EVEN_MATRIX( lnkdir, nbr_idx, LINKB); 00545 } 00546 RECONSTRUCT_MATRIX(1, linkb); 00547 00548 if (GOES_FORWARDS(path0)){ 00549 COPY_SU3_MATRIX(linkb, linka); 00550 COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(path0, new_mem_idx); 00551 nbr_oddbit = nbr_oddbit^1; 00552 }else{ 00553 SU3_ADJOINT(linkb, linka); 00554 } 00555 00556 for(j=1; j < length[i]; j++){ 00557 00558 int lnkdir; 00559 int pathj = path[j]; 00560 if (GOES_FORWARDS(pathj)){ 00561 lnkdir=pathj; 00562 }else{ 00563 lnkdir=OPP_DIR(pathj); 00564 COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(pathj), new_mem_idx); 00565 nbr_oddbit = nbr_oddbit^1; 00566 00567 } 00568 00569 int nbr_idx = new_mem_idx >>1; 00570 if (nbr_oddbit){ 00571 LOAD_ODD_MATRIX(lnkdir, nbr_idx, LINKB); 00572 }else{ 00573 LOAD_EVEN_MATRIX(lnkdir, nbr_idx, LINKB); 00574 } 00575 RECONSTRUCT_MATRIX(1, linkb); 00576 if (GOES_FORWARDS(pathj)){ 00577 MULT_SU3_NN_TEST(linka, linkb); 00578 00579 COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(pathj, new_mem_idx); 00580 nbr_oddbit = nbr_oddbit^1; 00581 00582 00583 }else{ 00584 MULT_SU3_NA_TEST(linka, linkb); 00585 } 00586 00587 }//j 00588 SCALAR_MULT_ADD_SU3_MATRIX(staple, linka, path_coeff[i], staple); 00589 }//i 00590 00591 00592 //update mom 00593 if (oddBit){ 00594 LOAD_ODD_MATRIX(dir, (X>>1), LINKA); 00595 }else{ 00596 LOAD_EVEN_MATRIX(dir, (X>>1), LINKA); 00597 } 00598 RECONSTRUCT_MATRIX(1, linka); 00599 MULT_SU3_NN_TEST(linka, staple); 00600 LOAD_ANTI_HERMITIAN(mymom, dir, sid, AH); 00601 UNCOMPRESS_ANTI_HERMITIAN(ah, linkb); 00602 SCALAR_MULT_SUB_SU3_MATRIX(linkb, linka, eb3, linka); 00603 MAKE_ANTI_HERMITIAN(linka, ah); 00604 00605 WRITE_ANTI_HERMITIAN(mymom, dir, sid, AH, mom_ga_stride); 00606 00607 return; 00608 } 00609 00610 00611 00612 #undef COMPUTE_NEW_FULL_IDX_PLUS_UPDATE 00613 #undef COMPUTE_NEW_FULL_IDX_MINUS_UPDATE 00614 #undef MULT_SU3_NN_TEST 00615 #undef MULT_SU3_NA_TEST 00616 #undef MULT_SU3_AN_TEST 00617 00618 #undef linka00_re 00619 #undef linka00_im 00620 #undef linka01_re 00621 #undef linka01_im 00622 #undef linka02_re 00623 #undef linka02_im 00624 #undef linka10_re 00625 #undef linka10_im 00626 #undef linka11_re 00627 #undef linka11_im 00628 #undef linka12_re 00629 #undef linka12_im 00630 #undef linka20_re 00631 #undef linka20_im 00632 #undef linka21_re 00633 #undef linka21_im 00634 #undef linka22_re 00635 #undef linka22_im 00636 00637 #undef linkb00_re 00638 #undef linkb00_im 00639 #undef linkb01_re 00640 #undef linkb01_im 00641 #undef linkb02_re 00642 #undef linkb02_im 00643 #undef linkb10_re 00644 #undef linkb10_im 00645 #undef linkb11_re 00646 #undef linkb11_im 00647 #undef linkb12_re 00648 #undef linkb12_im 00649 #undef linkb20_re 00650 #undef linkb20_im 00651 #undef linkb21_re 00652 #undef linkb21_im 00653 #undef linkb22_re 00654 #undef linkb22_im