QUDA v0.4.0
A library for QCD on GPUs
|
00001 #define Vsh_x ghostFace[0] 00002 #define Vsh_y ghostFace[1] 00003 #define Vsh_z ghostFace[2] 00004 #define Vsh_t ghostFace[3] 00005 00006 #define xcomm kparam.ghostDim[0] 00007 #define ycomm kparam.ghostDim[1] 00008 #define zcomm kparam.ghostDim[2] 00009 #define tcomm kparam.ghostDim[3] 00010 #define dimcomm kparam.ghostDim 00011 00012 00013 #define D1 kparam.D1 00014 #define D2 kparam.D2 00015 #define D3 kparam.D3 00016 #define D4 kparam.D4 00017 #define D1h kparam.D1h 00018 00019 #if (RECONSTRUCT == 18) 00020 #define DECLARE_VAR_SIGN 00021 #define DECLARE_NEW_X 00022 #ifdef MULTI_GPU 00023 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4}; 00024 #else 00025 #define DECLARE_X_ARRAY 00026 #endif 00027 #else //RECONSTRUCT == 12 00028 #define DECLARE_VAR_SIGN short sign=1 00029 #define DECLARE_NEW_X short new_x1=x1; short new_x2=x2; \ 00030 /*short new_x3=x3; */short new_x4=x4; 00031 #define DECLARE_X_ARRAY int x[4] = {x1,x2,x3, x4}; 00032 00033 #endif 00034 00035 #if (PRECISION == 1 && RECONSTRUCT == 12) 00036 00037 #define a00_re A0.x 00038 #define a00_im A0.y 00039 #define a01_re A0.z 00040 #define a01_im A0.w 00041 #define a02_re A1.x 00042 #define a02_im A1.y 00043 #define a10_re A1.z 00044 #define a10_im A1.w 00045 #define a11_re A2.x 00046 #define a11_im A2.y 00047 #define a12_re A2.z 00048 #define a12_im A2.w 00049 #define a20_re A3.x 00050 #define a20_im A3.y 00051 #define a21_re A3.z 00052 #define a21_im A3.w 00053 #define a22_re A4.x 00054 #define a22_im A4.y 00055 00056 #define b00_re B0.x 00057 #define b00_im B0.y 00058 #define b01_re B0.z 00059 #define b01_im B0.w 00060 #define b02_re B1.x 00061 #define b02_im B1.y 00062 #define b10_re B1.z 00063 #define b10_im B1.w 00064 #define b11_re B2.x 00065 #define b11_im B2.y 00066 #define b12_re B2.z 00067 #define b12_im B2.w 00068 #define b20_re B3.x 00069 #define b20_im B3.y 00070 #define b21_re B3.z 00071 #define b21_im B3.w 00072 #define b22_re B4.x 00073 #define b22_im B4.y 00074 00075 #define c00_re C0.x 00076 #define c00_im C0.y 00077 #define c01_re C0.z 00078 #define c01_im C0.w 00079 #define c02_re C1.x 00080 #define c02_im C1.y 00081 #define c10_re C1.z 00082 #define c10_im C1.w 00083 #define c11_re C2.x 00084 #define c11_im C2.y 00085 #define c12_re C2.z 00086 #define c12_im C2.w 00087 #define c20_re C3.x 00088 #define c20_im C3.y 00089 #define c21_re C3.z 00090 #define c21_im C3.w 00091 #define c22_re C4.x 00092 #define c22_im C4.y 00093 00094 #else 00095 #define a00_re A0.x 00096 #define a00_im A0.y 00097 #define a01_re A1.x 00098 #define a01_im A1.y 00099 #define a02_re A2.x 00100 #define a02_im A2.y 00101 #define a10_re A3.x 00102 #define a10_im A3.y 00103 #define a11_re A4.x 00104 #define a11_im A4.y 00105 #define a12_re A5.x 00106 #define a12_im A5.y 00107 00108 #define a20_re A6.x 00109 #define a20_im A6.y 00110 #define a21_re A7.x 00111 #define a21_im A7.y 00112 #define a22_re A8.x 00113 #define a22_im A8.y 00114 00115 #define b00_re B0.x 00116 #define b00_im B0.y 00117 #define b01_re B1.x 00118 #define b01_im B1.y 00119 #define b02_re B2.x 00120 #define b02_im B2.y 00121 #define b10_re B3.x 00122 #define b10_im B3.y 00123 #define b11_re B4.x 00124 #define b11_im B4.y 00125 #define b12_re B5.x 00126 #define b12_im B5.y 00127 #define b20_re B6.x 00128 #define b20_im B6.y 00129 #define b21_re B7.x 00130 #define b21_im B7.y 00131 #define b22_re B8.x 00132 #define b22_im B8.y 00133 00134 #define c00_re C0.x 00135 #define c00_im C0.y 00136 #define c01_re C1.x 00137 #define c01_im C1.y 00138 #define c02_re C2.x 00139 #define c02_im C2.y 00140 #define c10_re C3.x 00141 #define c10_im C3.y 00142 #define c11_re C4.x 00143 #define c11_im C4.y 00144 #define c12_re C5.x 00145 #define c12_im C5.y 00146 #define c20_re C6.x 00147 #define c20_im C6.y 00148 #define c21_re C7.x 00149 #define c21_im C7.y 00150 #define c22_re C8.x 00151 #define c22_im C8.y 00152 00153 #endif 00154 00155 #define bb00_re BB0.x 00156 #define bb00_im BB0.y 00157 #define bb01_re BB1.x 00158 #define bb01_im BB1.y 00159 #define bb02_re BB2.x 00160 #define bb02_im BB2.y 00161 #define bb10_re BB3.x 00162 #define bb10_im BB3.y 00163 #define bb11_re BB4.x 00164 #define bb11_im BB4.y 00165 #define bb12_re BB5.x 00166 #define bb12_im BB5.y 00167 #define bb20_re BB6.x 00168 #define bb20_im BB6.y 00169 #define bb21_re BB7.x 00170 #define bb21_im BB7.y 00171 #define bb22_re BB8.x 00172 #define bb22_im BB8.y 00173 00174 00175 00176 #define aT00_re (+a00_re) 00177 #define aT00_im (-a00_im) 00178 #define aT01_re (+a10_re) 00179 #define aT01_im (-a10_im) 00180 #define aT02_re (+a20_re) 00181 #define aT02_im (-a20_im) 00182 #define aT10_re (+a01_re) 00183 #define aT10_im (-a01_im) 00184 #define aT11_re (+a11_re) 00185 #define aT11_im (-a11_im) 00186 #define aT12_re (+a21_re) 00187 #define aT12_im (-a21_im) 00188 #define aT20_re (+a02_re) 00189 #define aT20_im (-a02_im) 00190 #define aT21_re (+a12_re) 00191 #define aT21_im (-a12_im) 00192 #define aT22_re (+a22_re) 00193 #define aT22_im (-a22_im) 00194 00195 #define bT00_re (+b00_re) 00196 #define bT00_im (-b00_im) 00197 #define bT01_re (+b10_re) 00198 #define bT01_im (-b10_im) 00199 #define bT02_re (+b20_re) 00200 #define bT02_im (-b20_im) 00201 #define bT10_re (+b01_re) 00202 #define bT10_im (-b01_im) 00203 #define bT11_re (+b11_re) 00204 #define bT11_im (-b11_im) 00205 #define bT12_re (+b21_re) 00206 #define bT12_im (-b21_im) 00207 #define bT20_re (+b02_re) 00208 #define bT20_im (-b02_im) 00209 #define bT21_re (+b12_re) 00210 #define bT21_im (-b12_im) 00211 #define bT22_re (+b22_re) 00212 #define bT22_im (-b22_im) 00213 00214 #define cT00_re (+c00_re) 00215 #define cT00_im (-c00_im) 00216 #define cT01_re (+c10_re) 00217 #define cT01_im (-c10_im) 00218 #define cT02_re (+c20_re) 00219 #define cT02_im (-c20_im) 00220 #define cT10_re (+c01_re) 00221 #define cT10_im (-c01_im) 00222 #define cT11_re (+c11_re) 00223 #define cT11_im (-c11_im) 00224 #define cT12_re (+c21_re) 00225 #define cT12_im (-c21_im) 00226 #define cT20_re (+c02_re) 00227 #define cT20_im (-c02_im) 00228 #define cT21_re (+c12_re) 00229 #define cT21_im (-c12_im) 00230 #define cT22_re (+c22_re) 00231 #define cT22_im (-c22_im) 00232 00233 00234 #define tempa00_re TEMPA0.x 00235 #define tempa00_im TEMPA0.y 00236 #define tempa01_re TEMPA1.x 00237 #define tempa01_im TEMPA1.y 00238 #define tempa02_re TEMPA2.x 00239 #define tempa02_im TEMPA2.y 00240 #define tempa10_re TEMPA3.x 00241 #define tempa10_im TEMPA3.y 00242 #define tempa11_re TEMPA4.x 00243 #define tempa11_im TEMPA4.y 00244 #define tempa12_re TEMPA5.x 00245 #define tempa12_im TEMPA5.y 00246 #define tempa20_re TEMPA6.x 00247 #define tempa20_im TEMPA6.y 00248 #define tempa21_re TEMPA7.x 00249 #define tempa21_im TEMPA7.y 00250 #define tempa22_re TEMPA8.x 00251 #define tempa22_im TEMPA8.y 00252 00253 #define tempb00_re TEMPB0.x 00254 #define tempb00_im TEMPB0.y 00255 #define tempb01_re TEMPB1.x 00256 #define tempb01_im TEMPB1.y 00257 #define tempb02_re TEMPB2.x 00258 #define tempb02_im TEMPB2.y 00259 #define tempb10_re TEMPB3.x 00260 #define tempb10_im TEMPB3.y 00261 #define tempb11_re TEMPB4.x 00262 #define tempb11_im TEMPB4.y 00263 #define tempb12_re TEMPB5.x 00264 #define tempb12_im TEMPB5.y 00265 #define tempb20_re TEMPB6.x 00266 #define tempb20_im TEMPB6.y 00267 #define tempb21_re TEMPB7.x 00268 #define tempb21_im TEMPB7.y 00269 #define tempb22_re TEMPB8.x 00270 #define tempb22_im TEMPB8.y 00271 00272 #define fat00_re FAT0.x 00273 #define fat00_im FAT0.y 00274 #define fat01_re FAT1.x 00275 #define fat01_im FAT1.y 00276 #define fat02_re FAT2.x 00277 #define fat02_im FAT2.y 00278 #define fat10_re FAT3.x 00279 #define fat10_im FAT3.y 00280 #define fat11_re FAT4.x 00281 #define fat11_im FAT4.y 00282 #define fat12_re FAT5.x 00283 #define fat12_im FAT5.y 00284 #define fat20_re FAT6.x 00285 #define fat20_im FAT6.y 00286 #define fat21_re FAT7.x 00287 #define fat21_im FAT7.y 00288 #define fat22_re FAT8.x 00289 #define fat22_im FAT8.y 00290 00291 #define NUM_FLOATS 5 00292 #define TEMPA0 sd_data[threadIdx.x + 0*blockDim.x] 00293 #define TEMPA1 sd_data[threadIdx.x + 1*blockDim.x ] 00294 #define TEMPA2 sd_data[threadIdx.x + 2*blockDim.x ] 00295 #define TEMPA3 sd_data[threadIdx.x + 3*blockDim.x ] 00296 #define TEMPA4 sd_data[threadIdx.x + 4*blockDim.x ] 00297 00298 00299 #undef UPDATE_COOR_PLUS 00300 #undef UPDATE_COOR_MINUS 00301 #undef UPDATE_COOR_LOWER_STAPLE 00302 #undef UPDATE_COOR_LOWER_STAPLE_DIAG 00303 #undef UPDATE_COOR_LOWER_STAPLE_EX 00304 #undef COMPUTE_RECONSTRUCT_SIGN 00305 #if (RECONSTRUCT != 18) 00306 #define UPDATE_COOR_PLUS(mydir, idx) do { \ 00307 new_x1 = x1; new_x2 = x2; new_x4 = x4; \ 00308 switch(mydir){ \ 00309 case 0: \ 00310 new_x1 = x1+1; \ 00311 break; \ 00312 case 1: \ 00313 new_x2 = x2+1; \ 00314 break; \ 00315 case 2: \ 00316 break; \ 00317 case 3: \ 00318 new_x4 = x4+1; \ 00319 break; \ 00320 } \ 00321 }while(0) 00322 00323 #define UPDATE_COOR_MINUS(mydir, idx) do { \ 00324 new_x1 = x1; new_x2 = x2; new_x4 = x4; \ 00325 switch(mydir){ \ 00326 case 0: \ 00327 new_x1 = x1-1; \ 00328 break; \ 00329 case 1: \ 00330 new_x2 = x2-1; \ 00331 break; \ 00332 case 2: \ 00333 break; \ 00334 case 3: \ 00335 new_x4 = x4-1; \ 00336 break; \ 00337 } \ 00338 }while(0) 00339 00340 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2) do { \ 00341 new_x1 = x1; new_x2 = x2; new_x4 = x4; \ 00342 if(dimcomm[mydir1] == 0 || x[mydir1] > 0){ \ 00343 switch(mydir1){ \ 00344 case 0: \ 00345 new_x1 = x1 - 1; \ 00346 break; \ 00347 case 1: \ 00348 new_x2 = x2 - 1; \ 00349 break; \ 00350 case 2: \ 00351 break; \ 00352 case 3: \ 00353 new_x4 = x4 - 1; \ 00354 break; \ 00355 } \ 00356 switch(mydir2){ \ 00357 case 0: \ 00358 new_x1 = x1+1; \ 00359 break; \ 00360 case 1: \ 00361 new_x2 = x2+1; \ 00362 break; \ 00363 case 2: \ 00364 break; \ 00365 case 3: \ 00366 new_x4 = x4+1; \ 00367 break; \ 00368 } \ 00369 }else{ \ 00370 /*the case where both dir1/dir2 are out of boundary are dealed with a different macro (_DIAG)*/ \ 00371 switch(mydir2){ \ 00372 case 0: \ 00373 new_x1 = x1+1; \ 00374 break; \ 00375 case 1: \ 00376 new_x2 = x2+1; \ 00377 break; \ 00378 case 2: \ 00379 break; \ 00380 case 3: \ 00381 new_x4 = x4+1; \ 00382 break; \ 00383 } \ 00384 switch(mydir1){/*mydir1 is 0 here */ \ 00385 case 0: \ 00386 new_x1 = x1-1; \ 00387 break; \ 00388 case 1: \ 00389 new_x2 = x2-1; \ 00390 break; \ 00391 case 2: \ 00392 break; \ 00393 case 3: \ 00394 new_x4 = x4-1; \ 00395 break; \ 00396 } \ 00397 } \ 00398 }while(0) 00399 00400 00401 00402 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do { \ 00403 int new_x[4]; \ 00404 new_x[3] = x4; new_x[1] = x2; new_x[0] = x1; \ 00405 new_x[nu] = -1; \ 00406 new_x[mu] = 0; \ 00407 new_x1 = new_x[0]; \ 00408 new_x2 = new_x[1]; \ 00409 new_x4 = new_x[3]; \ 00410 }while(0) 00411 00412 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2) do { \ 00413 new_x1 = x1; new_x2 = x2; new_x4 = x4; \ 00414 switch(mydir1){ \ 00415 case 0: \ 00416 new_x1 = x1 - 1; \ 00417 break; \ 00418 case 1: \ 00419 new_x2 = x2 - 1; \ 00420 break; \ 00421 case 2: \ 00422 break; \ 00423 case 3: \ 00424 new_x4 = x4 - 1; \ 00425 break; \ 00426 } \ 00427 switch(mydir2){ \ 00428 case 0: \ 00429 new_x1 = x1+1; \ 00430 break; \ 00431 case 1: \ 00432 new_x2 = x2+1; \ 00433 break; \ 00434 case 2: \ 00435 break; \ 00436 case 3: \ 00437 new_x4 = x4+1; \ 00438 break; \ 00439 } \ 00440 }while(0) 00441 00442 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \ 00443 sign =1; \ 00444 switch(dir){ \ 00445 case XUP: \ 00446 if ( (i4 & 1) != 0){ \ 00447 sign = -1; \ 00448 } \ 00449 break; \ 00450 case YUP: \ 00451 if ( ((i4+i1) & 1) != 0){ \ 00452 sign = -1; \ 00453 } \ 00454 break; \ 00455 case ZUP: \ 00456 if ( ((i4+i1+i2) & 1) != 0){ \ 00457 sign = -1; \ 00458 } \ 00459 break; \ 00460 case TUP: \ 00461 if (i4 == X4m1 && last_proc_in_tdim){ \ 00462 sign = -1; \ 00463 }else if(i4 == -1 && first_proc_in_tdim){ \ 00464 sign = -1; \ 00465 } \ 00466 break; \ 00467 } \ 00468 }while (0) 00469 00470 00471 #else 00472 00473 #define UPDATE_COOR_PLUS(mydir, idx) 00474 #define UPDATE_COOR_MINUS(mydir, idx) 00475 #define UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2) 00476 #define UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) 00477 #define COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) 00478 #define UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2) 00479 #endif 00480 00481 #ifdef MULTI_GPU 00482 00483 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, idx) do { \ 00484 switch(mydir){ \ 00485 case 0: \ 00486 new_mem_idx = (x1==X1m1)? ((Vh+Vsh_x+ spacecon_x)*xcomm+(idx - X1m1)/2*(1-xcomm)):((idx+1)>>1); \ 00487 break; \ 00488 case 1: \ 00489 new_mem_idx = (x2==X2m1)? ((Vh+2*(Vsh_x)+Vsh_y+ spacecon_y)*ycomm+(idx-X2X1mX1)/2*(1-ycomm)):((idx+X1)>>1); \ 00490 break; \ 00491 case 2: \ 00492 new_mem_idx = (x3==X3m1)? ((Vh+2*(Vsh_x+Vsh_y)+Vsh_z+ spacecon_z))*zcomm+(idx-X3X2X1mX2X1)/2*(1-zcomm):((idx+X2X1)>>1); \ 00493 break; \ 00494 case 3: \ 00495 new_mem_idx = ( (x4==X4m1)? ((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t+spacecon_t))*tcomm+(idx-X4X3X2X1mX3X2X1)/2*(1-tcomm): (idx+X3X2X1)>>1); \ 00496 break; \ 00497 } \ 00498 UPDATE_COOR_PLUS(mydir, idx); \ 00499 }while(0) 00500 00501 00502 00503 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \ 00504 switch(mydir){ \ 00505 case 0: \ 00506 new_mem_idx = (x1==0)?( (Vh+spacecon_x)*xcomm+(idx+X1m1)/2*(1-xcomm)):((idx-1) >> 1); \ 00507 break; \ 00508 case 1: \ 00509 new_mem_idx = (x2==0)?( (Vh+2*Vsh_x+spacecon_y)*ycomm+(idx+X2X1mX1)/2*(1-ycomm)):((idx-X1) >> 1); \ 00510 break; \ 00511 case 2: \ 00512 new_mem_idx = (x3==0)?((Vh+2*(Vsh_x+Vsh_y)+spacecon_z)*zcomm+(idx+X3X2X1mX2X1)/2*(1-zcomm)):((idx-X2X1) >> 1); \ 00513 break; \ 00514 case 3: \ 00515 new_mem_idx = (x4==0)?((Vh+2*(Vsh_x+Vsh_y+Vsh_z)+ spacecon_t)*tcomm + (idx+X4X3X2X1mX3X2X1)/2*(1-tcomm)):((idx-X3X2X1) >> 1); \ 00516 break; \ 00517 } \ 00518 UPDATE_COOR_MINUS(mydir, idx); \ 00519 }while(0) 00520 00521 00522 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do { \ 00523 int local_new_x1=x1; \ 00524 int local_new_x2=x2; \ 00525 int local_new_x3=x3; \ 00526 int local_new_x4=x4; \ 00527 new_mem_idx=X; \ 00528 if(dimcomm[mydir1] == 0 || x[mydir1] > 0){ \ 00529 switch(mydir1){/*mydir1 is not partitioned or x[mydir1]!= 0*/ \ 00530 case 0: \ 00531 new_mem_idx = (x1==0)?(new_mem_idx+X1m1):(new_mem_idx-1); \ 00532 local_new_x1 = (x1==0)?X1m1:(x1 - 1); \ 00533 break; \ 00534 case 1: \ 00535 new_mem_idx = (x2==0)?(new_mem_idx+X2X1mX1):(new_mem_idx-X1); \ 00536 local_new_x2 = (x2==0)?X2m1:(x2 - 1); \ 00537 break; \ 00538 case 2: \ 00539 new_mem_idx = (x3==0)?(new_mem_idx+X3X2X1mX2X1):(new_mem_idx-X2X1); \ 00540 local_new_x3 = (x3==0)?X3m1:(x3 -1); \ 00541 break; \ 00542 case 3: \ 00543 new_mem_idx = (x4==0)?(new_mem_idx+X4X3X2X1mX3X2X1):(new_mem_idx-X3X2X1); \ 00544 local_new_x4 = (x4==0)?X4m1:(x4 - 1); \ 00545 break; \ 00546 } \ 00547 switch(mydir2){ \ 00548 case 0: \ 00549 new_mem_idx = (x1==X1m1)?(2*(Vh+Vsh_x)+((local_new_x4*X3X2+local_new_x3*X2+local_new_x2)))*xcomm+(new_mem_idx-X1m1)*(1-xcomm):(new_mem_idx+1); \ 00550 break; \ 00551 case 1: \ 00552 new_mem_idx = (x2==X2m1)?(2*(Vh+2*(Vsh_x)+Vsh_y)+((local_new_x4*X3X1+local_new_x3*X1+local_new_x1)))*ycomm+(new_mem_idx-X2X1mX1)*(1-ycomm):(new_mem_idx+X1); \ 00553 break; \ 00554 case 2: \ 00555 new_mem_idx = (x3==X3m1)?(2*(Vh+2*(Vsh_x+Vsh_y)+Vsh_z)+((local_new_x4*X2X1+local_new_x2*X1+local_new_x1)))*zcomm+(new_mem_idx-X3X2X1mX2X1)*(1-zcomm):(new_mem_idx+X2X1); \ 00556 break; \ 00557 case 3: \ 00558 new_mem_idx = (x4==X4m1)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z)+Vsh_t)+((local_new_x3*X2X1+local_new_x2*X1+local_new_x1)))*tcomm+(new_mem_idx-X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx+X3X2X1); \ 00559 break; \ 00560 } \ 00561 }else{ \ 00562 /*the case where both dir1/dir2 are out of boundary are dealed with a different macro (_DIAG)*/ \ 00563 switch(mydir2){ /*mydir2 is not partitioned or x[mydir2]!= 0*/ \ 00564 case 0: \ 00565 new_mem_idx = (x1==X1m1)?(new_mem_idx-X1m1):(new_mem_idx+1); \ 00566 local_new_x1 = (x1==X1m1)?0:(x1+1); \ 00567 break; \ 00568 case 1: \ 00569 new_mem_idx = (x2==X2m1)?(new_mem_idx-X2X1mX1):(new_mem_idx+X1); \ 00570 local_new_x2 = (x2==X2m1)?0:(x2+1); \ 00571 break; \ 00572 case 2: \ 00573 new_mem_idx = (x3==X3m1)?(new_mem_idx-X3X2X1mX2X1):(new_mem_idx+X2X1); \ 00574 local_new_x3 = (x3==X3m1)?0:(x3+1); \ 00575 break; \ 00576 case 3: \ 00577 new_mem_idx = (x4==X4m1)?(new_mem_idx-X4X3X2X1mX3X2X1):(new_mem_idx+X3X2X1); \ 00578 local_new_x4 = (x4==X4m1)?0:(x4+1); \ 00579 break; \ 00580 } \ 00581 switch(mydir1){/*mydir1 is 0 here */ \ 00582 case 0: \ 00583 new_mem_idx = (x1==0)?(2*(Vh)+(local_new_x4*X3X2+local_new_x3*X2+local_new_x2))*xcomm+(new_mem_idx+X1m1)*(1-xcomm):(new_mem_idx -1); \ 00584 break; \ 00585 case 1: \ 00586 new_mem_idx = (x2==0)?(2*(Vh+2*Vsh_x)+(local_new_x4*X3X1+local_new_x3*X1+local_new_x1))*ycomm+(new_mem_idx+X2X1mX1)*(1-ycomm):(new_mem_idx-X1); \ 00587 break; \ 00588 case 2: \ 00589 new_mem_idx = (x3==0)?(2*(Vh+2*(Vsh_x+Vsh_y))+(local_new_x4*X2X1+local_new_x2*X1+local_new_x1))*zcomm+(new_mem_idx+X3X2X1mX2X1)*(1-zcomm):(new_mem_idx-X2X1); \ 00590 break; \ 00591 case 3: \ 00592 new_mem_idx = (x4==0)?(2*(Vh+2*(Vsh_x+Vsh_y+Vsh_z))+(local_new_x3*X2X1+local_new_x2*X1+local_new_x1))*tcomm+(new_mem_idx+X4X3X2X1mX3X2X1)*(1-tcomm):(new_mem_idx-X3X2X1); \ 00593 break; \ 00594 } \ 00595 } \ 00596 new_mem_idx = new_mem_idx >> 1; \ 00597 UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2); \ 00598 }while(0) 00599 00600 00601 00602 00603 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2) do { \ 00604 new_mem_idx = Vh+2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t) + mu*Vh_2d_max + ((x[dir2]*Z[dir1] + x[dir1])>>1); \ 00605 UPDATE_COOR_LOWER_STAPLE_DIAG(nu, mu, dir1, dir2); \ 00606 }while(0) 00607 00608 00609 #else 00610 00611 #define LLFAT_COMPUTE_NEW_IDX_PLUS(mydir, idx) do { \ 00612 switch(mydir){ \ 00613 case 0: \ 00614 new_mem_idx = ( (x1==X1m1)?idx-X1m1:idx+1)>>1; \ 00615 break; \ 00616 case 1: \ 00617 new_mem_idx = ( (x2==X2m1)?idx-X2X1mX1:idx+X1)>>1; \ 00618 break; \ 00619 case 2: \ 00620 new_mem_idx = ( (x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1)>>1; \ 00621 break; \ 00622 case 3: \ 00623 new_mem_idx = ( (x4==X4m1)?idx-X4X3X2X1mX3X2X1: idx+X3X2X1)>>1; \ 00624 break; \ 00625 } \ 00626 UPDATE_COOR_PLUS(mydir, idx); \ 00627 }while(0) 00628 00629 00630 #define LLFAT_COMPUTE_NEW_IDX_MINUS(mydir, idx) do { \ 00631 switch(mydir){ \ 00632 case 0: \ 00633 new_mem_idx = ( (x1==0)?idx+X1m1:idx-1) >> 1; \ 00634 break; \ 00635 case 1: \ 00636 new_mem_idx = ( (x2==0)?idx+X2X1mX1:idx-X1) >> 1; \ 00637 break; \ 00638 case 2: \ 00639 new_mem_idx = ( (x3==0)?idx+X3X2X1mX2X1:idx-X2X1) >> 1; \ 00640 break; \ 00641 case 3: \ 00642 new_mem_idx = ( (x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1) >> 1; \ 00643 break; \ 00644 } \ 00645 UPDATE_COOR_MINUS(mydir, idx); \ 00646 }while(0) 00647 00648 00649 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(mydir1, mydir2) do { \ 00650 switch(mydir1){ \ 00651 case 0: \ 00652 new_mem_idx = ( (x1==0)?X+X1m1:X-1); \ 00653 break; \ 00654 case 1: \ 00655 new_mem_idx = ( (x2==0)?X+X2X1mX1:X-X1); \ 00656 break; \ 00657 case 2: \ 00658 new_mem_idx = ( (x3==0)?X+X3X2X1mX2X1:X-X2X1); \ 00659 break; \ 00660 case 3: \ 00661 new_mem_idx = ((x4==0)?X+X4X3X2X1mX3X2X1:X-X3X2X1); \ 00662 break; \ 00663 } \ 00664 switch(mydir2){ \ 00665 case 0: \ 00666 new_mem_idx = ( (x1==X1m1)?new_mem_idx-X1m1:new_mem_idx+1)>> 1; \ 00667 break; \ 00668 case 1: \ 00669 new_mem_idx = ( (x2==X2m1)?new_mem_idx-X2X1mX1:new_mem_idx+X1) >> 1; \ 00670 break; \ 00671 case 2: \ 00672 new_mem_idx = ( (x3==X3m1)?new_mem_idx-X3X2X1mX2X1:new_mem_idx+X2X1) >> 1; \ 00673 break; \ 00674 case 3: \ 00675 new_mem_idx = ( (x4==X4m1)?new_mem_idx-X4X3X2X1mX3X2X1:new_mem_idx+X3X2X1) >> 1; \ 00676 break; \ 00677 } \ 00678 UPDATE_COOR_LOWER_STAPLE(mydir1, mydir2); \ 00679 }while(0) 00680 00681 #endif 00682 00683 00684 #define LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mydir, idx) do { \ 00685 switch(mydir){ \ 00686 case 0: \ 00687 new_mem_idx = (idx+1)>>1; \ 00688 break; \ 00689 case 1: \ 00690 new_mem_idx = (idx+E1)>>1; \ 00691 break; \ 00692 case 2: \ 00693 new_mem_idx = (idx+E2E1)>>1; \ 00694 break; \ 00695 case 3: \ 00696 new_mem_idx = (idx+E3E2E1)>>1; \ 00697 break; \ 00698 } \ 00699 UPDATE_COOR_PLUS(mydir, idx); \ 00700 }while(0) 00701 00702 #define LLFAT_COMPUTE_NEW_IDX_MINUS_EX(mydir, idx) do { \ 00703 switch(mydir){ \ 00704 case 0: \ 00705 new_mem_idx = (idx-1) >> 1; \ 00706 break; \ 00707 case 1: \ 00708 new_mem_idx = (idx-E1) >> 1; \ 00709 break; \ 00710 case 2: \ 00711 new_mem_idx = (idx-E2E1) >> 1; \ 00712 break; \ 00713 case 3: \ 00714 new_mem_idx = (idx-E3E2E1) >> 1; \ 00715 break; \ 00716 } \ 00717 UPDATE_COOR_MINUS(mydir, idx); \ 00718 }while(0) 00719 00720 00721 #define LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(mydir1, mydir2) do { \ 00722 switch(mydir1){ \ 00723 case 0: \ 00724 new_mem_idx = X-1; \ 00725 break; \ 00726 case 1: \ 00727 new_mem_idx = X-E1; \ 00728 break; \ 00729 case 2: \ 00730 new_mem_idx = X-E2E1; \ 00731 break; \ 00732 case 3: \ 00733 new_mem_idx = X-E3E2E1; \ 00734 break; \ 00735 } \ 00736 switch(mydir2){ \ 00737 case 0: \ 00738 new_mem_idx = (new_mem_idx+1)>> 1; \ 00739 break; \ 00740 case 1: \ 00741 new_mem_idx = (new_mem_idx+E1) >> 1; \ 00742 break; \ 00743 case 2: \ 00744 new_mem_idx = (new_mem_idx+E2E1) >> 1; \ 00745 break; \ 00746 case 3: \ 00747 new_mem_idx = (new_mem_idx+E3E2E1) >> 1; \ 00748 break; \ 00749 } \ 00750 UPDATE_COOR_LOWER_STAPLE_EX(mydir1, mydir2); \ 00751 }while(0) 00752 00753 00754 00755 00756 template<int mu, int nu, int odd_bit> 00757 __global__ void 00758 LLFAT_KERNEL(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 00759 FloatN* sitelink_even, FloatN* sitelink_odd, 00760 FloatM* fatlink_even, FloatM* fatlink_odd, 00761 Float mycoeff, llfat_kernel_param_t kparam) 00762 { 00763 __shared__ FloatM sd_data[NUM_FLOATS*64]; 00764 00765 //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8; 00766 FloatM TEMPA5, TEMPA6, TEMPA7, TEMPA8; 00767 FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8; 00768 //FloatM STAPLE6, STAPLE7, STAPLE8; 00769 00770 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x; 00771 00772 int z1 = mem_idx / X1h; 00773 short x1h = mem_idx - z1*X1h; 00774 int z2 = z1 / X2; 00775 short x2 = z1 - z2*X2; 00776 short x4 = z2 / X3; 00777 short x3 = z2 - x4*X3; 00778 00779 short x1odd = (x2 + x3 + x4 + odd_bit) & 1; 00780 short x1 = 2*x1h + x1odd; 00781 int X = 2*mem_idx + x1odd; 00782 00783 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_X && x1 != X1m1) return; 00784 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_X && x1 != 0) return; 00785 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Y && x2 != X2m1) return; 00786 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Y && x2 != 0) return; 00787 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Z && x3 != X3m1) return; 00788 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Z && x3 != 0) return; 00789 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_T && x4 != X4m1) return; 00790 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_T && x4 != 0) return; 00791 00792 int new_mem_idx; 00793 DECLARE_VAR_SIGN; 00794 DECLARE_NEW_X; 00795 DECLARE_X_ARRAY; 00796 00797 //int x[4] = {x1,x2,x3, x4}; 00798 #ifdef MULTI_GPU 00799 int Z[4] ={X1,X2,X3,X4}; 00800 int spacecon_x = (x4*X3X2+x3*X2+x2)>>1; 00801 int spacecon_y = (x4*X3X1+x3*X1+x1)>>1; 00802 int spacecon_z = (x4*X2X1+x2*X1+x1)>>1; 00803 int spacecon_t = (x3*X2X1+x2*X1+x1)>>1; 00804 #endif 00805 00806 /* Upper staple */ 00807 /* Computes the staple : 00808 * mu (B) 00809 * +-------+ 00810 * nu | | 00811 * (A) | |(C) 00812 * X X 00813 * 00814 */ 00815 00816 { 00817 /* load matrix A*/ 00818 LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A); 00819 COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4); 00820 RECONSTRUCT_SITE_LINK(sign, a); 00821 00822 00823 /* load matrix B*/ 00824 LLFAT_COMPUTE_NEW_IDX_PLUS(nu, X); 00825 LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B); 00826 COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4); 00827 RECONSTRUCT_SITE_LINK(sign, b); 00828 00829 00830 MULT_SU3_NN(a, b, tempa); 00831 00832 /* load matrix C*/ 00833 00834 LLFAT_COMPUTE_NEW_IDX_PLUS(mu, X); 00835 LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C); 00836 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00837 RECONSTRUCT_SITE_LINK(sign, c); 00838 00839 MULT_SU3_NA(tempa, c, staple); 00840 } 00841 00842 /***************lower staple**************** 00843 * 00844 * X X 00845 * nu | | 00846 * (A) | | (C) 00847 * +-------+ 00848 * mu (B) 00849 * 00850 *********************************************/ 00851 { 00852 /* load matrix A*/ 00853 LLFAT_COMPUTE_NEW_IDX_MINUS(nu,X); 00854 00855 LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A); 00856 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00857 RECONSTRUCT_SITE_LINK(sign, a); 00858 00859 /* load matrix B*/ 00860 LOAD_ODD_SITE_MATRIX(mu, (new_mem_idx), B); 00861 COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4); 00862 RECONSTRUCT_SITE_LINK(sign, b); 00863 00864 MULT_SU3_AN(a, b, tempa); 00865 00866 /* load matrix C*/ 00867 //if(x[nu] == 0 && x[mu] == Z[mu] - 1){ 00868 #ifdef MULTI_GPU 00869 if(dimcomm[nu] && dimcomm[mu] && x[nu] == 0 && x[mu] == Z[mu] - 1){ 00870 int idx = nu*4+mu; 00871 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1_array[idx], dir2_array[idx]); 00872 }else{ 00873 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu); 00874 } 00875 #else 00876 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu); 00877 #endif 00878 00879 LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C); 00880 00881 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00882 RECONSTRUCT_SITE_LINK(sign, c); 00883 00884 00885 MULT_SU3_NN(tempa, c, b); 00886 LLFAT_ADD_SU3_MATRIX(b, staple, staple); 00887 } 00888 00889 if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){ 00890 LOAD_EVEN_FAT_MATRIX(mu, mem_idx); 00891 SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat); 00892 WRITE_FAT_MATRIX(fatlink_even,mu, mem_idx); 00893 } 00894 WRITE_STAPLE_MATRIX(staple_even, mem_idx); 00895 00896 return; 00897 } 00898 00899 template<int mu, int nu, int odd_bit, int save_staple> 00900 __global__ void 00901 LLFAT_KERNEL(do_computeGenStapleFieldParity,RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 00902 FloatN* sitelink_even, FloatN* sitelink_odd, 00903 FloatM* fatlink_even, FloatM* fatlink_odd, 00904 FloatM* mulink_even, FloatM* mulink_odd, 00905 Float mycoeff, llfat_kernel_param_t kparam) 00906 { 00907 __shared__ FloatM sd_data[NUM_FLOATS*64]; 00908 //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8; 00909 FloatM TEMPA5, TEMPA6, TEMPA7, TEMPA8; 00910 FloatM TEMPB0, TEMPB1, TEMPB2, TEMPB3, TEMPB4, TEMPB5, TEMPB6, TEMPB7, TEMPB8; 00911 FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8; 00912 //FloatM STAPLE6, STAPLE7, STAPLE8; 00913 00914 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x; 00915 00916 int z1 = mem_idx / X1h; 00917 int x1h = mem_idx - z1*X1h; 00918 int z2 = z1 / X2; 00919 int x2 = z1 - z2*X2; 00920 int x4 = z2 / X3; 00921 int x3 = z2 - x4*X3; 00922 00923 int x1odd = (x2 + x3 + x4 + odd_bit) & 1; 00924 int x1 = 2*x1h + x1odd; 00925 int X = 2*mem_idx + x1odd; 00926 00927 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_X && x1 != X1m1) return; 00928 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_X && x1 != 0) return; 00929 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Y && x2 != X2m1) return; 00930 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Y && x2 != 0) return; 00931 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_Z && x3 != X3m1) return; 00932 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_Z && x3 != 0) return; 00933 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_FWD_T && x4 != X4m1) return; 00934 if(kparam.kernel_type == LLFAT_EXTERIOR_KERNEL_BACK_T && x4 != 0) return; 00935 00936 DECLARE_X_ARRAY; 00937 #ifdef MULTI_GPU 00938 int Z[4] ={X1,X2,X3,X4}; 00939 int spacecon_x = (x4*X3X2+x3*X2+x2)>>1; 00940 int spacecon_y = (x4*X3X1+x3*X1+x1)>>1; 00941 int spacecon_z = (x4*X2X1+x2*X1+x1)>>1; 00942 int spacecon_t = (x3*X2X1+x2*X1+x1)>>1; 00943 #endif 00944 00945 int new_mem_idx; 00946 DECLARE_VAR_SIGN; 00947 DECLARE_NEW_X; 00948 00949 /* Upper staple */ 00950 /* Computes the staple : 00951 * mu (BB) 00952 * +-------+ 00953 * nu | | 00954 * (A) | |(C) 00955 * X X 00956 * 00957 */ 00958 { 00959 /* load matrix A*/ 00960 LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A); 00961 COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4); 00962 RECONSTRUCT_SITE_LINK(sign, a); 00963 00964 /* load matrix BB*/ 00965 LLFAT_COMPUTE_NEW_IDX_PLUS(nu, X); 00966 LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB); 00967 00968 MULT_SU3_NN(a, bb, tempa); 00969 00970 /* load matrix C*/ 00971 LLFAT_COMPUTE_NEW_IDX_PLUS(mu, X); 00972 LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C); 00973 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00974 RECONSTRUCT_SITE_LINK(sign, c); 00975 if (save_staple){ 00976 MULT_SU3_NA(tempa, c, staple); 00977 }else{ 00978 MULT_SU3_NA(tempa, c, tempb); 00979 } 00980 } 00981 00982 /***************lower staple**************** 00983 * 00984 * X X 00985 * nu | | 00986 * (A) | | (C) 00987 * +-------+ 00988 * mu (B) 00989 * 00990 *********************************************/ 00991 00992 00993 { 00994 /* load matrix A*/ 00995 LLFAT_COMPUTE_NEW_IDX_MINUS(nu, X); 00996 00997 LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, A); 00998 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00999 RECONSTRUCT_SITE_LINK(sign, a); 01000 01001 /* load matrix B*/ 01002 LLFAT_COMPUTE_NEW_IDX_MINUS(nu, X); 01003 LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB); 01004 01005 MULT_SU3_AN(a, bb, tempa); 01006 01007 /* load matrix C*/ 01008 //if(x[nu] == 0 && x[mu] == Z[mu] - 1){ 01009 #ifdef MULTI_GPU 01010 if(dimcomm[nu] && dimcomm[mu] && x[nu] == 0 && x[mu] == Z[mu] - 1){ 01011 int idx = nu*4+mu; 01012 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_DIAG(nu, mu, dir1_array[idx], dir2_array[idx]); 01013 }else{ 01014 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu); 01015 } 01016 #else 01017 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu); 01018 #endif 01019 01020 LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C); 01021 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 01022 RECONSTRUCT_SITE_LINK(sign, c); 01023 01024 MULT_SU3_NN(tempa, c, a); 01025 if(save_staple){ 01026 LLFAT_ADD_SU3_MATRIX(staple, a, staple); 01027 }else{ 01028 LLFAT_ADD_SU3_MATRIX(a, tempb, tempb); 01029 } 01030 } 01031 01032 LOAD_EVEN_FAT_MATRIX(mu, mem_idx); 01033 if(save_staple){ 01034 if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){ 01035 SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat); 01036 } 01037 WRITE_STAPLE_MATRIX(staple_even, mem_idx); 01038 }else{ 01039 if(kparam.kernel_type == LLFAT_INTERIOR_KERNEL){ 01040 SCALAR_MULT_ADD_SU3_MATRIX(fat, tempb, mycoeff, fat); 01041 }else{ 01042 //The code should never be here 01043 //because it makes no sense to split kernels when no staple is stored 01044 //print error? 01045 } 01046 } 01047 01048 WRITE_FAT_MATRIX(fatlink_even, mu, mem_idx); 01049 01050 return; 01051 } 01052 01053 __global__ void 01054 LLFAT_KERNEL(llfatOneLink, RECONSTRUCT)(FloatN* sitelink_even, FloatN* sitelink_odd, 01055 FloatM* fatlink_even, FloatM* fatlink_odd, 01056 Float coeff0, Float coeff5) 01057 { 01058 FloatN* my_sitelink; 01059 FloatM* my_fatlink; 01060 int sid = blockIdx.x*blockDim.x + threadIdx.x; 01061 int mem_idx = sid; 01062 01063 #if (RECONSTRUCT != 18) 01064 int odd_bit= 0; 01065 #endif 01066 01067 my_sitelink = sitelink_even; 01068 my_fatlink = fatlink_even; 01069 if (mem_idx >= Vh){ 01070 #if (RECONSTRUCT != 18) 01071 odd_bit=1; 01072 #endif 01073 mem_idx = mem_idx - Vh; 01074 my_sitelink = sitelink_odd; 01075 my_fatlink = fatlink_odd; 01076 } 01077 01078 #if (RECONSTRUCT != 18) 01079 int z1 = mem_idx / X1h; 01080 int x1h = mem_idx - z1*X1h; 01081 int z2 = z1 / X2; 01082 int x2 = z1 - z2*X2; 01083 int x4 = z2 / X3; 01084 int x3 = z2 - x4*X3; 01085 int x1odd = (x2 + x3 + x4 + odd_bit) & 1; 01086 int x1 = 2*x1h + x1odd; 01087 DECLARE_VAR_SIGN; 01088 #endif 01089 01090 for(int dir=0;dir < 4; dir++){ 01091 LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A); 01092 COMPUTE_RECONSTRUCT_SIGN(sign, dir, x1, x2, x3, x4); 01093 RECONSTRUCT_SITE_LINK(sign, a); 01094 01095 LOAD_FAT_MATRIX(my_fatlink, dir, mem_idx); 01096 01097 SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat); 01098 01099 WRITE_FAT_MATRIX(my_fatlink,dir, mem_idx); 01100 } 01101 01102 return; 01103 } 01104 01105 01106 01107 01108 template<int mu, int nu, int odd_bit> 01109 __global__ void 01110 LLFAT_KERNEL_EX(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 01111 FloatN* sitelink_even, FloatN* sitelink_odd, 01112 FloatM* fatlink_even, FloatM* fatlink_odd, 01113 Float mycoeff, llfat_kernel_param_t kparam) 01114 { 01115 #if 1 01116 extern __shared__ FloatM sd_data[]; //sd_data is a macro name defined in llfat_quda.cu 01117 01118 01119 //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8; 01120 FloatM TEMPA5, TEMPA6, TEMPA7, TEMPA8; 01121 FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8; 01122 01123 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x; 01124 if(mem_idx >= kparam.threads) return; 01125 01126 int z1 = mem_idx/D1h; 01127 short x1h = mem_idx - z1*D1h; 01128 int z2 = z1/D2; 01129 short x2 = z1 - z2*D2; 01130 short x4 = z2/D3; 01131 short x3 = z2 - x4*D3; 01132 01133 short x1odd = (x2 + x3 + x4 + odd_bit) & 1; 01134 short x1 = 2*x1h + x1odd; 01135 01136 x1 += kparam.base_idx; 01137 x2 += kparam.base_idx; 01138 x3 += kparam.base_idx; 01139 x4 += kparam.base_idx; 01140 int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1; 01141 mem_idx = X/2; 01142 01143 int new_mem_idx; 01144 DECLARE_VAR_SIGN; 01145 DECLARE_NEW_X; 01146 01147 /* Upper staple */ 01148 /* Computes the staple : 01149 * mu (B) 01150 * +-------+ 01151 * nu | | 01152 * (A) | |(C) 01153 * X X 01154 * 01155 */ 01156 01157 { 01158 /* load matrix A*/ 01159 LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A); 01160 COMPUTE_RECONSTRUCT_SIGN(sign, nu, (x1-2), (x2-2), (x3-2), (x4-2)); 01161 RECONSTRUCT_SITE_LINK(sign, a); 01162 01163 01164 01165 /* load matrix B*/ 01166 LLFAT_COMPUTE_NEW_IDX_PLUS_EX(nu, X); 01167 LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B); 01168 COMPUTE_RECONSTRUCT_SIGN(sign, mu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2)); 01169 RECONSTRUCT_SITE_LINK(sign, b); 01170 01171 01172 MULT_SU3_NN(a, b, tempa); 01173 01174 /* load matrix C*/ 01175 01176 LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mu, X); 01177 LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C); 01178 COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2)); 01179 RECONSTRUCT_SITE_LINK(sign, c); 01180 01181 MULT_SU3_NA(tempa, c, staple); 01182 01183 } 01184 01185 /***************lower staple**************** 01186 * 01187 * X X 01188 * nu | | 01189 * (A) | | (C) 01190 * +-------+ 01191 * mu (B) 01192 * 01193 *********************************************/ 01194 { 01195 /* load matrix A*/ 01196 LLFAT_COMPUTE_NEW_IDX_MINUS_EX(nu,X); 01197 01198 LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A); 01199 COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2)); 01200 RECONSTRUCT_SITE_LINK(sign, a); 01201 01202 /* load matrix B*/ 01203 LOAD_ODD_SITE_MATRIX(mu, (new_mem_idx), B); 01204 COMPUTE_RECONSTRUCT_SIGN(sign, mu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2)); 01205 RECONSTRUCT_SITE_LINK(sign, b); 01206 01207 MULT_SU3_AN(a, b, tempa); 01208 01209 01210 01211 /* load matrix C*/ 01212 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(nu, mu); 01213 LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C); 01214 01215 COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2)); 01216 RECONSTRUCT_SITE_LINK(sign, c); 01217 01218 01219 MULT_SU3_NN(tempa, c, b); 01220 LLFAT_ADD_SU3_MATRIX(b, staple, staple); 01221 01222 } 01223 01224 01225 if( !(x1 == 1 || x1 == X1 + 2 || x2 == 1 || x2 == X2 + 2 01226 || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 == X4 + 2)){ 01227 int orig_idx = ((x4-2)* X3X2X1 + (x3-2)*X2X1 + (x2-2)*X1 + (x1-2))>>1; 01228 01229 LOAD_EVEN_FAT_MATRIX(mu, orig_idx); 01230 SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat); 01231 WRITE_FAT_MATRIX(fatlink_even,mu, orig_idx); 01232 } 01233 WRITE_STAPLE_MATRIX(staple_even, mem_idx); 01234 01235 #endif 01236 01237 return; 01238 } 01239 01240 template<int mu, int nu, int odd_bit, int save_staple> 01241 __global__ void 01242 LLFAT_KERNEL_EX(do_computeGenStapleFieldParity,RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 01243 FloatN* sitelink_even, FloatN* sitelink_odd, 01244 FloatM* fatlink_even, FloatM* fatlink_odd, 01245 FloatM* mulink_even, FloatM* mulink_odd, 01246 Float mycoeff, llfat_kernel_param_t kparam) 01247 { 01248 #if 1 01249 //__shared__ FloatM sd_data[NUM_FLOATS*64]; 01250 extern __shared__ FloatM sd_data[]; //sd_data is a macro name defined in llfat_quda.cu 01251 //FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8; 01252 FloatM TEMPA5, TEMPA6, TEMPA7, TEMPA8; 01253 FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8; 01254 01255 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x; 01256 if(mem_idx >= kparam.threads) return; 01257 01258 int z1 = mem_idx/D1h; 01259 short x1h = mem_idx - z1*D1h; 01260 int z2 = z1/D2; 01261 short x2 = z1 - z2*D2; 01262 short x4 = z2/D3; 01263 short x3 = z2 - x4*D3; 01264 01265 short x1odd = (x2 + x3 + x4 + odd_bit) & 1; 01266 short x1 = 2*x1h + x1odd; 01267 01268 x1 += kparam.base_idx; 01269 x2 += kparam.base_idx; 01270 x3 += kparam.base_idx; 01271 x4 += kparam.base_idx; 01272 int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1; 01273 mem_idx = X/2; 01274 01275 int new_mem_idx; 01276 DECLARE_VAR_SIGN; 01277 DECLARE_NEW_X; 01278 01279 /* Upper staple */ 01280 /* Computes the staple : 01281 * mu (BB) 01282 * +-------+ 01283 * nu | | 01284 * (A) | |(C) 01285 * X X 01286 * 01287 */ 01288 { 01289 /* load matrix A*/ 01290 LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A); 01291 COMPUTE_RECONSTRUCT_SIGN(sign, nu, (x1-2), (x2-2), (x3-2), (x4-2)); 01292 RECONSTRUCT_SITE_LINK(sign, a); 01293 01294 /* load matrix BB*/ 01295 LLFAT_COMPUTE_NEW_IDX_PLUS_EX(nu, X); 01296 LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB); 01297 MULT_SU3_NN(a, bb, tempa); 01298 01299 /* load matrix C*/ 01300 LLFAT_COMPUTE_NEW_IDX_PLUS_EX(mu, X); 01301 LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C); 01302 COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2)); 01303 RECONSTRUCT_SITE_LINK(sign, c); 01304 01305 MULT_SU3_NA(tempa, c, staple); 01306 } 01307 01308 /***************lower staple**************** 01309 * 01310 * X X 01311 * nu | | 01312 * (A) | | (C) 01313 * +-------+ 01314 * mu (B) 01315 * 01316 *********************************************/ 01317 01318 { 01319 /* load matrix A*/ 01320 LLFAT_COMPUTE_NEW_IDX_MINUS_EX(nu, X); 01321 01322 LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, A); 01323 COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2)); 01324 RECONSTRUCT_SITE_LINK(sign, a); 01325 01326 /* load matrix B*/ 01327 LLFAT_COMPUTE_NEW_IDX_MINUS_EX(nu, X); 01328 LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB); 01329 01330 MULT_SU3_AN(a, bb, tempa); 01331 01332 /* load matrix C*/ 01333 01334 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE_EX(nu, mu); 01335 01336 LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C); 01337 COMPUTE_RECONSTRUCT_SIGN(sign, nu, (new_x1-2), (new_x2-2), (new_x3-2), (new_x4-2)); 01338 RECONSTRUCT_SITE_LINK(sign, c); 01339 01340 MULT_SU3_NN(tempa, c, a); 01341 01342 LLFAT_ADD_SU3_MATRIX(a, staple, staple); 01343 } 01344 01345 01346 if( !(x1 == 1 || x1 == X1 + 2 || x2 == 1 || x2 == X2 + 2 01347 || x3 == 1 || x3 == X3 + 2 || x4 == 1 || x4 == X4 + 2)){ 01348 int orig_idx = ((x4-2)* X3X2X1 + (x3-2)*X2X1 + (x2-2)*X1 + (x1-2))>>1; 01349 LOAD_EVEN_FAT_MATRIX(mu, orig_idx); 01350 SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat); 01351 WRITE_FAT_MATRIX(fatlink_even, mu, orig_idx); 01352 } 01353 01354 if(save_staple){ 01355 WRITE_STAPLE_MATRIX(staple_even, mem_idx); 01356 } 01357 #endif 01358 01359 return; 01360 } 01361 01362 01363 __global__ void 01364 LLFAT_KERNEL_EX(llfatOneLink, RECONSTRUCT)(FloatN* sitelink_even, FloatN* sitelink_odd, 01365 FloatM* fatlink_even, FloatM* fatlink_odd, 01366 Float coeff0, Float coeff5, llfat_kernel_param_t kparam) 01367 { 01368 #if 1 01369 01370 FloatN* my_sitelink; 01371 FloatM* my_fatlink; 01372 int sid = blockIdx.x*blockDim.x + threadIdx.x; 01373 int idx = sid; 01374 01375 if(sid >= 2*kparam.threads) return; 01376 01377 short odd_bit= 0; 01378 01379 my_sitelink = sitelink_even; 01380 my_fatlink = fatlink_even; 01381 if (idx >= kparam.threads){ 01382 odd_bit=1; 01383 idx = idx - kparam.threads; 01384 my_sitelink = sitelink_odd; 01385 my_fatlink = fatlink_odd; 01386 } 01387 01388 int z1 = idx/D1h; 01389 short x1h = idx - z1*D1h; 01390 int z2 = z1/D2; 01391 short x2 = z1 - z2*D2; 01392 int x4 = z2/D3; 01393 short x3 = z2 - x4*D3; 01394 short x1odd = (x2 + x3 + x4 + odd_bit) & 1; 01395 short x1 = 2*x1h + x1odd; 01396 DECLARE_VAR_SIGN; 01397 01398 x1 += kparam.base_idx; 01399 x2 += kparam.base_idx; 01400 x3 += kparam.base_idx; 01401 x4 += kparam.base_idx; 01402 int X = x4*E3E2E1 + x3*E2E1 + x2*E1 + x1; 01403 int mem_idx = X/2; 01404 01405 for(int dir=0;dir < 4; dir++){ 01406 LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A); 01407 COMPUTE_RECONSTRUCT_SIGN(sign, dir, (x1-2), (x2-2), (x3-2), (x4-2)); 01408 RECONSTRUCT_SITE_LINK(sign, a); 01409 01410 LOAD_FAT_MATRIX(my_fatlink, dir, idx); 01411 01412 SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat); 01413 01414 WRITE_FAT_MATRIX(my_fatlink,dir, idx); 01415 } 01416 #endif 01417 01418 return; 01419 } 01420 01421 01422 01423 #undef DECLARE_VAR_SIGN 01424 #undef DECLARE_NEW_X 01425 #undef DECLARE_X_ARRAY 01426 01427 #undef a00_re 01428 #undef a00_im 01429 #undef a01_re 01430 #undef a01_im 01431 #undef a02_re 01432 #undef a02_im 01433 #undef a10_re 01434 #undef a10_im 01435 #undef a11_re 01436 #undef a11_im 01437 #undef a12_re 01438 #undef a12_im 01439 #undef a20_re 01440 #undef a20_im 01441 #undef a21_re 01442 #undef a21_im 01443 #undef a22_re 01444 #undef a22_im 01445 01446 #undef b00_re 01447 #undef b00_im 01448 #undef b01_re 01449 #undef b01_im 01450 #undef b02_re 01451 #undef b02_im 01452 #undef b10_re 01453 #undef b10_im 01454 #undef b11_re 01455 #undef b11_im 01456 #undef b12_re 01457 #undef b12_im 01458 #undef b20_re 01459 #undef b20_im 01460 #undef b21_re 01461 #undef b21_im 01462 #undef b22_re 01463 #undef b22_im 01464 01465 #undef bb00_re 01466 #undef bb00_im 01467 #undef bb01_re 01468 #undef bb01_im 01469 #undef bb02_re 01470 #undef bb02_im 01471 #undef bb10_re 01472 #undef bb10_im 01473 #undef bb11_re 01474 #undef bb11_im 01475 #undef bb12_re 01476 #undef bb12_im 01477 #undef bb20_re 01478 #undef bb20_im 01479 #undef bb21_re 01480 #undef bb21_im 01481 #undef bb22_re 01482 #undef bb22_im 01483 01484 #undef c00_re 01485 #undef c00_im 01486 #undef c01_re 01487 #undef c01_im 01488 #undef c02_re 01489 #undef c02_im 01490 #undef c10_re 01491 #undef c10_im 01492 #undef c11_re 01493 #undef c11_im 01494 #undef c12_re 01495 #undef c12_im 01496 #undef c20_re 01497 #undef c20_im 01498 #undef c21_re 01499 #undef c21_im 01500 #undef c22_re 01501 #undef c22_im 01502 01503 #undef aT00_re 01504 #undef aT00_im 01505 #undef aT01_re 01506 #undef aT01_im 01507 #undef aT02_re 01508 #undef aT02_im 01509 #undef aT10_re 01510 #undef aT10_im 01511 #undef aT11_re 01512 #undef aT11_im 01513 #undef aT12_re 01514 #undef aT12_im 01515 #undef aT20_re 01516 #undef aT20_im 01517 #undef aT21_re 01518 #undef aT21_im 01519 #undef aT22_re 01520 #undef aT22_im 01521 01522 #undef bT00_re 01523 #undef bT00_im 01524 #undef bT01_re 01525 #undef bT01_im 01526 #undef bT02_re 01527 #undef bT02_im 01528 #undef bT10_re 01529 #undef bT10_im 01530 #undef bT11_re 01531 #undef bT11_im 01532 #undef bT12_re 01533 #undef bT12_im 01534 #undef bT20_re 01535 #undef bT20_im 01536 #undef bT21_re 01537 #undef bT21_im 01538 #undef bT22_re 01539 #undef bT22_im 01540 01541 #undef cT00_re 01542 #undef cT00_im 01543 #undef cT01_re 01544 #undef cT01_im 01545 #undef cT02_re 01546 #undef cT02_im 01547 #undef cT10_re 01548 #undef cT10_im 01549 #undef cT11_re 01550 #undef cT11_im 01551 #undef cT12_re 01552 #undef cT12_im 01553 #undef cT20_re 01554 #undef cT20_im 01555 #undef cT21_re 01556 #undef cT21_im 01557 #undef cT22_re 01558 #undef cT22_im 01559 01560 01561 #undef tempa00_re 01562 #undef tempa00_im 01563 #undef tempa01_re 01564 #undef tempa01_im 01565 #undef tempa02_re 01566 #undef tempa02_im 01567 #undef tempa10_re 01568 #undef tempa10_im 01569 #undef tempa11_re 01570 #undef tempa11_im 01571 #undef tempa12_re 01572 #undef tempa12_im 01573 #undef tempa20_re 01574 #undef tempa20_im 01575 #undef tempa21_re 01576 #undef tempa21_im 01577 #undef tempa22_re 01578 #undef tempa22_im 01579 01580 #undef tempb00_re 01581 #undef tempb00_im 01582 #undef tempb01_re 01583 #undef tempb01_im 01584 #undef tempb02_re 01585 #undef tempb02_im 01586 #undef tempb10_re 01587 #undef tempb10_im 01588 #undef tempb11_re 01589 #undef tempb11_im 01590 #undef tempb12_re 01591 #undef tempb12_im 01592 #undef tempb20_re 01593 #undef tempb20_im 01594 #undef tempb21_re 01595 #undef tempb21_im 01596 #undef tempb22_re 01597 #undef tempb22_im 01598 01599 #undef fat00_re 01600 #undef fat00_im 01601 #undef fat01_re 01602 #undef fat01_im 01603 #undef fat02_re 01604 #undef fat02_im 01605 #undef fat10_re 01606 #undef fat10_im 01607 #undef fat11_re 01608 #undef fat11_im 01609 #undef fat12_re 01610 #undef fat12_im 01611 #undef fat20_re 01612 #undef fat20_im 01613 #undef fat21_re 01614 #undef fat21_im 01615 #undef fat22_re 01616 #undef fat22_im