|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 00002 #if ((PRECISION == 1) && (RECONSTRUCT == 12 || RECONSTRUCT == 8)) 00003 #define a00_re A0.x 00004 #define a00_im A0.y 00005 #define a01_re A0.z 00006 #define a01_im A0.w 00007 #define a02_re A1.x 00008 #define a02_im A1.y 00009 #define a10_re A1.z 00010 #define a10_im A1.w 00011 #define a11_re A2.x 00012 #define a11_im A2.y 00013 #define a12_re A2.z 00014 #define a12_im A2.w 00015 #define a20_re A3.x 00016 #define a20_im A3.y 00017 #define a21_re A3.z 00018 #define a21_im A3.w 00019 #define a22_re A4.x 00020 #define a22_im A4.y 00021 00022 #define b00_re B0.x 00023 #define b00_im B0.y 00024 #define b01_re B0.z 00025 #define b01_im B0.w 00026 #define b02_re B1.x 00027 #define b02_im B1.y 00028 #define b10_re B1.z 00029 #define b10_im B1.w 00030 #define b11_re B2.x 00031 #define b11_im B2.y 00032 #define b12_re B2.z 00033 #define b12_im B2.w 00034 #define b20_re B3.x 00035 #define b20_im B3.y 00036 #define b21_re B3.z 00037 #define b21_im B3.w 00038 #define b22_re B4.x 00039 #define b22_im B4.y 00040 00041 #define c00_re C0.x 00042 #define c00_im C0.y 00043 #define c01_re C0.z 00044 #define c01_im C0.w 00045 #define c02_re C1.x 00046 #define c02_im C1.y 00047 #define c10_re C1.z 00048 #define c10_im C1.w 00049 #define c11_re C2.x 00050 #define c11_im C2.y 00051 #define c12_re C2.z 00052 #define c12_im C2.w 00053 #define c20_re C3.x 00054 #define c20_im C3.y 00055 #define c21_re C3.z 00056 #define c21_im C3.w 00057 #define c22_re C4.x 00058 #define c22_im C4.y 00059 00060 #else 00061 00062 #define a00_re A0.x 00063 #define a00_im A0.y 00064 #define a01_re A1.x 00065 #define a01_im A1.y 00066 #define a02_re A2.x 00067 #define a02_im A2.y 00068 #define a10_re A3.x 00069 #define a10_im A3.y 00070 #define a11_re A4.x 00071 #define a11_im A4.y 00072 #define a12_re A5.x 00073 #define a12_im A5.y 00074 #define a20_re A6.x 00075 #define a20_im A6.y 00076 #define a21_re A7.x 00077 #define a21_im A7.y 00078 #define a22_re A8.x 00079 #define a22_im A8.y 00080 00081 #define b00_re B0.x 00082 #define b00_im B0.y 00083 #define b01_re B1.x 00084 #define b01_im B1.y 00085 #define b02_re B2.x 00086 #define b02_im B2.y 00087 #define b10_re B3.x 00088 #define b10_im B3.y 00089 #define b11_re B4.x 00090 #define b11_im B4.y 00091 #define b12_re B5.x 00092 #define b12_im B5.y 00093 #define b20_re B6.x 00094 #define b20_im B6.y 00095 #define b21_re B7.x 00096 #define b21_im B7.y 00097 #define b22_re B8.x 00098 #define b22_im B8.y 00099 00100 #define c00_re C0.x 00101 #define c00_im C0.y 00102 #define c01_re C1.x 00103 #define c01_im C1.y 00104 #define c02_re C2.x 00105 #define c02_im C2.y 00106 #define c10_re C3.x 00107 #define c10_im C3.y 00108 #define c11_re C4.x 00109 #define c11_im C4.y 00110 #define c12_re C5.x 00111 #define c12_im C5.y 00112 #define c20_re C6.x 00113 #define c20_im C6.y 00114 #define c21_re C7.x 00115 #define c21_im C7.y 00116 #define c22_re C8.x 00117 #define c22_im C8.y 00118 00119 #endif 00120 00121 00122 #define bb00_re BB0.x 00123 #define bb00_im BB0.y 00124 #define bb01_re BB1.x 00125 #define bb01_im BB1.y 00126 #define bb02_re BB2.x 00127 #define bb02_im BB2.y 00128 #define bb10_re BB3.x 00129 #define bb10_im BB3.y 00130 #define bb11_re BB4.x 00131 #define bb11_im BB4.y 00132 #define bb12_re BB5.x 00133 #define bb12_im BB5.y 00134 #define bb20_re BB6.x 00135 #define bb20_im BB6.y 00136 #define bb21_re BB7.x 00137 #define bb21_im BB7.y 00138 #define bb22_re BB8.x 00139 #define bb22_im BB8.y 00140 00141 00142 00143 #define aT00_re (+a00_re) 00144 #define aT00_im (-a00_im) 00145 #define aT01_re (+a10_re) 00146 #define aT01_im (-a10_im) 00147 #define aT02_re (+a20_re) 00148 #define aT02_im (-a20_im) 00149 #define aT10_re (+a01_re) 00150 #define aT10_im (-a01_im) 00151 #define aT11_re (+a11_re) 00152 #define aT11_im (-a11_im) 00153 #define aT12_re (+a21_re) 00154 #define aT12_im (-a21_im) 00155 #define aT20_re (+a02_re) 00156 #define aT20_im (-a02_im) 00157 #define aT21_re (+a12_re) 00158 #define aT21_im (-a12_im) 00159 #define aT22_re (+a22_re) 00160 #define aT22_im (-a22_im) 00161 00162 #define bT00_re (+b00_re) 00163 #define bT00_im (-b00_im) 00164 #define bT01_re (+b10_re) 00165 #define bT01_im (-b10_im) 00166 #define bT02_re (+b20_re) 00167 #define bT02_im (-b20_im) 00168 #define bT10_re (+b01_re) 00169 #define bT10_im (-b01_im) 00170 #define bT11_re (+b11_re) 00171 #define bT11_im (-b11_im) 00172 #define bT12_re (+b21_re) 00173 #define bT12_im (-b21_im) 00174 #define bT20_re (+b02_re) 00175 #define bT20_im (-b02_im) 00176 #define bT21_re (+b12_re) 00177 #define bT21_im (-b12_im) 00178 #define bT22_re (+b22_re) 00179 #define bT22_im (-b22_im) 00180 00181 #define cT00_re (+c00_re) 00182 #define cT00_im (-c00_im) 00183 #define cT01_re (+c10_re) 00184 #define cT01_im (-c10_im) 00185 #define cT02_re (+c20_re) 00186 #define cT02_im (-c20_im) 00187 #define cT10_re (+c01_re) 00188 #define cT10_im (-c01_im) 00189 #define cT11_re (+c11_re) 00190 #define cT11_im (-c11_im) 00191 #define cT12_re (+c21_re) 00192 #define cT12_im (-c21_im) 00193 #define cT20_re (+c02_re) 00194 #define cT20_im (-c02_im) 00195 #define cT21_re (+c12_re) 00196 #define cT21_im (-c12_im) 00197 #define cT22_re (+c22_re) 00198 #define cT22_im (-c22_im) 00199 00200 00201 #define tempa00_re TEMPA0.x 00202 #define tempa00_im TEMPA0.y 00203 #define tempa01_re TEMPA1.x 00204 #define tempa01_im TEMPA1.y 00205 #define tempa02_re TEMPA2.x 00206 #define tempa02_im TEMPA2.y 00207 #define tempa10_re TEMPA3.x 00208 #define tempa10_im TEMPA3.y 00209 #define tempa11_re TEMPA4.x 00210 #define tempa11_im TEMPA4.y 00211 #define tempa12_re TEMPA5.x 00212 #define tempa12_im TEMPA5.y 00213 #define tempa20_re TEMPA6.x 00214 #define tempa20_im TEMPA6.y 00215 #define tempa21_re TEMPA7.x 00216 #define tempa21_im TEMPA7.y 00217 #define tempa22_re TEMPA8.x 00218 #define tempa22_im TEMPA8.y 00219 00220 #define tempb00_re TEMPB0.x 00221 #define tempb00_im TEMPB0.y 00222 #define tempb01_re TEMPB1.x 00223 #define tempb01_im TEMPB1.y 00224 #define tempb02_re TEMPB2.x 00225 #define tempb02_im TEMPB2.y 00226 #define tempb10_re TEMPB3.x 00227 #define tempb10_im TEMPB3.y 00228 #define tempb11_re TEMPB4.x 00229 #define tempb11_im TEMPB4.y 00230 #define tempb12_re TEMPB5.x 00231 #define tempb12_im TEMPB5.y 00232 #define tempb20_re TEMPB6.x 00233 #define tempb20_im TEMPB6.y 00234 #define tempb21_re TEMPB7.x 00235 #define tempb21_im TEMPB7.y 00236 #define tempb22_re TEMPB8.x 00237 #define tempb22_im TEMPB8.y 00238 00239 00240 //fat link is not compressible 00241 #define fat00_re FAT0.x 00242 #define fat00_im FAT0.y 00243 #define fat01_re FAT1.x 00244 #define fat01_im FAT1.y 00245 #define fat02_re FAT2.x 00246 #define fat02_im FAT2.y 00247 #define fat10_re FAT3.x 00248 #define fat10_im FAT3.y 00249 #define fat11_re FAT4.x 00250 #define fat11_im FAT4.y 00251 #define fat12_re FAT5.x 00252 #define fat12_im FAT5.y 00253 #define fat20_re FAT6.x 00254 #define fat20_im FAT6.y 00255 #define fat21_re FAT7.x 00256 #define fat21_im FAT7.y 00257 #define fat22_re FAT8.x 00258 #define fat22_im FAT8.y 00259 00260 template<int mu, int nu, int odd_bit> 00261 __global__ void 00262 LLFAT_KERNEL(do_siteComputeGenStapleParity, RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 00263 FloatN* sitelink_even, FloatN* sitelink_odd, 00264 FloatM* fatlink_even, FloatM* fatlink_odd, 00265 Float mycoeff) 00266 { 00267 FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8; 00268 FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8; 00269 00270 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x; 00271 00272 int z1 = FAST_INT_DIVIDE(mem_idx, X1h); 00273 short x1h = mem_idx - z1*X1h; 00274 int z2 = FAST_INT_DIVIDE(z1, X2); 00275 short x2 = z1 - z2*X2; 00276 short x4 = FAST_INT_DIVIDE(z2, X3); 00277 short x3 = z2 - x4*X3; 00278 short x1odd = (x2 + x3 + x4 + odd_bit) & 1; 00279 short x1 = 2*x1h + x1odd; 00280 int X = 2*mem_idx + x1odd; 00281 float sign =1; 00282 int new_mem_idx; 00283 int new_x1 = x1; 00284 int new_x2 = x2; 00285 int new_x3 = x3; 00286 int new_x4 = x4; 00287 00288 00289 /* Upper staple */ 00290 /* Computes the staple : 00291 * mu (B) 00292 * +-------+ 00293 * nu | | 00294 * (A) | |(C) 00295 * X X 00296 * 00297 */ 00298 00299 { 00300 /* load matrix A*/ 00301 LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A); 00302 COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4); 00303 RECONSTRUCT_SITE_LINK(nu, mem_idx, sign, a); 00304 00305 /* load matrix B*/ 00306 LLFAT_COMPUTE_NEW_IDX_PLUS(nu, X); 00307 LOAD_ODD_SITE_MATRIX(mu, new_mem_idx, B); 00308 COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4); 00309 RECONSTRUCT_SITE_LINK(mu, new_mem_idx, sign, b); 00310 00311 MULT_SU3_NN(a, b, tempa); 00312 00313 /* load matrix C*/ 00314 00315 LLFAT_COMPUTE_NEW_IDX_PLUS(mu, X); 00316 LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C); 00317 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00318 RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c); 00319 00320 MULT_SU3_NA(tempa, c, staple); 00321 } 00322 00323 /***************lower staple**************** 00324 * 00325 * X X 00326 * nu | | 00327 * (A) | | (C) 00328 * +-------+ 00329 * mu (B) 00330 * 00331 *********************************************/ 00332 { 00333 /* load matrix A*/ 00334 LLFAT_COMPUTE_NEW_IDX_MINUS(nu,X); 00335 00336 LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A); 00337 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00338 RECONSTRUCT_SITE_LINK(nu, (new_mem_idx), sign, a); 00339 00340 /* load matrix B*/ 00341 LOAD_ODD_SITE_MATRIX(mu, (new_mem_idx), B); 00342 COMPUTE_RECONSTRUCT_SIGN(sign, mu, new_x1, new_x2, new_x3, new_x4); 00343 RECONSTRUCT_SITE_LINK(mu, (new_mem_idx), sign, b); 00344 00345 MULT_SU3_AN(a, b, tempa); 00346 00347 /* load matrix C*/ 00348 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu); 00349 LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C); 00350 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00351 RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c); 00352 00353 00354 MULT_SU3_NN(tempa, c, b); 00355 LLFAT_ADD_SU3_MATRIX(b, staple, staple); 00356 } 00357 00358 LOAD_EVEN_FAT_MATRIX(mu, mem_idx); 00359 SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat); 00360 WRITE_FAT_MATRIX(fatlink_even,mu, mem_idx); 00361 00362 WRITE_STAPLE_MATRIX(staple_even, mem_idx); 00363 00364 return; 00365 } 00366 00367 00368 template<int mu, int nu, int odd_bit> 00369 __global__ void 00370 LLFAT_KERNEL(do_computeGenStapleFieldParity, RECONSTRUCT)(FloatN* sitelink_even, FloatN* sitelink_odd, 00371 FloatM* fatlink_even, FloatM* fatlink_odd, 00372 FloatM* mulink_even, FloatM* mulink_odd, 00373 Float mycoeff) 00374 { 00375 FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8; 00376 FloatM TEMPB0, TEMPB1, TEMPB2, TEMPB3, TEMPB4, TEMPB5, TEMPB6, TEMPB7, TEMPB8; 00377 00378 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x; 00379 00380 int z1 = FAST_INT_DIVIDE(mem_idx, X1h); 00381 int x1h = mem_idx - z1*X1h; 00382 int z2 = FAST_INT_DIVIDE(z1, X2); 00383 int x2 = z1 - z2*X2; 00384 int x4 = FAST_INT_DIVIDE(z2, X3); 00385 int x3 = z2 - x4*X3; 00386 int x1odd = (x2 + x3 + x4 + odd_bit) & 1; 00387 int x1 = 2*x1h + x1odd; 00388 int X = 2*mem_idx + x1odd; 00389 00390 int sign =1; 00391 00392 int new_mem_idx; 00393 int new_x1 = x1; 00394 int new_x2 = x2; 00395 int new_x3 = x3; 00396 int new_x4 = x4; 00397 00398 00399 /* Upper staple */ 00400 /* Computes the staple : 00401 * mu (BB) 00402 * +-------+ 00403 * nu | | 00404 * (A) | |(C) 00405 * X X 00406 * 00407 */ 00408 { 00409 /* load matrix A*/ 00410 LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A); 00411 COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4); 00412 RECONSTRUCT_SITE_LINK(nu, mem_idx, sign, a); 00413 00414 /* load matrix BB*/ 00415 00416 LLFAT_COMPUTE_NEW_IDX_PLUS(nu, X); 00417 LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB); 00418 00419 MULT_SU3_NN(a, bb, tempa); 00420 /* load matrix C*/ 00421 00422 LLFAT_COMPUTE_NEW_IDX_PLUS(mu, X); 00423 LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C); 00424 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00425 RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c); 00426 00427 00428 MULT_SU3_NA(tempa, c, tempb); 00429 } 00430 00431 /***************lower staple**************** 00432 * 00433 * X X 00434 * nu | | 00435 * (A) | | (C) 00436 * +-------+ 00437 * mu (B) 00438 * 00439 *********************************************/ 00440 00441 { 00442 /* load matrix A*/ 00443 LLFAT_COMPUTE_NEW_IDX_MINUS(nu, X); 00444 00445 LOAD_ODD_SITE_MATRIX(nu, (new_mem_idx), A); 00446 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00447 RECONSTRUCT_SITE_LINK(nu, (new_mem_idx), sign, a); 00448 00449 /* load matrix B*/ 00450 LOAD_ODD_MULINK_MATRIX(0, (new_mem_idx), BB); 00451 00452 MULT_SU3_AN(a, bb, tempa); 00453 00454 /* load matrix C*/ 00455 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu); 00456 LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C); 00457 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00458 RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c); 00459 00460 00461 MULT_SU3_NN(tempa, c, a); 00462 00463 LLFAT_ADD_SU3_MATRIX(a, tempb, tempb); 00464 } 00465 00466 LOAD_EVEN_FAT_MATRIX(mu, mem_idx); 00467 SCALAR_MULT_ADD_SU3_MATRIX(fat, tempb, mycoeff, fat); 00468 00469 WRITE_FAT_MATRIX(fatlink_even, mu, mem_idx); 00470 00471 return; 00472 } 00473 00474 template<int mu, int nu, int odd_bit> 00475 __global__ void 00476 LLFAT_KERNEL(do_computeGenStapleFieldSaveParity,RECONSTRUCT)(FloatM* staple_even, FloatM* staple_odd, 00477 FloatN* sitelink_even, FloatN* sitelink_odd, 00478 FloatM* fatlink_even, FloatM* fatlink_odd, 00479 FloatM* mulink_even, FloatM* mulink_odd, 00480 Float mycoeff) 00481 { 00482 FloatM TEMPA0, TEMPA1, TEMPA2, TEMPA3, TEMPA4, TEMPA5, TEMPA6, TEMPA7, TEMPA8; 00483 FloatM STAPLE0, STAPLE1, STAPLE2, STAPLE3, STAPLE4, STAPLE5, STAPLE6, STAPLE7, STAPLE8; 00484 00485 int mem_idx = blockIdx.x*blockDim.x + threadIdx.x; 00486 00487 int z1 = FAST_INT_DIVIDE(mem_idx, X1h); 00488 int x1h = mem_idx - z1*X1h; 00489 int z2 = FAST_INT_DIVIDE(z1, X2); 00490 int x2 = z1 - z2*X2; 00491 int x4 = FAST_INT_DIVIDE(z2, X3); 00492 int x3 = z2 - x4*X3; 00493 int x1odd = (x2 + x3 + x4 + odd_bit) & 1; 00494 int x1 = 2*x1h + x1odd; 00495 int X = 2*mem_idx + x1odd; 00496 00497 int sign =1; 00498 00499 int new_mem_idx; 00500 int new_x1 = x1; 00501 int new_x2 = x2; 00502 int new_x3 = x3; 00503 int new_x4 = x4; 00504 00505 00506 /* Upper staple */ 00507 /* Computes the staple : 00508 * mu (BB) 00509 * +-------+ 00510 * nu | | 00511 * (A) | |(C) 00512 * X X 00513 * 00514 */ 00515 { 00516 /* load matrix A*/ 00517 LOAD_EVEN_SITE_MATRIX(nu, mem_idx, A); 00518 COMPUTE_RECONSTRUCT_SIGN(sign, nu, x1, x2, x3, x4); 00519 RECONSTRUCT_SITE_LINK(nu, mem_idx, sign, a); 00520 00521 /* load matrix BB*/ 00522 LLFAT_COMPUTE_NEW_IDX_PLUS(nu, X); 00523 LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB); 00524 00525 MULT_SU3_NN(a, bb, tempa); 00526 00527 /* load matrix C*/ 00528 LLFAT_COMPUTE_NEW_IDX_PLUS(mu, X); 00529 LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, C); 00530 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00531 RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c); 00532 00533 MULT_SU3_NA(tempa, c, staple); 00534 } 00535 00536 /***************lower staple**************** 00537 * 00538 * X X 00539 * nu | | 00540 * (A) | | (C) 00541 * +-------+ 00542 * mu (B) 00543 * 00544 *********************************************/ 00545 00546 00547 { 00548 /* load matrix A*/ 00549 LLFAT_COMPUTE_NEW_IDX_MINUS(nu, X); 00550 00551 LOAD_ODD_SITE_MATRIX(nu, new_mem_idx, A); 00552 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00553 RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, a); 00554 00555 /* load matrix B*/ 00556 LOAD_ODD_MULINK_MATRIX(0, new_mem_idx, BB); 00557 00558 MULT_SU3_AN(a, bb, tempa); 00559 00560 /* load matrix C*/ 00561 LLFAT_COMPUTE_NEW_IDX_LOWER_STAPLE(nu, mu); 00562 00563 LOAD_EVEN_SITE_MATRIX(nu, new_mem_idx, C); 00564 COMPUTE_RECONSTRUCT_SIGN(sign, nu, new_x1, new_x2, new_x3, new_x4); 00565 RECONSTRUCT_SITE_LINK(nu, new_mem_idx, sign, c); 00566 00567 MULT_SU3_NN(tempa, c, a); 00568 LLFAT_ADD_SU3_MATRIX(staple, a, staple); 00569 } 00570 00571 LOAD_EVEN_FAT_MATRIX(mu, mem_idx); 00572 SCALAR_MULT_ADD_SU3_MATRIX(fat, staple, mycoeff, fat); 00573 00574 WRITE_FAT_MATRIX(fatlink_even, mu, mem_idx); 00575 WRITE_STAPLE_MATRIX(staple_even, mem_idx); 00576 00577 return; 00578 } 00579 00580 __global__ void 00581 LLFAT_KERNEL(llfatOneLink, RECONSTRUCT)(FloatN* sitelink_even, FloatN* sitelink_odd, 00582 FloatM* fatlink_even, FloatM* fatlink_odd, 00583 Float coeff0, Float coeff5) 00584 { 00585 00586 FloatN* my_sitelink; 00587 FloatM* my_fatlink; 00588 int sid = blockIdx.x*blockDim.x + threadIdx.x; 00589 int mem_idx = sid; 00590 00591 int odd_bit= 0; 00592 my_sitelink = sitelink_even; 00593 my_fatlink = fatlink_even; 00594 if (mem_idx >= Vh){ 00595 odd_bit=1; 00596 mem_idx = mem_idx - Vh; 00597 my_sitelink = sitelink_odd; 00598 my_fatlink = fatlink_odd; 00599 } 00600 00601 int z1 = FAST_INT_DIVIDE(mem_idx, X1h); 00602 int x1h = mem_idx - z1*X1h; 00603 int z2 = FAST_INT_DIVIDE(z1, X2); 00604 int x2 = z1 - z2*X2; 00605 int x4 = FAST_INT_DIVIDE(z2, X3); 00606 int x3 = z2 - x4*X3; 00607 int x1odd = (x2 + x3 + x4 + odd_bit) & 1; 00608 int x1 = 2*x1h + x1odd; 00609 int sign =1; 00610 00611 for(int dir=0;dir < 4; dir++){ 00612 LOAD_SITE_MATRIX(my_sitelink, dir, mem_idx, A); 00613 COMPUTE_RECONSTRUCT_SIGN(sign, dir, x1, x2, x3, x4); 00614 RECONSTRUCT_SITE_LINK(dir, mem_idx, sign, a); 00615 00616 LOAD_FAT_MATRIX(my_fatlink, dir, mem_idx); 00617 00618 SCALAR_MULT_SU3_MATRIX((coeff0 - 6.0*coeff5), a, fat); 00619 00620 WRITE_FAT_MATRIX(my_fatlink,dir, mem_idx); 00621 } 00622 00623 return; 00624 } 00625 00626 00627 00628 #undef a00_re 00629 #undef a00_im 00630 #undef a01_re 00631 #undef a01_im 00632 #undef a02_re 00633 #undef a02_im 00634 #undef a10_re 00635 #undef a10_im 00636 #undef a11_re 00637 #undef a11_im 00638 #undef a12_re 00639 #undef a12_im 00640 #undef a20_re 00641 #undef a20_im 00642 #undef a21_re 00643 #undef a21_im 00644 #undef a22_re 00645 #undef a22_im 00646 00647 #undef b00_re 00648 #undef b00_im 00649 #undef b01_re 00650 #undef b01_im 00651 #undef b02_re 00652 #undef b02_im 00653 #undef b10_re 00654 #undef b10_im 00655 #undef b11_re 00656 #undef b11_im 00657 #undef b12_re 00658 #undef b12_im 00659 #undef b20_re 00660 #undef b20_im 00661 #undef b21_re 00662 #undef b21_im 00663 #undef b22_re 00664 #undef b22_im 00665 00666 #undef bb00_re 00667 #undef bb00_im 00668 #undef bb01_re 00669 #undef bb01_im 00670 #undef bb02_re 00671 #undef bb02_im 00672 #undef bb10_re 00673 #undef bb10_im 00674 #undef bb11_re 00675 #undef bb11_im 00676 #undef bb12_re 00677 #undef bb12_im 00678 #undef bb20_re 00679 #undef bb20_im 00680 #undef bb21_re 00681 #undef bb21_im 00682 #undef bb22_re 00683 #undef bb22_im 00684 00685 #undef c00_re 00686 #undef c00_im 00687 #undef c01_re 00688 #undef c01_im 00689 #undef c02_re 00690 #undef c02_im 00691 #undef c10_re 00692 #undef c10_im 00693 #undef c11_re 00694 #undef c11_im 00695 #undef c12_re 00696 #undef c12_im 00697 #undef c20_re 00698 #undef c20_im 00699 #undef c21_re 00700 #undef c21_im 00701 #undef c22_re 00702 #undef c22_im 00703 00704 #undef aT00_re 00705 #undef aT00_im 00706 #undef aT01_re 00707 #undef aT01_im 00708 #undef aT02_re 00709 #undef aT02_im 00710 #undef aT10_re 00711 #undef aT10_im 00712 #undef aT11_re 00713 #undef aT11_im 00714 #undef aT12_re 00715 #undef aT12_im 00716 #undef aT20_re 00717 #undef aT20_im 00718 #undef aT21_re 00719 #undef aT21_im 00720 #undef aT22_re 00721 #undef aT22_im 00722 00723 #undef bT00_re 00724 #undef bT00_im 00725 #undef bT01_re 00726 #undef bT01_im 00727 #undef bT02_re 00728 #undef bT02_im 00729 #undef bT10_re 00730 #undef bT10_im 00731 #undef bT11_re 00732 #undef bT11_im 00733 #undef bT12_re 00734 #undef bT12_im 00735 #undef bT20_re 00736 #undef bT20_im 00737 #undef bT21_re 00738 #undef bT21_im 00739 #undef bT22_re 00740 #undef bT22_im 00741 00742 #undef cT00_re 00743 #undef cT00_im 00744 #undef cT01_re 00745 #undef cT01_im 00746 #undef cT02_re 00747 #undef cT02_im 00748 #undef cT10_re 00749 #undef cT10_im 00750 #undef cT11_re 00751 #undef cT11_im 00752 #undef cT12_re 00753 #undef cT12_im 00754 #undef cT20_re 00755 #undef cT20_im 00756 #undef cT21_re 00757 #undef cT21_im 00758 #undef cT22_re 00759 #undef cT22_im 00760 00761 00762 #undef tempa00_re 00763 #undef tempa00_im 00764 #undef tempa01_re 00765 #undef tempa01_im 00766 #undef tempa02_re 00767 #undef tempa02_im 00768 #undef tempa10_re 00769 #undef tempa10_im 00770 #undef tempa11_re 00771 #undef tempa11_im 00772 #undef tempa12_re 00773 #undef tempa12_im 00774 #undef tempa20_re 00775 #undef tempa20_im 00776 #undef tempa21_re 00777 #undef tempa21_im 00778 #undef tempa22_re 00779 #undef tempa22_im 00780 00781 #undef tempb00_re 00782 #undef tempb00_im 00783 #undef tempb01_re 00784 #undef tempb01_im 00785 #undef tempb02_re 00786 #undef tempb02_im 00787 #undef tempb10_re 00788 #undef tempb10_im 00789 #undef tempb11_re 00790 #undef tempb11_im 00791 #undef tempb12_re 00792 #undef tempb12_im 00793 #undef tempb20_re 00794 #undef tempb20_im 00795 #undef tempb21_re 00796 #undef tempb21_im 00797 #undef tempb22_re 00798 #undef tempb22_im 00799 00800 #undef fat00_re 00801 #undef fat00_im 00802 #undef fat01_re 00803 #undef fat01_im 00804 #undef fat02_re 00805 #undef fat02_im 00806 #undef fat10_re 00807 #undef fat10_im 00808 #undef fat11_re 00809 #undef fat11_im 00810 #undef fat12_re 00811 #undef fat12_im 00812 #undef fat20_re 00813 #undef fat20_im 00814 #undef fat21_re 00815 #undef fat21_im 00816 #undef fat22_re 00817 #undef fat22_im
1.7.3