|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 // *** CUDA DSLASH *** 00002 00003 #define SHARED_FLOATS_PER_THREAD 8 00004 00005 // input spinor 00006 #ifdef SPINOR_DOUBLE 00007 #define spinorFloat double 00008 #define i00_re I0.x 00009 #define i00_im I0.y 00010 #define i01_re I1.x 00011 #define i01_im I1.y 00012 #define i02_re I2.x 00013 #define i02_im I2.y 00014 #define i10_re I3.x 00015 #define i10_im I3.y 00016 #define i11_re I4.x 00017 #define i11_im I4.y 00018 #define i12_re I5.x 00019 #define i12_im I5.y 00020 #define i20_re I6.x 00021 #define i20_im I6.y 00022 #define i21_re I7.x 00023 #define i21_im I7.y 00024 #define i22_re I8.x 00025 #define i22_im I8.y 00026 #define i30_re I9.x 00027 #define i30_im I9.y 00028 #define i31_re I10.x 00029 #define i31_im I10.y 00030 #define i32_re I11.x 00031 #define i32_im I11.y 00032 00033 #else 00034 #define spinorFloat float 00035 #define i00_re I0.x 00036 #define i00_im I0.y 00037 #define i01_re I0.z 00038 #define i01_im I0.w 00039 #define i02_re I1.x 00040 #define i02_im I1.y 00041 #define i10_re I1.z 00042 #define i10_im I1.w 00043 #define i11_re I2.x 00044 #define i11_im I2.y 00045 #define i12_re I2.z 00046 #define i12_im I2.w 00047 #define i20_re I3.x 00048 #define i20_im I3.y 00049 #define i21_re I3.z 00050 #define i21_im I3.w 00051 #define i22_re I4.x 00052 #define i22_im I4.y 00053 #define i30_re I4.z 00054 #define i30_im I4.w 00055 #define i31_re I5.x 00056 #define i31_im I5.y 00057 #define i32_re I5.z 00058 #define i32_im I5.w 00059 #endif // SPINOR_DOUBLE 00060 00061 // gauge link 00062 #ifdef GAUGE_FLOAT2 00063 #define g00_re G0.x 00064 #define g00_im G0.y 00065 #define g01_re G1.x 00066 #define g01_im G1.y 00067 #define g02_re G2.x 00068 #define g02_im G2.y 00069 #define g10_re G3.x 00070 #define g10_im G3.y 00071 #define g11_re G4.x 00072 #define g11_im G4.y 00073 #define g12_re G5.x 00074 #define g12_im G5.y 00075 #define g20_re G6.x 00076 #define g20_im G6.y 00077 #define g21_re G7.x 00078 #define g21_im G7.y 00079 #define g22_re G8.x 00080 #define g22_im G8.y 00081 // temporaries 00082 #define A_re G9.x 00083 #define A_im G9.y 00084 00085 #else 00086 #define g00_re G0.x 00087 #define g00_im G0.y 00088 #define g01_re G0.z 00089 #define g01_im G0.w 00090 #define g02_re G1.x 00091 #define g02_im G1.y 00092 #define g10_re G1.z 00093 #define g10_im G1.w 00094 #define g11_re G2.x 00095 #define g11_im G2.y 00096 #define g12_re G2.z 00097 #define g12_im G2.w 00098 #define g20_re G3.x 00099 #define g20_im G3.y 00100 #define g21_re G3.z 00101 #define g21_im G3.w 00102 #define g22_re G4.x 00103 #define g22_im G4.y 00104 // temporaries 00105 #define A_re G4.z 00106 #define A_im G4.w 00107 00108 #endif // GAUGE_DOUBLE 00109 00110 // conjugated gauge link 00111 #define gT00_re (+g00_re) 00112 #define gT00_im (-g00_im) 00113 #define gT01_re (+g10_re) 00114 #define gT01_im (-g10_im) 00115 #define gT02_re (+g20_re) 00116 #define gT02_im (-g20_im) 00117 #define gT10_re (+g01_re) 00118 #define gT10_im (-g01_im) 00119 #define gT11_re (+g11_re) 00120 #define gT11_im (-g11_im) 00121 #define gT12_re (+g21_re) 00122 #define gT12_im (-g21_im) 00123 #define gT20_re (+g02_re) 00124 #define gT20_im (-g02_im) 00125 #define gT21_re (+g12_re) 00126 #define gT21_im (-g12_im) 00127 #define gT22_re (+g22_re) 00128 #define gT22_im (-g22_im) 00129 00130 // output spinor 00131 #define o00_re s[0*SHARED_STRIDE] 00132 #define o00_im s[1*SHARED_STRIDE] 00133 #define o01_re s[2*SHARED_STRIDE] 00134 #define o01_im s[3*SHARED_STRIDE] 00135 #define o02_re s[4*SHARED_STRIDE] 00136 #define o02_im s[5*SHARED_STRIDE] 00137 #define o10_re s[6*SHARED_STRIDE] 00138 #define o10_im s[7*SHARED_STRIDE] 00139 volatile spinorFloat o11_re; 00140 volatile spinorFloat o11_im; 00141 volatile spinorFloat o12_re; 00142 volatile spinorFloat o12_im; 00143 volatile spinorFloat o20_re; 00144 volatile spinorFloat o20_im; 00145 volatile spinorFloat o21_re; 00146 volatile spinorFloat o21_im; 00147 volatile spinorFloat o22_re; 00148 volatile spinorFloat o22_im; 00149 volatile spinorFloat o30_re; 00150 volatile spinorFloat o30_im; 00151 volatile spinorFloat o31_re; 00152 volatile spinorFloat o31_im; 00153 volatile spinorFloat o32_re; 00154 volatile spinorFloat o32_im; 00155 00156 00157 00158 #include "read_gauge.h" 00159 #include "read_clover.h" 00160 #include "io_spinor.h" 00161 00162 int sid = blockIdx.x*blockDim.x + threadIdx.x; 00163 int z1 = FAST_INT_DIVIDE(sid, X1h); 00164 int x1h = sid - z1*X1h; 00165 int z2 = FAST_INT_DIVIDE(z1, X2); 00166 int x2 = z1 - z2*X2; 00167 int x4 = FAST_INT_DIVIDE(z2, X3); 00168 int x3 = z2 - x4*X3; 00169 int x1odd = (x2 + x3 + x4 + oddBit) & 1; 00170 int x1 = 2*x1h + x1odd; 00171 int X = 2*sid + x1odd; 00172 00173 #ifdef SPINOR_DOUBLE 00174 #if (__CUDA_ARCH__ >= 200) 00175 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 00176 #else 00177 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200 00178 #endif 00179 extern __shared__ spinorFloat sd_data[]; 00180 volatile spinorFloat *s = sd_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE) 00181 + (threadIdx.x % SHARED_STRIDE); 00182 #else 00183 #if (__CUDA_ARCH__ >= 200) 00184 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 00185 #else 00186 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200 00187 #endif 00188 extern __shared__ spinorFloat ss_data[]; 00189 volatile spinorFloat *s = ss_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE) 00190 + (threadIdx.x % SHARED_STRIDE); 00191 #endif 00192 00193 o00_re = o00_im = 0; 00194 o01_re = o01_im = 0; 00195 o02_re = o02_im = 0; 00196 o10_re = o10_im = 0; 00197 o11_re = o11_im = 0; 00198 o12_re = o12_im = 0; 00199 o20_re = o20_im = 0; 00200 o21_re = o21_im = 0; 00201 o22_re = o22_im = 0; 00202 o30_re = o30_im = 0; 00203 o31_re = o31_im = 0; 00204 o32_re = o32_im = 0; 00205 00206 { 00207 // Projector P0- 00208 // 1 0 0 -i 00209 // 0 1 -i 0 00210 // 0 i 1 0 00211 // i 0 0 1 00212 00213 int sp_idx = ((x1==X1m1) ? X-X1m1 : X+1) >> 1; 00214 int ga_idx = sid; 00215 00216 // read gauge matrix from device memory 00217 READ_GAUGE_MATRIX(GAUGE0TEX, 0); 00218 00219 // read spinor from device memory 00220 READ_SPINOR(SPINORTEX); 00221 00222 // reconstruct gauge matrix 00223 RECONSTRUCT_GAUGE_MATRIX(0); 00224 00225 // project spinor into half spinors 00226 spinorFloat a0_re = +i00_re+i30_im; 00227 spinorFloat a0_im = +i00_im-i30_re; 00228 spinorFloat a1_re = +i01_re+i31_im; 00229 spinorFloat a1_im = +i01_im-i31_re; 00230 spinorFloat a2_re = +i02_re+i32_im; 00231 spinorFloat a2_im = +i02_im-i32_re; 00232 00233 spinorFloat b0_re = +i10_re+i20_im; 00234 spinorFloat b0_im = +i10_im-i20_re; 00235 spinorFloat b1_re = +i11_re+i21_im; 00236 spinorFloat b1_im = +i11_im-i21_re; 00237 spinorFloat b2_re = +i12_re+i22_im; 00238 spinorFloat b2_im = +i12_im-i22_re; 00239 00240 // multiply row 0 00241 spinorFloat A0_re = 0; 00242 A0_re += g00_re * a0_re; 00243 A0_re -= g00_im * a0_im; 00244 A0_re += g01_re * a1_re; 00245 A0_re -= g01_im * a1_im; 00246 A0_re += g02_re * a2_re; 00247 A0_re -= g02_im * a2_im; 00248 spinorFloat A0_im = 0; 00249 A0_im += g00_re * a0_im; 00250 A0_im += g00_im * a0_re; 00251 A0_im += g01_re * a1_im; 00252 A0_im += g01_im * a1_re; 00253 A0_im += g02_re * a2_im; 00254 A0_im += g02_im * a2_re; 00255 spinorFloat B0_re = 0; 00256 B0_re += g00_re * b0_re; 00257 B0_re -= g00_im * b0_im; 00258 B0_re += g01_re * b1_re; 00259 B0_re -= g01_im * b1_im; 00260 B0_re += g02_re * b2_re; 00261 B0_re -= g02_im * b2_im; 00262 spinorFloat B0_im = 0; 00263 B0_im += g00_re * b0_im; 00264 B0_im += g00_im * b0_re; 00265 B0_im += g01_re * b1_im; 00266 B0_im += g01_im * b1_re; 00267 B0_im += g02_re * b2_im; 00268 B0_im += g02_im * b2_re; 00269 00270 // multiply row 1 00271 spinorFloat A1_re = 0; 00272 A1_re += g10_re * a0_re; 00273 A1_re -= g10_im * a0_im; 00274 A1_re += g11_re * a1_re; 00275 A1_re -= g11_im * a1_im; 00276 A1_re += g12_re * a2_re; 00277 A1_re -= g12_im * a2_im; 00278 spinorFloat A1_im = 0; 00279 A1_im += g10_re * a0_im; 00280 A1_im += g10_im * a0_re; 00281 A1_im += g11_re * a1_im; 00282 A1_im += g11_im * a1_re; 00283 A1_im += g12_re * a2_im; 00284 A1_im += g12_im * a2_re; 00285 spinorFloat B1_re = 0; 00286 B1_re += g10_re * b0_re; 00287 B1_re -= g10_im * b0_im; 00288 B1_re += g11_re * b1_re; 00289 B1_re -= g11_im * b1_im; 00290 B1_re += g12_re * b2_re; 00291 B1_re -= g12_im * b2_im; 00292 spinorFloat B1_im = 0; 00293 B1_im += g10_re * b0_im; 00294 B1_im += g10_im * b0_re; 00295 B1_im += g11_re * b1_im; 00296 B1_im += g11_im * b1_re; 00297 B1_im += g12_re * b2_im; 00298 B1_im += g12_im * b2_re; 00299 00300 // multiply row 2 00301 spinorFloat A2_re = 0; 00302 A2_re += g20_re * a0_re; 00303 A2_re -= g20_im * a0_im; 00304 A2_re += g21_re * a1_re; 00305 A2_re -= g21_im * a1_im; 00306 A2_re += g22_re * a2_re; 00307 A2_re -= g22_im * a2_im; 00308 spinorFloat A2_im = 0; 00309 A2_im += g20_re * a0_im; 00310 A2_im += g20_im * a0_re; 00311 A2_im += g21_re * a1_im; 00312 A2_im += g21_im * a1_re; 00313 A2_im += g22_re * a2_im; 00314 A2_im += g22_im * a2_re; 00315 spinorFloat B2_re = 0; 00316 B2_re += g20_re * b0_re; 00317 B2_re -= g20_im * b0_im; 00318 B2_re += g21_re * b1_re; 00319 B2_re -= g21_im * b1_im; 00320 B2_re += g22_re * b2_re; 00321 B2_re -= g22_im * b2_im; 00322 spinorFloat B2_im = 0; 00323 B2_im += g20_re * b0_im; 00324 B2_im += g20_im * b0_re; 00325 B2_im += g21_re * b1_im; 00326 B2_im += g21_im * b1_re; 00327 B2_im += g22_re * b2_im; 00328 B2_im += g22_im * b2_re; 00329 00330 o00_re += A0_re; 00331 o00_im += A0_im; 00332 o10_re += B0_re; 00333 o10_im += B0_im; 00334 o20_re -= B0_im; 00335 o20_im += B0_re; 00336 o30_re -= A0_im; 00337 o30_im += A0_re; 00338 00339 o01_re += A1_re; 00340 o01_im += A1_im; 00341 o11_re += B1_re; 00342 o11_im += B1_im; 00343 o21_re -= B1_im; 00344 o21_im += B1_re; 00345 o31_re -= A1_im; 00346 o31_im += A1_re; 00347 00348 o02_re += A2_re; 00349 o02_im += A2_im; 00350 o12_re += B2_re; 00351 o12_im += B2_im; 00352 o22_re -= B2_im; 00353 o22_im += B2_re; 00354 o32_re -= A2_im; 00355 o32_im += A2_re; 00356 00357 } 00358 00359 { 00360 // Projector P0+ 00361 // 1 0 0 i 00362 // 0 1 i 0 00363 // 0 -i 1 0 00364 // -i 0 0 1 00365 00366 int sp_idx = ((x1==0) ? X+X1m1 : X-1) >> 1; 00367 int ga_idx = sp_idx; 00368 00369 // read gauge matrix from device memory 00370 READ_GAUGE_MATRIX(GAUGE1TEX, 1); 00371 00372 // read spinor from device memory 00373 READ_SPINOR(SPINORTEX); 00374 00375 // reconstruct gauge matrix 00376 RECONSTRUCT_GAUGE_MATRIX(1); 00377 00378 // project spinor into half spinors 00379 spinorFloat a0_re = +i00_re-i30_im; 00380 spinorFloat a0_im = +i00_im+i30_re; 00381 spinorFloat a1_re = +i01_re-i31_im; 00382 spinorFloat a1_im = +i01_im+i31_re; 00383 spinorFloat a2_re = +i02_re-i32_im; 00384 spinorFloat a2_im = +i02_im+i32_re; 00385 00386 spinorFloat b0_re = +i10_re-i20_im; 00387 spinorFloat b0_im = +i10_im+i20_re; 00388 spinorFloat b1_re = +i11_re-i21_im; 00389 spinorFloat b1_im = +i11_im+i21_re; 00390 spinorFloat b2_re = +i12_re-i22_im; 00391 spinorFloat b2_im = +i12_im+i22_re; 00392 00393 // multiply row 0 00394 spinorFloat A0_re = 0; 00395 A0_re += gT00_re * a0_re; 00396 A0_re -= gT00_im * a0_im; 00397 A0_re += gT01_re * a1_re; 00398 A0_re -= gT01_im * a1_im; 00399 A0_re += gT02_re * a2_re; 00400 A0_re -= gT02_im * a2_im; 00401 spinorFloat A0_im = 0; 00402 A0_im += gT00_re * a0_im; 00403 A0_im += gT00_im * a0_re; 00404 A0_im += gT01_re * a1_im; 00405 A0_im += gT01_im * a1_re; 00406 A0_im += gT02_re * a2_im; 00407 A0_im += gT02_im * a2_re; 00408 spinorFloat B0_re = 0; 00409 B0_re += gT00_re * b0_re; 00410 B0_re -= gT00_im * b0_im; 00411 B0_re += gT01_re * b1_re; 00412 B0_re -= gT01_im * b1_im; 00413 B0_re += gT02_re * b2_re; 00414 B0_re -= gT02_im * b2_im; 00415 spinorFloat B0_im = 0; 00416 B0_im += gT00_re * b0_im; 00417 B0_im += gT00_im * b0_re; 00418 B0_im += gT01_re * b1_im; 00419 B0_im += gT01_im * b1_re; 00420 B0_im += gT02_re * b2_im; 00421 B0_im += gT02_im * b2_re; 00422 00423 // multiply row 1 00424 spinorFloat A1_re = 0; 00425 A1_re += gT10_re * a0_re; 00426 A1_re -= gT10_im * a0_im; 00427 A1_re += gT11_re * a1_re; 00428 A1_re -= gT11_im * a1_im; 00429 A1_re += gT12_re * a2_re; 00430 A1_re -= gT12_im * a2_im; 00431 spinorFloat A1_im = 0; 00432 A1_im += gT10_re * a0_im; 00433 A1_im += gT10_im * a0_re; 00434 A1_im += gT11_re * a1_im; 00435 A1_im += gT11_im * a1_re; 00436 A1_im += gT12_re * a2_im; 00437 A1_im += gT12_im * a2_re; 00438 spinorFloat B1_re = 0; 00439 B1_re += gT10_re * b0_re; 00440 B1_re -= gT10_im * b0_im; 00441 B1_re += gT11_re * b1_re; 00442 B1_re -= gT11_im * b1_im; 00443 B1_re += gT12_re * b2_re; 00444 B1_re -= gT12_im * b2_im; 00445 spinorFloat B1_im = 0; 00446 B1_im += gT10_re * b0_im; 00447 B1_im += gT10_im * b0_re; 00448 B1_im += gT11_re * b1_im; 00449 B1_im += gT11_im * b1_re; 00450 B1_im += gT12_re * b2_im; 00451 B1_im += gT12_im * b2_re; 00452 00453 // multiply row 2 00454 spinorFloat A2_re = 0; 00455 A2_re += gT20_re * a0_re; 00456 A2_re -= gT20_im * a0_im; 00457 A2_re += gT21_re * a1_re; 00458 A2_re -= gT21_im * a1_im; 00459 A2_re += gT22_re * a2_re; 00460 A2_re -= gT22_im * a2_im; 00461 spinorFloat A2_im = 0; 00462 A2_im += gT20_re * a0_im; 00463 A2_im += gT20_im * a0_re; 00464 A2_im += gT21_re * a1_im; 00465 A2_im += gT21_im * a1_re; 00466 A2_im += gT22_re * a2_im; 00467 A2_im += gT22_im * a2_re; 00468 spinorFloat B2_re = 0; 00469 B2_re += gT20_re * b0_re; 00470 B2_re -= gT20_im * b0_im; 00471 B2_re += gT21_re * b1_re; 00472 B2_re -= gT21_im * b1_im; 00473 B2_re += gT22_re * b2_re; 00474 B2_re -= gT22_im * b2_im; 00475 spinorFloat B2_im = 0; 00476 B2_im += gT20_re * b0_im; 00477 B2_im += gT20_im * b0_re; 00478 B2_im += gT21_re * b1_im; 00479 B2_im += gT21_im * b1_re; 00480 B2_im += gT22_re * b2_im; 00481 B2_im += gT22_im * b2_re; 00482 00483 o00_re += A0_re; 00484 o00_im += A0_im; 00485 o10_re += B0_re; 00486 o10_im += B0_im; 00487 o20_re += B0_im; 00488 o20_im -= B0_re; 00489 o30_re += A0_im; 00490 o30_im -= A0_re; 00491 00492 o01_re += A1_re; 00493 o01_im += A1_im; 00494 o11_re += B1_re; 00495 o11_im += B1_im; 00496 o21_re += B1_im; 00497 o21_im -= B1_re; 00498 o31_re += A1_im; 00499 o31_im -= A1_re; 00500 00501 o02_re += A2_re; 00502 o02_im += A2_im; 00503 o12_re += B2_re; 00504 o12_im += B2_im; 00505 o22_re += B2_im; 00506 o22_im -= B2_re; 00507 o32_re += A2_im; 00508 o32_im -= A2_re; 00509 00510 } 00511 00512 { 00513 // Projector P1- 00514 // 1 0 0 -1 00515 // 0 1 1 0 00516 // 0 1 1 0 00517 // -1 0 0 1 00518 00519 int sp_idx = ((x2==X2m1) ? X-X2X1mX1 : X+X1) >> 1; 00520 int ga_idx = sid; 00521 00522 // read gauge matrix from device memory 00523 READ_GAUGE_MATRIX(GAUGE0TEX, 2); 00524 00525 // read spinor from device memory 00526 READ_SPINOR(SPINORTEX); 00527 00528 // reconstruct gauge matrix 00529 RECONSTRUCT_GAUGE_MATRIX(2); 00530 00531 // project spinor into half spinors 00532 spinorFloat a0_re = +i00_re-i30_re; 00533 spinorFloat a0_im = +i00_im-i30_im; 00534 spinorFloat a1_re = +i01_re-i31_re; 00535 spinorFloat a1_im = +i01_im-i31_im; 00536 spinorFloat a2_re = +i02_re-i32_re; 00537 spinorFloat a2_im = +i02_im-i32_im; 00538 00539 spinorFloat b0_re = +i10_re+i20_re; 00540 spinorFloat b0_im = +i10_im+i20_im; 00541 spinorFloat b1_re = +i11_re+i21_re; 00542 spinorFloat b1_im = +i11_im+i21_im; 00543 spinorFloat b2_re = +i12_re+i22_re; 00544 spinorFloat b2_im = +i12_im+i22_im; 00545 00546 // multiply row 0 00547 spinorFloat A0_re = 0; 00548 A0_re += g00_re * a0_re; 00549 A0_re -= g00_im * a0_im; 00550 A0_re += g01_re * a1_re; 00551 A0_re -= g01_im * a1_im; 00552 A0_re += g02_re * a2_re; 00553 A0_re -= g02_im * a2_im; 00554 spinorFloat A0_im = 0; 00555 A0_im += g00_re * a0_im; 00556 A0_im += g00_im * a0_re; 00557 A0_im += g01_re * a1_im; 00558 A0_im += g01_im * a1_re; 00559 A0_im += g02_re * a2_im; 00560 A0_im += g02_im * a2_re; 00561 spinorFloat B0_re = 0; 00562 B0_re += g00_re * b0_re; 00563 B0_re -= g00_im * b0_im; 00564 B0_re += g01_re * b1_re; 00565 B0_re -= g01_im * b1_im; 00566 B0_re += g02_re * b2_re; 00567 B0_re -= g02_im * b2_im; 00568 spinorFloat B0_im = 0; 00569 B0_im += g00_re * b0_im; 00570 B0_im += g00_im * b0_re; 00571 B0_im += g01_re * b1_im; 00572 B0_im += g01_im * b1_re; 00573 B0_im += g02_re * b2_im; 00574 B0_im += g02_im * b2_re; 00575 00576 // multiply row 1 00577 spinorFloat A1_re = 0; 00578 A1_re += g10_re * a0_re; 00579 A1_re -= g10_im * a0_im; 00580 A1_re += g11_re * a1_re; 00581 A1_re -= g11_im * a1_im; 00582 A1_re += g12_re * a2_re; 00583 A1_re -= g12_im * a2_im; 00584 spinorFloat A1_im = 0; 00585 A1_im += g10_re * a0_im; 00586 A1_im += g10_im * a0_re; 00587 A1_im += g11_re * a1_im; 00588 A1_im += g11_im * a1_re; 00589 A1_im += g12_re * a2_im; 00590 A1_im += g12_im * a2_re; 00591 spinorFloat B1_re = 0; 00592 B1_re += g10_re * b0_re; 00593 B1_re -= g10_im * b0_im; 00594 B1_re += g11_re * b1_re; 00595 B1_re -= g11_im * b1_im; 00596 B1_re += g12_re * b2_re; 00597 B1_re -= g12_im * b2_im; 00598 spinorFloat B1_im = 0; 00599 B1_im += g10_re * b0_im; 00600 B1_im += g10_im * b0_re; 00601 B1_im += g11_re * b1_im; 00602 B1_im += g11_im * b1_re; 00603 B1_im += g12_re * b2_im; 00604 B1_im += g12_im * b2_re; 00605 00606 // multiply row 2 00607 spinorFloat A2_re = 0; 00608 A2_re += g20_re * a0_re; 00609 A2_re -= g20_im * a0_im; 00610 A2_re += g21_re * a1_re; 00611 A2_re -= g21_im * a1_im; 00612 A2_re += g22_re * a2_re; 00613 A2_re -= g22_im * a2_im; 00614 spinorFloat A2_im = 0; 00615 A2_im += g20_re * a0_im; 00616 A2_im += g20_im * a0_re; 00617 A2_im += g21_re * a1_im; 00618 A2_im += g21_im * a1_re; 00619 A2_im += g22_re * a2_im; 00620 A2_im += g22_im * a2_re; 00621 spinorFloat B2_re = 0; 00622 B2_re += g20_re * b0_re; 00623 B2_re -= g20_im * b0_im; 00624 B2_re += g21_re * b1_re; 00625 B2_re -= g21_im * b1_im; 00626 B2_re += g22_re * b2_re; 00627 B2_re -= g22_im * b2_im; 00628 spinorFloat B2_im = 0; 00629 B2_im += g20_re * b0_im; 00630 B2_im += g20_im * b0_re; 00631 B2_im += g21_re * b1_im; 00632 B2_im += g21_im * b1_re; 00633 B2_im += g22_re * b2_im; 00634 B2_im += g22_im * b2_re; 00635 00636 o00_re += A0_re; 00637 o00_im += A0_im; 00638 o10_re += B0_re; 00639 o10_im += B0_im; 00640 o20_re += B0_re; 00641 o20_im += B0_im; 00642 o30_re -= A0_re; 00643 o30_im -= A0_im; 00644 00645 o01_re += A1_re; 00646 o01_im += A1_im; 00647 o11_re += B1_re; 00648 o11_im += B1_im; 00649 o21_re += B1_re; 00650 o21_im += B1_im; 00651 o31_re -= A1_re; 00652 o31_im -= A1_im; 00653 00654 o02_re += A2_re; 00655 o02_im += A2_im; 00656 o12_re += B2_re; 00657 o12_im += B2_im; 00658 o22_re += B2_re; 00659 o22_im += B2_im; 00660 o32_re -= A2_re; 00661 o32_im -= A2_im; 00662 00663 } 00664 00665 { 00666 // Projector P1+ 00667 // 1 0 0 1 00668 // 0 1 -1 0 00669 // 0 -1 1 0 00670 // 1 0 0 1 00671 00672 int sp_idx = ((x2==0) ? X+X2X1mX1 : X-X1) >> 1; 00673 int ga_idx = sp_idx; 00674 00675 // read gauge matrix from device memory 00676 READ_GAUGE_MATRIX(GAUGE1TEX, 3); 00677 00678 // read spinor from device memory 00679 READ_SPINOR(SPINORTEX); 00680 00681 // reconstruct gauge matrix 00682 RECONSTRUCT_GAUGE_MATRIX(3); 00683 00684 // project spinor into half spinors 00685 spinorFloat a0_re = +i00_re+i30_re; 00686 spinorFloat a0_im = +i00_im+i30_im; 00687 spinorFloat a1_re = +i01_re+i31_re; 00688 spinorFloat a1_im = +i01_im+i31_im; 00689 spinorFloat a2_re = +i02_re+i32_re; 00690 spinorFloat a2_im = +i02_im+i32_im; 00691 00692 spinorFloat b0_re = +i10_re-i20_re; 00693 spinorFloat b0_im = +i10_im-i20_im; 00694 spinorFloat b1_re = +i11_re-i21_re; 00695 spinorFloat b1_im = +i11_im-i21_im; 00696 spinorFloat b2_re = +i12_re-i22_re; 00697 spinorFloat b2_im = +i12_im-i22_im; 00698 00699 // multiply row 0 00700 spinorFloat A0_re = 0; 00701 A0_re += gT00_re * a0_re; 00702 A0_re -= gT00_im * a0_im; 00703 A0_re += gT01_re * a1_re; 00704 A0_re -= gT01_im * a1_im; 00705 A0_re += gT02_re * a2_re; 00706 A0_re -= gT02_im * a2_im; 00707 spinorFloat A0_im = 0; 00708 A0_im += gT00_re * a0_im; 00709 A0_im += gT00_im * a0_re; 00710 A0_im += gT01_re * a1_im; 00711 A0_im += gT01_im * a1_re; 00712 A0_im += gT02_re * a2_im; 00713 A0_im += gT02_im * a2_re; 00714 spinorFloat B0_re = 0; 00715 B0_re += gT00_re * b0_re; 00716 B0_re -= gT00_im * b0_im; 00717 B0_re += gT01_re * b1_re; 00718 B0_re -= gT01_im * b1_im; 00719 B0_re += gT02_re * b2_re; 00720 B0_re -= gT02_im * b2_im; 00721 spinorFloat B0_im = 0; 00722 B0_im += gT00_re * b0_im; 00723 B0_im += gT00_im * b0_re; 00724 B0_im += gT01_re * b1_im; 00725 B0_im += gT01_im * b1_re; 00726 B0_im += gT02_re * b2_im; 00727 B0_im += gT02_im * b2_re; 00728 00729 // multiply row 1 00730 spinorFloat A1_re = 0; 00731 A1_re += gT10_re * a0_re; 00732 A1_re -= gT10_im * a0_im; 00733 A1_re += gT11_re * a1_re; 00734 A1_re -= gT11_im * a1_im; 00735 A1_re += gT12_re * a2_re; 00736 A1_re -= gT12_im * a2_im; 00737 spinorFloat A1_im = 0; 00738 A1_im += gT10_re * a0_im; 00739 A1_im += gT10_im * a0_re; 00740 A1_im += gT11_re * a1_im; 00741 A1_im += gT11_im * a1_re; 00742 A1_im += gT12_re * a2_im; 00743 A1_im += gT12_im * a2_re; 00744 spinorFloat B1_re = 0; 00745 B1_re += gT10_re * b0_re; 00746 B1_re -= gT10_im * b0_im; 00747 B1_re += gT11_re * b1_re; 00748 B1_re -= gT11_im * b1_im; 00749 B1_re += gT12_re * b2_re; 00750 B1_re -= gT12_im * b2_im; 00751 spinorFloat B1_im = 0; 00752 B1_im += gT10_re * b0_im; 00753 B1_im += gT10_im * b0_re; 00754 B1_im += gT11_re * b1_im; 00755 B1_im += gT11_im * b1_re; 00756 B1_im += gT12_re * b2_im; 00757 B1_im += gT12_im * b2_re; 00758 00759 // multiply row 2 00760 spinorFloat A2_re = 0; 00761 A2_re += gT20_re * a0_re; 00762 A2_re -= gT20_im * a0_im; 00763 A2_re += gT21_re * a1_re; 00764 A2_re -= gT21_im * a1_im; 00765 A2_re += gT22_re * a2_re; 00766 A2_re -= gT22_im * a2_im; 00767 spinorFloat A2_im = 0; 00768 A2_im += gT20_re * a0_im; 00769 A2_im += gT20_im * a0_re; 00770 A2_im += gT21_re * a1_im; 00771 A2_im += gT21_im * a1_re; 00772 A2_im += gT22_re * a2_im; 00773 A2_im += gT22_im * a2_re; 00774 spinorFloat B2_re = 0; 00775 B2_re += gT20_re * b0_re; 00776 B2_re -= gT20_im * b0_im; 00777 B2_re += gT21_re * b1_re; 00778 B2_re -= gT21_im * b1_im; 00779 B2_re += gT22_re * b2_re; 00780 B2_re -= gT22_im * b2_im; 00781 spinorFloat B2_im = 0; 00782 B2_im += gT20_re * b0_im; 00783 B2_im += gT20_im * b0_re; 00784 B2_im += gT21_re * b1_im; 00785 B2_im += gT21_im * b1_re; 00786 B2_im += gT22_re * b2_im; 00787 B2_im += gT22_im * b2_re; 00788 00789 o00_re += A0_re; 00790 o00_im += A0_im; 00791 o10_re += B0_re; 00792 o10_im += B0_im; 00793 o20_re -= B0_re; 00794 o20_im -= B0_im; 00795 o30_re += A0_re; 00796 o30_im += A0_im; 00797 00798 o01_re += A1_re; 00799 o01_im += A1_im; 00800 o11_re += B1_re; 00801 o11_im += B1_im; 00802 o21_re -= B1_re; 00803 o21_im -= B1_im; 00804 o31_re += A1_re; 00805 o31_im += A1_im; 00806 00807 o02_re += A2_re; 00808 o02_im += A2_im; 00809 o12_re += B2_re; 00810 o12_im += B2_im; 00811 o22_re -= B2_re; 00812 o22_im -= B2_im; 00813 o32_re += A2_re; 00814 o32_im += A2_im; 00815 00816 } 00817 00818 { 00819 // Projector P2- 00820 // 1 0 -i 0 00821 // 0 1 0 i 00822 // i 0 1 0 00823 // 0 -i 0 1 00824 00825 int sp_idx = ((x3==X3m1) ? X-X3X2X1mX2X1 : X+X2X1) >> 1; 00826 int ga_idx = sid; 00827 00828 // read gauge matrix from device memory 00829 READ_GAUGE_MATRIX(GAUGE0TEX, 4); 00830 00831 // read spinor from device memory 00832 READ_SPINOR(SPINORTEX); 00833 00834 // reconstruct gauge matrix 00835 RECONSTRUCT_GAUGE_MATRIX(4); 00836 00837 // project spinor into half spinors 00838 spinorFloat a0_re = +i00_re+i20_im; 00839 spinorFloat a0_im = +i00_im-i20_re; 00840 spinorFloat a1_re = +i01_re+i21_im; 00841 spinorFloat a1_im = +i01_im-i21_re; 00842 spinorFloat a2_re = +i02_re+i22_im; 00843 spinorFloat a2_im = +i02_im-i22_re; 00844 00845 spinorFloat b0_re = +i10_re-i30_im; 00846 spinorFloat b0_im = +i10_im+i30_re; 00847 spinorFloat b1_re = +i11_re-i31_im; 00848 spinorFloat b1_im = +i11_im+i31_re; 00849 spinorFloat b2_re = +i12_re-i32_im; 00850 spinorFloat b2_im = +i12_im+i32_re; 00851 00852 // multiply row 0 00853 spinorFloat A0_re = 0; 00854 A0_re += g00_re * a0_re; 00855 A0_re -= g00_im * a0_im; 00856 A0_re += g01_re * a1_re; 00857 A0_re -= g01_im * a1_im; 00858 A0_re += g02_re * a2_re; 00859 A0_re -= g02_im * a2_im; 00860 spinorFloat A0_im = 0; 00861 A0_im += g00_re * a0_im; 00862 A0_im += g00_im * a0_re; 00863 A0_im += g01_re * a1_im; 00864 A0_im += g01_im * a1_re; 00865 A0_im += g02_re * a2_im; 00866 A0_im += g02_im * a2_re; 00867 spinorFloat B0_re = 0; 00868 B0_re += g00_re * b0_re; 00869 B0_re -= g00_im * b0_im; 00870 B0_re += g01_re * b1_re; 00871 B0_re -= g01_im * b1_im; 00872 B0_re += g02_re * b2_re; 00873 B0_re -= g02_im * b2_im; 00874 spinorFloat B0_im = 0; 00875 B0_im += g00_re * b0_im; 00876 B0_im += g00_im * b0_re; 00877 B0_im += g01_re * b1_im; 00878 B0_im += g01_im * b1_re; 00879 B0_im += g02_re * b2_im; 00880 B0_im += g02_im * b2_re; 00881 00882 // multiply row 1 00883 spinorFloat A1_re = 0; 00884 A1_re += g10_re * a0_re; 00885 A1_re -= g10_im * a0_im; 00886 A1_re += g11_re * a1_re; 00887 A1_re -= g11_im * a1_im; 00888 A1_re += g12_re * a2_re; 00889 A1_re -= g12_im * a2_im; 00890 spinorFloat A1_im = 0; 00891 A1_im += g10_re * a0_im; 00892 A1_im += g10_im * a0_re; 00893 A1_im += g11_re * a1_im; 00894 A1_im += g11_im * a1_re; 00895 A1_im += g12_re * a2_im; 00896 A1_im += g12_im * a2_re; 00897 spinorFloat B1_re = 0; 00898 B1_re += g10_re * b0_re; 00899 B1_re -= g10_im * b0_im; 00900 B1_re += g11_re * b1_re; 00901 B1_re -= g11_im * b1_im; 00902 B1_re += g12_re * b2_re; 00903 B1_re -= g12_im * b2_im; 00904 spinorFloat B1_im = 0; 00905 B1_im += g10_re * b0_im; 00906 B1_im += g10_im * b0_re; 00907 B1_im += g11_re * b1_im; 00908 B1_im += g11_im * b1_re; 00909 B1_im += g12_re * b2_im; 00910 B1_im += g12_im * b2_re; 00911 00912 // multiply row 2 00913 spinorFloat A2_re = 0; 00914 A2_re += g20_re * a0_re; 00915 A2_re -= g20_im * a0_im; 00916 A2_re += g21_re * a1_re; 00917 A2_re -= g21_im * a1_im; 00918 A2_re += g22_re * a2_re; 00919 A2_re -= g22_im * a2_im; 00920 spinorFloat A2_im = 0; 00921 A2_im += g20_re * a0_im; 00922 A2_im += g20_im * a0_re; 00923 A2_im += g21_re * a1_im; 00924 A2_im += g21_im * a1_re; 00925 A2_im += g22_re * a2_im; 00926 A2_im += g22_im * a2_re; 00927 spinorFloat B2_re = 0; 00928 B2_re += g20_re * b0_re; 00929 B2_re -= g20_im * b0_im; 00930 B2_re += g21_re * b1_re; 00931 B2_re -= g21_im * b1_im; 00932 B2_re += g22_re * b2_re; 00933 B2_re -= g22_im * b2_im; 00934 spinorFloat B2_im = 0; 00935 B2_im += g20_re * b0_im; 00936 B2_im += g20_im * b0_re; 00937 B2_im += g21_re * b1_im; 00938 B2_im += g21_im * b1_re; 00939 B2_im += g22_re * b2_im; 00940 B2_im += g22_im * b2_re; 00941 00942 o00_re += A0_re; 00943 o00_im += A0_im; 00944 o10_re += B0_re; 00945 o10_im += B0_im; 00946 o20_re -= A0_im; 00947 o20_im += A0_re; 00948 o30_re += B0_im; 00949 o30_im -= B0_re; 00950 00951 o01_re += A1_re; 00952 o01_im += A1_im; 00953 o11_re += B1_re; 00954 o11_im += B1_im; 00955 o21_re -= A1_im; 00956 o21_im += A1_re; 00957 o31_re += B1_im; 00958 o31_im -= B1_re; 00959 00960 o02_re += A2_re; 00961 o02_im += A2_im; 00962 o12_re += B2_re; 00963 o12_im += B2_im; 00964 o22_re -= A2_im; 00965 o22_im += A2_re; 00966 o32_re += B2_im; 00967 o32_im -= B2_re; 00968 00969 } 00970 00971 { 00972 // Projector P2+ 00973 // 1 0 i 0 00974 // 0 1 0 -i 00975 // -i 0 1 0 00976 // 0 i 0 1 00977 00978 int sp_idx = ((x3==0) ? X+X3X2X1mX2X1 : X-X2X1) >> 1; 00979 int ga_idx = sp_idx; 00980 00981 // read gauge matrix from device memory 00982 READ_GAUGE_MATRIX(GAUGE1TEX, 5); 00983 00984 // read spinor from device memory 00985 READ_SPINOR(SPINORTEX); 00986 00987 // reconstruct gauge matrix 00988 RECONSTRUCT_GAUGE_MATRIX(5); 00989 00990 // project spinor into half spinors 00991 spinorFloat a0_re = +i00_re-i20_im; 00992 spinorFloat a0_im = +i00_im+i20_re; 00993 spinorFloat a1_re = +i01_re-i21_im; 00994 spinorFloat a1_im = +i01_im+i21_re; 00995 spinorFloat a2_re = +i02_re-i22_im; 00996 spinorFloat a2_im = +i02_im+i22_re; 00997 00998 spinorFloat b0_re = +i10_re+i30_im; 00999 spinorFloat b0_im = +i10_im-i30_re; 01000 spinorFloat b1_re = +i11_re+i31_im; 01001 spinorFloat b1_im = +i11_im-i31_re; 01002 spinorFloat b2_re = +i12_re+i32_im; 01003 spinorFloat b2_im = +i12_im-i32_re; 01004 01005 // multiply row 0 01006 spinorFloat A0_re = 0; 01007 A0_re += gT00_re * a0_re; 01008 A0_re -= gT00_im * a0_im; 01009 A0_re += gT01_re * a1_re; 01010 A0_re -= gT01_im * a1_im; 01011 A0_re += gT02_re * a2_re; 01012 A0_re -= gT02_im * a2_im; 01013 spinorFloat A0_im = 0; 01014 A0_im += gT00_re * a0_im; 01015 A0_im += gT00_im * a0_re; 01016 A0_im += gT01_re * a1_im; 01017 A0_im += gT01_im * a1_re; 01018 A0_im += gT02_re * a2_im; 01019 A0_im += gT02_im * a2_re; 01020 spinorFloat B0_re = 0; 01021 B0_re += gT00_re * b0_re; 01022 B0_re -= gT00_im * b0_im; 01023 B0_re += gT01_re * b1_re; 01024 B0_re -= gT01_im * b1_im; 01025 B0_re += gT02_re * b2_re; 01026 B0_re -= gT02_im * b2_im; 01027 spinorFloat B0_im = 0; 01028 B0_im += gT00_re * b0_im; 01029 B0_im += gT00_im * b0_re; 01030 B0_im += gT01_re * b1_im; 01031 B0_im += gT01_im * b1_re; 01032 B0_im += gT02_re * b2_im; 01033 B0_im += gT02_im * b2_re; 01034 01035 // multiply row 1 01036 spinorFloat A1_re = 0; 01037 A1_re += gT10_re * a0_re; 01038 A1_re -= gT10_im * a0_im; 01039 A1_re += gT11_re * a1_re; 01040 A1_re -= gT11_im * a1_im; 01041 A1_re += gT12_re * a2_re; 01042 A1_re -= gT12_im * a2_im; 01043 spinorFloat A1_im = 0; 01044 A1_im += gT10_re * a0_im; 01045 A1_im += gT10_im * a0_re; 01046 A1_im += gT11_re * a1_im; 01047 A1_im += gT11_im * a1_re; 01048 A1_im += gT12_re * a2_im; 01049 A1_im += gT12_im * a2_re; 01050 spinorFloat B1_re = 0; 01051 B1_re += gT10_re * b0_re; 01052 B1_re -= gT10_im * b0_im; 01053 B1_re += gT11_re * b1_re; 01054 B1_re -= gT11_im * b1_im; 01055 B1_re += gT12_re * b2_re; 01056 B1_re -= gT12_im * b2_im; 01057 spinorFloat B1_im = 0; 01058 B1_im += gT10_re * b0_im; 01059 B1_im += gT10_im * b0_re; 01060 B1_im += gT11_re * b1_im; 01061 B1_im += gT11_im * b1_re; 01062 B1_im += gT12_re * b2_im; 01063 B1_im += gT12_im * b2_re; 01064 01065 // multiply row 2 01066 spinorFloat A2_re = 0; 01067 A2_re += gT20_re * a0_re; 01068 A2_re -= gT20_im * a0_im; 01069 A2_re += gT21_re * a1_re; 01070 A2_re -= gT21_im * a1_im; 01071 A2_re += gT22_re * a2_re; 01072 A2_re -= gT22_im * a2_im; 01073 spinorFloat A2_im = 0; 01074 A2_im += gT20_re * a0_im; 01075 A2_im += gT20_im * a0_re; 01076 A2_im += gT21_re * a1_im; 01077 A2_im += gT21_im * a1_re; 01078 A2_im += gT22_re * a2_im; 01079 A2_im += gT22_im * a2_re; 01080 spinorFloat B2_re = 0; 01081 B2_re += gT20_re * b0_re; 01082 B2_re -= gT20_im * b0_im; 01083 B2_re += gT21_re * b1_re; 01084 B2_re -= gT21_im * b1_im; 01085 B2_re += gT22_re * b2_re; 01086 B2_re -= gT22_im * b2_im; 01087 spinorFloat B2_im = 0; 01088 B2_im += gT20_re * b0_im; 01089 B2_im += gT20_im * b0_re; 01090 B2_im += gT21_re * b1_im; 01091 B2_im += gT21_im * b1_re; 01092 B2_im += gT22_re * b2_im; 01093 B2_im += gT22_im * b2_re; 01094 01095 o00_re += A0_re; 01096 o00_im += A0_im; 01097 o10_re += B0_re; 01098 o10_im += B0_im; 01099 o20_re += A0_im; 01100 o20_im -= A0_re; 01101 o30_re -= B0_im; 01102 o30_im += B0_re; 01103 01104 o01_re += A1_re; 01105 o01_im += A1_im; 01106 o11_re += B1_re; 01107 o11_im += B1_im; 01108 o21_re += A1_im; 01109 o21_im -= A1_re; 01110 o31_re -= B1_im; 01111 o31_im += B1_re; 01112 01113 o02_re += A2_re; 01114 o02_im += A2_im; 01115 o12_re += B2_re; 01116 o12_im += B2_im; 01117 o22_re += A2_im; 01118 o22_im -= A2_re; 01119 o32_re -= B2_im; 01120 o32_im += B2_re; 01121 01122 } 01123 01124 { 01125 // Projector P3- 01126 // 0 0 0 0 01127 // 0 0 0 0 01128 // 0 0 2 0 01129 // 0 0 0 2 01130 01131 int sp_idx = ((x4==X4m1) ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1; 01132 int ga_idx = sid; 01133 01134 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) { 01135 // read spinor from device memory 01136 READ_SPINOR_DOWN(SPINORTEX); 01137 01138 // project spinor into half spinors 01139 spinorFloat a0_re = +2*i20_re; 01140 spinorFloat a0_im = +2*i20_im; 01141 spinorFloat a1_re = +2*i21_re; 01142 spinorFloat a1_im = +2*i21_im; 01143 spinorFloat a2_re = +2*i22_re; 01144 spinorFloat a2_im = +2*i22_im; 01145 01146 spinorFloat b0_re = +2*i30_re; 01147 spinorFloat b0_im = +2*i30_im; 01148 spinorFloat b1_re = +2*i31_re; 01149 spinorFloat b1_im = +2*i31_im; 01150 spinorFloat b2_re = +2*i32_re; 01151 spinorFloat b2_im = +2*i32_im; 01152 01153 // identity gauge matrix 01154 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01155 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01156 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01157 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01158 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01159 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01160 01161 o20_re += A0_re; 01162 o20_im += A0_im; 01163 o30_re += B0_re; 01164 o30_im += B0_im; 01165 01166 o21_re += A1_re; 01167 o21_im += A1_im; 01168 o31_re += B1_re; 01169 o31_im += B1_im; 01170 01171 o22_re += A2_re; 01172 o22_im += A2_im; 01173 o32_re += B2_re; 01174 o32_im += B2_im; 01175 01176 } 01177 else { 01178 // read gauge matrix from device memory 01179 READ_GAUGE_MATRIX(GAUGE0TEX, 6); 01180 01181 // read spinor from device memory 01182 READ_SPINOR_DOWN(SPINORTEX); 01183 01184 // reconstruct gauge matrix 01185 RECONSTRUCT_GAUGE_MATRIX(6); 01186 01187 // project spinor into half spinors 01188 spinorFloat a0_re = +2*i20_re; 01189 spinorFloat a0_im = +2*i20_im; 01190 spinorFloat a1_re = +2*i21_re; 01191 spinorFloat a1_im = +2*i21_im; 01192 spinorFloat a2_re = +2*i22_re; 01193 spinorFloat a2_im = +2*i22_im; 01194 01195 spinorFloat b0_re = +2*i30_re; 01196 spinorFloat b0_im = +2*i30_im; 01197 spinorFloat b1_re = +2*i31_re; 01198 spinorFloat b1_im = +2*i31_im; 01199 spinorFloat b2_re = +2*i32_re; 01200 spinorFloat b2_im = +2*i32_im; 01201 01202 // multiply row 0 01203 spinorFloat A0_re = 0; 01204 A0_re += g00_re * a0_re; 01205 A0_re -= g00_im * a0_im; 01206 A0_re += g01_re * a1_re; 01207 A0_re -= g01_im * a1_im; 01208 A0_re += g02_re * a2_re; 01209 A0_re -= g02_im * a2_im; 01210 spinorFloat A0_im = 0; 01211 A0_im += g00_re * a0_im; 01212 A0_im += g00_im * a0_re; 01213 A0_im += g01_re * a1_im; 01214 A0_im += g01_im * a1_re; 01215 A0_im += g02_re * a2_im; 01216 A0_im += g02_im * a2_re; 01217 spinorFloat B0_re = 0; 01218 B0_re += g00_re * b0_re; 01219 B0_re -= g00_im * b0_im; 01220 B0_re += g01_re * b1_re; 01221 B0_re -= g01_im * b1_im; 01222 B0_re += g02_re * b2_re; 01223 B0_re -= g02_im * b2_im; 01224 spinorFloat B0_im = 0; 01225 B0_im += g00_re * b0_im; 01226 B0_im += g00_im * b0_re; 01227 B0_im += g01_re * b1_im; 01228 B0_im += g01_im * b1_re; 01229 B0_im += g02_re * b2_im; 01230 B0_im += g02_im * b2_re; 01231 01232 // multiply row 1 01233 spinorFloat A1_re = 0; 01234 A1_re += g10_re * a0_re; 01235 A1_re -= g10_im * a0_im; 01236 A1_re += g11_re * a1_re; 01237 A1_re -= g11_im * a1_im; 01238 A1_re += g12_re * a2_re; 01239 A1_re -= g12_im * a2_im; 01240 spinorFloat A1_im = 0; 01241 A1_im += g10_re * a0_im; 01242 A1_im += g10_im * a0_re; 01243 A1_im += g11_re * a1_im; 01244 A1_im += g11_im * a1_re; 01245 A1_im += g12_re * a2_im; 01246 A1_im += g12_im * a2_re; 01247 spinorFloat B1_re = 0; 01248 B1_re += g10_re * b0_re; 01249 B1_re -= g10_im * b0_im; 01250 B1_re += g11_re * b1_re; 01251 B1_re -= g11_im * b1_im; 01252 B1_re += g12_re * b2_re; 01253 B1_re -= g12_im * b2_im; 01254 spinorFloat B1_im = 0; 01255 B1_im += g10_re * b0_im; 01256 B1_im += g10_im * b0_re; 01257 B1_im += g11_re * b1_im; 01258 B1_im += g11_im * b1_re; 01259 B1_im += g12_re * b2_im; 01260 B1_im += g12_im * b2_re; 01261 01262 // multiply row 2 01263 spinorFloat A2_re = 0; 01264 A2_re += g20_re * a0_re; 01265 A2_re -= g20_im * a0_im; 01266 A2_re += g21_re * a1_re; 01267 A2_re -= g21_im * a1_im; 01268 A2_re += g22_re * a2_re; 01269 A2_re -= g22_im * a2_im; 01270 spinorFloat A2_im = 0; 01271 A2_im += g20_re * a0_im; 01272 A2_im += g20_im * a0_re; 01273 A2_im += g21_re * a1_im; 01274 A2_im += g21_im * a1_re; 01275 A2_im += g22_re * a2_im; 01276 A2_im += g22_im * a2_re; 01277 spinorFloat B2_re = 0; 01278 B2_re += g20_re * b0_re; 01279 B2_re -= g20_im * b0_im; 01280 B2_re += g21_re * b1_re; 01281 B2_re -= g21_im * b1_im; 01282 B2_re += g22_re * b2_re; 01283 B2_re -= g22_im * b2_im; 01284 spinorFloat B2_im = 0; 01285 B2_im += g20_re * b0_im; 01286 B2_im += g20_im * b0_re; 01287 B2_im += g21_re * b1_im; 01288 B2_im += g21_im * b1_re; 01289 B2_im += g22_re * b2_im; 01290 B2_im += g22_im * b2_re; 01291 01292 o20_re += A0_re; 01293 o20_im += A0_im; 01294 o30_re += B0_re; 01295 o30_im += B0_im; 01296 01297 o21_re += A1_re; 01298 o21_im += A1_im; 01299 o31_re += B1_re; 01300 o31_im += B1_im; 01301 01302 o22_re += A2_re; 01303 o22_im += A2_im; 01304 o32_re += B2_re; 01305 o32_im += B2_im; 01306 01307 } 01308 } 01309 01310 { 01311 // Projector P3+ 01312 // 2 0 0 0 01313 // 0 2 0 0 01314 // 0 0 0 0 01315 // 0 0 0 0 01316 01317 int sp_idx = ((x4==0) ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1; 01318 int ga_idx = sp_idx; 01319 01320 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) { 01321 // read spinor from device memory 01322 READ_SPINOR_UP(SPINORTEX); 01323 01324 // project spinor into half spinors 01325 spinorFloat a0_re = +2*i00_re; 01326 spinorFloat a0_im = +2*i00_im; 01327 spinorFloat a1_re = +2*i01_re; 01328 spinorFloat a1_im = +2*i01_im; 01329 spinorFloat a2_re = +2*i02_re; 01330 spinorFloat a2_im = +2*i02_im; 01331 01332 spinorFloat b0_re = +2*i10_re; 01333 spinorFloat b0_im = +2*i10_im; 01334 spinorFloat b1_re = +2*i11_re; 01335 spinorFloat b1_im = +2*i11_im; 01336 spinorFloat b2_re = +2*i12_re; 01337 spinorFloat b2_im = +2*i12_im; 01338 01339 // identity gauge matrix 01340 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01341 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01342 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01343 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01344 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01345 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01346 01347 o00_re += A0_re; 01348 o00_im += A0_im; 01349 o10_re += B0_re; 01350 o10_im += B0_im; 01351 01352 o01_re += A1_re; 01353 o01_im += A1_im; 01354 o11_re += B1_re; 01355 o11_im += B1_im; 01356 01357 o02_re += A2_re; 01358 o02_im += A2_im; 01359 o12_re += B2_re; 01360 o12_im += B2_im; 01361 01362 } 01363 else { 01364 // read gauge matrix from device memory 01365 READ_GAUGE_MATRIX(GAUGE1TEX, 7); 01366 01367 // read spinor from device memory 01368 READ_SPINOR_UP(SPINORTEX); 01369 01370 // reconstruct gauge matrix 01371 RECONSTRUCT_GAUGE_MATRIX(7); 01372 01373 // project spinor into half spinors 01374 spinorFloat a0_re = +2*i00_re; 01375 spinorFloat a0_im = +2*i00_im; 01376 spinorFloat a1_re = +2*i01_re; 01377 spinorFloat a1_im = +2*i01_im; 01378 spinorFloat a2_re = +2*i02_re; 01379 spinorFloat a2_im = +2*i02_im; 01380 01381 spinorFloat b0_re = +2*i10_re; 01382 spinorFloat b0_im = +2*i10_im; 01383 spinorFloat b1_re = +2*i11_re; 01384 spinorFloat b1_im = +2*i11_im; 01385 spinorFloat b2_re = +2*i12_re; 01386 spinorFloat b2_im = +2*i12_im; 01387 01388 // multiply row 0 01389 spinorFloat A0_re = 0; 01390 A0_re += gT00_re * a0_re; 01391 A0_re -= gT00_im * a0_im; 01392 A0_re += gT01_re * a1_re; 01393 A0_re -= gT01_im * a1_im; 01394 A0_re += gT02_re * a2_re; 01395 A0_re -= gT02_im * a2_im; 01396 spinorFloat A0_im = 0; 01397 A0_im += gT00_re * a0_im; 01398 A0_im += gT00_im * a0_re; 01399 A0_im += gT01_re * a1_im; 01400 A0_im += gT01_im * a1_re; 01401 A0_im += gT02_re * a2_im; 01402 A0_im += gT02_im * a2_re; 01403 spinorFloat B0_re = 0; 01404 B0_re += gT00_re * b0_re; 01405 B0_re -= gT00_im * b0_im; 01406 B0_re += gT01_re * b1_re; 01407 B0_re -= gT01_im * b1_im; 01408 B0_re += gT02_re * b2_re; 01409 B0_re -= gT02_im * b2_im; 01410 spinorFloat B0_im = 0; 01411 B0_im += gT00_re * b0_im; 01412 B0_im += gT00_im * b0_re; 01413 B0_im += gT01_re * b1_im; 01414 B0_im += gT01_im * b1_re; 01415 B0_im += gT02_re * b2_im; 01416 B0_im += gT02_im * b2_re; 01417 01418 // multiply row 1 01419 spinorFloat A1_re = 0; 01420 A1_re += gT10_re * a0_re; 01421 A1_re -= gT10_im * a0_im; 01422 A1_re += gT11_re * a1_re; 01423 A1_re -= gT11_im * a1_im; 01424 A1_re += gT12_re * a2_re; 01425 A1_re -= gT12_im * a2_im; 01426 spinorFloat A1_im = 0; 01427 A1_im += gT10_re * a0_im; 01428 A1_im += gT10_im * a0_re; 01429 A1_im += gT11_re * a1_im; 01430 A1_im += gT11_im * a1_re; 01431 A1_im += gT12_re * a2_im; 01432 A1_im += gT12_im * a2_re; 01433 spinorFloat B1_re = 0; 01434 B1_re += gT10_re * b0_re; 01435 B1_re -= gT10_im * b0_im; 01436 B1_re += gT11_re * b1_re; 01437 B1_re -= gT11_im * b1_im; 01438 B1_re += gT12_re * b2_re; 01439 B1_re -= gT12_im * b2_im; 01440 spinorFloat B1_im = 0; 01441 B1_im += gT10_re * b0_im; 01442 B1_im += gT10_im * b0_re; 01443 B1_im += gT11_re * b1_im; 01444 B1_im += gT11_im * b1_re; 01445 B1_im += gT12_re * b2_im; 01446 B1_im += gT12_im * b2_re; 01447 01448 // multiply row 2 01449 spinorFloat A2_re = 0; 01450 A2_re += gT20_re * a0_re; 01451 A2_re -= gT20_im * a0_im; 01452 A2_re += gT21_re * a1_re; 01453 A2_re -= gT21_im * a1_im; 01454 A2_re += gT22_re * a2_re; 01455 A2_re -= gT22_im * a2_im; 01456 spinorFloat A2_im = 0; 01457 A2_im += gT20_re * a0_im; 01458 A2_im += gT20_im * a0_re; 01459 A2_im += gT21_re * a1_im; 01460 A2_im += gT21_im * a1_re; 01461 A2_im += gT22_re * a2_im; 01462 A2_im += gT22_im * a2_re; 01463 spinorFloat B2_re = 0; 01464 B2_re += gT20_re * b0_re; 01465 B2_re -= gT20_im * b0_im; 01466 B2_re += gT21_re * b1_re; 01467 B2_re -= gT21_im * b1_im; 01468 B2_re += gT22_re * b2_re; 01469 B2_re -= gT22_im * b2_im; 01470 spinorFloat B2_im = 0; 01471 B2_im += gT20_re * b0_im; 01472 B2_im += gT20_im * b0_re; 01473 B2_im += gT21_re * b1_im; 01474 B2_im += gT21_im * b1_re; 01475 B2_im += gT22_re * b2_im; 01476 B2_im += gT22_im * b2_re; 01477 01478 o00_re += A0_re; 01479 o00_im += A0_im; 01480 o10_re += B0_re; 01481 o10_im += B0_im; 01482 01483 o01_re += A1_re; 01484 o01_im += A1_im; 01485 o11_re += B1_re; 01486 o11_im += B1_im; 01487 01488 o02_re += A2_re; 01489 o02_im += A2_im; 01490 o12_re += B2_re; 01491 o12_im += B2_im; 01492 01493 } 01494 } 01495 01496 { 01497 // apply twisted mass rotation 01498 volatile spinorFloat tmp00_re = +o00_re-o20_im*a; 01499 volatile spinorFloat tmp00_im = +o00_im+o20_re*a; 01500 volatile spinorFloat tmp01_re = +o01_re-o21_im*a; 01501 volatile spinorFloat tmp01_im = +o01_im+o21_re*a; 01502 volatile spinorFloat tmp02_re = +o02_re-o22_im*a; 01503 volatile spinorFloat tmp02_im = +o02_im+o22_re*a; 01504 01505 volatile spinorFloat tmp10_re = +o10_re-o30_im*a; 01506 volatile spinorFloat tmp10_im = +o10_im+o30_re*a; 01507 volatile spinorFloat tmp11_re = +o11_re-o31_im*a; 01508 volatile spinorFloat tmp11_im = +o11_im+o31_re*a; 01509 volatile spinorFloat tmp12_re = +o12_re-o32_im*a; 01510 volatile spinorFloat tmp12_im = +o12_im+o32_re*a; 01511 01512 volatile spinorFloat tmp20_re = -o00_im*a+o20_re; 01513 volatile spinorFloat tmp20_im = +o00_re*a+o20_im; 01514 volatile spinorFloat tmp21_re = -o01_im*a+o21_re; 01515 volatile spinorFloat tmp21_im = +o01_re*a+o21_im; 01516 volatile spinorFloat tmp22_re = -o02_im*a+o22_re; 01517 volatile spinorFloat tmp22_im = +o02_re*a+o22_im; 01518 01519 volatile spinorFloat tmp30_re = -o10_im*a+o30_re; 01520 volatile spinorFloat tmp30_im = +o10_re*a+o30_im; 01521 volatile spinorFloat tmp31_re = -o11_im*a+o31_re; 01522 volatile spinorFloat tmp31_im = +o11_re*a+o31_im; 01523 volatile spinorFloat tmp32_re = -o12_im*a+o32_re; 01524 volatile spinorFloat tmp32_im = +o12_re*a+o32_im; 01525 01526 01527 #ifndef DSLASH_XPAY 01528 //scale by b = 1/(1 + a*a) 01529 o00_re = b*tmp00_re; 01530 o00_im = b*tmp00_im; 01531 o01_re = b*tmp01_re; 01532 o01_im = b*tmp01_im; 01533 o02_re = b*tmp02_re; 01534 o02_im = b*tmp02_im; 01535 o10_re = b*tmp10_re; 01536 o10_im = b*tmp10_im; 01537 o11_re = b*tmp11_re; 01538 o11_im = b*tmp11_im; 01539 o12_re = b*tmp12_re; 01540 o12_im = b*tmp12_im; 01541 o20_re = b*tmp20_re; 01542 o20_im = b*tmp20_im; 01543 o21_re = b*tmp21_re; 01544 o21_im = b*tmp21_im; 01545 o22_re = b*tmp22_re; 01546 o22_im = b*tmp22_im; 01547 o30_re = b*tmp30_re; 01548 o30_im = b*tmp30_im; 01549 o31_re = b*tmp31_re; 01550 o31_im = b*tmp31_im; 01551 o32_re = b*tmp32_re; 01552 o32_im = b*tmp32_im; 01553 #else 01554 o00_re = tmp00_re; 01555 o00_im = tmp00_im; 01556 o01_re = tmp01_re; 01557 o01_im = tmp01_im; 01558 o02_re = tmp02_re; 01559 o02_im = tmp02_im; 01560 o10_re = tmp10_re; 01561 o10_im = tmp10_im; 01562 o11_re = tmp11_re; 01563 o11_im = tmp11_im; 01564 o12_re = tmp12_re; 01565 o12_im = tmp12_im; 01566 o20_re = tmp20_re; 01567 o20_im = tmp20_im; 01568 o21_re = tmp21_re; 01569 o21_im = tmp21_im; 01570 o22_re = tmp22_re; 01571 o22_im = tmp22_im; 01572 o30_re = tmp30_re; 01573 o30_im = tmp30_im; 01574 o31_re = tmp31_re; 01575 o31_im = tmp31_im; 01576 o32_re = tmp32_re; 01577 o32_im = tmp32_im; 01578 #endif // DSLASH_XPAY 01579 01580 } 01581 01582 01583 #ifdef DSLASH_XPAY 01584 READ_ACCUM(ACCUMTEX) 01585 #ifdef SPINOR_DOUBLE 01586 o00_re = b*o00_re + accum0.x; 01587 o00_im = b*o00_im + accum0.y; 01588 o01_re = b*o01_re + accum1.x; 01589 o01_im = b*o01_im + accum1.y; 01590 o02_re = b*o02_re + accum2.x; 01591 o02_im = b*o02_im + accum2.y; 01592 o10_re = b*o10_re + accum3.x; 01593 o10_im = b*o10_im + accum3.y; 01594 o11_re = b*o11_re + accum4.x; 01595 o11_im = b*o11_im + accum4.y; 01596 o12_re = b*o12_re + accum5.x; 01597 o12_im = b*o12_im + accum5.y; 01598 o20_re = b*o20_re + accum6.x; 01599 o20_im = b*o20_im + accum6.y; 01600 o21_re = b*o21_re + accum7.x; 01601 o21_im = b*o21_im + accum7.y; 01602 o22_re = b*o22_re + accum8.x; 01603 o22_im = b*o22_im + accum8.y; 01604 o30_re = b*o30_re + accum9.x; 01605 o30_im = b*o30_im + accum9.y; 01606 o31_re = b*o31_re + accum10.x; 01607 o31_im = b*o31_im + accum10.y; 01608 o32_re = b*o32_re + accum11.x; 01609 o32_im = b*o32_im + accum11.y; 01610 #else 01611 o00_re = b*o00_re + accum0.x; 01612 o00_im = b*o00_im + accum0.y; 01613 o01_re = b*o01_re + accum0.z; 01614 o01_im = b*o01_im + accum0.w; 01615 o02_re = b*o02_re + accum1.x; 01616 o02_im = b*o02_im + accum1.y; 01617 o10_re = b*o10_re + accum1.z; 01618 o10_im = b*o10_im + accum1.w; 01619 o11_re = b*o11_re + accum2.x; 01620 o11_im = b*o11_im + accum2.y; 01621 o12_re = b*o12_re + accum2.z; 01622 o12_im = b*o12_im + accum2.w; 01623 o20_re = b*o20_re + accum3.x; 01624 o20_im = b*o20_im + accum3.y; 01625 o21_re = b*o21_re + accum3.z; 01626 o21_im = b*o21_im + accum3.w; 01627 o22_re = b*o22_re + accum4.x; 01628 o22_im = b*o22_im + accum4.y; 01629 o30_re = b*o30_re + accum4.z; 01630 o30_im = b*o30_im + accum4.w; 01631 o31_re = b*o31_re + accum5.x; 01632 o31_im = b*o31_im + accum5.y; 01633 o32_re = b*o32_re + accum5.z; 01634 o32_im = b*o32_im + accum5.w; 01635 #endif // SPINOR_DOUBLE 01636 #endif // DSLASH_XPAY 01637 01638 01639 // write spinor field back to device memory 01640 WRITE_SPINOR(); 01641 01642 // undefine to prevent warning when precision is changed 01643 #undef spinorFloat 01644 #undef SHARED_STRIDE 01645 01646 #undef A_re 01647 #undef A_im 01648 01649 #undef g00_re 01650 #undef g00_im 01651 #undef g01_re 01652 #undef g01_im 01653 #undef g02_re 01654 #undef g02_im 01655 #undef g10_re 01656 #undef g10_im 01657 #undef g11_re 01658 #undef g11_im 01659 #undef g12_re 01660 #undef g12_im 01661 #undef g20_re 01662 #undef g20_im 01663 #undef g21_re 01664 #undef g21_im 01665 #undef g22_re 01666 #undef g22_im 01667 01668 #undef i00_re 01669 #undef i00_im 01670 #undef i01_re 01671 #undef i01_im 01672 #undef i02_re 01673 #undef i02_im 01674 #undef i10_re 01675 #undef i10_im 01676 #undef i11_re 01677 #undef i11_im 01678 #undef i12_re 01679 #undef i12_im 01680 #undef i20_re 01681 #undef i20_im 01682 #undef i21_re 01683 #undef i21_im 01684 #undef i22_re 01685 #undef i22_im 01686 #undef i30_re 01687 #undef i30_im 01688 #undef i31_re 01689 #undef i31_im 01690 #undef i32_re 01691 #undef i32_im 01692 01693 #undef o00_re 01694 #undef o00_im 01695 #undef o01_re 01696 #undef o01_im 01697 #undef o02_re 01698 #undef o02_im 01699 #undef o10_re 01700 #undef o10_im 01701
1.7.3