QUDA v0.4.0
A library for QCD on GPUs
|
00001 // *** CUDA DSLASH DAGGER *** 00002 00003 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 00004 00005 00006 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 00007 #define VOLATILE 00008 #else // Open64 compiler 00009 #define VOLATILE volatile 00010 #endif 00011 // input spinor 00012 #ifdef SPINOR_DOUBLE 00013 #define spinorFloat double 00014 #define i00_re I0.x 00015 #define i00_im I0.y 00016 #define i01_re I1.x 00017 #define i01_im I1.y 00018 #define i02_re I2.x 00019 #define i02_im I2.y 00020 #define i10_re I3.x 00021 #define i10_im I3.y 00022 #define i11_re I4.x 00023 #define i11_im I4.y 00024 #define i12_re I5.x 00025 #define i12_im I5.y 00026 #define i20_re I6.x 00027 #define i20_im I6.y 00028 #define i21_re I7.x 00029 #define i21_im I7.y 00030 #define i22_re I8.x 00031 #define i22_im I8.y 00032 #define i30_re I9.x 00033 #define i30_im I9.y 00034 #define i31_re I10.x 00035 #define i31_im I10.y 00036 #define i32_re I11.x 00037 #define i32_im I11.y 00038 #else 00039 #define spinorFloat float 00040 #define i00_re I0.x 00041 #define i00_im I0.y 00042 #define i01_re I0.z 00043 #define i01_im I0.w 00044 #define i02_re I1.x 00045 #define i02_im I1.y 00046 #define i10_re I1.z 00047 #define i10_im I1.w 00048 #define i11_re I2.x 00049 #define i11_im I2.y 00050 #define i12_re I2.z 00051 #define i12_im I2.w 00052 #define i20_re I3.x 00053 #define i20_im I3.y 00054 #define i21_re I3.z 00055 #define i21_im I3.w 00056 #define i22_re I4.x 00057 #define i22_im I4.y 00058 #define i30_re I4.z 00059 #define i30_im I4.w 00060 #define i31_re I5.x 00061 #define i31_im I5.y 00062 #define i32_re I5.z 00063 #define i32_im I5.w 00064 #endif // SPINOR_DOUBLE 00065 00066 // gauge link 00067 #ifdef GAUGE_FLOAT2 00068 #define g00_re G0.x 00069 #define g00_im G0.y 00070 #define g01_re G1.x 00071 #define g01_im G1.y 00072 #define g02_re G2.x 00073 #define g02_im G2.y 00074 #define g10_re G3.x 00075 #define g10_im G3.y 00076 #define g11_re G4.x 00077 #define g11_im G4.y 00078 #define g12_re G5.x 00079 #define g12_im G5.y 00080 #define g20_re G6.x 00081 #define g20_im G6.y 00082 #define g21_re G7.x 00083 #define g21_im G7.y 00084 #define g22_re G8.x 00085 #define g22_im G8.y 00086 // temporaries 00087 #define A_re G9.x 00088 #define A_im G9.y 00089 00090 #else 00091 #define g00_re G0.x 00092 #define g00_im G0.y 00093 #define g01_re G0.z 00094 #define g01_im G0.w 00095 #define g02_re G1.x 00096 #define g02_im G1.y 00097 #define g10_re G1.z 00098 #define g10_im G1.w 00099 #define g11_re G2.x 00100 #define g11_im G2.y 00101 #define g12_re G2.z 00102 #define g12_im G2.w 00103 #define g20_re G3.x 00104 #define g20_im G3.y 00105 #define g21_re G3.z 00106 #define g21_im G3.w 00107 #define g22_re G4.x 00108 #define g22_im G4.y 00109 // temporaries 00110 #define A_re G4.z 00111 #define A_im G4.w 00112 00113 #endif // GAUGE_DOUBLE 00114 00115 // conjugated gauge link 00116 #define gT00_re (+g00_re) 00117 #define gT00_im (-g00_im) 00118 #define gT01_re (+g10_re) 00119 #define gT01_im (-g10_im) 00120 #define gT02_re (+g20_re) 00121 #define gT02_im (-g20_im) 00122 #define gT10_re (+g01_re) 00123 #define gT10_im (-g01_im) 00124 #define gT11_re (+g11_re) 00125 #define gT11_im (-g11_im) 00126 #define gT12_re (+g21_re) 00127 #define gT12_im (-g21_im) 00128 #define gT20_re (+g02_re) 00129 #define gT20_im (-g02_im) 00130 #define gT21_re (+g12_re) 00131 #define gT21_im (-g12_im) 00132 #define gT22_re (+g22_re) 00133 #define gT22_im (-g22_im) 00134 00135 // output spinor 00136 VOLATILE spinorFloat o00_re; 00137 VOLATILE spinorFloat o00_im; 00138 VOLATILE spinorFloat o01_re; 00139 VOLATILE spinorFloat o01_im; 00140 VOLATILE spinorFloat o02_re; 00141 VOLATILE spinorFloat o02_im; 00142 VOLATILE spinorFloat o10_re; 00143 VOLATILE spinorFloat o10_im; 00144 VOLATILE spinorFloat o11_re; 00145 VOLATILE spinorFloat o11_im; 00146 VOLATILE spinorFloat o12_re; 00147 VOLATILE spinorFloat o12_im; 00148 VOLATILE spinorFloat o20_re; 00149 VOLATILE spinorFloat o20_im; 00150 VOLATILE spinorFloat o21_re; 00151 VOLATILE spinorFloat o21_im; 00152 VOLATILE spinorFloat o22_re; 00153 VOLATILE spinorFloat o22_im; 00154 VOLATILE spinorFloat o30_re; 00155 VOLATILE spinorFloat o30_im; 00156 VOLATILE spinorFloat o31_re; 00157 VOLATILE spinorFloat o31_im; 00158 VOLATILE spinorFloat o32_re; 00159 VOLATILE spinorFloat o32_im; 00160 00161 #include "read_gauge.h" 00162 #include "read_clover.h" 00163 #include "io_spinor.h" 00164 00165 int x1, x2, x3, x4; 00166 int X; 00167 00168 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision 00169 int sp_norm_idx; 00170 #endif // MULTI_GPU half precision 00171 00172 int sid; 00173 00174 #ifdef MULTI_GPU 00175 int face_idx; 00176 if (kernel_type == INTERIOR_KERNEL) { 00177 #endif 00178 00179 sid = blockIdx.x*blockDim.x + threadIdx.x; 00180 if (sid >= param.threads) return; 00181 00182 // Inline by hand for the moment and assume even dimensions 00183 //coordsFromIndex(X, x1, x2, x3, x4, sid, param.parity); 00184 00185 X = 2*sid; 00186 int aux1 = X / X1; 00187 x1 = X - aux1 * X1; 00188 int aux2 = aux1 / X2; 00189 x2 = aux1 - aux2 * X2; 00190 x4 = aux2 / X3; 00191 x3 = aux2 - x4 * X3; 00192 aux1 = (param.parity + x4 + x3 + x2) & 1; 00193 x1 += aux1; 00194 X += aux1; 00195 00196 o00_re = 0; o00_im = 0; 00197 o01_re = 0; o01_im = 0; 00198 o02_re = 0; o02_im = 0; 00199 o10_re = 0; o10_im = 0; 00200 o11_re = 0; o11_im = 0; 00201 o12_re = 0; o12_im = 0; 00202 o20_re = 0; o20_im = 0; 00203 o21_re = 0; o21_im = 0; 00204 o22_re = 0; o22_im = 0; 00205 o30_re = 0; o30_im = 0; 00206 o31_re = 0; o31_im = 0; 00207 o32_re = 0; o32_im = 0; 00208 00209 #ifdef MULTI_GPU 00210 } else { // exterior kernel 00211 00212 sid = blockIdx.x*blockDim.x + threadIdx.x; 00213 if (sid >= param.threads) return; 00214 00215 const int dim = static_cast<int>(kernel_type); 00216 const int face_volume = (param.threads >> 1); // volume of one face 00217 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1 00218 face_idx = sid - face_num*face_volume; // index into the respective face 00219 00220 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP) 00221 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading) 00222 //sp_idx = face_idx + param.ghostOffset[dim]; 00223 00224 #if (DD_PREC==2) // half precision 00225 sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)]; 00226 #endif 00227 00228 coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity); 00229 00230 READ_INTERMEDIATE_SPINOR(INTERTEX, sp_stride, sid, sid); 00231 00232 o00_re = i00_re; o00_im = i00_im; 00233 o01_re = i01_re; o01_im = i01_im; 00234 o02_re = i02_re; o02_im = i02_im; 00235 o10_re = i10_re; o10_im = i10_im; 00236 o11_re = i11_re; o11_im = i11_im; 00237 o12_re = i12_re; o12_im = i12_im; 00238 o20_re = i20_re; o20_im = i20_im; 00239 o21_re = i21_re; o21_im = i21_im; 00240 o22_re = i22_re; o22_im = i22_im; 00241 o30_re = i30_re; o30_im = i30_im; 00242 o31_re = i31_re; o31_im = i31_im; 00243 o32_re = i32_re; o32_im = i32_im; 00244 } 00245 #endif // MULTI_GPU 00246 00247 00248 #ifdef MULTI_GPU 00249 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) || 00250 (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) ) 00251 #endif 00252 { 00253 // Projector P0+ 00254 // 1 0 0 i 00255 // 0 1 i 0 00256 // 0 -i 1 0 00257 // -i 0 0 1 00258 00259 #ifdef MULTI_GPU 00260 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 : 00261 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00262 #else 00263 const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1; 00264 #endif 00265 00266 const int ga_idx = sid; 00267 00268 spinorFloat a0_re, a0_im; 00269 spinorFloat a1_re, a1_im; 00270 spinorFloat a2_re, a2_im; 00271 spinorFloat b0_re, b0_im; 00272 spinorFloat b1_re, b1_im; 00273 spinorFloat b2_re, b2_im; 00274 00275 #ifdef MULTI_GPU 00276 if (kernel_type == INTERIOR_KERNEL) { 00277 #endif 00278 00279 // read spinor from device memory 00280 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00281 00282 // project spinor into half spinors 00283 a0_re = +i00_re-i30_im; 00284 a0_im = +i00_im+i30_re; 00285 a1_re = +i01_re-i31_im; 00286 a1_im = +i01_im+i31_re; 00287 a2_re = +i02_re-i32_im; 00288 a2_im = +i02_im+i32_re; 00289 b0_re = +i10_re-i20_im; 00290 b0_im = +i10_im+i20_re; 00291 b1_re = +i11_re-i21_im; 00292 b1_im = +i11_im+i21_re; 00293 b2_re = +i12_re-i22_im; 00294 b2_im = +i12_im+i22_re; 00295 00296 #ifdef MULTI_GPU 00297 } else { 00298 00299 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00300 00301 // read half spinor from device memory 00302 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00303 00304 a0_re = i00_re; a0_im = i00_im; 00305 a1_re = i01_re; a1_im = i01_im; 00306 a2_re = i02_re; a2_im = i02_im; 00307 b0_re = i10_re; b0_im = i10_im; 00308 b1_re = i11_re; b1_im = i11_im; 00309 b2_re = i12_re; b2_im = i12_im; 00310 00311 } 00312 #endif // MULTI_GPU 00313 00314 // read gauge matrix from device memory 00315 READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride); 00316 00317 // reconstruct gauge matrix 00318 RECONSTRUCT_GAUGE_MATRIX(0); 00319 00320 // multiply row 0 00321 spinorFloat A0_re = 0; 00322 A0_re += g00_re * a0_re; 00323 A0_re -= g00_im * a0_im; 00324 A0_re += g01_re * a1_re; 00325 A0_re -= g01_im * a1_im; 00326 A0_re += g02_re * a2_re; 00327 A0_re -= g02_im * a2_im; 00328 spinorFloat A0_im = 0; 00329 A0_im += g00_re * a0_im; 00330 A0_im += g00_im * a0_re; 00331 A0_im += g01_re * a1_im; 00332 A0_im += g01_im * a1_re; 00333 A0_im += g02_re * a2_im; 00334 A0_im += g02_im * a2_re; 00335 spinorFloat B0_re = 0; 00336 B0_re += g00_re * b0_re; 00337 B0_re -= g00_im * b0_im; 00338 B0_re += g01_re * b1_re; 00339 B0_re -= g01_im * b1_im; 00340 B0_re += g02_re * b2_re; 00341 B0_re -= g02_im * b2_im; 00342 spinorFloat B0_im = 0; 00343 B0_im += g00_re * b0_im; 00344 B0_im += g00_im * b0_re; 00345 B0_im += g01_re * b1_im; 00346 B0_im += g01_im * b1_re; 00347 B0_im += g02_re * b2_im; 00348 B0_im += g02_im * b2_re; 00349 00350 // multiply row 1 00351 spinorFloat A1_re = 0; 00352 A1_re += g10_re * a0_re; 00353 A1_re -= g10_im * a0_im; 00354 A1_re += g11_re * a1_re; 00355 A1_re -= g11_im * a1_im; 00356 A1_re += g12_re * a2_re; 00357 A1_re -= g12_im * a2_im; 00358 spinorFloat A1_im = 0; 00359 A1_im += g10_re * a0_im; 00360 A1_im += g10_im * a0_re; 00361 A1_im += g11_re * a1_im; 00362 A1_im += g11_im * a1_re; 00363 A1_im += g12_re * a2_im; 00364 A1_im += g12_im * a2_re; 00365 spinorFloat B1_re = 0; 00366 B1_re += g10_re * b0_re; 00367 B1_re -= g10_im * b0_im; 00368 B1_re += g11_re * b1_re; 00369 B1_re -= g11_im * b1_im; 00370 B1_re += g12_re * b2_re; 00371 B1_re -= g12_im * b2_im; 00372 spinorFloat B1_im = 0; 00373 B1_im += g10_re * b0_im; 00374 B1_im += g10_im * b0_re; 00375 B1_im += g11_re * b1_im; 00376 B1_im += g11_im * b1_re; 00377 B1_im += g12_re * b2_im; 00378 B1_im += g12_im * b2_re; 00379 00380 // multiply row 2 00381 spinorFloat A2_re = 0; 00382 A2_re += g20_re * a0_re; 00383 A2_re -= g20_im * a0_im; 00384 A2_re += g21_re * a1_re; 00385 A2_re -= g21_im * a1_im; 00386 A2_re += g22_re * a2_re; 00387 A2_re -= g22_im * a2_im; 00388 spinorFloat A2_im = 0; 00389 A2_im += g20_re * a0_im; 00390 A2_im += g20_im * a0_re; 00391 A2_im += g21_re * a1_im; 00392 A2_im += g21_im * a1_re; 00393 A2_im += g22_re * a2_im; 00394 A2_im += g22_im * a2_re; 00395 spinorFloat B2_re = 0; 00396 B2_re += g20_re * b0_re; 00397 B2_re -= g20_im * b0_im; 00398 B2_re += g21_re * b1_re; 00399 B2_re -= g21_im * b1_im; 00400 B2_re += g22_re * b2_re; 00401 B2_re -= g22_im * b2_im; 00402 spinorFloat B2_im = 0; 00403 B2_im += g20_re * b0_im; 00404 B2_im += g20_im * b0_re; 00405 B2_im += g21_re * b1_im; 00406 B2_im += g21_im * b1_re; 00407 B2_im += g22_re * b2_im; 00408 B2_im += g22_im * b2_re; 00409 00410 o00_re += A0_re; 00411 o00_im += A0_im; 00412 o10_re += B0_re; 00413 o10_im += B0_im; 00414 o20_re += B0_im; 00415 o20_im -= B0_re; 00416 o30_re += A0_im; 00417 o30_im -= A0_re; 00418 00419 o01_re += A1_re; 00420 o01_im += A1_im; 00421 o11_re += B1_re; 00422 o11_im += B1_im; 00423 o21_re += B1_im; 00424 o21_im -= B1_re; 00425 o31_re += A1_im; 00426 o31_im -= A1_re; 00427 00428 o02_re += A2_re; 00429 o02_im += A2_im; 00430 o12_re += B2_re; 00431 o12_im += B2_im; 00432 o22_re += B2_im; 00433 o22_im -= B2_re; 00434 o32_re += A2_im; 00435 o32_im -= A2_re; 00436 00437 } 00438 00439 #ifdef MULTI_GPU 00440 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) || 00441 (kernel_type == EXTERIOR_KERNEL_X && x1==0) ) 00442 #endif 00443 { 00444 // Projector P0- 00445 // 1 0 0 -i 00446 // 0 1 -i 0 00447 // 0 i 1 0 00448 // i 0 0 1 00449 00450 #ifdef MULTI_GPU 00451 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 : 00452 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00453 #else 00454 const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1; 00455 #endif 00456 00457 #ifdef MULTI_GPU 00458 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 00459 #else 00460 const int ga_idx = sp_idx; 00461 #endif 00462 00463 spinorFloat a0_re, a0_im; 00464 spinorFloat a1_re, a1_im; 00465 spinorFloat a2_re, a2_im; 00466 spinorFloat b0_re, b0_im; 00467 spinorFloat b1_re, b1_im; 00468 spinorFloat b2_re, b2_im; 00469 00470 #ifdef MULTI_GPU 00471 if (kernel_type == INTERIOR_KERNEL) { 00472 #endif 00473 00474 // read spinor from device memory 00475 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00476 00477 // project spinor into half spinors 00478 a0_re = +i00_re+i30_im; 00479 a0_im = +i00_im-i30_re; 00480 a1_re = +i01_re+i31_im; 00481 a1_im = +i01_im-i31_re; 00482 a2_re = +i02_re+i32_im; 00483 a2_im = +i02_im-i32_re; 00484 b0_re = +i10_re+i20_im; 00485 b0_im = +i10_im-i20_re; 00486 b1_re = +i11_re+i21_im; 00487 b1_im = +i11_im-i21_re; 00488 b2_re = +i12_re+i22_im; 00489 b2_im = +i12_im-i22_re; 00490 00491 #ifdef MULTI_GPU 00492 } else { 00493 00494 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00495 00496 // read half spinor from device memory 00497 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 00498 00499 a0_re = i00_re; a0_im = i00_im; 00500 a1_re = i01_re; a1_im = i01_im; 00501 a2_re = i02_re; a2_im = i02_im; 00502 b0_re = i10_re; b0_im = i10_im; 00503 b1_re = i11_re; b1_im = i11_im; 00504 b2_re = i12_re; b2_im = i12_im; 00505 00506 } 00507 #endif // MULTI_GPU 00508 00509 // read gauge matrix from device memory 00510 READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride); 00511 00512 // reconstruct gauge matrix 00513 RECONSTRUCT_GAUGE_MATRIX(1); 00514 00515 // multiply row 0 00516 spinorFloat A0_re = 0; 00517 A0_re += gT00_re * a0_re; 00518 A0_re -= gT00_im * a0_im; 00519 A0_re += gT01_re * a1_re; 00520 A0_re -= gT01_im * a1_im; 00521 A0_re += gT02_re * a2_re; 00522 A0_re -= gT02_im * a2_im; 00523 spinorFloat A0_im = 0; 00524 A0_im += gT00_re * a0_im; 00525 A0_im += gT00_im * a0_re; 00526 A0_im += gT01_re * a1_im; 00527 A0_im += gT01_im * a1_re; 00528 A0_im += gT02_re * a2_im; 00529 A0_im += gT02_im * a2_re; 00530 spinorFloat B0_re = 0; 00531 B0_re += gT00_re * b0_re; 00532 B0_re -= gT00_im * b0_im; 00533 B0_re += gT01_re * b1_re; 00534 B0_re -= gT01_im * b1_im; 00535 B0_re += gT02_re * b2_re; 00536 B0_re -= gT02_im * b2_im; 00537 spinorFloat B0_im = 0; 00538 B0_im += gT00_re * b0_im; 00539 B0_im += gT00_im * b0_re; 00540 B0_im += gT01_re * b1_im; 00541 B0_im += gT01_im * b1_re; 00542 B0_im += gT02_re * b2_im; 00543 B0_im += gT02_im * b2_re; 00544 00545 // multiply row 1 00546 spinorFloat A1_re = 0; 00547 A1_re += gT10_re * a0_re; 00548 A1_re -= gT10_im * a0_im; 00549 A1_re += gT11_re * a1_re; 00550 A1_re -= gT11_im * a1_im; 00551 A1_re += gT12_re * a2_re; 00552 A1_re -= gT12_im * a2_im; 00553 spinorFloat A1_im = 0; 00554 A1_im += gT10_re * a0_im; 00555 A1_im += gT10_im * a0_re; 00556 A1_im += gT11_re * a1_im; 00557 A1_im += gT11_im * a1_re; 00558 A1_im += gT12_re * a2_im; 00559 A1_im += gT12_im * a2_re; 00560 spinorFloat B1_re = 0; 00561 B1_re += gT10_re * b0_re; 00562 B1_re -= gT10_im * b0_im; 00563 B1_re += gT11_re * b1_re; 00564 B1_re -= gT11_im * b1_im; 00565 B1_re += gT12_re * b2_re; 00566 B1_re -= gT12_im * b2_im; 00567 spinorFloat B1_im = 0; 00568 B1_im += gT10_re * b0_im; 00569 B1_im += gT10_im * b0_re; 00570 B1_im += gT11_re * b1_im; 00571 B1_im += gT11_im * b1_re; 00572 B1_im += gT12_re * b2_im; 00573 B1_im += gT12_im * b2_re; 00574 00575 // multiply row 2 00576 spinorFloat A2_re = 0; 00577 A2_re += gT20_re * a0_re; 00578 A2_re -= gT20_im * a0_im; 00579 A2_re += gT21_re * a1_re; 00580 A2_re -= gT21_im * a1_im; 00581 A2_re += gT22_re * a2_re; 00582 A2_re -= gT22_im * a2_im; 00583 spinorFloat A2_im = 0; 00584 A2_im += gT20_re * a0_im; 00585 A2_im += gT20_im * a0_re; 00586 A2_im += gT21_re * a1_im; 00587 A2_im += gT21_im * a1_re; 00588 A2_im += gT22_re * a2_im; 00589 A2_im += gT22_im * a2_re; 00590 spinorFloat B2_re = 0; 00591 B2_re += gT20_re * b0_re; 00592 B2_re -= gT20_im * b0_im; 00593 B2_re += gT21_re * b1_re; 00594 B2_re -= gT21_im * b1_im; 00595 B2_re += gT22_re * b2_re; 00596 B2_re -= gT22_im * b2_im; 00597 spinorFloat B2_im = 0; 00598 B2_im += gT20_re * b0_im; 00599 B2_im += gT20_im * b0_re; 00600 B2_im += gT21_re * b1_im; 00601 B2_im += gT21_im * b1_re; 00602 B2_im += gT22_re * b2_im; 00603 B2_im += gT22_im * b2_re; 00604 00605 o00_re += A0_re; 00606 o00_im += A0_im; 00607 o10_re += B0_re; 00608 o10_im += B0_im; 00609 o20_re -= B0_im; 00610 o20_im += B0_re; 00611 o30_re -= A0_im; 00612 o30_im += A0_re; 00613 00614 o01_re += A1_re; 00615 o01_im += A1_im; 00616 o11_re += B1_re; 00617 o11_im += B1_im; 00618 o21_re -= B1_im; 00619 o21_im += B1_re; 00620 o31_re -= A1_im; 00621 o31_im += A1_re; 00622 00623 o02_re += A2_re; 00624 o02_im += A2_im; 00625 o12_re += B2_re; 00626 o12_im += B2_im; 00627 o22_re -= B2_im; 00628 o22_im += B2_re; 00629 o32_re -= A2_im; 00630 o32_im += A2_re; 00631 00632 } 00633 00634 #ifdef MULTI_GPU 00635 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) || 00636 (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) ) 00637 #endif 00638 { 00639 // Projector P1+ 00640 // 1 0 0 1 00641 // 0 1 -1 0 00642 // 0 -1 1 0 00643 // 1 0 0 1 00644 00645 #ifdef MULTI_GPU 00646 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 : 00647 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00648 #else 00649 const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1; 00650 #endif 00651 00652 const int ga_idx = sid; 00653 00654 spinorFloat a0_re, a0_im; 00655 spinorFloat a1_re, a1_im; 00656 spinorFloat a2_re, a2_im; 00657 spinorFloat b0_re, b0_im; 00658 spinorFloat b1_re, b1_im; 00659 spinorFloat b2_re, b2_im; 00660 00661 #ifdef MULTI_GPU 00662 if (kernel_type == INTERIOR_KERNEL) { 00663 #endif 00664 00665 // read spinor from device memory 00666 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00667 00668 // project spinor into half spinors 00669 a0_re = +i00_re+i30_re; 00670 a0_im = +i00_im+i30_im; 00671 a1_re = +i01_re+i31_re; 00672 a1_im = +i01_im+i31_im; 00673 a2_re = +i02_re+i32_re; 00674 a2_im = +i02_im+i32_im; 00675 b0_re = +i10_re-i20_re; 00676 b0_im = +i10_im-i20_im; 00677 b1_re = +i11_re-i21_re; 00678 b1_im = +i11_im-i21_im; 00679 b2_re = +i12_re-i22_re; 00680 b2_im = +i12_im-i22_im; 00681 00682 #ifdef MULTI_GPU 00683 } else { 00684 00685 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00686 00687 // read half spinor from device memory 00688 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00689 00690 a0_re = i00_re; a0_im = i00_im; 00691 a1_re = i01_re; a1_im = i01_im; 00692 a2_re = i02_re; a2_im = i02_im; 00693 b0_re = i10_re; b0_im = i10_im; 00694 b1_re = i11_re; b1_im = i11_im; 00695 b2_re = i12_re; b2_im = i12_im; 00696 00697 } 00698 #endif // MULTI_GPU 00699 00700 // read gauge matrix from device memory 00701 READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride); 00702 00703 // reconstruct gauge matrix 00704 RECONSTRUCT_GAUGE_MATRIX(2); 00705 00706 // multiply row 0 00707 spinorFloat A0_re = 0; 00708 A0_re += g00_re * a0_re; 00709 A0_re -= g00_im * a0_im; 00710 A0_re += g01_re * a1_re; 00711 A0_re -= g01_im * a1_im; 00712 A0_re += g02_re * a2_re; 00713 A0_re -= g02_im * a2_im; 00714 spinorFloat A0_im = 0; 00715 A0_im += g00_re * a0_im; 00716 A0_im += g00_im * a0_re; 00717 A0_im += g01_re * a1_im; 00718 A0_im += g01_im * a1_re; 00719 A0_im += g02_re * a2_im; 00720 A0_im += g02_im * a2_re; 00721 spinorFloat B0_re = 0; 00722 B0_re += g00_re * b0_re; 00723 B0_re -= g00_im * b0_im; 00724 B0_re += g01_re * b1_re; 00725 B0_re -= g01_im * b1_im; 00726 B0_re += g02_re * b2_re; 00727 B0_re -= g02_im * b2_im; 00728 spinorFloat B0_im = 0; 00729 B0_im += g00_re * b0_im; 00730 B0_im += g00_im * b0_re; 00731 B0_im += g01_re * b1_im; 00732 B0_im += g01_im * b1_re; 00733 B0_im += g02_re * b2_im; 00734 B0_im += g02_im * b2_re; 00735 00736 // multiply row 1 00737 spinorFloat A1_re = 0; 00738 A1_re += g10_re * a0_re; 00739 A1_re -= g10_im * a0_im; 00740 A1_re += g11_re * a1_re; 00741 A1_re -= g11_im * a1_im; 00742 A1_re += g12_re * a2_re; 00743 A1_re -= g12_im * a2_im; 00744 spinorFloat A1_im = 0; 00745 A1_im += g10_re * a0_im; 00746 A1_im += g10_im * a0_re; 00747 A1_im += g11_re * a1_im; 00748 A1_im += g11_im * a1_re; 00749 A1_im += g12_re * a2_im; 00750 A1_im += g12_im * a2_re; 00751 spinorFloat B1_re = 0; 00752 B1_re += g10_re * b0_re; 00753 B1_re -= g10_im * b0_im; 00754 B1_re += g11_re * b1_re; 00755 B1_re -= g11_im * b1_im; 00756 B1_re += g12_re * b2_re; 00757 B1_re -= g12_im * b2_im; 00758 spinorFloat B1_im = 0; 00759 B1_im += g10_re * b0_im; 00760 B1_im += g10_im * b0_re; 00761 B1_im += g11_re * b1_im; 00762 B1_im += g11_im * b1_re; 00763 B1_im += g12_re * b2_im; 00764 B1_im += g12_im * b2_re; 00765 00766 // multiply row 2 00767 spinorFloat A2_re = 0; 00768 A2_re += g20_re * a0_re; 00769 A2_re -= g20_im * a0_im; 00770 A2_re += g21_re * a1_re; 00771 A2_re -= g21_im * a1_im; 00772 A2_re += g22_re * a2_re; 00773 A2_re -= g22_im * a2_im; 00774 spinorFloat A2_im = 0; 00775 A2_im += g20_re * a0_im; 00776 A2_im += g20_im * a0_re; 00777 A2_im += g21_re * a1_im; 00778 A2_im += g21_im * a1_re; 00779 A2_im += g22_re * a2_im; 00780 A2_im += g22_im * a2_re; 00781 spinorFloat B2_re = 0; 00782 B2_re += g20_re * b0_re; 00783 B2_re -= g20_im * b0_im; 00784 B2_re += g21_re * b1_re; 00785 B2_re -= g21_im * b1_im; 00786 B2_re += g22_re * b2_re; 00787 B2_re -= g22_im * b2_im; 00788 spinorFloat B2_im = 0; 00789 B2_im += g20_re * b0_im; 00790 B2_im += g20_im * b0_re; 00791 B2_im += g21_re * b1_im; 00792 B2_im += g21_im * b1_re; 00793 B2_im += g22_re * b2_im; 00794 B2_im += g22_im * b2_re; 00795 00796 o00_re += A0_re; 00797 o00_im += A0_im; 00798 o10_re += B0_re; 00799 o10_im += B0_im; 00800 o20_re -= B0_re; 00801 o20_im -= B0_im; 00802 o30_re += A0_re; 00803 o30_im += A0_im; 00804 00805 o01_re += A1_re; 00806 o01_im += A1_im; 00807 o11_re += B1_re; 00808 o11_im += B1_im; 00809 o21_re -= B1_re; 00810 o21_im -= B1_im; 00811 o31_re += A1_re; 00812 o31_im += A1_im; 00813 00814 o02_re += A2_re; 00815 o02_im += A2_im; 00816 o12_re += B2_re; 00817 o12_im += B2_im; 00818 o22_re -= B2_re; 00819 o22_im -= B2_im; 00820 o32_re += A2_re; 00821 o32_im += A2_im; 00822 00823 } 00824 00825 #ifdef MULTI_GPU 00826 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) || 00827 (kernel_type == EXTERIOR_KERNEL_Y && x2==0) ) 00828 #endif 00829 { 00830 // Projector P1- 00831 // 1 0 0 -1 00832 // 0 1 1 0 00833 // 0 1 1 0 00834 // -1 0 0 1 00835 00836 #ifdef MULTI_GPU 00837 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 : 00838 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00839 #else 00840 const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1; 00841 #endif 00842 00843 #ifdef MULTI_GPU 00844 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 00845 #else 00846 const int ga_idx = sp_idx; 00847 #endif 00848 00849 spinorFloat a0_re, a0_im; 00850 spinorFloat a1_re, a1_im; 00851 spinorFloat a2_re, a2_im; 00852 spinorFloat b0_re, b0_im; 00853 spinorFloat b1_re, b1_im; 00854 spinorFloat b2_re, b2_im; 00855 00856 #ifdef MULTI_GPU 00857 if (kernel_type == INTERIOR_KERNEL) { 00858 #endif 00859 00860 // read spinor from device memory 00861 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00862 00863 // project spinor into half spinors 00864 a0_re = +i00_re-i30_re; 00865 a0_im = +i00_im-i30_im; 00866 a1_re = +i01_re-i31_re; 00867 a1_im = +i01_im-i31_im; 00868 a2_re = +i02_re-i32_re; 00869 a2_im = +i02_im-i32_im; 00870 b0_re = +i10_re+i20_re; 00871 b0_im = +i10_im+i20_im; 00872 b1_re = +i11_re+i21_re; 00873 b1_im = +i11_im+i21_im; 00874 b2_re = +i12_re+i22_re; 00875 b2_im = +i12_im+i22_im; 00876 00877 #ifdef MULTI_GPU 00878 } else { 00879 00880 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00881 00882 // read half spinor from device memory 00883 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 00884 00885 a0_re = i00_re; a0_im = i00_im; 00886 a1_re = i01_re; a1_im = i01_im; 00887 a2_re = i02_re; a2_im = i02_im; 00888 b0_re = i10_re; b0_im = i10_im; 00889 b1_re = i11_re; b1_im = i11_im; 00890 b2_re = i12_re; b2_im = i12_im; 00891 00892 } 00893 #endif // MULTI_GPU 00894 00895 // read gauge matrix from device memory 00896 READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride); 00897 00898 // reconstruct gauge matrix 00899 RECONSTRUCT_GAUGE_MATRIX(3); 00900 00901 // multiply row 0 00902 spinorFloat A0_re = 0; 00903 A0_re += gT00_re * a0_re; 00904 A0_re -= gT00_im * a0_im; 00905 A0_re += gT01_re * a1_re; 00906 A0_re -= gT01_im * a1_im; 00907 A0_re += gT02_re * a2_re; 00908 A0_re -= gT02_im * a2_im; 00909 spinorFloat A0_im = 0; 00910 A0_im += gT00_re * a0_im; 00911 A0_im += gT00_im * a0_re; 00912 A0_im += gT01_re * a1_im; 00913 A0_im += gT01_im * a1_re; 00914 A0_im += gT02_re * a2_im; 00915 A0_im += gT02_im * a2_re; 00916 spinorFloat B0_re = 0; 00917 B0_re += gT00_re * b0_re; 00918 B0_re -= gT00_im * b0_im; 00919 B0_re += gT01_re * b1_re; 00920 B0_re -= gT01_im * b1_im; 00921 B0_re += gT02_re * b2_re; 00922 B0_re -= gT02_im * b2_im; 00923 spinorFloat B0_im = 0; 00924 B0_im += gT00_re * b0_im; 00925 B0_im += gT00_im * b0_re; 00926 B0_im += gT01_re * b1_im; 00927 B0_im += gT01_im * b1_re; 00928 B0_im += gT02_re * b2_im; 00929 B0_im += gT02_im * b2_re; 00930 00931 // multiply row 1 00932 spinorFloat A1_re = 0; 00933 A1_re += gT10_re * a0_re; 00934 A1_re -= gT10_im * a0_im; 00935 A1_re += gT11_re * a1_re; 00936 A1_re -= gT11_im * a1_im; 00937 A1_re += gT12_re * a2_re; 00938 A1_re -= gT12_im * a2_im; 00939 spinorFloat A1_im = 0; 00940 A1_im += gT10_re * a0_im; 00941 A1_im += gT10_im * a0_re; 00942 A1_im += gT11_re * a1_im; 00943 A1_im += gT11_im * a1_re; 00944 A1_im += gT12_re * a2_im; 00945 A1_im += gT12_im * a2_re; 00946 spinorFloat B1_re = 0; 00947 B1_re += gT10_re * b0_re; 00948 B1_re -= gT10_im * b0_im; 00949 B1_re += gT11_re * b1_re; 00950 B1_re -= gT11_im * b1_im; 00951 B1_re += gT12_re * b2_re; 00952 B1_re -= gT12_im * b2_im; 00953 spinorFloat B1_im = 0; 00954 B1_im += gT10_re * b0_im; 00955 B1_im += gT10_im * b0_re; 00956 B1_im += gT11_re * b1_im; 00957 B1_im += gT11_im * b1_re; 00958 B1_im += gT12_re * b2_im; 00959 B1_im += gT12_im * b2_re; 00960 00961 // multiply row 2 00962 spinorFloat A2_re = 0; 00963 A2_re += gT20_re * a0_re; 00964 A2_re -= gT20_im * a0_im; 00965 A2_re += gT21_re * a1_re; 00966 A2_re -= gT21_im * a1_im; 00967 A2_re += gT22_re * a2_re; 00968 A2_re -= gT22_im * a2_im; 00969 spinorFloat A2_im = 0; 00970 A2_im += gT20_re * a0_im; 00971 A2_im += gT20_im * a0_re; 00972 A2_im += gT21_re * a1_im; 00973 A2_im += gT21_im * a1_re; 00974 A2_im += gT22_re * a2_im; 00975 A2_im += gT22_im * a2_re; 00976 spinorFloat B2_re = 0; 00977 B2_re += gT20_re * b0_re; 00978 B2_re -= gT20_im * b0_im; 00979 B2_re += gT21_re * b1_re; 00980 B2_re -= gT21_im * b1_im; 00981 B2_re += gT22_re * b2_re; 00982 B2_re -= gT22_im * b2_im; 00983 spinorFloat B2_im = 0; 00984 B2_im += gT20_re * b0_im; 00985 B2_im += gT20_im * b0_re; 00986 B2_im += gT21_re * b1_im; 00987 B2_im += gT21_im * b1_re; 00988 B2_im += gT22_re * b2_im; 00989 B2_im += gT22_im * b2_re; 00990 00991 o00_re += A0_re; 00992 o00_im += A0_im; 00993 o10_re += B0_re; 00994 o10_im += B0_im; 00995 o20_re += B0_re; 00996 o20_im += B0_im; 00997 o30_re -= A0_re; 00998 o30_im -= A0_im; 00999 01000 o01_re += A1_re; 01001 o01_im += A1_im; 01002 o11_re += B1_re; 01003 o11_im += B1_im; 01004 o21_re += B1_re; 01005 o21_im += B1_im; 01006 o31_re -= A1_re; 01007 o31_im -= A1_im; 01008 01009 o02_re += A2_re; 01010 o02_im += A2_im; 01011 o12_re += B2_re; 01012 o12_im += B2_im; 01013 o22_re += B2_re; 01014 o22_im += B2_im; 01015 o32_re -= A2_re; 01016 o32_im -= A2_im; 01017 01018 } 01019 01020 #ifdef MULTI_GPU 01021 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) || 01022 (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) ) 01023 #endif 01024 { 01025 // Projector P2+ 01026 // 1 0 i 0 01027 // 0 1 0 -i 01028 // -i 0 1 0 01029 // 0 i 0 1 01030 01031 #ifdef MULTI_GPU 01032 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 : 01033 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01034 #else 01035 const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1; 01036 #endif 01037 01038 const int ga_idx = sid; 01039 01040 spinorFloat a0_re, a0_im; 01041 spinorFloat a1_re, a1_im; 01042 spinorFloat a2_re, a2_im; 01043 spinorFloat b0_re, b0_im; 01044 spinorFloat b1_re, b1_im; 01045 spinorFloat b2_re, b2_im; 01046 01047 #ifdef MULTI_GPU 01048 if (kernel_type == INTERIOR_KERNEL) { 01049 #endif 01050 01051 // read spinor from device memory 01052 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01053 01054 // project spinor into half spinors 01055 a0_re = +i00_re-i20_im; 01056 a0_im = +i00_im+i20_re; 01057 a1_re = +i01_re-i21_im; 01058 a1_im = +i01_im+i21_re; 01059 a2_re = +i02_re-i22_im; 01060 a2_im = +i02_im+i22_re; 01061 b0_re = +i10_re+i30_im; 01062 b0_im = +i10_im-i30_re; 01063 b1_re = +i11_re+i31_im; 01064 b1_im = +i11_im-i31_re; 01065 b2_re = +i12_re+i32_im; 01066 b2_im = +i12_im-i32_re; 01067 01068 #ifdef MULTI_GPU 01069 } else { 01070 01071 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01072 01073 // read half spinor from device memory 01074 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01075 01076 a0_re = i00_re; a0_im = i00_im; 01077 a1_re = i01_re; a1_im = i01_im; 01078 a2_re = i02_re; a2_im = i02_im; 01079 b0_re = i10_re; b0_im = i10_im; 01080 b1_re = i11_re; b1_im = i11_im; 01081 b2_re = i12_re; b2_im = i12_im; 01082 01083 } 01084 #endif // MULTI_GPU 01085 01086 // read gauge matrix from device memory 01087 READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride); 01088 01089 // reconstruct gauge matrix 01090 RECONSTRUCT_GAUGE_MATRIX(4); 01091 01092 // multiply row 0 01093 spinorFloat A0_re = 0; 01094 A0_re += g00_re * a0_re; 01095 A0_re -= g00_im * a0_im; 01096 A0_re += g01_re * a1_re; 01097 A0_re -= g01_im * a1_im; 01098 A0_re += g02_re * a2_re; 01099 A0_re -= g02_im * a2_im; 01100 spinorFloat A0_im = 0; 01101 A0_im += g00_re * a0_im; 01102 A0_im += g00_im * a0_re; 01103 A0_im += g01_re * a1_im; 01104 A0_im += g01_im * a1_re; 01105 A0_im += g02_re * a2_im; 01106 A0_im += g02_im * a2_re; 01107 spinorFloat B0_re = 0; 01108 B0_re += g00_re * b0_re; 01109 B0_re -= g00_im * b0_im; 01110 B0_re += g01_re * b1_re; 01111 B0_re -= g01_im * b1_im; 01112 B0_re += g02_re * b2_re; 01113 B0_re -= g02_im * b2_im; 01114 spinorFloat B0_im = 0; 01115 B0_im += g00_re * b0_im; 01116 B0_im += g00_im * b0_re; 01117 B0_im += g01_re * b1_im; 01118 B0_im += g01_im * b1_re; 01119 B0_im += g02_re * b2_im; 01120 B0_im += g02_im * b2_re; 01121 01122 // multiply row 1 01123 spinorFloat A1_re = 0; 01124 A1_re += g10_re * a0_re; 01125 A1_re -= g10_im * a0_im; 01126 A1_re += g11_re * a1_re; 01127 A1_re -= g11_im * a1_im; 01128 A1_re += g12_re * a2_re; 01129 A1_re -= g12_im * a2_im; 01130 spinorFloat A1_im = 0; 01131 A1_im += g10_re * a0_im; 01132 A1_im += g10_im * a0_re; 01133 A1_im += g11_re * a1_im; 01134 A1_im += g11_im * a1_re; 01135 A1_im += g12_re * a2_im; 01136 A1_im += g12_im * a2_re; 01137 spinorFloat B1_re = 0; 01138 B1_re += g10_re * b0_re; 01139 B1_re -= g10_im * b0_im; 01140 B1_re += g11_re * b1_re; 01141 B1_re -= g11_im * b1_im; 01142 B1_re += g12_re * b2_re; 01143 B1_re -= g12_im * b2_im; 01144 spinorFloat B1_im = 0; 01145 B1_im += g10_re * b0_im; 01146 B1_im += g10_im * b0_re; 01147 B1_im += g11_re * b1_im; 01148 B1_im += g11_im * b1_re; 01149 B1_im += g12_re * b2_im; 01150 B1_im += g12_im * b2_re; 01151 01152 // multiply row 2 01153 spinorFloat A2_re = 0; 01154 A2_re += g20_re * a0_re; 01155 A2_re -= g20_im * a0_im; 01156 A2_re += g21_re * a1_re; 01157 A2_re -= g21_im * a1_im; 01158 A2_re += g22_re * a2_re; 01159 A2_re -= g22_im * a2_im; 01160 spinorFloat A2_im = 0; 01161 A2_im += g20_re * a0_im; 01162 A2_im += g20_im * a0_re; 01163 A2_im += g21_re * a1_im; 01164 A2_im += g21_im * a1_re; 01165 A2_im += g22_re * a2_im; 01166 A2_im += g22_im * a2_re; 01167 spinorFloat B2_re = 0; 01168 B2_re += g20_re * b0_re; 01169 B2_re -= g20_im * b0_im; 01170 B2_re += g21_re * b1_re; 01171 B2_re -= g21_im * b1_im; 01172 B2_re += g22_re * b2_re; 01173 B2_re -= g22_im * b2_im; 01174 spinorFloat B2_im = 0; 01175 B2_im += g20_re * b0_im; 01176 B2_im += g20_im * b0_re; 01177 B2_im += g21_re * b1_im; 01178 B2_im += g21_im * b1_re; 01179 B2_im += g22_re * b2_im; 01180 B2_im += g22_im * b2_re; 01181 01182 o00_re += A0_re; 01183 o00_im += A0_im; 01184 o10_re += B0_re; 01185 o10_im += B0_im; 01186 o20_re += A0_im; 01187 o20_im -= A0_re; 01188 o30_re -= B0_im; 01189 o30_im += B0_re; 01190 01191 o01_re += A1_re; 01192 o01_im += A1_im; 01193 o11_re += B1_re; 01194 o11_im += B1_im; 01195 o21_re += A1_im; 01196 o21_im -= A1_re; 01197 o31_re -= B1_im; 01198 o31_im += B1_re; 01199 01200 o02_re += A2_re; 01201 o02_im += A2_im; 01202 o12_re += B2_re; 01203 o12_im += B2_im; 01204 o22_re += A2_im; 01205 o22_im -= A2_re; 01206 o32_re -= B2_im; 01207 o32_im += B2_re; 01208 01209 } 01210 01211 #ifdef MULTI_GPU 01212 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) || 01213 (kernel_type == EXTERIOR_KERNEL_Z && x3==0) ) 01214 #endif 01215 { 01216 // Projector P2- 01217 // 1 0 -i 0 01218 // 0 1 0 i 01219 // i 0 1 0 01220 // 0 -i 0 1 01221 01222 #ifdef MULTI_GPU 01223 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 : 01224 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01225 #else 01226 const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1; 01227 #endif 01228 01229 #ifdef MULTI_GPU 01230 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01231 #else 01232 const int ga_idx = sp_idx; 01233 #endif 01234 01235 spinorFloat a0_re, a0_im; 01236 spinorFloat a1_re, a1_im; 01237 spinorFloat a2_re, a2_im; 01238 spinorFloat b0_re, b0_im; 01239 spinorFloat b1_re, b1_im; 01240 spinorFloat b2_re, b2_im; 01241 01242 #ifdef MULTI_GPU 01243 if (kernel_type == INTERIOR_KERNEL) { 01244 #endif 01245 01246 // read spinor from device memory 01247 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01248 01249 // project spinor into half spinors 01250 a0_re = +i00_re+i20_im; 01251 a0_im = +i00_im-i20_re; 01252 a1_re = +i01_re+i21_im; 01253 a1_im = +i01_im-i21_re; 01254 a2_re = +i02_re+i22_im; 01255 a2_im = +i02_im-i22_re; 01256 b0_re = +i10_re-i30_im; 01257 b0_im = +i10_im+i30_re; 01258 b1_re = +i11_re-i31_im; 01259 b1_im = +i11_im+i31_re; 01260 b2_re = +i12_re-i32_im; 01261 b2_im = +i12_im+i32_re; 01262 01263 #ifdef MULTI_GPU 01264 } else { 01265 01266 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01267 01268 // read half spinor from device memory 01269 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01270 01271 a0_re = i00_re; a0_im = i00_im; 01272 a1_re = i01_re; a1_im = i01_im; 01273 a2_re = i02_re; a2_im = i02_im; 01274 b0_re = i10_re; b0_im = i10_im; 01275 b1_re = i11_re; b1_im = i11_im; 01276 b2_re = i12_re; b2_im = i12_im; 01277 01278 } 01279 #endif // MULTI_GPU 01280 01281 // read gauge matrix from device memory 01282 READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride); 01283 01284 // reconstruct gauge matrix 01285 RECONSTRUCT_GAUGE_MATRIX(5); 01286 01287 // multiply row 0 01288 spinorFloat A0_re = 0; 01289 A0_re += gT00_re * a0_re; 01290 A0_re -= gT00_im * a0_im; 01291 A0_re += gT01_re * a1_re; 01292 A0_re -= gT01_im * a1_im; 01293 A0_re += gT02_re * a2_re; 01294 A0_re -= gT02_im * a2_im; 01295 spinorFloat A0_im = 0; 01296 A0_im += gT00_re * a0_im; 01297 A0_im += gT00_im * a0_re; 01298 A0_im += gT01_re * a1_im; 01299 A0_im += gT01_im * a1_re; 01300 A0_im += gT02_re * a2_im; 01301 A0_im += gT02_im * a2_re; 01302 spinorFloat B0_re = 0; 01303 B0_re += gT00_re * b0_re; 01304 B0_re -= gT00_im * b0_im; 01305 B0_re += gT01_re * b1_re; 01306 B0_re -= gT01_im * b1_im; 01307 B0_re += gT02_re * b2_re; 01308 B0_re -= gT02_im * b2_im; 01309 spinorFloat B0_im = 0; 01310 B0_im += gT00_re * b0_im; 01311 B0_im += gT00_im * b0_re; 01312 B0_im += gT01_re * b1_im; 01313 B0_im += gT01_im * b1_re; 01314 B0_im += gT02_re * b2_im; 01315 B0_im += gT02_im * b2_re; 01316 01317 // multiply row 1 01318 spinorFloat A1_re = 0; 01319 A1_re += gT10_re * a0_re; 01320 A1_re -= gT10_im * a0_im; 01321 A1_re += gT11_re * a1_re; 01322 A1_re -= gT11_im * a1_im; 01323 A1_re += gT12_re * a2_re; 01324 A1_re -= gT12_im * a2_im; 01325 spinorFloat A1_im = 0; 01326 A1_im += gT10_re * a0_im; 01327 A1_im += gT10_im * a0_re; 01328 A1_im += gT11_re * a1_im; 01329 A1_im += gT11_im * a1_re; 01330 A1_im += gT12_re * a2_im; 01331 A1_im += gT12_im * a2_re; 01332 spinorFloat B1_re = 0; 01333 B1_re += gT10_re * b0_re; 01334 B1_re -= gT10_im * b0_im; 01335 B1_re += gT11_re * b1_re; 01336 B1_re -= gT11_im * b1_im; 01337 B1_re += gT12_re * b2_re; 01338 B1_re -= gT12_im * b2_im; 01339 spinorFloat B1_im = 0; 01340 B1_im += gT10_re * b0_im; 01341 B1_im += gT10_im * b0_re; 01342 B1_im += gT11_re * b1_im; 01343 B1_im += gT11_im * b1_re; 01344 B1_im += gT12_re * b2_im; 01345 B1_im += gT12_im * b2_re; 01346 01347 // multiply row 2 01348 spinorFloat A2_re = 0; 01349 A2_re += gT20_re * a0_re; 01350 A2_re -= gT20_im * a0_im; 01351 A2_re += gT21_re * a1_re; 01352 A2_re -= gT21_im * a1_im; 01353 A2_re += gT22_re * a2_re; 01354 A2_re -= gT22_im * a2_im; 01355 spinorFloat A2_im = 0; 01356 A2_im += gT20_re * a0_im; 01357 A2_im += gT20_im * a0_re; 01358 A2_im += gT21_re * a1_im; 01359 A2_im += gT21_im * a1_re; 01360 A2_im += gT22_re * a2_im; 01361 A2_im += gT22_im * a2_re; 01362 spinorFloat B2_re = 0; 01363 B2_re += gT20_re * b0_re; 01364 B2_re -= gT20_im * b0_im; 01365 B2_re += gT21_re * b1_re; 01366 B2_re -= gT21_im * b1_im; 01367 B2_re += gT22_re * b2_re; 01368 B2_re -= gT22_im * b2_im; 01369 spinorFloat B2_im = 0; 01370 B2_im += gT20_re * b0_im; 01371 B2_im += gT20_im * b0_re; 01372 B2_im += gT21_re * b1_im; 01373 B2_im += gT21_im * b1_re; 01374 B2_im += gT22_re * b2_im; 01375 B2_im += gT22_im * b2_re; 01376 01377 o00_re += A0_re; 01378 o00_im += A0_im; 01379 o10_re += B0_re; 01380 o10_im += B0_im; 01381 o20_re -= A0_im; 01382 o20_im += A0_re; 01383 o30_re += B0_im; 01384 o30_im -= B0_re; 01385 01386 o01_re += A1_re; 01387 o01_im += A1_im; 01388 o11_re += B1_re; 01389 o11_im += B1_im; 01390 o21_re -= A1_im; 01391 o21_im += A1_re; 01392 o31_re += B1_im; 01393 o31_im -= B1_re; 01394 01395 o02_re += A2_re; 01396 o02_im += A2_im; 01397 o12_re += B2_re; 01398 o12_im += B2_im; 01399 o22_re -= A2_im; 01400 o22_im += A2_re; 01401 o32_re += B2_im; 01402 o32_im -= B2_re; 01403 01404 } 01405 01406 #ifdef MULTI_GPU 01407 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) || 01408 (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) ) 01409 #endif 01410 { 01411 // Projector P3+ 01412 // 2 0 0 0 01413 // 0 2 0 0 01414 // 0 0 0 0 01415 // 0 0 0 0 01416 01417 #ifdef MULTI_GPU 01418 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 : 01419 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01420 #else 01421 const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1; 01422 #endif 01423 01424 const int ga_idx = sid; 01425 01426 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01427 { 01428 spinorFloat a0_re, a0_im; 01429 spinorFloat a1_re, a1_im; 01430 spinorFloat a2_re, a2_im; 01431 spinorFloat b0_re, b0_im; 01432 spinorFloat b1_re, b1_im; 01433 spinorFloat b2_re, b2_im; 01434 01435 #ifdef MULTI_GPU 01436 if (kernel_type == INTERIOR_KERNEL) { 01437 #endif 01438 01439 // read spinor from device memory 01440 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01441 01442 // project spinor into half spinors 01443 a0_re = +2*i00_re; 01444 a0_im = +2*i00_im; 01445 a1_re = +2*i01_re; 01446 a1_im = +2*i01_im; 01447 a2_re = +2*i02_re; 01448 a2_im = +2*i02_im; 01449 b0_re = +2*i10_re; 01450 b0_im = +2*i10_im; 01451 b1_re = +2*i11_re; 01452 b1_im = +2*i11_im; 01453 b2_re = +2*i12_re; 01454 b2_im = +2*i12_im; 01455 01456 #ifdef MULTI_GPU 01457 } else { 01458 01459 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01460 const int t_proj_scale = TPROJSCALE; 01461 01462 // read half spinor from device memory 01463 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01464 01465 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01466 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01467 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01468 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01469 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01470 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01471 01472 } 01473 #endif // MULTI_GPU 01474 01475 // identity gauge matrix 01476 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01477 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01478 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01479 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01480 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01481 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01482 01483 o00_re += A0_re; 01484 o00_im += A0_im; 01485 o10_re += B0_re; 01486 o10_im += B0_im; 01487 01488 o01_re += A1_re; 01489 o01_im += A1_im; 01490 o11_re += B1_re; 01491 o11_im += B1_im; 01492 01493 o02_re += A2_re; 01494 o02_im += A2_im; 01495 o12_re += B2_re; 01496 o12_im += B2_im; 01497 01498 } else { 01499 spinorFloat a0_re, a0_im; 01500 spinorFloat a1_re, a1_im; 01501 spinorFloat a2_re, a2_im; 01502 spinorFloat b0_re, b0_im; 01503 spinorFloat b1_re, b1_im; 01504 spinorFloat b2_re, b2_im; 01505 01506 #ifdef MULTI_GPU 01507 if (kernel_type == INTERIOR_KERNEL) { 01508 #endif 01509 01510 // read spinor from device memory 01511 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01512 01513 // project spinor into half spinors 01514 a0_re = +2*i00_re; 01515 a0_im = +2*i00_im; 01516 a1_re = +2*i01_re; 01517 a1_im = +2*i01_im; 01518 a2_re = +2*i02_re; 01519 a2_im = +2*i02_im; 01520 b0_re = +2*i10_re; 01521 b0_im = +2*i10_im; 01522 b1_re = +2*i11_re; 01523 b1_im = +2*i11_im; 01524 b2_re = +2*i12_re; 01525 b2_im = +2*i12_im; 01526 01527 #ifdef MULTI_GPU 01528 } else { 01529 01530 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01531 const int t_proj_scale = TPROJSCALE; 01532 01533 // read half spinor from device memory 01534 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01535 01536 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01537 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01538 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01539 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01540 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01541 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01542 01543 } 01544 #endif // MULTI_GPU 01545 01546 // read gauge matrix from device memory 01547 READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride); 01548 01549 // reconstruct gauge matrix 01550 RECONSTRUCT_GAUGE_MATRIX(6); 01551 01552 // multiply row 0 01553 spinorFloat A0_re = 0; 01554 A0_re += g00_re * a0_re; 01555 A0_re -= g00_im * a0_im; 01556 A0_re += g01_re * a1_re; 01557 A0_re -= g01_im * a1_im; 01558 A0_re += g02_re * a2_re; 01559 A0_re -= g02_im * a2_im; 01560 spinorFloat A0_im = 0; 01561 A0_im += g00_re * a0_im; 01562 A0_im += g00_im * a0_re; 01563 A0_im += g01_re * a1_im; 01564 A0_im += g01_im * a1_re; 01565 A0_im += g02_re * a2_im; 01566 A0_im += g02_im * a2_re; 01567 spinorFloat B0_re = 0; 01568 B0_re += g00_re * b0_re; 01569 B0_re -= g00_im * b0_im; 01570 B0_re += g01_re * b1_re; 01571 B0_re -= g01_im * b1_im; 01572 B0_re += g02_re * b2_re; 01573 B0_re -= g02_im * b2_im; 01574 spinorFloat B0_im = 0; 01575 B0_im += g00_re * b0_im; 01576 B0_im += g00_im * b0_re; 01577 B0_im += g01_re * b1_im; 01578 B0_im += g01_im * b1_re; 01579 B0_im += g02_re * b2_im; 01580 B0_im += g02_im * b2_re; 01581 01582 // multiply row 1 01583 spinorFloat A1_re = 0; 01584 A1_re += g10_re * a0_re; 01585 A1_re -= g10_im * a0_im; 01586 A1_re += g11_re * a1_re; 01587 A1_re -= g11_im * a1_im; 01588 A1_re += g12_re * a2_re; 01589 A1_re -= g12_im * a2_im; 01590 spinorFloat A1_im = 0; 01591 A1_im += g10_re * a0_im; 01592 A1_im += g10_im * a0_re; 01593 A1_im += g11_re * a1_im; 01594 A1_im += g11_im * a1_re; 01595 A1_im += g12_re * a2_im; 01596 A1_im += g12_im * a2_re; 01597 spinorFloat B1_re = 0; 01598 B1_re += g10_re * b0_re; 01599 B1_re -= g10_im * b0_im; 01600 B1_re += g11_re * b1_re; 01601 B1_re -= g11_im * b1_im; 01602 B1_re += g12_re * b2_re; 01603 B1_re -= g12_im * b2_im; 01604 spinorFloat B1_im = 0; 01605 B1_im += g10_re * b0_im; 01606 B1_im += g10_im * b0_re; 01607 B1_im += g11_re * b1_im; 01608 B1_im += g11_im * b1_re; 01609 B1_im += g12_re * b2_im; 01610 B1_im += g12_im * b2_re; 01611 01612 // multiply row 2 01613 spinorFloat A2_re = 0; 01614 A2_re += g20_re * a0_re; 01615 A2_re -= g20_im * a0_im; 01616 A2_re += g21_re * a1_re; 01617 A2_re -= g21_im * a1_im; 01618 A2_re += g22_re * a2_re; 01619 A2_re -= g22_im * a2_im; 01620 spinorFloat A2_im = 0; 01621 A2_im += g20_re * a0_im; 01622 A2_im += g20_im * a0_re; 01623 A2_im += g21_re * a1_im; 01624 A2_im += g21_im * a1_re; 01625 A2_im += g22_re * a2_im; 01626 A2_im += g22_im * a2_re; 01627 spinorFloat B2_re = 0; 01628 B2_re += g20_re * b0_re; 01629 B2_re -= g20_im * b0_im; 01630 B2_re += g21_re * b1_re; 01631 B2_re -= g21_im * b1_im; 01632 B2_re += g22_re * b2_re; 01633 B2_re -= g22_im * b2_im; 01634 spinorFloat B2_im = 0; 01635 B2_im += g20_re * b0_im; 01636 B2_im += g20_im * b0_re; 01637 B2_im += g21_re * b1_im; 01638 B2_im += g21_im * b1_re; 01639 B2_im += g22_re * b2_im; 01640 B2_im += g22_im * b2_re; 01641 01642 o00_re += A0_re; 01643 o00_im += A0_im; 01644 o10_re += B0_re; 01645 o10_im += B0_im; 01646 01647 o01_re += A1_re; 01648 o01_im += A1_im; 01649 o11_re += B1_re; 01650 o11_im += B1_im; 01651 01652 o02_re += A2_re; 01653 o02_im += A2_im; 01654 o12_re += B2_re; 01655 o12_im += B2_im; 01656 01657 } 01658 } 01659 01660 #ifdef MULTI_GPU 01661 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) || 01662 (kernel_type == EXTERIOR_KERNEL_T && x4==0) ) 01663 #endif 01664 { 01665 // Projector P3- 01666 // 0 0 0 0 01667 // 0 0 0 0 01668 // 0 0 2 0 01669 // 0 0 0 2 01670 01671 #ifdef MULTI_GPU 01672 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 : 01673 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01674 #else 01675 const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1; 01676 #endif 01677 01678 #ifdef MULTI_GPU 01679 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01680 #else 01681 const int ga_idx = sp_idx; 01682 #endif 01683 01684 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01685 { 01686 spinorFloat a0_re, a0_im; 01687 spinorFloat a1_re, a1_im; 01688 spinorFloat a2_re, a2_im; 01689 spinorFloat b0_re, b0_im; 01690 spinorFloat b1_re, b1_im; 01691 spinorFloat b2_re, b2_im; 01692 01693 #ifdef MULTI_GPU 01694 if (kernel_type == INTERIOR_KERNEL) { 01695 #endif 01696 01697 // read spinor from device memory 01698 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01699 01700 // project spinor into half spinors 01701 a0_re = +2*i20_re; 01702 a0_im = +2*i20_im; 01703 a1_re = +2*i21_re; 01704 a1_im = +2*i21_im; 01705 a2_re = +2*i22_re; 01706 a2_im = +2*i22_im; 01707 b0_re = +2*i30_re; 01708 b0_im = +2*i30_im; 01709 b1_re = +2*i31_re; 01710 b1_im = +2*i31_im; 01711 b2_re = +2*i32_re; 01712 b2_im = +2*i32_im; 01713 01714 #ifdef MULTI_GPU 01715 } else { 01716 01717 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01718 const int t_proj_scale = TPROJSCALE; 01719 01720 // read half spinor from device memory 01721 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01722 01723 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01724 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01725 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01726 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01727 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01728 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01729 01730 } 01731 #endif // MULTI_GPU 01732 01733 // identity gauge matrix 01734 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01735 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01736 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01737 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01738 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01739 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01740 01741 o20_re += A0_re; 01742 o20_im += A0_im; 01743 o30_re += B0_re; 01744 o30_im += B0_im; 01745 01746 o21_re += A1_re; 01747 o21_im += A1_im; 01748 o31_re += B1_re; 01749 o31_im += B1_im; 01750 01751 o22_re += A2_re; 01752 o22_im += A2_im; 01753 o32_re += B2_re; 01754 o32_im += B2_im; 01755 01756 } else { 01757 spinorFloat a0_re, a0_im; 01758 spinorFloat a1_re, a1_im; 01759 spinorFloat a2_re, a2_im; 01760 spinorFloat b0_re, b0_im; 01761 spinorFloat b1_re, b1_im; 01762 spinorFloat b2_re, b2_im; 01763 01764 #ifdef MULTI_GPU 01765 if (kernel_type == INTERIOR_KERNEL) { 01766 #endif 01767 01768 // read spinor from device memory 01769 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01770 01771 // project spinor into half spinors 01772 a0_re = +2*i20_re; 01773 a0_im = +2*i20_im; 01774 a1_re = +2*i21_re; 01775 a1_im = +2*i21_im; 01776 a2_re = +2*i22_re; 01777 a2_im = +2*i22_im; 01778 b0_re = +2*i30_re; 01779 b0_im = +2*i30_im; 01780 b1_re = +2*i31_re; 01781 b1_im = +2*i31_im; 01782 b2_re = +2*i32_re; 01783 b2_im = +2*i32_im; 01784 01785 #ifdef MULTI_GPU 01786 } else { 01787 01788 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01789 const int t_proj_scale = TPROJSCALE; 01790 01791 // read half spinor from device memory 01792 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01793 01794 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01795 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01796 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01797 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01798 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01799 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01800 01801 } 01802 #endif // MULTI_GPU 01803 01804 // read gauge matrix from device memory 01805 READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride); 01806 01807 // reconstruct gauge matrix 01808 RECONSTRUCT_GAUGE_MATRIX(7); 01809 01810 // multiply row 0 01811 spinorFloat A0_re = 0; 01812 A0_re += gT00_re * a0_re; 01813 A0_re -= gT00_im * a0_im; 01814 A0_re += gT01_re * a1_re; 01815 A0_re -= gT01_im * a1_im; 01816 A0_re += gT02_re * a2_re; 01817 A0_re -= gT02_im * a2_im; 01818 spinorFloat A0_im = 0; 01819 A0_im += gT00_re * a0_im; 01820 A0_im += gT00_im * a0_re; 01821 A0_im += gT01_re * a1_im; 01822 A0_im += gT01_im * a1_re; 01823 A0_im += gT02_re * a2_im; 01824 A0_im += gT02_im * a2_re; 01825 spinorFloat B0_re = 0; 01826 B0_re += gT00_re * b0_re; 01827 B0_re -= gT00_im * b0_im; 01828 B0_re += gT01_re * b1_re; 01829 B0_re -= gT01_im * b1_im; 01830 B0_re += gT02_re * b2_re; 01831 B0_re -= gT02_im * b2_im; 01832 spinorFloat B0_im = 0; 01833 B0_im += gT00_re * b0_im; 01834 B0_im += gT00_im * b0_re; 01835 B0_im += gT01_re * b1_im; 01836 B0_im += gT01_im * b1_re; 01837 B0_im += gT02_re * b2_im; 01838 B0_im += gT02_im * b2_re; 01839 01840 // multiply row 1 01841 spinorFloat A1_re = 0; 01842 A1_re += gT10_re * a0_re; 01843 A1_re -= gT10_im * a0_im; 01844 A1_re += gT11_re * a1_re; 01845 A1_re -= gT11_im * a1_im; 01846 A1_re += gT12_re * a2_re; 01847 A1_re -= gT12_im * a2_im; 01848 spinorFloat A1_im = 0; 01849 A1_im += gT10_re * a0_im; 01850 A1_im += gT10_im * a0_re; 01851 A1_im += gT11_re * a1_im; 01852 A1_im += gT11_im * a1_re; 01853 A1_im += gT12_re * a2_im; 01854 A1_im += gT12_im * a2_re; 01855 spinorFloat B1_re = 0; 01856 B1_re += gT10_re * b0_re; 01857 B1_re -= gT10_im * b0_im; 01858 B1_re += gT11_re * b1_re; 01859 B1_re -= gT11_im * b1_im; 01860 B1_re += gT12_re * b2_re; 01861 B1_re -= gT12_im * b2_im; 01862 spinorFloat B1_im = 0; 01863 B1_im += gT10_re * b0_im; 01864 B1_im += gT10_im * b0_re; 01865 B1_im += gT11_re * b1_im; 01866 B1_im += gT11_im * b1_re; 01867 B1_im += gT12_re * b2_im; 01868 B1_im += gT12_im * b2_re; 01869 01870 // multiply row 2 01871 spinorFloat A2_re = 0; 01872 A2_re += gT20_re * a0_re; 01873 A2_re -= gT20_im * a0_im; 01874 A2_re += gT21_re * a1_re; 01875 A2_re -= gT21_im * a1_im; 01876 A2_re += gT22_re * a2_re; 01877 A2_re -= gT22_im * a2_im; 01878 spinorFloat A2_im = 0; 01879 A2_im += gT20_re * a0_im; 01880 A2_im += gT20_im * a0_re; 01881 A2_im += gT21_re * a1_im; 01882 A2_im += gT21_im * a1_re; 01883 A2_im += gT22_re * a2_im; 01884 A2_im += gT22_im * a2_re; 01885 spinorFloat B2_re = 0; 01886 B2_re += gT20_re * b0_re; 01887 B2_re -= gT20_im * b0_im; 01888 B2_re += gT21_re * b1_re; 01889 B2_re -= gT21_im * b1_im; 01890 B2_re += gT22_re * b2_re; 01891 B2_re -= gT22_im * b2_im; 01892 spinorFloat B2_im = 0; 01893 B2_im += gT20_re * b0_im; 01894 B2_im += gT20_im * b0_re; 01895 B2_im += gT21_re * b1_im; 01896 B2_im += gT21_im * b1_re; 01897 B2_im += gT22_re * b2_im; 01898 B2_im += gT22_im * b2_re; 01899 01900 o20_re += A0_re; 01901 o20_im += A0_im; 01902 o30_re += B0_re; 01903 o30_im += B0_im; 01904 01905 o21_re += A1_re; 01906 o21_im += A1_im; 01907 o31_re += B1_re; 01908 o31_im += B1_im; 01909 01910 o22_re += A2_re; 01911 o22_im += A2_im; 01912 o32_re += B2_re; 01913 o32_im += B2_im; 01914 01915 } 01916 } 01917 01918 #ifdef MULTI_GPU 01919 01920 int incomplete = 0; // Have all 8 contributions been computed for this site? 01921 01922 switch(kernel_type) { // intentional fall-through 01923 case INTERIOR_KERNEL: 01924 incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1)); 01925 case EXTERIOR_KERNEL_T: 01926 incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1)); 01927 case EXTERIOR_KERNEL_Z: 01928 incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1)); 01929 case EXTERIOR_KERNEL_Y: 01930 incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1)); 01931 } 01932 01933 if (!incomplete) 01934 #endif // MULTI_GPU 01935 { 01936 01937 { 01938 // apply twisted mass rotation 01939 VOLATILE spinorFloat tmp00_re = +o00_re-o20_im*a; 01940 VOLATILE spinorFloat tmp00_im = +o00_im+o20_re*a; 01941 VOLATILE spinorFloat tmp01_re = +o01_re-o21_im*a; 01942 VOLATILE spinorFloat tmp01_im = +o01_im+o21_re*a; 01943 VOLATILE spinorFloat tmp02_re = +o02_re-o22_im*a; 01944 VOLATILE spinorFloat tmp02_im = +o02_im+o22_re*a; 01945 01946 VOLATILE spinorFloat tmp10_re = +o10_re-o30_im*a; 01947 VOLATILE spinorFloat tmp10_im = +o10_im+o30_re*a; 01948 VOLATILE spinorFloat tmp11_re = +o11_re-o31_im*a; 01949 VOLATILE spinorFloat tmp11_im = +o11_im+o31_re*a; 01950 VOLATILE spinorFloat tmp12_re = +o12_re-o32_im*a; 01951 VOLATILE spinorFloat tmp12_im = +o12_im+o32_re*a; 01952 01953 VOLATILE spinorFloat tmp20_re = -o00_im*a+o20_re; 01954 VOLATILE spinorFloat tmp20_im = +o00_re*a+o20_im; 01955 VOLATILE spinorFloat tmp21_re = -o01_im*a+o21_re; 01956 VOLATILE spinorFloat tmp21_im = +o01_re*a+o21_im; 01957 VOLATILE spinorFloat tmp22_re = -o02_im*a+o22_re; 01958 VOLATILE spinorFloat tmp22_im = +o02_re*a+o22_im; 01959 01960 VOLATILE spinorFloat tmp30_re = -o10_im*a+o30_re; 01961 VOLATILE spinorFloat tmp30_im = +o10_re*a+o30_im; 01962 VOLATILE spinorFloat tmp31_re = -o11_im*a+o31_re; 01963 VOLATILE spinorFloat tmp31_im = +o11_re*a+o31_im; 01964 VOLATILE spinorFloat tmp32_re = -o12_im*a+o32_re; 01965 VOLATILE spinorFloat tmp32_im = +o12_re*a+o32_im; 01966 01967 01968 #ifndef DSLASH_XPAY 01969 //scale by b = 1/(1 + a*a) 01970 o00_re = b*tmp00_re; 01971 o00_im = b*tmp00_im; 01972 o01_re = b*tmp01_re; 01973 o01_im = b*tmp01_im; 01974 o02_re = b*tmp02_re; 01975 o02_im = b*tmp02_im; 01976 o10_re = b*tmp10_re; 01977 o10_im = b*tmp10_im; 01978 o11_re = b*tmp11_re; 01979 o11_im = b*tmp11_im; 01980 o12_re = b*tmp12_re; 01981 o12_im = b*tmp12_im; 01982 o20_re = b*tmp20_re; 01983 o20_im = b*tmp20_im; 01984 o21_re = b*tmp21_re; 01985 o21_im = b*tmp21_im; 01986 o22_re = b*tmp22_re; 01987 o22_im = b*tmp22_im; 01988 o30_re = b*tmp30_re; 01989 o30_im = b*tmp30_im; 01990 o31_re = b*tmp31_re; 01991 o31_im = b*tmp31_im; 01992 o32_re = b*tmp32_re; 01993 o32_im = b*tmp32_im; 01994 #else 01995 o00_re = tmp00_re; 01996 o00_im = tmp00_im; 01997 o01_re = tmp01_re; 01998 o01_im = tmp01_im; 01999 o02_re = tmp02_re; 02000 o02_im = tmp02_im; 02001 o10_re = tmp10_re; 02002 o10_im = tmp10_im; 02003 o11_re = tmp11_re; 02004 o11_im = tmp11_im; 02005 o12_re = tmp12_re; 02006 o12_im = tmp12_im; 02007 o20_re = tmp20_re; 02008 o20_im = tmp20_im; 02009 o21_re = tmp21_re; 02010 o21_im = tmp21_im; 02011 o22_re = tmp22_re; 02012 o22_im = tmp22_im; 02013 o30_re = tmp30_re; 02014 o30_im = tmp30_im; 02015 o31_re = tmp31_re; 02016 o31_im = tmp31_im; 02017 o32_re = tmp32_re; 02018 o32_im = tmp32_im; 02019 #endif // DSLASH_XPAY 02020 02021 } 02022 #ifdef DSLASH_XPAY 02023 02024 READ_ACCUM(ACCUMTEX, sp_stride) 02025 02026 #ifdef SPINOR_DOUBLE 02027 o00_re = b*o00_re + accum0.x; 02028 o00_im = b*o00_im + accum0.y; 02029 o01_re = b*o01_re + accum1.x; 02030 o01_im = b*o01_im + accum1.y; 02031 o02_re = b*o02_re + accum2.x; 02032 o02_im = b*o02_im + accum2.y; 02033 o10_re = b*o10_re + accum3.x; 02034 o10_im = b*o10_im + accum3.y; 02035 o11_re = b*o11_re + accum4.x; 02036 o11_im = b*o11_im + accum4.y; 02037 o12_re = b*o12_re + accum5.x; 02038 o12_im = b*o12_im + accum5.y; 02039 o20_re = b*o20_re + accum6.x; 02040 o20_im = b*o20_im + accum6.y; 02041 o21_re = b*o21_re + accum7.x; 02042 o21_im = b*o21_im + accum7.y; 02043 o22_re = b*o22_re + accum8.x; 02044 o22_im = b*o22_im + accum8.y; 02045 o30_re = b*o30_re + accum9.x; 02046 o30_im = b*o30_im + accum9.y; 02047 o31_re = b*o31_re + accum10.x; 02048 o31_im = b*o31_im + accum10.y; 02049 o32_re = b*o32_re + accum11.x; 02050 o32_im = b*o32_im + accum11.y; 02051 #else 02052 o00_re = b*o00_re + accum0.x; 02053 o00_im = b*o00_im + accum0.y; 02054 o01_re = b*o01_re + accum0.z; 02055 o01_im = b*o01_im + accum0.w; 02056 o02_re = b*o02_re + accum1.x; 02057 o02_im = b*o02_im + accum1.y; 02058 o10_re = b*o10_re + accum1.z; 02059 o10_im = b*o10_im + accum1.w; 02060 o11_re = b*o11_re + accum2.x; 02061 o11_im = b*o11_im + accum2.y; 02062 o12_re = b*o12_re + accum2.z; 02063 o12_im = b*o12_im + accum2.w; 02064 o20_re = b*o20_re + accum3.x; 02065 o20_im = b*o20_im + accum3.y; 02066 o21_re = b*o21_re + accum3.z; 02067 o21_im = b*o21_im + accum3.w; 02068 o22_re = b*o22_re + accum4.x; 02069 o22_im = b*o22_im + accum4.y; 02070 o30_re = b*o30_re + accum4.z; 02071 o30_im = b*o30_im + accum4.w; 02072 o31_re = b*o31_re + accum5.x; 02073 o31_im = b*o31_im + accum5.y; 02074 o32_re = b*o32_re + accum5.z; 02075 o32_im = b*o32_im + accum5.w; 02076 #endif // SPINOR_DOUBLE 02077 02078 #endif // DSLASH_XPAY 02079 } 02080 02081 // write spinor field back to device memory 02082 WRITE_SPINOR(sp_stride); 02083 02084 // undefine to prevent warning when precision is changed 02085 #undef spinorFloat 02086 #undef A_re 02087 #undef A_im 02088 02089 #undef g00_re 02090 #undef g00_im 02091 #undef g01_re 02092 #undef g01_im 02093 #undef g02_re 02094 #undef g02_im 02095 #undef g10_re 02096 #undef g10_im 02097 #undef g11_re 02098 #undef g11_im 02099 #undef g12_re 02100 #undef g12_im 02101 #undef g20_re 02102 #undef g20_im 02103 #undef g21_re 02104 #undef g21_im 02105 #undef g22_re 02106 #undef g22_im 02107 02108 #undef i00_re 02109 #undef i00_im 02110 #undef i01_re 02111 #undef i01_im 02112 #undef i02_re 02113 #undef i02_im 02114 #undef i10_re 02115 #undef i10_im 02116 #undef i11_re 02117 #undef i11_im 02118 #undef i12_re 02119 #undef i12_im 02120 #undef i20_re 02121 #undef i20_im 02122 #undef i21_re 02123 #undef i21_im 02124 #undef i22_re 02125 #undef i22_im 02126 #undef i30_re 02127 #undef i30_im 02128 #undef i31_re 02129 #undef i31_im 02130 #undef i32_re 02131 #undef i32_im 02132 02133 02134 02135 #undef VOLATILE