QUDA v0.4.0
A library for QCD on GPUs
|
00001 // *** CUDA DSLASH DAGGER *** 00002 00003 #define DSLASH_SHARED_FLOATS_PER_THREAD 19 00004 00005 00006 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 00007 #define VOLATILE 00008 #else // Open64 compiler 00009 #define VOLATILE volatile 00010 #endif 00011 // input spinor 00012 #ifdef SPINOR_DOUBLE 00013 #define spinorFloat double 00014 #define i00_re I0.x 00015 #define i00_im I0.y 00016 #define i01_re I1.x 00017 #define i01_im I1.y 00018 #define i02_re I2.x 00019 #define i02_im I2.y 00020 #define i10_re I3.x 00021 #define i10_im I3.y 00022 #define i11_re I4.x 00023 #define i11_im I4.y 00024 #define i12_re I5.x 00025 #define i12_im I5.y 00026 #define i20_re I6.x 00027 #define i20_im I6.y 00028 #define i21_re I7.x 00029 #define i21_im I7.y 00030 #define i22_re I8.x 00031 #define i22_im I8.y 00032 #define i30_re I9.x 00033 #define i30_im I9.y 00034 #define i31_re I10.x 00035 #define i31_im I10.y 00036 #define i32_re I11.x 00037 #define i32_im I11.y 00038 #else 00039 #define spinorFloat float 00040 #define i00_re I0.x 00041 #define i00_im I0.y 00042 #define i01_re I0.z 00043 #define i01_im I0.w 00044 #define i02_re I1.x 00045 #define i02_im I1.y 00046 #define i10_re I1.z 00047 #define i10_im I1.w 00048 #define i11_re I2.x 00049 #define i11_im I2.y 00050 #define i12_re I2.z 00051 #define i12_im I2.w 00052 #define i20_re I3.x 00053 #define i20_im I3.y 00054 #define i21_re I3.z 00055 #define i21_im I3.w 00056 #define i22_re I4.x 00057 #define i22_im I4.y 00058 #define i30_re I4.z 00059 #define i30_im I4.w 00060 #define i31_re I5.x 00061 #define i31_im I5.y 00062 #define i32_re I5.z 00063 #define i32_im I5.w 00064 #endif // SPINOR_DOUBLE 00065 00066 // gauge link 00067 #ifdef GAUGE_FLOAT2 00068 #define g00_re G0.x 00069 #define g00_im G0.y 00070 #define g01_re G1.x 00071 #define g01_im G1.y 00072 #define g02_re G2.x 00073 #define g02_im G2.y 00074 #define g10_re G3.x 00075 #define g10_im G3.y 00076 #define g11_re G4.x 00077 #define g11_im G4.y 00078 #define g12_re G5.x 00079 #define g12_im G5.y 00080 #define g20_re G6.x 00081 #define g20_im G6.y 00082 #define g21_re G7.x 00083 #define g21_im G7.y 00084 #define g22_re G8.x 00085 #define g22_im G8.y 00086 // temporaries 00087 #define A_re G9.x 00088 #define A_im G9.y 00089 00090 #else 00091 #define g00_re G0.x 00092 #define g00_im G0.y 00093 #define g01_re G0.z 00094 #define g01_im G0.w 00095 #define g02_re G1.x 00096 #define g02_im G1.y 00097 #define g10_re G1.z 00098 #define g10_im G1.w 00099 #define g11_re G2.x 00100 #define g11_im G2.y 00101 #define g12_re G2.z 00102 #define g12_im G2.w 00103 #define g20_re G3.x 00104 #define g20_im G3.y 00105 #define g21_re G3.z 00106 #define g21_im G3.w 00107 #define g22_re G4.x 00108 #define g22_im G4.y 00109 // temporaries 00110 #define A_re G4.z 00111 #define A_im G4.w 00112 00113 #endif // GAUGE_DOUBLE 00114 00115 // conjugated gauge link 00116 #define gT00_re (+g00_re) 00117 #define gT00_im (-g00_im) 00118 #define gT01_re (+g10_re) 00119 #define gT01_im (-g10_im) 00120 #define gT02_re (+g20_re) 00121 #define gT02_im (-g20_im) 00122 #define gT10_re (+g01_re) 00123 #define gT10_im (-g01_im) 00124 #define gT11_re (+g11_re) 00125 #define gT11_im (-g11_im) 00126 #define gT12_re (+g21_re) 00127 #define gT12_im (-g21_im) 00128 #define gT20_re (+g02_re) 00129 #define gT20_im (-g02_im) 00130 #define gT21_re (+g12_re) 00131 #define gT21_im (-g12_im) 00132 #define gT22_re (+g22_re) 00133 #define gT22_im (-g22_im) 00134 00135 // output spinor 00136 #define o00_re s[0*SHARED_STRIDE] 00137 #define o00_im s[1*SHARED_STRIDE] 00138 #define o01_re s[2*SHARED_STRIDE] 00139 #define o01_im s[3*SHARED_STRIDE] 00140 #define o02_re s[4*SHARED_STRIDE] 00141 #define o02_im s[5*SHARED_STRIDE] 00142 #define o10_re s[6*SHARED_STRIDE] 00143 #define o10_im s[7*SHARED_STRIDE] 00144 #define o11_re s[8*SHARED_STRIDE] 00145 #define o11_im s[9*SHARED_STRIDE] 00146 #define o12_re s[10*SHARED_STRIDE] 00147 #define o12_im s[11*SHARED_STRIDE] 00148 #define o20_re s[12*SHARED_STRIDE] 00149 #define o20_im s[13*SHARED_STRIDE] 00150 #define o21_re s[14*SHARED_STRIDE] 00151 #define o21_im s[15*SHARED_STRIDE] 00152 #define o22_re s[16*SHARED_STRIDE] 00153 #define o22_im s[17*SHARED_STRIDE] 00154 #define o30_re s[18*SHARED_STRIDE] 00155 VOLATILE spinorFloat o30_im; 00156 VOLATILE spinorFloat o31_re; 00157 VOLATILE spinorFloat o31_im; 00158 VOLATILE spinorFloat o32_re; 00159 VOLATILE spinorFloat o32_im; 00160 00161 #ifdef SPINOR_DOUBLE 00162 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200 00163 #else 00164 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200 00165 #endif 00166 00167 extern __shared__ char s_data[]; 00168 00169 VOLATILE spinorFloat *s = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE) 00170 + (threadIdx.x % SHARED_STRIDE); 00171 00172 #include "read_gauge.h" 00173 #include "read_clover.h" 00174 #include "io_spinor.h" 00175 00176 int x1, x2, x3, x4; 00177 int X; 00178 00179 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision 00180 int sp_norm_idx; 00181 #endif // MULTI_GPU half precision 00182 00183 int sid; 00184 00185 #ifdef MULTI_GPU 00186 int face_idx; 00187 if (kernel_type == INTERIOR_KERNEL) { 00188 #endif 00189 00190 sid = blockIdx.x*blockDim.x + threadIdx.x; 00191 if (sid >= param.threads) return; 00192 00193 // Inline by hand for the moment and assume even dimensions 00194 //coordsFromIndex(X, x1, x2, x3, x4, sid, param.parity); 00195 00196 X = 2*sid; 00197 int aux1 = X / X1; 00198 x1 = X - aux1 * X1; 00199 int aux2 = aux1 / X2; 00200 x2 = aux1 - aux2 * X2; 00201 x4 = aux2 / X3; 00202 x3 = aux2 - x4 * X3; 00203 aux1 = (param.parity + x4 + x3 + x2) & 1; 00204 x1 += aux1; 00205 X += aux1; 00206 00207 o00_re = 0; o00_im = 0; 00208 o01_re = 0; o01_im = 0; 00209 o02_re = 0; o02_im = 0; 00210 o10_re = 0; o10_im = 0; 00211 o11_re = 0; o11_im = 0; 00212 o12_re = 0; o12_im = 0; 00213 o20_re = 0; o20_im = 0; 00214 o21_re = 0; o21_im = 0; 00215 o22_re = 0; o22_im = 0; 00216 o30_re = 0; o30_im = 0; 00217 o31_re = 0; o31_im = 0; 00218 o32_re = 0; o32_im = 0; 00219 00220 #ifdef MULTI_GPU 00221 } else { // exterior kernel 00222 00223 sid = blockIdx.x*blockDim.x + threadIdx.x; 00224 if (sid >= param.threads) return; 00225 00226 const int dim = static_cast<int>(kernel_type); 00227 const int face_volume = (param.threads >> 1); // volume of one face 00228 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1 00229 face_idx = sid - face_num*face_volume; // index into the respective face 00230 00231 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP) 00232 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading) 00233 //sp_idx = face_idx + param.ghostOffset[dim]; 00234 00235 #if (DD_PREC==2) // half precision 00236 sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)]; 00237 #endif 00238 00239 coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity); 00240 00241 READ_INTERMEDIATE_SPINOR(INTERTEX, sp_stride, sid, sid); 00242 00243 o00_re = i00_re; o00_im = i00_im; 00244 o01_re = i01_re; o01_im = i01_im; 00245 o02_re = i02_re; o02_im = i02_im; 00246 o10_re = i10_re; o10_im = i10_im; 00247 o11_re = i11_re; o11_im = i11_im; 00248 o12_re = i12_re; o12_im = i12_im; 00249 o20_re = i20_re; o20_im = i20_im; 00250 o21_re = i21_re; o21_im = i21_im; 00251 o22_re = i22_re; o22_im = i22_im; 00252 o30_re = i30_re; o30_im = i30_im; 00253 o31_re = i31_re; o31_im = i31_im; 00254 o32_re = i32_re; o32_im = i32_im; 00255 } 00256 #endif // MULTI_GPU 00257 00258 00259 #ifdef MULTI_GPU 00260 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) || 00261 (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) ) 00262 #endif 00263 { 00264 // Projector P0+ 00265 // 1 0 0 i 00266 // 0 1 i 0 00267 // 0 -i 1 0 00268 // -i 0 0 1 00269 00270 #ifdef MULTI_GPU 00271 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 : 00272 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00273 #else 00274 const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1; 00275 #endif 00276 00277 const int ga_idx = sid; 00278 00279 spinorFloat a0_re, a0_im; 00280 spinorFloat a1_re, a1_im; 00281 spinorFloat a2_re, a2_im; 00282 spinorFloat b0_re, b0_im; 00283 spinorFloat b1_re, b1_im; 00284 spinorFloat b2_re, b2_im; 00285 00286 #ifdef MULTI_GPU 00287 if (kernel_type == INTERIOR_KERNEL) { 00288 #endif 00289 00290 // read spinor from device memory 00291 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00292 00293 // project spinor into half spinors 00294 a0_re = +i00_re-i30_im; 00295 a0_im = +i00_im+i30_re; 00296 a1_re = +i01_re-i31_im; 00297 a1_im = +i01_im+i31_re; 00298 a2_re = +i02_re-i32_im; 00299 a2_im = +i02_im+i32_re; 00300 b0_re = +i10_re-i20_im; 00301 b0_im = +i10_im+i20_re; 00302 b1_re = +i11_re-i21_im; 00303 b1_im = +i11_im+i21_re; 00304 b2_re = +i12_re-i22_im; 00305 b2_im = +i12_im+i22_re; 00306 00307 #ifdef MULTI_GPU 00308 } else { 00309 00310 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00311 00312 // read half spinor from device memory 00313 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00314 00315 a0_re = i00_re; a0_im = i00_im; 00316 a1_re = i01_re; a1_im = i01_im; 00317 a2_re = i02_re; a2_im = i02_im; 00318 b0_re = i10_re; b0_im = i10_im; 00319 b1_re = i11_re; b1_im = i11_im; 00320 b2_re = i12_re; b2_im = i12_im; 00321 00322 } 00323 #endif // MULTI_GPU 00324 00325 // read gauge matrix from device memory 00326 READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride); 00327 00328 // reconstruct gauge matrix 00329 RECONSTRUCT_GAUGE_MATRIX(0); 00330 00331 // multiply row 0 00332 spinorFloat A0_re = 0; 00333 A0_re += g00_re * a0_re; 00334 A0_re -= g00_im * a0_im; 00335 A0_re += g01_re * a1_re; 00336 A0_re -= g01_im * a1_im; 00337 A0_re += g02_re * a2_re; 00338 A0_re -= g02_im * a2_im; 00339 spinorFloat A0_im = 0; 00340 A0_im += g00_re * a0_im; 00341 A0_im += g00_im * a0_re; 00342 A0_im += g01_re * a1_im; 00343 A0_im += g01_im * a1_re; 00344 A0_im += g02_re * a2_im; 00345 A0_im += g02_im * a2_re; 00346 spinorFloat B0_re = 0; 00347 B0_re += g00_re * b0_re; 00348 B0_re -= g00_im * b0_im; 00349 B0_re += g01_re * b1_re; 00350 B0_re -= g01_im * b1_im; 00351 B0_re += g02_re * b2_re; 00352 B0_re -= g02_im * b2_im; 00353 spinorFloat B0_im = 0; 00354 B0_im += g00_re * b0_im; 00355 B0_im += g00_im * b0_re; 00356 B0_im += g01_re * b1_im; 00357 B0_im += g01_im * b1_re; 00358 B0_im += g02_re * b2_im; 00359 B0_im += g02_im * b2_re; 00360 00361 // multiply row 1 00362 spinorFloat A1_re = 0; 00363 A1_re += g10_re * a0_re; 00364 A1_re -= g10_im * a0_im; 00365 A1_re += g11_re * a1_re; 00366 A1_re -= g11_im * a1_im; 00367 A1_re += g12_re * a2_re; 00368 A1_re -= g12_im * a2_im; 00369 spinorFloat A1_im = 0; 00370 A1_im += g10_re * a0_im; 00371 A1_im += g10_im * a0_re; 00372 A1_im += g11_re * a1_im; 00373 A1_im += g11_im * a1_re; 00374 A1_im += g12_re * a2_im; 00375 A1_im += g12_im * a2_re; 00376 spinorFloat B1_re = 0; 00377 B1_re += g10_re * b0_re; 00378 B1_re -= g10_im * b0_im; 00379 B1_re += g11_re * b1_re; 00380 B1_re -= g11_im * b1_im; 00381 B1_re += g12_re * b2_re; 00382 B1_re -= g12_im * b2_im; 00383 spinorFloat B1_im = 0; 00384 B1_im += g10_re * b0_im; 00385 B1_im += g10_im * b0_re; 00386 B1_im += g11_re * b1_im; 00387 B1_im += g11_im * b1_re; 00388 B1_im += g12_re * b2_im; 00389 B1_im += g12_im * b2_re; 00390 00391 // multiply row 2 00392 spinorFloat A2_re = 0; 00393 A2_re += g20_re * a0_re; 00394 A2_re -= g20_im * a0_im; 00395 A2_re += g21_re * a1_re; 00396 A2_re -= g21_im * a1_im; 00397 A2_re += g22_re * a2_re; 00398 A2_re -= g22_im * a2_im; 00399 spinorFloat A2_im = 0; 00400 A2_im += g20_re * a0_im; 00401 A2_im += g20_im * a0_re; 00402 A2_im += g21_re * a1_im; 00403 A2_im += g21_im * a1_re; 00404 A2_im += g22_re * a2_im; 00405 A2_im += g22_im * a2_re; 00406 spinorFloat B2_re = 0; 00407 B2_re += g20_re * b0_re; 00408 B2_re -= g20_im * b0_im; 00409 B2_re += g21_re * b1_re; 00410 B2_re -= g21_im * b1_im; 00411 B2_re += g22_re * b2_re; 00412 B2_re -= g22_im * b2_im; 00413 spinorFloat B2_im = 0; 00414 B2_im += g20_re * b0_im; 00415 B2_im += g20_im * b0_re; 00416 B2_im += g21_re * b1_im; 00417 B2_im += g21_im * b1_re; 00418 B2_im += g22_re * b2_im; 00419 B2_im += g22_im * b2_re; 00420 00421 o00_re += A0_re; 00422 o00_im += A0_im; 00423 o10_re += B0_re; 00424 o10_im += B0_im; 00425 o20_re += B0_im; 00426 o20_im -= B0_re; 00427 o30_re += A0_im; 00428 o30_im -= A0_re; 00429 00430 o01_re += A1_re; 00431 o01_im += A1_im; 00432 o11_re += B1_re; 00433 o11_im += B1_im; 00434 o21_re += B1_im; 00435 o21_im -= B1_re; 00436 o31_re += A1_im; 00437 o31_im -= A1_re; 00438 00439 o02_re += A2_re; 00440 o02_im += A2_im; 00441 o12_re += B2_re; 00442 o12_im += B2_im; 00443 o22_re += B2_im; 00444 o22_im -= B2_re; 00445 o32_re += A2_im; 00446 o32_im -= A2_re; 00447 00448 } 00449 00450 #ifdef MULTI_GPU 00451 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) || 00452 (kernel_type == EXTERIOR_KERNEL_X && x1==0) ) 00453 #endif 00454 { 00455 // Projector P0- 00456 // 1 0 0 -i 00457 // 0 1 -i 0 00458 // 0 i 1 0 00459 // i 0 0 1 00460 00461 #ifdef MULTI_GPU 00462 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 : 00463 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00464 #else 00465 const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1; 00466 #endif 00467 00468 #ifdef MULTI_GPU 00469 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 00470 #else 00471 const int ga_idx = sp_idx; 00472 #endif 00473 00474 spinorFloat a0_re, a0_im; 00475 spinorFloat a1_re, a1_im; 00476 spinorFloat a2_re, a2_im; 00477 spinorFloat b0_re, b0_im; 00478 spinorFloat b1_re, b1_im; 00479 spinorFloat b2_re, b2_im; 00480 00481 #ifdef MULTI_GPU 00482 if (kernel_type == INTERIOR_KERNEL) { 00483 #endif 00484 00485 // read spinor from device memory 00486 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00487 00488 // project spinor into half spinors 00489 a0_re = +i00_re+i30_im; 00490 a0_im = +i00_im-i30_re; 00491 a1_re = +i01_re+i31_im; 00492 a1_im = +i01_im-i31_re; 00493 a2_re = +i02_re+i32_im; 00494 a2_im = +i02_im-i32_re; 00495 b0_re = +i10_re+i20_im; 00496 b0_im = +i10_im-i20_re; 00497 b1_re = +i11_re+i21_im; 00498 b1_im = +i11_im-i21_re; 00499 b2_re = +i12_re+i22_im; 00500 b2_im = +i12_im-i22_re; 00501 00502 #ifdef MULTI_GPU 00503 } else { 00504 00505 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00506 00507 // read half spinor from device memory 00508 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 00509 00510 a0_re = i00_re; a0_im = i00_im; 00511 a1_re = i01_re; a1_im = i01_im; 00512 a2_re = i02_re; a2_im = i02_im; 00513 b0_re = i10_re; b0_im = i10_im; 00514 b1_re = i11_re; b1_im = i11_im; 00515 b2_re = i12_re; b2_im = i12_im; 00516 00517 } 00518 #endif // MULTI_GPU 00519 00520 // read gauge matrix from device memory 00521 READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride); 00522 00523 // reconstruct gauge matrix 00524 RECONSTRUCT_GAUGE_MATRIX(1); 00525 00526 // multiply row 0 00527 spinorFloat A0_re = 0; 00528 A0_re += gT00_re * a0_re; 00529 A0_re -= gT00_im * a0_im; 00530 A0_re += gT01_re * a1_re; 00531 A0_re -= gT01_im * a1_im; 00532 A0_re += gT02_re * a2_re; 00533 A0_re -= gT02_im * a2_im; 00534 spinorFloat A0_im = 0; 00535 A0_im += gT00_re * a0_im; 00536 A0_im += gT00_im * a0_re; 00537 A0_im += gT01_re * a1_im; 00538 A0_im += gT01_im * a1_re; 00539 A0_im += gT02_re * a2_im; 00540 A0_im += gT02_im * a2_re; 00541 spinorFloat B0_re = 0; 00542 B0_re += gT00_re * b0_re; 00543 B0_re -= gT00_im * b0_im; 00544 B0_re += gT01_re * b1_re; 00545 B0_re -= gT01_im * b1_im; 00546 B0_re += gT02_re * b2_re; 00547 B0_re -= gT02_im * b2_im; 00548 spinorFloat B0_im = 0; 00549 B0_im += gT00_re * b0_im; 00550 B0_im += gT00_im * b0_re; 00551 B0_im += gT01_re * b1_im; 00552 B0_im += gT01_im * b1_re; 00553 B0_im += gT02_re * b2_im; 00554 B0_im += gT02_im * b2_re; 00555 00556 // multiply row 1 00557 spinorFloat A1_re = 0; 00558 A1_re += gT10_re * a0_re; 00559 A1_re -= gT10_im * a0_im; 00560 A1_re += gT11_re * a1_re; 00561 A1_re -= gT11_im * a1_im; 00562 A1_re += gT12_re * a2_re; 00563 A1_re -= gT12_im * a2_im; 00564 spinorFloat A1_im = 0; 00565 A1_im += gT10_re * a0_im; 00566 A1_im += gT10_im * a0_re; 00567 A1_im += gT11_re * a1_im; 00568 A1_im += gT11_im * a1_re; 00569 A1_im += gT12_re * a2_im; 00570 A1_im += gT12_im * a2_re; 00571 spinorFloat B1_re = 0; 00572 B1_re += gT10_re * b0_re; 00573 B1_re -= gT10_im * b0_im; 00574 B1_re += gT11_re * b1_re; 00575 B1_re -= gT11_im * b1_im; 00576 B1_re += gT12_re * b2_re; 00577 B1_re -= gT12_im * b2_im; 00578 spinorFloat B1_im = 0; 00579 B1_im += gT10_re * b0_im; 00580 B1_im += gT10_im * b0_re; 00581 B1_im += gT11_re * b1_im; 00582 B1_im += gT11_im * b1_re; 00583 B1_im += gT12_re * b2_im; 00584 B1_im += gT12_im * b2_re; 00585 00586 // multiply row 2 00587 spinorFloat A2_re = 0; 00588 A2_re += gT20_re * a0_re; 00589 A2_re -= gT20_im * a0_im; 00590 A2_re += gT21_re * a1_re; 00591 A2_re -= gT21_im * a1_im; 00592 A2_re += gT22_re * a2_re; 00593 A2_re -= gT22_im * a2_im; 00594 spinorFloat A2_im = 0; 00595 A2_im += gT20_re * a0_im; 00596 A2_im += gT20_im * a0_re; 00597 A2_im += gT21_re * a1_im; 00598 A2_im += gT21_im * a1_re; 00599 A2_im += gT22_re * a2_im; 00600 A2_im += gT22_im * a2_re; 00601 spinorFloat B2_re = 0; 00602 B2_re += gT20_re * b0_re; 00603 B2_re -= gT20_im * b0_im; 00604 B2_re += gT21_re * b1_re; 00605 B2_re -= gT21_im * b1_im; 00606 B2_re += gT22_re * b2_re; 00607 B2_re -= gT22_im * b2_im; 00608 spinorFloat B2_im = 0; 00609 B2_im += gT20_re * b0_im; 00610 B2_im += gT20_im * b0_re; 00611 B2_im += gT21_re * b1_im; 00612 B2_im += gT21_im * b1_re; 00613 B2_im += gT22_re * b2_im; 00614 B2_im += gT22_im * b2_re; 00615 00616 o00_re += A0_re; 00617 o00_im += A0_im; 00618 o10_re += B0_re; 00619 o10_im += B0_im; 00620 o20_re -= B0_im; 00621 o20_im += B0_re; 00622 o30_re -= A0_im; 00623 o30_im += A0_re; 00624 00625 o01_re += A1_re; 00626 o01_im += A1_im; 00627 o11_re += B1_re; 00628 o11_im += B1_im; 00629 o21_re -= B1_im; 00630 o21_im += B1_re; 00631 o31_re -= A1_im; 00632 o31_im += A1_re; 00633 00634 o02_re += A2_re; 00635 o02_im += A2_im; 00636 o12_re += B2_re; 00637 o12_im += B2_im; 00638 o22_re -= B2_im; 00639 o22_im += B2_re; 00640 o32_re -= A2_im; 00641 o32_im += A2_re; 00642 00643 } 00644 00645 #ifdef MULTI_GPU 00646 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) || 00647 (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) ) 00648 #endif 00649 { 00650 // Projector P1+ 00651 // 1 0 0 1 00652 // 0 1 -1 0 00653 // 0 -1 1 0 00654 // 1 0 0 1 00655 00656 #ifdef MULTI_GPU 00657 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 : 00658 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00659 #else 00660 const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1; 00661 #endif 00662 00663 const int ga_idx = sid; 00664 00665 spinorFloat a0_re, a0_im; 00666 spinorFloat a1_re, a1_im; 00667 spinorFloat a2_re, a2_im; 00668 spinorFloat b0_re, b0_im; 00669 spinorFloat b1_re, b1_im; 00670 spinorFloat b2_re, b2_im; 00671 00672 #ifdef MULTI_GPU 00673 if (kernel_type == INTERIOR_KERNEL) { 00674 #endif 00675 00676 // read spinor from device memory 00677 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00678 00679 // project spinor into half spinors 00680 a0_re = +i00_re+i30_re; 00681 a0_im = +i00_im+i30_im; 00682 a1_re = +i01_re+i31_re; 00683 a1_im = +i01_im+i31_im; 00684 a2_re = +i02_re+i32_re; 00685 a2_im = +i02_im+i32_im; 00686 b0_re = +i10_re-i20_re; 00687 b0_im = +i10_im-i20_im; 00688 b1_re = +i11_re-i21_re; 00689 b1_im = +i11_im-i21_im; 00690 b2_re = +i12_re-i22_re; 00691 b2_im = +i12_im-i22_im; 00692 00693 #ifdef MULTI_GPU 00694 } else { 00695 00696 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00697 00698 // read half spinor from device memory 00699 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00700 00701 a0_re = i00_re; a0_im = i00_im; 00702 a1_re = i01_re; a1_im = i01_im; 00703 a2_re = i02_re; a2_im = i02_im; 00704 b0_re = i10_re; b0_im = i10_im; 00705 b1_re = i11_re; b1_im = i11_im; 00706 b2_re = i12_re; b2_im = i12_im; 00707 00708 } 00709 #endif // MULTI_GPU 00710 00711 // read gauge matrix from device memory 00712 READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride); 00713 00714 // reconstruct gauge matrix 00715 RECONSTRUCT_GAUGE_MATRIX(2); 00716 00717 // multiply row 0 00718 spinorFloat A0_re = 0; 00719 A0_re += g00_re * a0_re; 00720 A0_re -= g00_im * a0_im; 00721 A0_re += g01_re * a1_re; 00722 A0_re -= g01_im * a1_im; 00723 A0_re += g02_re * a2_re; 00724 A0_re -= g02_im * a2_im; 00725 spinorFloat A0_im = 0; 00726 A0_im += g00_re * a0_im; 00727 A0_im += g00_im * a0_re; 00728 A0_im += g01_re * a1_im; 00729 A0_im += g01_im * a1_re; 00730 A0_im += g02_re * a2_im; 00731 A0_im += g02_im * a2_re; 00732 spinorFloat B0_re = 0; 00733 B0_re += g00_re * b0_re; 00734 B0_re -= g00_im * b0_im; 00735 B0_re += g01_re * b1_re; 00736 B0_re -= g01_im * b1_im; 00737 B0_re += g02_re * b2_re; 00738 B0_re -= g02_im * b2_im; 00739 spinorFloat B0_im = 0; 00740 B0_im += g00_re * b0_im; 00741 B0_im += g00_im * b0_re; 00742 B0_im += g01_re * b1_im; 00743 B0_im += g01_im * b1_re; 00744 B0_im += g02_re * b2_im; 00745 B0_im += g02_im * b2_re; 00746 00747 // multiply row 1 00748 spinorFloat A1_re = 0; 00749 A1_re += g10_re * a0_re; 00750 A1_re -= g10_im * a0_im; 00751 A1_re += g11_re * a1_re; 00752 A1_re -= g11_im * a1_im; 00753 A1_re += g12_re * a2_re; 00754 A1_re -= g12_im * a2_im; 00755 spinorFloat A1_im = 0; 00756 A1_im += g10_re * a0_im; 00757 A1_im += g10_im * a0_re; 00758 A1_im += g11_re * a1_im; 00759 A1_im += g11_im * a1_re; 00760 A1_im += g12_re * a2_im; 00761 A1_im += g12_im * a2_re; 00762 spinorFloat B1_re = 0; 00763 B1_re += g10_re * b0_re; 00764 B1_re -= g10_im * b0_im; 00765 B1_re += g11_re * b1_re; 00766 B1_re -= g11_im * b1_im; 00767 B1_re += g12_re * b2_re; 00768 B1_re -= g12_im * b2_im; 00769 spinorFloat B1_im = 0; 00770 B1_im += g10_re * b0_im; 00771 B1_im += g10_im * b0_re; 00772 B1_im += g11_re * b1_im; 00773 B1_im += g11_im * b1_re; 00774 B1_im += g12_re * b2_im; 00775 B1_im += g12_im * b2_re; 00776 00777 // multiply row 2 00778 spinorFloat A2_re = 0; 00779 A2_re += g20_re * a0_re; 00780 A2_re -= g20_im * a0_im; 00781 A2_re += g21_re * a1_re; 00782 A2_re -= g21_im * a1_im; 00783 A2_re += g22_re * a2_re; 00784 A2_re -= g22_im * a2_im; 00785 spinorFloat A2_im = 0; 00786 A2_im += g20_re * a0_im; 00787 A2_im += g20_im * a0_re; 00788 A2_im += g21_re * a1_im; 00789 A2_im += g21_im * a1_re; 00790 A2_im += g22_re * a2_im; 00791 A2_im += g22_im * a2_re; 00792 spinorFloat B2_re = 0; 00793 B2_re += g20_re * b0_re; 00794 B2_re -= g20_im * b0_im; 00795 B2_re += g21_re * b1_re; 00796 B2_re -= g21_im * b1_im; 00797 B2_re += g22_re * b2_re; 00798 B2_re -= g22_im * b2_im; 00799 spinorFloat B2_im = 0; 00800 B2_im += g20_re * b0_im; 00801 B2_im += g20_im * b0_re; 00802 B2_im += g21_re * b1_im; 00803 B2_im += g21_im * b1_re; 00804 B2_im += g22_re * b2_im; 00805 B2_im += g22_im * b2_re; 00806 00807 o00_re += A0_re; 00808 o00_im += A0_im; 00809 o10_re += B0_re; 00810 o10_im += B0_im; 00811 o20_re -= B0_re; 00812 o20_im -= B0_im; 00813 o30_re += A0_re; 00814 o30_im += A0_im; 00815 00816 o01_re += A1_re; 00817 o01_im += A1_im; 00818 o11_re += B1_re; 00819 o11_im += B1_im; 00820 o21_re -= B1_re; 00821 o21_im -= B1_im; 00822 o31_re += A1_re; 00823 o31_im += A1_im; 00824 00825 o02_re += A2_re; 00826 o02_im += A2_im; 00827 o12_re += B2_re; 00828 o12_im += B2_im; 00829 o22_re -= B2_re; 00830 o22_im -= B2_im; 00831 o32_re += A2_re; 00832 o32_im += A2_im; 00833 00834 } 00835 00836 #ifdef MULTI_GPU 00837 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) || 00838 (kernel_type == EXTERIOR_KERNEL_Y && x2==0) ) 00839 #endif 00840 { 00841 // Projector P1- 00842 // 1 0 0 -1 00843 // 0 1 1 0 00844 // 0 1 1 0 00845 // -1 0 0 1 00846 00847 #ifdef MULTI_GPU 00848 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 : 00849 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00850 #else 00851 const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1; 00852 #endif 00853 00854 #ifdef MULTI_GPU 00855 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 00856 #else 00857 const int ga_idx = sp_idx; 00858 #endif 00859 00860 spinorFloat a0_re, a0_im; 00861 spinorFloat a1_re, a1_im; 00862 spinorFloat a2_re, a2_im; 00863 spinorFloat b0_re, b0_im; 00864 spinorFloat b1_re, b1_im; 00865 spinorFloat b2_re, b2_im; 00866 00867 #ifdef MULTI_GPU 00868 if (kernel_type == INTERIOR_KERNEL) { 00869 #endif 00870 00871 // read spinor from device memory 00872 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00873 00874 // project spinor into half spinors 00875 a0_re = +i00_re-i30_re; 00876 a0_im = +i00_im-i30_im; 00877 a1_re = +i01_re-i31_re; 00878 a1_im = +i01_im-i31_im; 00879 a2_re = +i02_re-i32_re; 00880 a2_im = +i02_im-i32_im; 00881 b0_re = +i10_re+i20_re; 00882 b0_im = +i10_im+i20_im; 00883 b1_re = +i11_re+i21_re; 00884 b1_im = +i11_im+i21_im; 00885 b2_re = +i12_re+i22_re; 00886 b2_im = +i12_im+i22_im; 00887 00888 #ifdef MULTI_GPU 00889 } else { 00890 00891 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00892 00893 // read half spinor from device memory 00894 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 00895 00896 a0_re = i00_re; a0_im = i00_im; 00897 a1_re = i01_re; a1_im = i01_im; 00898 a2_re = i02_re; a2_im = i02_im; 00899 b0_re = i10_re; b0_im = i10_im; 00900 b1_re = i11_re; b1_im = i11_im; 00901 b2_re = i12_re; b2_im = i12_im; 00902 00903 } 00904 #endif // MULTI_GPU 00905 00906 // read gauge matrix from device memory 00907 READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride); 00908 00909 // reconstruct gauge matrix 00910 RECONSTRUCT_GAUGE_MATRIX(3); 00911 00912 // multiply row 0 00913 spinorFloat A0_re = 0; 00914 A0_re += gT00_re * a0_re; 00915 A0_re -= gT00_im * a0_im; 00916 A0_re += gT01_re * a1_re; 00917 A0_re -= gT01_im * a1_im; 00918 A0_re += gT02_re * a2_re; 00919 A0_re -= gT02_im * a2_im; 00920 spinorFloat A0_im = 0; 00921 A0_im += gT00_re * a0_im; 00922 A0_im += gT00_im * a0_re; 00923 A0_im += gT01_re * a1_im; 00924 A0_im += gT01_im * a1_re; 00925 A0_im += gT02_re * a2_im; 00926 A0_im += gT02_im * a2_re; 00927 spinorFloat B0_re = 0; 00928 B0_re += gT00_re * b0_re; 00929 B0_re -= gT00_im * b0_im; 00930 B0_re += gT01_re * b1_re; 00931 B0_re -= gT01_im * b1_im; 00932 B0_re += gT02_re * b2_re; 00933 B0_re -= gT02_im * b2_im; 00934 spinorFloat B0_im = 0; 00935 B0_im += gT00_re * b0_im; 00936 B0_im += gT00_im * b0_re; 00937 B0_im += gT01_re * b1_im; 00938 B0_im += gT01_im * b1_re; 00939 B0_im += gT02_re * b2_im; 00940 B0_im += gT02_im * b2_re; 00941 00942 // multiply row 1 00943 spinorFloat A1_re = 0; 00944 A1_re += gT10_re * a0_re; 00945 A1_re -= gT10_im * a0_im; 00946 A1_re += gT11_re * a1_re; 00947 A1_re -= gT11_im * a1_im; 00948 A1_re += gT12_re * a2_re; 00949 A1_re -= gT12_im * a2_im; 00950 spinorFloat A1_im = 0; 00951 A1_im += gT10_re * a0_im; 00952 A1_im += gT10_im * a0_re; 00953 A1_im += gT11_re * a1_im; 00954 A1_im += gT11_im * a1_re; 00955 A1_im += gT12_re * a2_im; 00956 A1_im += gT12_im * a2_re; 00957 spinorFloat B1_re = 0; 00958 B1_re += gT10_re * b0_re; 00959 B1_re -= gT10_im * b0_im; 00960 B1_re += gT11_re * b1_re; 00961 B1_re -= gT11_im * b1_im; 00962 B1_re += gT12_re * b2_re; 00963 B1_re -= gT12_im * b2_im; 00964 spinorFloat B1_im = 0; 00965 B1_im += gT10_re * b0_im; 00966 B1_im += gT10_im * b0_re; 00967 B1_im += gT11_re * b1_im; 00968 B1_im += gT11_im * b1_re; 00969 B1_im += gT12_re * b2_im; 00970 B1_im += gT12_im * b2_re; 00971 00972 // multiply row 2 00973 spinorFloat A2_re = 0; 00974 A2_re += gT20_re * a0_re; 00975 A2_re -= gT20_im * a0_im; 00976 A2_re += gT21_re * a1_re; 00977 A2_re -= gT21_im * a1_im; 00978 A2_re += gT22_re * a2_re; 00979 A2_re -= gT22_im * a2_im; 00980 spinorFloat A2_im = 0; 00981 A2_im += gT20_re * a0_im; 00982 A2_im += gT20_im * a0_re; 00983 A2_im += gT21_re * a1_im; 00984 A2_im += gT21_im * a1_re; 00985 A2_im += gT22_re * a2_im; 00986 A2_im += gT22_im * a2_re; 00987 spinorFloat B2_re = 0; 00988 B2_re += gT20_re * b0_re; 00989 B2_re -= gT20_im * b0_im; 00990 B2_re += gT21_re * b1_re; 00991 B2_re -= gT21_im * b1_im; 00992 B2_re += gT22_re * b2_re; 00993 B2_re -= gT22_im * b2_im; 00994 spinorFloat B2_im = 0; 00995 B2_im += gT20_re * b0_im; 00996 B2_im += gT20_im * b0_re; 00997 B2_im += gT21_re * b1_im; 00998 B2_im += gT21_im * b1_re; 00999 B2_im += gT22_re * b2_im; 01000 B2_im += gT22_im * b2_re; 01001 01002 o00_re += A0_re; 01003 o00_im += A0_im; 01004 o10_re += B0_re; 01005 o10_im += B0_im; 01006 o20_re += B0_re; 01007 o20_im += B0_im; 01008 o30_re -= A0_re; 01009 o30_im -= A0_im; 01010 01011 o01_re += A1_re; 01012 o01_im += A1_im; 01013 o11_re += B1_re; 01014 o11_im += B1_im; 01015 o21_re += B1_re; 01016 o21_im += B1_im; 01017 o31_re -= A1_re; 01018 o31_im -= A1_im; 01019 01020 o02_re += A2_re; 01021 o02_im += A2_im; 01022 o12_re += B2_re; 01023 o12_im += B2_im; 01024 o22_re += B2_re; 01025 o22_im += B2_im; 01026 o32_re -= A2_re; 01027 o32_im -= A2_im; 01028 01029 } 01030 01031 #ifdef MULTI_GPU 01032 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) || 01033 (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) ) 01034 #endif 01035 { 01036 // Projector P2+ 01037 // 1 0 i 0 01038 // 0 1 0 -i 01039 // -i 0 1 0 01040 // 0 i 0 1 01041 01042 #ifdef MULTI_GPU 01043 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 : 01044 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01045 #else 01046 const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1; 01047 #endif 01048 01049 const int ga_idx = sid; 01050 01051 spinorFloat a0_re, a0_im; 01052 spinorFloat a1_re, a1_im; 01053 spinorFloat a2_re, a2_im; 01054 spinorFloat b0_re, b0_im; 01055 spinorFloat b1_re, b1_im; 01056 spinorFloat b2_re, b2_im; 01057 01058 #ifdef MULTI_GPU 01059 if (kernel_type == INTERIOR_KERNEL) { 01060 #endif 01061 01062 // read spinor from device memory 01063 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01064 01065 // project spinor into half spinors 01066 a0_re = +i00_re-i20_im; 01067 a0_im = +i00_im+i20_re; 01068 a1_re = +i01_re-i21_im; 01069 a1_im = +i01_im+i21_re; 01070 a2_re = +i02_re-i22_im; 01071 a2_im = +i02_im+i22_re; 01072 b0_re = +i10_re+i30_im; 01073 b0_im = +i10_im-i30_re; 01074 b1_re = +i11_re+i31_im; 01075 b1_im = +i11_im-i31_re; 01076 b2_re = +i12_re+i32_im; 01077 b2_im = +i12_im-i32_re; 01078 01079 #ifdef MULTI_GPU 01080 } else { 01081 01082 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01083 01084 // read half spinor from device memory 01085 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01086 01087 a0_re = i00_re; a0_im = i00_im; 01088 a1_re = i01_re; a1_im = i01_im; 01089 a2_re = i02_re; a2_im = i02_im; 01090 b0_re = i10_re; b0_im = i10_im; 01091 b1_re = i11_re; b1_im = i11_im; 01092 b2_re = i12_re; b2_im = i12_im; 01093 01094 } 01095 #endif // MULTI_GPU 01096 01097 // read gauge matrix from device memory 01098 READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride); 01099 01100 // reconstruct gauge matrix 01101 RECONSTRUCT_GAUGE_MATRIX(4); 01102 01103 // multiply row 0 01104 spinorFloat A0_re = 0; 01105 A0_re += g00_re * a0_re; 01106 A0_re -= g00_im * a0_im; 01107 A0_re += g01_re * a1_re; 01108 A0_re -= g01_im * a1_im; 01109 A0_re += g02_re * a2_re; 01110 A0_re -= g02_im * a2_im; 01111 spinorFloat A0_im = 0; 01112 A0_im += g00_re * a0_im; 01113 A0_im += g00_im * a0_re; 01114 A0_im += g01_re * a1_im; 01115 A0_im += g01_im * a1_re; 01116 A0_im += g02_re * a2_im; 01117 A0_im += g02_im * a2_re; 01118 spinorFloat B0_re = 0; 01119 B0_re += g00_re * b0_re; 01120 B0_re -= g00_im * b0_im; 01121 B0_re += g01_re * b1_re; 01122 B0_re -= g01_im * b1_im; 01123 B0_re += g02_re * b2_re; 01124 B0_re -= g02_im * b2_im; 01125 spinorFloat B0_im = 0; 01126 B0_im += g00_re * b0_im; 01127 B0_im += g00_im * b0_re; 01128 B0_im += g01_re * b1_im; 01129 B0_im += g01_im * b1_re; 01130 B0_im += g02_re * b2_im; 01131 B0_im += g02_im * b2_re; 01132 01133 // multiply row 1 01134 spinorFloat A1_re = 0; 01135 A1_re += g10_re * a0_re; 01136 A1_re -= g10_im * a0_im; 01137 A1_re += g11_re * a1_re; 01138 A1_re -= g11_im * a1_im; 01139 A1_re += g12_re * a2_re; 01140 A1_re -= g12_im * a2_im; 01141 spinorFloat A1_im = 0; 01142 A1_im += g10_re * a0_im; 01143 A1_im += g10_im * a0_re; 01144 A1_im += g11_re * a1_im; 01145 A1_im += g11_im * a1_re; 01146 A1_im += g12_re * a2_im; 01147 A1_im += g12_im * a2_re; 01148 spinorFloat B1_re = 0; 01149 B1_re += g10_re * b0_re; 01150 B1_re -= g10_im * b0_im; 01151 B1_re += g11_re * b1_re; 01152 B1_re -= g11_im * b1_im; 01153 B1_re += g12_re * b2_re; 01154 B1_re -= g12_im * b2_im; 01155 spinorFloat B1_im = 0; 01156 B1_im += g10_re * b0_im; 01157 B1_im += g10_im * b0_re; 01158 B1_im += g11_re * b1_im; 01159 B1_im += g11_im * b1_re; 01160 B1_im += g12_re * b2_im; 01161 B1_im += g12_im * b2_re; 01162 01163 // multiply row 2 01164 spinorFloat A2_re = 0; 01165 A2_re += g20_re * a0_re; 01166 A2_re -= g20_im * a0_im; 01167 A2_re += g21_re * a1_re; 01168 A2_re -= g21_im * a1_im; 01169 A2_re += g22_re * a2_re; 01170 A2_re -= g22_im * a2_im; 01171 spinorFloat A2_im = 0; 01172 A2_im += g20_re * a0_im; 01173 A2_im += g20_im * a0_re; 01174 A2_im += g21_re * a1_im; 01175 A2_im += g21_im * a1_re; 01176 A2_im += g22_re * a2_im; 01177 A2_im += g22_im * a2_re; 01178 spinorFloat B2_re = 0; 01179 B2_re += g20_re * b0_re; 01180 B2_re -= g20_im * b0_im; 01181 B2_re += g21_re * b1_re; 01182 B2_re -= g21_im * b1_im; 01183 B2_re += g22_re * b2_re; 01184 B2_re -= g22_im * b2_im; 01185 spinorFloat B2_im = 0; 01186 B2_im += g20_re * b0_im; 01187 B2_im += g20_im * b0_re; 01188 B2_im += g21_re * b1_im; 01189 B2_im += g21_im * b1_re; 01190 B2_im += g22_re * b2_im; 01191 B2_im += g22_im * b2_re; 01192 01193 o00_re += A0_re; 01194 o00_im += A0_im; 01195 o10_re += B0_re; 01196 o10_im += B0_im; 01197 o20_re += A0_im; 01198 o20_im -= A0_re; 01199 o30_re -= B0_im; 01200 o30_im += B0_re; 01201 01202 o01_re += A1_re; 01203 o01_im += A1_im; 01204 o11_re += B1_re; 01205 o11_im += B1_im; 01206 o21_re += A1_im; 01207 o21_im -= A1_re; 01208 o31_re -= B1_im; 01209 o31_im += B1_re; 01210 01211 o02_re += A2_re; 01212 o02_im += A2_im; 01213 o12_re += B2_re; 01214 o12_im += B2_im; 01215 o22_re += A2_im; 01216 o22_im -= A2_re; 01217 o32_re -= B2_im; 01218 o32_im += B2_re; 01219 01220 } 01221 01222 #ifdef MULTI_GPU 01223 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) || 01224 (kernel_type == EXTERIOR_KERNEL_Z && x3==0) ) 01225 #endif 01226 { 01227 // Projector P2- 01228 // 1 0 -i 0 01229 // 0 1 0 i 01230 // i 0 1 0 01231 // 0 -i 0 1 01232 01233 #ifdef MULTI_GPU 01234 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 : 01235 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01236 #else 01237 const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1; 01238 #endif 01239 01240 #ifdef MULTI_GPU 01241 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01242 #else 01243 const int ga_idx = sp_idx; 01244 #endif 01245 01246 spinorFloat a0_re, a0_im; 01247 spinorFloat a1_re, a1_im; 01248 spinorFloat a2_re, a2_im; 01249 spinorFloat b0_re, b0_im; 01250 spinorFloat b1_re, b1_im; 01251 spinorFloat b2_re, b2_im; 01252 01253 #ifdef MULTI_GPU 01254 if (kernel_type == INTERIOR_KERNEL) { 01255 #endif 01256 01257 // read spinor from device memory 01258 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01259 01260 // project spinor into half spinors 01261 a0_re = +i00_re+i20_im; 01262 a0_im = +i00_im-i20_re; 01263 a1_re = +i01_re+i21_im; 01264 a1_im = +i01_im-i21_re; 01265 a2_re = +i02_re+i22_im; 01266 a2_im = +i02_im-i22_re; 01267 b0_re = +i10_re-i30_im; 01268 b0_im = +i10_im+i30_re; 01269 b1_re = +i11_re-i31_im; 01270 b1_im = +i11_im+i31_re; 01271 b2_re = +i12_re-i32_im; 01272 b2_im = +i12_im+i32_re; 01273 01274 #ifdef MULTI_GPU 01275 } else { 01276 01277 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01278 01279 // read half spinor from device memory 01280 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01281 01282 a0_re = i00_re; a0_im = i00_im; 01283 a1_re = i01_re; a1_im = i01_im; 01284 a2_re = i02_re; a2_im = i02_im; 01285 b0_re = i10_re; b0_im = i10_im; 01286 b1_re = i11_re; b1_im = i11_im; 01287 b2_re = i12_re; b2_im = i12_im; 01288 01289 } 01290 #endif // MULTI_GPU 01291 01292 // read gauge matrix from device memory 01293 READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride); 01294 01295 // reconstruct gauge matrix 01296 RECONSTRUCT_GAUGE_MATRIX(5); 01297 01298 // multiply row 0 01299 spinorFloat A0_re = 0; 01300 A0_re += gT00_re * a0_re; 01301 A0_re -= gT00_im * a0_im; 01302 A0_re += gT01_re * a1_re; 01303 A0_re -= gT01_im * a1_im; 01304 A0_re += gT02_re * a2_re; 01305 A0_re -= gT02_im * a2_im; 01306 spinorFloat A0_im = 0; 01307 A0_im += gT00_re * a0_im; 01308 A0_im += gT00_im * a0_re; 01309 A0_im += gT01_re * a1_im; 01310 A0_im += gT01_im * a1_re; 01311 A0_im += gT02_re * a2_im; 01312 A0_im += gT02_im * a2_re; 01313 spinorFloat B0_re = 0; 01314 B0_re += gT00_re * b0_re; 01315 B0_re -= gT00_im * b0_im; 01316 B0_re += gT01_re * b1_re; 01317 B0_re -= gT01_im * b1_im; 01318 B0_re += gT02_re * b2_re; 01319 B0_re -= gT02_im * b2_im; 01320 spinorFloat B0_im = 0; 01321 B0_im += gT00_re * b0_im; 01322 B0_im += gT00_im * b0_re; 01323 B0_im += gT01_re * b1_im; 01324 B0_im += gT01_im * b1_re; 01325 B0_im += gT02_re * b2_im; 01326 B0_im += gT02_im * b2_re; 01327 01328 // multiply row 1 01329 spinorFloat A1_re = 0; 01330 A1_re += gT10_re * a0_re; 01331 A1_re -= gT10_im * a0_im; 01332 A1_re += gT11_re * a1_re; 01333 A1_re -= gT11_im * a1_im; 01334 A1_re += gT12_re * a2_re; 01335 A1_re -= gT12_im * a2_im; 01336 spinorFloat A1_im = 0; 01337 A1_im += gT10_re * a0_im; 01338 A1_im += gT10_im * a0_re; 01339 A1_im += gT11_re * a1_im; 01340 A1_im += gT11_im * a1_re; 01341 A1_im += gT12_re * a2_im; 01342 A1_im += gT12_im * a2_re; 01343 spinorFloat B1_re = 0; 01344 B1_re += gT10_re * b0_re; 01345 B1_re -= gT10_im * b0_im; 01346 B1_re += gT11_re * b1_re; 01347 B1_re -= gT11_im * b1_im; 01348 B1_re += gT12_re * b2_re; 01349 B1_re -= gT12_im * b2_im; 01350 spinorFloat B1_im = 0; 01351 B1_im += gT10_re * b0_im; 01352 B1_im += gT10_im * b0_re; 01353 B1_im += gT11_re * b1_im; 01354 B1_im += gT11_im * b1_re; 01355 B1_im += gT12_re * b2_im; 01356 B1_im += gT12_im * b2_re; 01357 01358 // multiply row 2 01359 spinorFloat A2_re = 0; 01360 A2_re += gT20_re * a0_re; 01361 A2_re -= gT20_im * a0_im; 01362 A2_re += gT21_re * a1_re; 01363 A2_re -= gT21_im * a1_im; 01364 A2_re += gT22_re * a2_re; 01365 A2_re -= gT22_im * a2_im; 01366 spinorFloat A2_im = 0; 01367 A2_im += gT20_re * a0_im; 01368 A2_im += gT20_im * a0_re; 01369 A2_im += gT21_re * a1_im; 01370 A2_im += gT21_im * a1_re; 01371 A2_im += gT22_re * a2_im; 01372 A2_im += gT22_im * a2_re; 01373 spinorFloat B2_re = 0; 01374 B2_re += gT20_re * b0_re; 01375 B2_re -= gT20_im * b0_im; 01376 B2_re += gT21_re * b1_re; 01377 B2_re -= gT21_im * b1_im; 01378 B2_re += gT22_re * b2_re; 01379 B2_re -= gT22_im * b2_im; 01380 spinorFloat B2_im = 0; 01381 B2_im += gT20_re * b0_im; 01382 B2_im += gT20_im * b0_re; 01383 B2_im += gT21_re * b1_im; 01384 B2_im += gT21_im * b1_re; 01385 B2_im += gT22_re * b2_im; 01386 B2_im += gT22_im * b2_re; 01387 01388 o00_re += A0_re; 01389 o00_im += A0_im; 01390 o10_re += B0_re; 01391 o10_im += B0_im; 01392 o20_re -= A0_im; 01393 o20_im += A0_re; 01394 o30_re += B0_im; 01395 o30_im -= B0_re; 01396 01397 o01_re += A1_re; 01398 o01_im += A1_im; 01399 o11_re += B1_re; 01400 o11_im += B1_im; 01401 o21_re -= A1_im; 01402 o21_im += A1_re; 01403 o31_re += B1_im; 01404 o31_im -= B1_re; 01405 01406 o02_re += A2_re; 01407 o02_im += A2_im; 01408 o12_re += B2_re; 01409 o12_im += B2_im; 01410 o22_re -= A2_im; 01411 o22_im += A2_re; 01412 o32_re += B2_im; 01413 o32_im -= B2_re; 01414 01415 } 01416 01417 #ifdef MULTI_GPU 01418 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) || 01419 (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) ) 01420 #endif 01421 { 01422 // Projector P3+ 01423 // 2 0 0 0 01424 // 0 2 0 0 01425 // 0 0 0 0 01426 // 0 0 0 0 01427 01428 #ifdef MULTI_GPU 01429 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 : 01430 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01431 #else 01432 const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1; 01433 #endif 01434 01435 const int ga_idx = sid; 01436 01437 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01438 { 01439 spinorFloat a0_re, a0_im; 01440 spinorFloat a1_re, a1_im; 01441 spinorFloat a2_re, a2_im; 01442 spinorFloat b0_re, b0_im; 01443 spinorFloat b1_re, b1_im; 01444 spinorFloat b2_re, b2_im; 01445 01446 #ifdef MULTI_GPU 01447 if (kernel_type == INTERIOR_KERNEL) { 01448 #endif 01449 01450 // read spinor from device memory 01451 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01452 01453 // project spinor into half spinors 01454 a0_re = +2*i00_re; 01455 a0_im = +2*i00_im; 01456 a1_re = +2*i01_re; 01457 a1_im = +2*i01_im; 01458 a2_re = +2*i02_re; 01459 a2_im = +2*i02_im; 01460 b0_re = +2*i10_re; 01461 b0_im = +2*i10_im; 01462 b1_re = +2*i11_re; 01463 b1_im = +2*i11_im; 01464 b2_re = +2*i12_re; 01465 b2_im = +2*i12_im; 01466 01467 #ifdef MULTI_GPU 01468 } else { 01469 01470 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01471 const int t_proj_scale = TPROJSCALE; 01472 01473 // read half spinor from device memory 01474 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01475 01476 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01477 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01478 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01479 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01480 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01481 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01482 01483 } 01484 #endif // MULTI_GPU 01485 01486 // identity gauge matrix 01487 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01488 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01489 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01490 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01491 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01492 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01493 01494 o00_re += A0_re; 01495 o00_im += A0_im; 01496 o10_re += B0_re; 01497 o10_im += B0_im; 01498 01499 o01_re += A1_re; 01500 o01_im += A1_im; 01501 o11_re += B1_re; 01502 o11_im += B1_im; 01503 01504 o02_re += A2_re; 01505 o02_im += A2_im; 01506 o12_re += B2_re; 01507 o12_im += B2_im; 01508 01509 } else { 01510 spinorFloat a0_re, a0_im; 01511 spinorFloat a1_re, a1_im; 01512 spinorFloat a2_re, a2_im; 01513 spinorFloat b0_re, b0_im; 01514 spinorFloat b1_re, b1_im; 01515 spinorFloat b2_re, b2_im; 01516 01517 #ifdef MULTI_GPU 01518 if (kernel_type == INTERIOR_KERNEL) { 01519 #endif 01520 01521 // read spinor from device memory 01522 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01523 01524 // project spinor into half spinors 01525 a0_re = +2*i00_re; 01526 a0_im = +2*i00_im; 01527 a1_re = +2*i01_re; 01528 a1_im = +2*i01_im; 01529 a2_re = +2*i02_re; 01530 a2_im = +2*i02_im; 01531 b0_re = +2*i10_re; 01532 b0_im = +2*i10_im; 01533 b1_re = +2*i11_re; 01534 b1_im = +2*i11_im; 01535 b2_re = +2*i12_re; 01536 b2_im = +2*i12_im; 01537 01538 #ifdef MULTI_GPU 01539 } else { 01540 01541 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01542 const int t_proj_scale = TPROJSCALE; 01543 01544 // read half spinor from device memory 01545 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01546 01547 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01548 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01549 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01550 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01551 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01552 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01553 01554 } 01555 #endif // MULTI_GPU 01556 01557 // read gauge matrix from device memory 01558 READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride); 01559 01560 // reconstruct gauge matrix 01561 RECONSTRUCT_GAUGE_MATRIX(6); 01562 01563 // multiply row 0 01564 spinorFloat A0_re = 0; 01565 A0_re += g00_re * a0_re; 01566 A0_re -= g00_im * a0_im; 01567 A0_re += g01_re * a1_re; 01568 A0_re -= g01_im * a1_im; 01569 A0_re += g02_re * a2_re; 01570 A0_re -= g02_im * a2_im; 01571 spinorFloat A0_im = 0; 01572 A0_im += g00_re * a0_im; 01573 A0_im += g00_im * a0_re; 01574 A0_im += g01_re * a1_im; 01575 A0_im += g01_im * a1_re; 01576 A0_im += g02_re * a2_im; 01577 A0_im += g02_im * a2_re; 01578 spinorFloat B0_re = 0; 01579 B0_re += g00_re * b0_re; 01580 B0_re -= g00_im * b0_im; 01581 B0_re += g01_re * b1_re; 01582 B0_re -= g01_im * b1_im; 01583 B0_re += g02_re * b2_re; 01584 B0_re -= g02_im * b2_im; 01585 spinorFloat B0_im = 0; 01586 B0_im += g00_re * b0_im; 01587 B0_im += g00_im * b0_re; 01588 B0_im += g01_re * b1_im; 01589 B0_im += g01_im * b1_re; 01590 B0_im += g02_re * b2_im; 01591 B0_im += g02_im * b2_re; 01592 01593 // multiply row 1 01594 spinorFloat A1_re = 0; 01595 A1_re += g10_re * a0_re; 01596 A1_re -= g10_im * a0_im; 01597 A1_re += g11_re * a1_re; 01598 A1_re -= g11_im * a1_im; 01599 A1_re += g12_re * a2_re; 01600 A1_re -= g12_im * a2_im; 01601 spinorFloat A1_im = 0; 01602 A1_im += g10_re * a0_im; 01603 A1_im += g10_im * a0_re; 01604 A1_im += g11_re * a1_im; 01605 A1_im += g11_im * a1_re; 01606 A1_im += g12_re * a2_im; 01607 A1_im += g12_im * a2_re; 01608 spinorFloat B1_re = 0; 01609 B1_re += g10_re * b0_re; 01610 B1_re -= g10_im * b0_im; 01611 B1_re += g11_re * b1_re; 01612 B1_re -= g11_im * b1_im; 01613 B1_re += g12_re * b2_re; 01614 B1_re -= g12_im * b2_im; 01615 spinorFloat B1_im = 0; 01616 B1_im += g10_re * b0_im; 01617 B1_im += g10_im * b0_re; 01618 B1_im += g11_re * b1_im; 01619 B1_im += g11_im * b1_re; 01620 B1_im += g12_re * b2_im; 01621 B1_im += g12_im * b2_re; 01622 01623 // multiply row 2 01624 spinorFloat A2_re = 0; 01625 A2_re += g20_re * a0_re; 01626 A2_re -= g20_im * a0_im; 01627 A2_re += g21_re * a1_re; 01628 A2_re -= g21_im * a1_im; 01629 A2_re += g22_re * a2_re; 01630 A2_re -= g22_im * a2_im; 01631 spinorFloat A2_im = 0; 01632 A2_im += g20_re * a0_im; 01633 A2_im += g20_im * a0_re; 01634 A2_im += g21_re * a1_im; 01635 A2_im += g21_im * a1_re; 01636 A2_im += g22_re * a2_im; 01637 A2_im += g22_im * a2_re; 01638 spinorFloat B2_re = 0; 01639 B2_re += g20_re * b0_re; 01640 B2_re -= g20_im * b0_im; 01641 B2_re += g21_re * b1_re; 01642 B2_re -= g21_im * b1_im; 01643 B2_re += g22_re * b2_re; 01644 B2_re -= g22_im * b2_im; 01645 spinorFloat B2_im = 0; 01646 B2_im += g20_re * b0_im; 01647 B2_im += g20_im * b0_re; 01648 B2_im += g21_re * b1_im; 01649 B2_im += g21_im * b1_re; 01650 B2_im += g22_re * b2_im; 01651 B2_im += g22_im * b2_re; 01652 01653 o00_re += A0_re; 01654 o00_im += A0_im; 01655 o10_re += B0_re; 01656 o10_im += B0_im; 01657 01658 o01_re += A1_re; 01659 o01_im += A1_im; 01660 o11_re += B1_re; 01661 o11_im += B1_im; 01662 01663 o02_re += A2_re; 01664 o02_im += A2_im; 01665 o12_re += B2_re; 01666 o12_im += B2_im; 01667 01668 } 01669 } 01670 01671 #ifdef MULTI_GPU 01672 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) || 01673 (kernel_type == EXTERIOR_KERNEL_T && x4==0) ) 01674 #endif 01675 { 01676 // Projector P3- 01677 // 0 0 0 0 01678 // 0 0 0 0 01679 // 0 0 2 0 01680 // 0 0 0 2 01681 01682 #ifdef MULTI_GPU 01683 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 : 01684 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01685 #else 01686 const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1; 01687 #endif 01688 01689 #ifdef MULTI_GPU 01690 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01691 #else 01692 const int ga_idx = sp_idx; 01693 #endif 01694 01695 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01696 { 01697 spinorFloat a0_re, a0_im; 01698 spinorFloat a1_re, a1_im; 01699 spinorFloat a2_re, a2_im; 01700 spinorFloat b0_re, b0_im; 01701 spinorFloat b1_re, b1_im; 01702 spinorFloat b2_re, b2_im; 01703 01704 #ifdef MULTI_GPU 01705 if (kernel_type == INTERIOR_KERNEL) { 01706 #endif 01707 01708 // read spinor from device memory 01709 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01710 01711 // project spinor into half spinors 01712 a0_re = +2*i20_re; 01713 a0_im = +2*i20_im; 01714 a1_re = +2*i21_re; 01715 a1_im = +2*i21_im; 01716 a2_re = +2*i22_re; 01717 a2_im = +2*i22_im; 01718 b0_re = +2*i30_re; 01719 b0_im = +2*i30_im; 01720 b1_re = +2*i31_re; 01721 b1_im = +2*i31_im; 01722 b2_re = +2*i32_re; 01723 b2_im = +2*i32_im; 01724 01725 #ifdef MULTI_GPU 01726 } else { 01727 01728 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01729 const int t_proj_scale = TPROJSCALE; 01730 01731 // read half spinor from device memory 01732 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01733 01734 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01735 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01736 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01737 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01738 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01739 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01740 01741 } 01742 #endif // MULTI_GPU 01743 01744 // identity gauge matrix 01745 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01746 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01747 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01748 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01749 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01750 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01751 01752 o20_re += A0_re; 01753 o20_im += A0_im; 01754 o30_re += B0_re; 01755 o30_im += B0_im; 01756 01757 o21_re += A1_re; 01758 o21_im += A1_im; 01759 o31_re += B1_re; 01760 o31_im += B1_im; 01761 01762 o22_re += A2_re; 01763 o22_im += A2_im; 01764 o32_re += B2_re; 01765 o32_im += B2_im; 01766 01767 } else { 01768 spinorFloat a0_re, a0_im; 01769 spinorFloat a1_re, a1_im; 01770 spinorFloat a2_re, a2_im; 01771 spinorFloat b0_re, b0_im; 01772 spinorFloat b1_re, b1_im; 01773 spinorFloat b2_re, b2_im; 01774 01775 #ifdef MULTI_GPU 01776 if (kernel_type == INTERIOR_KERNEL) { 01777 #endif 01778 01779 // read spinor from device memory 01780 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01781 01782 // project spinor into half spinors 01783 a0_re = +2*i20_re; 01784 a0_im = +2*i20_im; 01785 a1_re = +2*i21_re; 01786 a1_im = +2*i21_im; 01787 a2_re = +2*i22_re; 01788 a2_im = +2*i22_im; 01789 b0_re = +2*i30_re; 01790 b0_im = +2*i30_im; 01791 b1_re = +2*i31_re; 01792 b1_im = +2*i31_im; 01793 b2_re = +2*i32_re; 01794 b2_im = +2*i32_im; 01795 01796 #ifdef MULTI_GPU 01797 } else { 01798 01799 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01800 const int t_proj_scale = TPROJSCALE; 01801 01802 // read half spinor from device memory 01803 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01804 01805 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01806 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01807 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01808 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01809 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01810 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01811 01812 } 01813 #endif // MULTI_GPU 01814 01815 // read gauge matrix from device memory 01816 READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride); 01817 01818 // reconstruct gauge matrix 01819 RECONSTRUCT_GAUGE_MATRIX(7); 01820 01821 // multiply row 0 01822 spinorFloat A0_re = 0; 01823 A0_re += gT00_re * a0_re; 01824 A0_re -= gT00_im * a0_im; 01825 A0_re += gT01_re * a1_re; 01826 A0_re -= gT01_im * a1_im; 01827 A0_re += gT02_re * a2_re; 01828 A0_re -= gT02_im * a2_im; 01829 spinorFloat A0_im = 0; 01830 A0_im += gT00_re * a0_im; 01831 A0_im += gT00_im * a0_re; 01832 A0_im += gT01_re * a1_im; 01833 A0_im += gT01_im * a1_re; 01834 A0_im += gT02_re * a2_im; 01835 A0_im += gT02_im * a2_re; 01836 spinorFloat B0_re = 0; 01837 B0_re += gT00_re * b0_re; 01838 B0_re -= gT00_im * b0_im; 01839 B0_re += gT01_re * b1_re; 01840 B0_re -= gT01_im * b1_im; 01841 B0_re += gT02_re * b2_re; 01842 B0_re -= gT02_im * b2_im; 01843 spinorFloat B0_im = 0; 01844 B0_im += gT00_re * b0_im; 01845 B0_im += gT00_im * b0_re; 01846 B0_im += gT01_re * b1_im; 01847 B0_im += gT01_im * b1_re; 01848 B0_im += gT02_re * b2_im; 01849 B0_im += gT02_im * b2_re; 01850 01851 // multiply row 1 01852 spinorFloat A1_re = 0; 01853 A1_re += gT10_re * a0_re; 01854 A1_re -= gT10_im * a0_im; 01855 A1_re += gT11_re * a1_re; 01856 A1_re -= gT11_im * a1_im; 01857 A1_re += gT12_re * a2_re; 01858 A1_re -= gT12_im * a2_im; 01859 spinorFloat A1_im = 0; 01860 A1_im += gT10_re * a0_im; 01861 A1_im += gT10_im * a0_re; 01862 A1_im += gT11_re * a1_im; 01863 A1_im += gT11_im * a1_re; 01864 A1_im += gT12_re * a2_im; 01865 A1_im += gT12_im * a2_re; 01866 spinorFloat B1_re = 0; 01867 B1_re += gT10_re * b0_re; 01868 B1_re -= gT10_im * b0_im; 01869 B1_re += gT11_re * b1_re; 01870 B1_re -= gT11_im * b1_im; 01871 B1_re += gT12_re * b2_re; 01872 B1_re -= gT12_im * b2_im; 01873 spinorFloat B1_im = 0; 01874 B1_im += gT10_re * b0_im; 01875 B1_im += gT10_im * b0_re; 01876 B1_im += gT11_re * b1_im; 01877 B1_im += gT11_im * b1_re; 01878 B1_im += gT12_re * b2_im; 01879 B1_im += gT12_im * b2_re; 01880 01881 // multiply row 2 01882 spinorFloat A2_re = 0; 01883 A2_re += gT20_re * a0_re; 01884 A2_re -= gT20_im * a0_im; 01885 A2_re += gT21_re * a1_re; 01886 A2_re -= gT21_im * a1_im; 01887 A2_re += gT22_re * a2_re; 01888 A2_re -= gT22_im * a2_im; 01889 spinorFloat A2_im = 0; 01890 A2_im += gT20_re * a0_im; 01891 A2_im += gT20_im * a0_re; 01892 A2_im += gT21_re * a1_im; 01893 A2_im += gT21_im * a1_re; 01894 A2_im += gT22_re * a2_im; 01895 A2_im += gT22_im * a2_re; 01896 spinorFloat B2_re = 0; 01897 B2_re += gT20_re * b0_re; 01898 B2_re -= gT20_im * b0_im; 01899 B2_re += gT21_re * b1_re; 01900 B2_re -= gT21_im * b1_im; 01901 B2_re += gT22_re * b2_re; 01902 B2_re -= gT22_im * b2_im; 01903 spinorFloat B2_im = 0; 01904 B2_im += gT20_re * b0_im; 01905 B2_im += gT20_im * b0_re; 01906 B2_im += gT21_re * b1_im; 01907 B2_im += gT21_im * b1_re; 01908 B2_im += gT22_re * b2_im; 01909 B2_im += gT22_im * b2_re; 01910 01911 o20_re += A0_re; 01912 o20_im += A0_im; 01913 o30_re += B0_re; 01914 o30_im += B0_im; 01915 01916 o21_re += A1_re; 01917 o21_im += A1_im; 01918 o31_re += B1_re; 01919 o31_im += B1_im; 01920 01921 o22_re += A2_re; 01922 o22_im += A2_im; 01923 o32_re += B2_re; 01924 o32_im += B2_im; 01925 01926 } 01927 } 01928 01929 #ifdef MULTI_GPU 01930 01931 int incomplete = 0; // Have all 8 contributions been computed for this site? 01932 01933 switch(kernel_type) { // intentional fall-through 01934 case INTERIOR_KERNEL: 01935 incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1)); 01936 case EXTERIOR_KERNEL_T: 01937 incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1)); 01938 case EXTERIOR_KERNEL_Z: 01939 incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1)); 01940 case EXTERIOR_KERNEL_Y: 01941 incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1)); 01942 } 01943 01944 if (!incomplete) 01945 #endif // MULTI_GPU 01946 { 01947 01948 { 01949 // apply twisted mass rotation 01950 VOLATILE spinorFloat tmp00_re = +o00_re-o20_im*a; 01951 VOLATILE spinorFloat tmp00_im = +o00_im+o20_re*a; 01952 VOLATILE spinorFloat tmp01_re = +o01_re-o21_im*a; 01953 VOLATILE spinorFloat tmp01_im = +o01_im+o21_re*a; 01954 VOLATILE spinorFloat tmp02_re = +o02_re-o22_im*a; 01955 VOLATILE spinorFloat tmp02_im = +o02_im+o22_re*a; 01956 01957 VOLATILE spinorFloat tmp10_re = +o10_re-o30_im*a; 01958 VOLATILE spinorFloat tmp10_im = +o10_im+o30_re*a; 01959 VOLATILE spinorFloat tmp11_re = +o11_re-o31_im*a; 01960 VOLATILE spinorFloat tmp11_im = +o11_im+o31_re*a; 01961 VOLATILE spinorFloat tmp12_re = +o12_re-o32_im*a; 01962 VOLATILE spinorFloat tmp12_im = +o12_im+o32_re*a; 01963 01964 VOLATILE spinorFloat tmp20_re = -o00_im*a+o20_re; 01965 VOLATILE spinorFloat tmp20_im = +o00_re*a+o20_im; 01966 VOLATILE spinorFloat tmp21_re = -o01_im*a+o21_re; 01967 VOLATILE spinorFloat tmp21_im = +o01_re*a+o21_im; 01968 VOLATILE spinorFloat tmp22_re = -o02_im*a+o22_re; 01969 VOLATILE spinorFloat tmp22_im = +o02_re*a+o22_im; 01970 01971 VOLATILE spinorFloat tmp30_re = -o10_im*a+o30_re; 01972 VOLATILE spinorFloat tmp30_im = +o10_re*a+o30_im; 01973 VOLATILE spinorFloat tmp31_re = -o11_im*a+o31_re; 01974 VOLATILE spinorFloat tmp31_im = +o11_re*a+o31_im; 01975 VOLATILE spinorFloat tmp32_re = -o12_im*a+o32_re; 01976 VOLATILE spinorFloat tmp32_im = +o12_re*a+o32_im; 01977 01978 01979 #ifndef DSLASH_XPAY 01980 //scale by b = 1/(1 + a*a) 01981 o00_re = b*tmp00_re; 01982 o00_im = b*tmp00_im; 01983 o01_re = b*tmp01_re; 01984 o01_im = b*tmp01_im; 01985 o02_re = b*tmp02_re; 01986 o02_im = b*tmp02_im; 01987 o10_re = b*tmp10_re; 01988 o10_im = b*tmp10_im; 01989 o11_re = b*tmp11_re; 01990 o11_im = b*tmp11_im; 01991 o12_re = b*tmp12_re; 01992 o12_im = b*tmp12_im; 01993 o20_re = b*tmp20_re; 01994 o20_im = b*tmp20_im; 01995 o21_re = b*tmp21_re; 01996 o21_im = b*tmp21_im; 01997 o22_re = b*tmp22_re; 01998 o22_im = b*tmp22_im; 01999 o30_re = b*tmp30_re; 02000 o30_im = b*tmp30_im; 02001 o31_re = b*tmp31_re; 02002 o31_im = b*tmp31_im; 02003 o32_re = b*tmp32_re; 02004 o32_im = b*tmp32_im; 02005 #else 02006 o00_re = tmp00_re; 02007 o00_im = tmp00_im; 02008 o01_re = tmp01_re; 02009 o01_im = tmp01_im; 02010 o02_re = tmp02_re; 02011 o02_im = tmp02_im; 02012 o10_re = tmp10_re; 02013 o10_im = tmp10_im; 02014 o11_re = tmp11_re; 02015 o11_im = tmp11_im; 02016 o12_re = tmp12_re; 02017 o12_im = tmp12_im; 02018 o20_re = tmp20_re; 02019 o20_im = tmp20_im; 02020 o21_re = tmp21_re; 02021 o21_im = tmp21_im; 02022 o22_re = tmp22_re; 02023 o22_im = tmp22_im; 02024 o30_re = tmp30_re; 02025 o30_im = tmp30_im; 02026 o31_re = tmp31_re; 02027 o31_im = tmp31_im; 02028 o32_re = tmp32_re; 02029 o32_im = tmp32_im; 02030 #endif // DSLASH_XPAY 02031 02032 } 02033 #ifdef DSLASH_XPAY 02034 02035 READ_ACCUM(ACCUMTEX, sp_stride) 02036 02037 #ifdef SPINOR_DOUBLE 02038 o00_re = b*o00_re + accum0.x; 02039 o00_im = b*o00_im + accum0.y; 02040 o01_re = b*o01_re + accum1.x; 02041 o01_im = b*o01_im + accum1.y; 02042 o02_re = b*o02_re + accum2.x; 02043 o02_im = b*o02_im + accum2.y; 02044 o10_re = b*o10_re + accum3.x; 02045 o10_im = b*o10_im + accum3.y; 02046 o11_re = b*o11_re + accum4.x; 02047 o11_im = b*o11_im + accum4.y; 02048 o12_re = b*o12_re + accum5.x; 02049 o12_im = b*o12_im + accum5.y; 02050 o20_re = b*o20_re + accum6.x; 02051 o20_im = b*o20_im + accum6.y; 02052 o21_re = b*o21_re + accum7.x; 02053 o21_im = b*o21_im + accum7.y; 02054 o22_re = b*o22_re + accum8.x; 02055 o22_im = b*o22_im + accum8.y; 02056 o30_re = b*o30_re + accum9.x; 02057 o30_im = b*o30_im + accum9.y; 02058 o31_re = b*o31_re + accum10.x; 02059 o31_im = b*o31_im + accum10.y; 02060 o32_re = b*o32_re + accum11.x; 02061 o32_im = b*o32_im + accum11.y; 02062 #else 02063 o00_re = b*o00_re + accum0.x; 02064 o00_im = b*o00_im + accum0.y; 02065 o01_re = b*o01_re + accum0.z; 02066 o01_im = b*o01_im + accum0.w; 02067 o02_re = b*o02_re + accum1.x; 02068 o02_im = b*o02_im + accum1.y; 02069 o10_re = b*o10_re + accum1.z; 02070 o10_im = b*o10_im + accum1.w; 02071 o11_re = b*o11_re + accum2.x; 02072 o11_im = b*o11_im + accum2.y; 02073 o12_re = b*o12_re + accum2.z; 02074 o12_im = b*o12_im + accum2.w; 02075 o20_re = b*o20_re + accum3.x; 02076 o20_im = b*o20_im + accum3.y; 02077 o21_re = b*o21_re + accum3.z; 02078 o21_im = b*o21_im + accum3.w; 02079 o22_re = b*o22_re + accum4.x; 02080 o22_im = b*o22_im + accum4.y; 02081 o30_re = b*o30_re + accum4.z; 02082 o30_im = b*o30_im + accum4.w; 02083 o31_re = b*o31_re + accum5.x; 02084 o31_im = b*o31_im + accum5.y; 02085 o32_re = b*o32_re + accum5.z; 02086 o32_im = b*o32_im + accum5.w; 02087 #endif // SPINOR_DOUBLE 02088 02089 #endif // DSLASH_XPAY 02090 } 02091 02092 // write spinor field back to device memory 02093 WRITE_SPINOR(sp_stride); 02094 02095 // undefine to prevent warning when precision is changed 02096 #undef spinorFloat 02097 #undef SHARED_STRIDE 02098 02099 #undef A_re 02100 #undef A_im 02101 02102 #undef g00_re 02103 #undef g00_im 02104 #undef g01_re 02105 #undef g01_im 02106 #undef g02_re 02107 #undef g02_im 02108 #undef g10_re 02109 #undef g10_im 02110 #undef g11_re 02111 #undef g11_im 02112 #undef g12_re 02113 #undef g12_im 02114 #undef g20_re 02115 #undef g20_im 02116 #undef g21_re 02117 #undef g21_im 02118 #undef g22_re 02119 #undef g22_im 02120 02121 #undef i00_re 02122 #undef i00_im 02123 #undef i01_re 02124 #undef i01_im 02125 #undef i02_re 02126 #undef i02_im 02127 #undef i10_re 02128 #undef i10_im 02129 #undef i11_re 02130 #undef i11_im 02131 #undef i12_re 02132 #undef i12_im 02133 #undef i20_re 02134 #undef i20_im 02135 #undef i21_re 02136 #undef i21_im 02137 #undef i22_re 02138 #undef i22_im 02139 #undef i30_re 02140 #undef i30_im 02141 #undef i31_re 02142 #undef i31_im 02143 #undef i32_re 02144 #undef i32_im 02145 02146 02147 #undef o00_re 02148 #undef o00_im 02149 #undef o01_re 02150 #undef o01_im 02151 #undef o02_re 02152 #undef o02_im 02153 #undef o10_re 02154 #undef o10_im 02155 #undef o11_re 02156 #undef o11_im 02157 #undef o12_re 02158 #undef o12_im 02159 #undef o20_re 02160 #undef o20_im 02161 #undef o21_re 02162 #undef o21_im 02163 #undef o22_re 02164 #undef o22_im 02165 #undef o30_re 02166 02167 #undef VOLATILE