QUDA v0.4.0
A library for QCD on GPUs
|
00001 // *** CUDA DSLASH *** 00002 00003 #define DSLASH_SHARED_FLOATS_PER_THREAD 19 00004 00005 00006 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 00007 #define VOLATILE 00008 #else // Open64 compiler 00009 #define VOLATILE volatile 00010 #endif 00011 // input spinor 00012 #ifdef SPINOR_DOUBLE 00013 #define spinorFloat double 00014 #define i00_re I0.x 00015 #define i00_im I0.y 00016 #define i01_re I1.x 00017 #define i01_im I1.y 00018 #define i02_re I2.x 00019 #define i02_im I2.y 00020 #define i10_re I3.x 00021 #define i10_im I3.y 00022 #define i11_re I4.x 00023 #define i11_im I4.y 00024 #define i12_re I5.x 00025 #define i12_im I5.y 00026 #define i20_re I6.x 00027 #define i20_im I6.y 00028 #define i21_re I7.x 00029 #define i21_im I7.y 00030 #define i22_re I8.x 00031 #define i22_im I8.y 00032 #define i30_re I9.x 00033 #define i30_im I9.y 00034 #define i31_re I10.x 00035 #define i31_im I10.y 00036 #define i32_re I11.x 00037 #define i32_im I11.y 00038 #else 00039 #define spinorFloat float 00040 #define i00_re I0.x 00041 #define i00_im I0.y 00042 #define i01_re I0.z 00043 #define i01_im I0.w 00044 #define i02_re I1.x 00045 #define i02_im I1.y 00046 #define i10_re I1.z 00047 #define i10_im I1.w 00048 #define i11_re I2.x 00049 #define i11_im I2.y 00050 #define i12_re I2.z 00051 #define i12_im I2.w 00052 #define i20_re I3.x 00053 #define i20_im I3.y 00054 #define i21_re I3.z 00055 #define i21_im I3.w 00056 #define i22_re I4.x 00057 #define i22_im I4.y 00058 #define i30_re I4.z 00059 #define i30_im I4.w 00060 #define i31_re I5.x 00061 #define i31_im I5.y 00062 #define i32_re I5.z 00063 #define i32_im I5.w 00064 #endif // SPINOR_DOUBLE 00065 00066 // gauge link 00067 #ifdef GAUGE_FLOAT2 00068 #define g00_re G0.x 00069 #define g00_im G0.y 00070 #define g01_re G1.x 00071 #define g01_im G1.y 00072 #define g02_re G2.x 00073 #define g02_im G2.y 00074 #define g10_re G3.x 00075 #define g10_im G3.y 00076 #define g11_re G4.x 00077 #define g11_im G4.y 00078 #define g12_re G5.x 00079 #define g12_im G5.y 00080 #define g20_re G6.x 00081 #define g20_im G6.y 00082 #define g21_re G7.x 00083 #define g21_im G7.y 00084 #define g22_re G8.x 00085 #define g22_im G8.y 00086 // temporaries 00087 #define A_re G9.x 00088 #define A_im G9.y 00089 00090 #else 00091 #define g00_re G0.x 00092 #define g00_im G0.y 00093 #define g01_re G0.z 00094 #define g01_im G0.w 00095 #define g02_re G1.x 00096 #define g02_im G1.y 00097 #define g10_re G1.z 00098 #define g10_im G1.w 00099 #define g11_re G2.x 00100 #define g11_im G2.y 00101 #define g12_re G2.z 00102 #define g12_im G2.w 00103 #define g20_re G3.x 00104 #define g20_im G3.y 00105 #define g21_re G3.z 00106 #define g21_im G3.w 00107 #define g22_re G4.x 00108 #define g22_im G4.y 00109 // temporaries 00110 #define A_re G4.z 00111 #define A_im G4.w 00112 00113 #endif // GAUGE_DOUBLE 00114 00115 // conjugated gauge link 00116 #define gT00_re (+g00_re) 00117 #define gT00_im (-g00_im) 00118 #define gT01_re (+g10_re) 00119 #define gT01_im (-g10_im) 00120 #define gT02_re (+g20_re) 00121 #define gT02_im (-g20_im) 00122 #define gT10_re (+g01_re) 00123 #define gT10_im (-g01_im) 00124 #define gT11_re (+g11_re) 00125 #define gT11_im (-g11_im) 00126 #define gT12_re (+g21_re) 00127 #define gT12_im (-g21_im) 00128 #define gT20_re (+g02_re) 00129 #define gT20_im (-g02_im) 00130 #define gT21_re (+g12_re) 00131 #define gT21_im (-g12_im) 00132 #define gT22_re (+g22_re) 00133 #define gT22_im (-g22_im) 00134 00135 // first chiral block of inverted clover term 00136 #ifdef CLOVER_DOUBLE 00137 #define c00_00_re C0.x 00138 #define c01_01_re C0.y 00139 #define c02_02_re C1.x 00140 #define c10_10_re C1.y 00141 #define c11_11_re C2.x 00142 #define c12_12_re C2.y 00143 #define c01_00_re C3.x 00144 #define c01_00_im C3.y 00145 #define c02_00_re C4.x 00146 #define c02_00_im C4.y 00147 #define c10_00_re C5.x 00148 #define c10_00_im C5.y 00149 #define c11_00_re C6.x 00150 #define c11_00_im C6.y 00151 #define c12_00_re C7.x 00152 #define c12_00_im C7.y 00153 #define c02_01_re C8.x 00154 #define c02_01_im C8.y 00155 #define c10_01_re C9.x 00156 #define c10_01_im C9.y 00157 #define c11_01_re C10.x 00158 #define c11_01_im C10.y 00159 #define c12_01_re C11.x 00160 #define c12_01_im C11.y 00161 #define c10_02_re C12.x 00162 #define c10_02_im C12.y 00163 #define c11_02_re C13.x 00164 #define c11_02_im C13.y 00165 #define c12_02_re C14.x 00166 #define c12_02_im C14.y 00167 #define c11_10_re C15.x 00168 #define c11_10_im C15.y 00169 #define c12_10_re C16.x 00170 #define c12_10_im C16.y 00171 #define c12_11_re C17.x 00172 #define c12_11_im C17.y 00173 #else 00174 #define c00_00_re C0.x 00175 #define c01_01_re C0.y 00176 #define c02_02_re C0.z 00177 #define c10_10_re C0.w 00178 #define c11_11_re C1.x 00179 #define c12_12_re C1.y 00180 #define c01_00_re C1.z 00181 #define c01_00_im C1.w 00182 #define c02_00_re C2.x 00183 #define c02_00_im C2.y 00184 #define c10_00_re C2.z 00185 #define c10_00_im C2.w 00186 #define c11_00_re C3.x 00187 #define c11_00_im C3.y 00188 #define c12_00_re C3.z 00189 #define c12_00_im C3.w 00190 #define c02_01_re C4.x 00191 #define c02_01_im C4.y 00192 #define c10_01_re C4.z 00193 #define c10_01_im C4.w 00194 #define c11_01_re C5.x 00195 #define c11_01_im C5.y 00196 #define c12_01_re C5.z 00197 #define c12_01_im C5.w 00198 #define c10_02_re C6.x 00199 #define c10_02_im C6.y 00200 #define c11_02_re C6.z 00201 #define c11_02_im C6.w 00202 #define c12_02_re C7.x 00203 #define c12_02_im C7.y 00204 #define c11_10_re C7.z 00205 #define c11_10_im C7.w 00206 #define c12_10_re C8.x 00207 #define c12_10_im C8.y 00208 #define c12_11_re C8.z 00209 #define c12_11_im C8.w 00210 #endif // CLOVER_DOUBLE 00211 00212 #define c00_01_re (+c01_00_re) 00213 #define c00_01_im (-c01_00_im) 00214 #define c00_02_re (+c02_00_re) 00215 #define c00_02_im (-c02_00_im) 00216 #define c01_02_re (+c02_01_re) 00217 #define c01_02_im (-c02_01_im) 00218 #define c00_10_re (+c10_00_re) 00219 #define c00_10_im (-c10_00_im) 00220 #define c01_10_re (+c10_01_re) 00221 #define c01_10_im (-c10_01_im) 00222 #define c02_10_re (+c10_02_re) 00223 #define c02_10_im (-c10_02_im) 00224 #define c00_11_re (+c11_00_re) 00225 #define c00_11_im (-c11_00_im) 00226 #define c01_11_re (+c11_01_re) 00227 #define c01_11_im (-c11_01_im) 00228 #define c02_11_re (+c11_02_re) 00229 #define c02_11_im (-c11_02_im) 00230 #define c10_11_re (+c11_10_re) 00231 #define c10_11_im (-c11_10_im) 00232 #define c00_12_re (+c12_00_re) 00233 #define c00_12_im (-c12_00_im) 00234 #define c01_12_re (+c12_01_re) 00235 #define c01_12_im (-c12_01_im) 00236 #define c02_12_re (+c12_02_re) 00237 #define c02_12_im (-c12_02_im) 00238 #define c10_12_re (+c12_10_re) 00239 #define c10_12_im (-c12_10_im) 00240 #define c11_12_re (+c12_11_re) 00241 #define c11_12_im (-c12_11_im) 00242 00243 // second chiral block of inverted clover term (reuses C0,...,C9) 00244 #define c20_20_re c00_00_re 00245 #define c21_20_re c01_00_re 00246 #define c21_20_im c01_00_im 00247 #define c22_20_re c02_00_re 00248 #define c22_20_im c02_00_im 00249 #define c30_20_re c10_00_re 00250 #define c30_20_im c10_00_im 00251 #define c31_20_re c11_00_re 00252 #define c31_20_im c11_00_im 00253 #define c32_20_re c12_00_re 00254 #define c32_20_im c12_00_im 00255 #define c20_21_re c00_01_re 00256 #define c20_21_im c00_01_im 00257 #define c21_21_re c01_01_re 00258 #define c22_21_re c02_01_re 00259 #define c22_21_im c02_01_im 00260 #define c30_21_re c10_01_re 00261 #define c30_21_im c10_01_im 00262 #define c31_21_re c11_01_re 00263 #define c31_21_im c11_01_im 00264 #define c32_21_re c12_01_re 00265 #define c32_21_im c12_01_im 00266 #define c20_22_re c00_02_re 00267 #define c20_22_im c00_02_im 00268 #define c21_22_re c01_02_re 00269 #define c21_22_im c01_02_im 00270 #define c22_22_re c02_02_re 00271 #define c30_22_re c10_02_re 00272 #define c30_22_im c10_02_im 00273 #define c31_22_re c11_02_re 00274 #define c31_22_im c11_02_im 00275 #define c32_22_re c12_02_re 00276 #define c32_22_im c12_02_im 00277 #define c20_30_re c00_10_re 00278 #define c20_30_im c00_10_im 00279 #define c21_30_re c01_10_re 00280 #define c21_30_im c01_10_im 00281 #define c22_30_re c02_10_re 00282 #define c22_30_im c02_10_im 00283 #define c30_30_re c10_10_re 00284 #define c31_30_re c11_10_re 00285 #define c31_30_im c11_10_im 00286 #define c32_30_re c12_10_re 00287 #define c32_30_im c12_10_im 00288 #define c20_31_re c00_11_re 00289 #define c20_31_im c00_11_im 00290 #define c21_31_re c01_11_re 00291 #define c21_31_im c01_11_im 00292 #define c22_31_re c02_11_re 00293 #define c22_31_im c02_11_im 00294 #define c30_31_re c10_11_re 00295 #define c30_31_im c10_11_im 00296 #define c31_31_re c11_11_re 00297 #define c32_31_re c12_11_re 00298 #define c32_31_im c12_11_im 00299 #define c20_32_re c00_12_re 00300 #define c20_32_im c00_12_im 00301 #define c21_32_re c01_12_re 00302 #define c21_32_im c01_12_im 00303 #define c22_32_re c02_12_re 00304 #define c22_32_im c02_12_im 00305 #define c30_32_re c10_12_re 00306 #define c30_32_im c10_12_im 00307 #define c31_32_re c11_12_re 00308 #define c31_32_im c11_12_im 00309 #define c32_32_re c12_12_re 00310 00311 // output spinor 00312 #define o00_re s[0*SHARED_STRIDE] 00313 #define o00_im s[1*SHARED_STRIDE] 00314 #define o01_re s[2*SHARED_STRIDE] 00315 #define o01_im s[3*SHARED_STRIDE] 00316 #define o02_re s[4*SHARED_STRIDE] 00317 #define o02_im s[5*SHARED_STRIDE] 00318 #define o10_re s[6*SHARED_STRIDE] 00319 #define o10_im s[7*SHARED_STRIDE] 00320 #define o11_re s[8*SHARED_STRIDE] 00321 #define o11_im s[9*SHARED_STRIDE] 00322 #define o12_re s[10*SHARED_STRIDE] 00323 #define o12_im s[11*SHARED_STRIDE] 00324 #define o20_re s[12*SHARED_STRIDE] 00325 #define o20_im s[13*SHARED_STRIDE] 00326 #define o21_re s[14*SHARED_STRIDE] 00327 #define o21_im s[15*SHARED_STRIDE] 00328 #define o22_re s[16*SHARED_STRIDE] 00329 #define o22_im s[17*SHARED_STRIDE] 00330 #define o30_re s[18*SHARED_STRIDE] 00331 VOLATILE spinorFloat o30_im; 00332 VOLATILE spinorFloat o31_re; 00333 VOLATILE spinorFloat o31_im; 00334 VOLATILE spinorFloat o32_re; 00335 VOLATILE spinorFloat o32_im; 00336 00337 #ifdef SPINOR_DOUBLE 00338 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200 00339 #else 00340 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200 00341 #endif 00342 00343 extern __shared__ char s_data[]; 00344 00345 VOLATILE spinorFloat *s = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE) 00346 + (threadIdx.x % SHARED_STRIDE); 00347 00348 #include "read_gauge.h" 00349 #include "read_clover.h" 00350 #include "io_spinor.h" 00351 00352 int x1, x2, x3, x4; 00353 int X; 00354 00355 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision 00356 int sp_norm_idx; 00357 #endif // MULTI_GPU half precision 00358 00359 int sid; 00360 00361 #ifdef MULTI_GPU 00362 int face_idx; 00363 if (kernel_type == INTERIOR_KERNEL) { 00364 #endif 00365 00366 sid = blockIdx.x*blockDim.x + threadIdx.x; 00367 if (sid >= param.threads) return; 00368 00369 // Inline by hand for the moment and assume even dimensions 00370 //coordsFromIndex(X, x1, x2, x3, x4, sid, param.parity); 00371 00372 X = 2*sid; 00373 int aux1 = X / X1; 00374 x1 = X - aux1 * X1; 00375 int aux2 = aux1 / X2; 00376 x2 = aux1 - aux2 * X2; 00377 x4 = aux2 / X3; 00378 x3 = aux2 - x4 * X3; 00379 aux1 = (param.parity + x4 + x3 + x2) & 1; 00380 x1 += aux1; 00381 X += aux1; 00382 00383 o00_re = 0; o00_im = 0; 00384 o01_re = 0; o01_im = 0; 00385 o02_re = 0; o02_im = 0; 00386 o10_re = 0; o10_im = 0; 00387 o11_re = 0; o11_im = 0; 00388 o12_re = 0; o12_im = 0; 00389 o20_re = 0; o20_im = 0; 00390 o21_re = 0; o21_im = 0; 00391 o22_re = 0; o22_im = 0; 00392 o30_re = 0; o30_im = 0; 00393 o31_re = 0; o31_im = 0; 00394 o32_re = 0; o32_im = 0; 00395 00396 #ifdef MULTI_GPU 00397 } else { // exterior kernel 00398 00399 sid = blockIdx.x*blockDim.x + threadIdx.x; 00400 if (sid >= param.threads) return; 00401 00402 const int dim = static_cast<int>(kernel_type); 00403 const int face_volume = (param.threads >> 1); // volume of one face 00404 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1 00405 face_idx = sid - face_num*face_volume; // index into the respective face 00406 00407 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP) 00408 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading) 00409 //sp_idx = face_idx + param.ghostOffset[dim]; 00410 00411 #if (DD_PREC==2) // half precision 00412 sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)]; 00413 #endif 00414 00415 coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity); 00416 00417 READ_INTERMEDIATE_SPINOR(INTERTEX, sp_stride, sid, sid); 00418 00419 o00_re = i00_re; o00_im = i00_im; 00420 o01_re = i01_re; o01_im = i01_im; 00421 o02_re = i02_re; o02_im = i02_im; 00422 o10_re = i10_re; o10_im = i10_im; 00423 o11_re = i11_re; o11_im = i11_im; 00424 o12_re = i12_re; o12_im = i12_im; 00425 o20_re = i20_re; o20_im = i20_im; 00426 o21_re = i21_re; o21_im = i21_im; 00427 o22_re = i22_re; o22_im = i22_im; 00428 o30_re = i30_re; o30_im = i30_im; 00429 o31_re = i31_re; o31_im = i31_im; 00430 o32_re = i32_re; o32_im = i32_im; 00431 } 00432 #endif // MULTI_GPU 00433 00434 00435 #ifdef MULTI_GPU 00436 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) || 00437 (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) ) 00438 #endif 00439 { 00440 // Projector P0- 00441 // 1 0 0 -i 00442 // 0 1 -i 0 00443 // 0 i 1 0 00444 // i 0 0 1 00445 00446 #ifdef MULTI_GPU 00447 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 : 00448 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00449 #else 00450 const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1; 00451 #endif 00452 00453 const int ga_idx = sid; 00454 00455 spinorFloat a0_re, a0_im; 00456 spinorFloat a1_re, a1_im; 00457 spinorFloat a2_re, a2_im; 00458 spinorFloat b0_re, b0_im; 00459 spinorFloat b1_re, b1_im; 00460 spinorFloat b2_re, b2_im; 00461 00462 #ifdef MULTI_GPU 00463 if (kernel_type == INTERIOR_KERNEL) { 00464 #endif 00465 00466 // read spinor from device memory 00467 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00468 00469 // project spinor into half spinors 00470 a0_re = +i00_re+i30_im; 00471 a0_im = +i00_im-i30_re; 00472 a1_re = +i01_re+i31_im; 00473 a1_im = +i01_im-i31_re; 00474 a2_re = +i02_re+i32_im; 00475 a2_im = +i02_im-i32_re; 00476 b0_re = +i10_re+i20_im; 00477 b0_im = +i10_im-i20_re; 00478 b1_re = +i11_re+i21_im; 00479 b1_im = +i11_im-i21_re; 00480 b2_re = +i12_re+i22_im; 00481 b2_im = +i12_im-i22_re; 00482 00483 #ifdef MULTI_GPU 00484 } else { 00485 00486 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00487 00488 // read half spinor from device memory 00489 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00490 00491 a0_re = i00_re; a0_im = i00_im; 00492 a1_re = i01_re; a1_im = i01_im; 00493 a2_re = i02_re; a2_im = i02_im; 00494 b0_re = i10_re; b0_im = i10_im; 00495 b1_re = i11_re; b1_im = i11_im; 00496 b2_re = i12_re; b2_im = i12_im; 00497 00498 } 00499 #endif // MULTI_GPU 00500 00501 // read gauge matrix from device memory 00502 READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride); 00503 00504 // reconstruct gauge matrix 00505 RECONSTRUCT_GAUGE_MATRIX(0); 00506 00507 // multiply row 0 00508 spinorFloat A0_re = 0; 00509 A0_re += g00_re * a0_re; 00510 A0_re -= g00_im * a0_im; 00511 A0_re += g01_re * a1_re; 00512 A0_re -= g01_im * a1_im; 00513 A0_re += g02_re * a2_re; 00514 A0_re -= g02_im * a2_im; 00515 spinorFloat A0_im = 0; 00516 A0_im += g00_re * a0_im; 00517 A0_im += g00_im * a0_re; 00518 A0_im += g01_re * a1_im; 00519 A0_im += g01_im * a1_re; 00520 A0_im += g02_re * a2_im; 00521 A0_im += g02_im * a2_re; 00522 spinorFloat B0_re = 0; 00523 B0_re += g00_re * b0_re; 00524 B0_re -= g00_im * b0_im; 00525 B0_re += g01_re * b1_re; 00526 B0_re -= g01_im * b1_im; 00527 B0_re += g02_re * b2_re; 00528 B0_re -= g02_im * b2_im; 00529 spinorFloat B0_im = 0; 00530 B0_im += g00_re * b0_im; 00531 B0_im += g00_im * b0_re; 00532 B0_im += g01_re * b1_im; 00533 B0_im += g01_im * b1_re; 00534 B0_im += g02_re * b2_im; 00535 B0_im += g02_im * b2_re; 00536 00537 // multiply row 1 00538 spinorFloat A1_re = 0; 00539 A1_re += g10_re * a0_re; 00540 A1_re -= g10_im * a0_im; 00541 A1_re += g11_re * a1_re; 00542 A1_re -= g11_im * a1_im; 00543 A1_re += g12_re * a2_re; 00544 A1_re -= g12_im * a2_im; 00545 spinorFloat A1_im = 0; 00546 A1_im += g10_re * a0_im; 00547 A1_im += g10_im * a0_re; 00548 A1_im += g11_re * a1_im; 00549 A1_im += g11_im * a1_re; 00550 A1_im += g12_re * a2_im; 00551 A1_im += g12_im * a2_re; 00552 spinorFloat B1_re = 0; 00553 B1_re += g10_re * b0_re; 00554 B1_re -= g10_im * b0_im; 00555 B1_re += g11_re * b1_re; 00556 B1_re -= g11_im * b1_im; 00557 B1_re += g12_re * b2_re; 00558 B1_re -= g12_im * b2_im; 00559 spinorFloat B1_im = 0; 00560 B1_im += g10_re * b0_im; 00561 B1_im += g10_im * b0_re; 00562 B1_im += g11_re * b1_im; 00563 B1_im += g11_im * b1_re; 00564 B1_im += g12_re * b2_im; 00565 B1_im += g12_im * b2_re; 00566 00567 // multiply row 2 00568 spinorFloat A2_re = 0; 00569 A2_re += g20_re * a0_re; 00570 A2_re -= g20_im * a0_im; 00571 A2_re += g21_re * a1_re; 00572 A2_re -= g21_im * a1_im; 00573 A2_re += g22_re * a2_re; 00574 A2_re -= g22_im * a2_im; 00575 spinorFloat A2_im = 0; 00576 A2_im += g20_re * a0_im; 00577 A2_im += g20_im * a0_re; 00578 A2_im += g21_re * a1_im; 00579 A2_im += g21_im * a1_re; 00580 A2_im += g22_re * a2_im; 00581 A2_im += g22_im * a2_re; 00582 spinorFloat B2_re = 0; 00583 B2_re += g20_re * b0_re; 00584 B2_re -= g20_im * b0_im; 00585 B2_re += g21_re * b1_re; 00586 B2_re -= g21_im * b1_im; 00587 B2_re += g22_re * b2_re; 00588 B2_re -= g22_im * b2_im; 00589 spinorFloat B2_im = 0; 00590 B2_im += g20_re * b0_im; 00591 B2_im += g20_im * b0_re; 00592 B2_im += g21_re * b1_im; 00593 B2_im += g21_im * b1_re; 00594 B2_im += g22_re * b2_im; 00595 B2_im += g22_im * b2_re; 00596 00597 o00_re += A0_re; 00598 o00_im += A0_im; 00599 o10_re += B0_re; 00600 o10_im += B0_im; 00601 o20_re -= B0_im; 00602 o20_im += B0_re; 00603 o30_re -= A0_im; 00604 o30_im += A0_re; 00605 00606 o01_re += A1_re; 00607 o01_im += A1_im; 00608 o11_re += B1_re; 00609 o11_im += B1_im; 00610 o21_re -= B1_im; 00611 o21_im += B1_re; 00612 o31_re -= A1_im; 00613 o31_im += A1_re; 00614 00615 o02_re += A2_re; 00616 o02_im += A2_im; 00617 o12_re += B2_re; 00618 o12_im += B2_im; 00619 o22_re -= B2_im; 00620 o22_im += B2_re; 00621 o32_re -= A2_im; 00622 o32_im += A2_re; 00623 00624 } 00625 00626 #ifdef MULTI_GPU 00627 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) || 00628 (kernel_type == EXTERIOR_KERNEL_X && x1==0) ) 00629 #endif 00630 { 00631 // Projector P0+ 00632 // 1 0 0 i 00633 // 0 1 i 0 00634 // 0 -i 1 0 00635 // -i 0 0 1 00636 00637 #ifdef MULTI_GPU 00638 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 : 00639 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00640 #else 00641 const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1; 00642 #endif 00643 00644 #ifdef MULTI_GPU 00645 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 00646 #else 00647 const int ga_idx = sp_idx; 00648 #endif 00649 00650 spinorFloat a0_re, a0_im; 00651 spinorFloat a1_re, a1_im; 00652 spinorFloat a2_re, a2_im; 00653 spinorFloat b0_re, b0_im; 00654 spinorFloat b1_re, b1_im; 00655 spinorFloat b2_re, b2_im; 00656 00657 #ifdef MULTI_GPU 00658 if (kernel_type == INTERIOR_KERNEL) { 00659 #endif 00660 00661 // read spinor from device memory 00662 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00663 00664 // project spinor into half spinors 00665 a0_re = +i00_re-i30_im; 00666 a0_im = +i00_im+i30_re; 00667 a1_re = +i01_re-i31_im; 00668 a1_im = +i01_im+i31_re; 00669 a2_re = +i02_re-i32_im; 00670 a2_im = +i02_im+i32_re; 00671 b0_re = +i10_re-i20_im; 00672 b0_im = +i10_im+i20_re; 00673 b1_re = +i11_re-i21_im; 00674 b1_im = +i11_im+i21_re; 00675 b2_re = +i12_re-i22_im; 00676 b2_im = +i12_im+i22_re; 00677 00678 #ifdef MULTI_GPU 00679 } else { 00680 00681 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00682 00683 // read half spinor from device memory 00684 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 00685 00686 a0_re = i00_re; a0_im = i00_im; 00687 a1_re = i01_re; a1_im = i01_im; 00688 a2_re = i02_re; a2_im = i02_im; 00689 b0_re = i10_re; b0_im = i10_im; 00690 b1_re = i11_re; b1_im = i11_im; 00691 b2_re = i12_re; b2_im = i12_im; 00692 00693 } 00694 #endif // MULTI_GPU 00695 00696 // read gauge matrix from device memory 00697 READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride); 00698 00699 // reconstruct gauge matrix 00700 RECONSTRUCT_GAUGE_MATRIX(1); 00701 00702 // multiply row 0 00703 spinorFloat A0_re = 0; 00704 A0_re += gT00_re * a0_re; 00705 A0_re -= gT00_im * a0_im; 00706 A0_re += gT01_re * a1_re; 00707 A0_re -= gT01_im * a1_im; 00708 A0_re += gT02_re * a2_re; 00709 A0_re -= gT02_im * a2_im; 00710 spinorFloat A0_im = 0; 00711 A0_im += gT00_re * a0_im; 00712 A0_im += gT00_im * a0_re; 00713 A0_im += gT01_re * a1_im; 00714 A0_im += gT01_im * a1_re; 00715 A0_im += gT02_re * a2_im; 00716 A0_im += gT02_im * a2_re; 00717 spinorFloat B0_re = 0; 00718 B0_re += gT00_re * b0_re; 00719 B0_re -= gT00_im * b0_im; 00720 B0_re += gT01_re * b1_re; 00721 B0_re -= gT01_im * b1_im; 00722 B0_re += gT02_re * b2_re; 00723 B0_re -= gT02_im * b2_im; 00724 spinorFloat B0_im = 0; 00725 B0_im += gT00_re * b0_im; 00726 B0_im += gT00_im * b0_re; 00727 B0_im += gT01_re * b1_im; 00728 B0_im += gT01_im * b1_re; 00729 B0_im += gT02_re * b2_im; 00730 B0_im += gT02_im * b2_re; 00731 00732 // multiply row 1 00733 spinorFloat A1_re = 0; 00734 A1_re += gT10_re * a0_re; 00735 A1_re -= gT10_im * a0_im; 00736 A1_re += gT11_re * a1_re; 00737 A1_re -= gT11_im * a1_im; 00738 A1_re += gT12_re * a2_re; 00739 A1_re -= gT12_im * a2_im; 00740 spinorFloat A1_im = 0; 00741 A1_im += gT10_re * a0_im; 00742 A1_im += gT10_im * a0_re; 00743 A1_im += gT11_re * a1_im; 00744 A1_im += gT11_im * a1_re; 00745 A1_im += gT12_re * a2_im; 00746 A1_im += gT12_im * a2_re; 00747 spinorFloat B1_re = 0; 00748 B1_re += gT10_re * b0_re; 00749 B1_re -= gT10_im * b0_im; 00750 B1_re += gT11_re * b1_re; 00751 B1_re -= gT11_im * b1_im; 00752 B1_re += gT12_re * b2_re; 00753 B1_re -= gT12_im * b2_im; 00754 spinorFloat B1_im = 0; 00755 B1_im += gT10_re * b0_im; 00756 B1_im += gT10_im * b0_re; 00757 B1_im += gT11_re * b1_im; 00758 B1_im += gT11_im * b1_re; 00759 B1_im += gT12_re * b2_im; 00760 B1_im += gT12_im * b2_re; 00761 00762 // multiply row 2 00763 spinorFloat A2_re = 0; 00764 A2_re += gT20_re * a0_re; 00765 A2_re -= gT20_im * a0_im; 00766 A2_re += gT21_re * a1_re; 00767 A2_re -= gT21_im * a1_im; 00768 A2_re += gT22_re * a2_re; 00769 A2_re -= gT22_im * a2_im; 00770 spinorFloat A2_im = 0; 00771 A2_im += gT20_re * a0_im; 00772 A2_im += gT20_im * a0_re; 00773 A2_im += gT21_re * a1_im; 00774 A2_im += gT21_im * a1_re; 00775 A2_im += gT22_re * a2_im; 00776 A2_im += gT22_im * a2_re; 00777 spinorFloat B2_re = 0; 00778 B2_re += gT20_re * b0_re; 00779 B2_re -= gT20_im * b0_im; 00780 B2_re += gT21_re * b1_re; 00781 B2_re -= gT21_im * b1_im; 00782 B2_re += gT22_re * b2_re; 00783 B2_re -= gT22_im * b2_im; 00784 spinorFloat B2_im = 0; 00785 B2_im += gT20_re * b0_im; 00786 B2_im += gT20_im * b0_re; 00787 B2_im += gT21_re * b1_im; 00788 B2_im += gT21_im * b1_re; 00789 B2_im += gT22_re * b2_im; 00790 B2_im += gT22_im * b2_re; 00791 00792 o00_re += A0_re; 00793 o00_im += A0_im; 00794 o10_re += B0_re; 00795 o10_im += B0_im; 00796 o20_re += B0_im; 00797 o20_im -= B0_re; 00798 o30_re += A0_im; 00799 o30_im -= A0_re; 00800 00801 o01_re += A1_re; 00802 o01_im += A1_im; 00803 o11_re += B1_re; 00804 o11_im += B1_im; 00805 o21_re += B1_im; 00806 o21_im -= B1_re; 00807 o31_re += A1_im; 00808 o31_im -= A1_re; 00809 00810 o02_re += A2_re; 00811 o02_im += A2_im; 00812 o12_re += B2_re; 00813 o12_im += B2_im; 00814 o22_re += B2_im; 00815 o22_im -= B2_re; 00816 o32_re += A2_im; 00817 o32_im -= A2_re; 00818 00819 } 00820 00821 #ifdef MULTI_GPU 00822 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) || 00823 (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) ) 00824 #endif 00825 { 00826 // Projector P1- 00827 // 1 0 0 -1 00828 // 0 1 1 0 00829 // 0 1 1 0 00830 // -1 0 0 1 00831 00832 #ifdef MULTI_GPU 00833 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 : 00834 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00835 #else 00836 const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1; 00837 #endif 00838 00839 const int ga_idx = sid; 00840 00841 spinorFloat a0_re, a0_im; 00842 spinorFloat a1_re, a1_im; 00843 spinorFloat a2_re, a2_im; 00844 spinorFloat b0_re, b0_im; 00845 spinorFloat b1_re, b1_im; 00846 spinorFloat b2_re, b2_im; 00847 00848 #ifdef MULTI_GPU 00849 if (kernel_type == INTERIOR_KERNEL) { 00850 #endif 00851 00852 // read spinor from device memory 00853 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00854 00855 // project spinor into half spinors 00856 a0_re = +i00_re-i30_re; 00857 a0_im = +i00_im-i30_im; 00858 a1_re = +i01_re-i31_re; 00859 a1_im = +i01_im-i31_im; 00860 a2_re = +i02_re-i32_re; 00861 a2_im = +i02_im-i32_im; 00862 b0_re = +i10_re+i20_re; 00863 b0_im = +i10_im+i20_im; 00864 b1_re = +i11_re+i21_re; 00865 b1_im = +i11_im+i21_im; 00866 b2_re = +i12_re+i22_re; 00867 b2_im = +i12_im+i22_im; 00868 00869 #ifdef MULTI_GPU 00870 } else { 00871 00872 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00873 00874 // read half spinor from device memory 00875 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00876 00877 a0_re = i00_re; a0_im = i00_im; 00878 a1_re = i01_re; a1_im = i01_im; 00879 a2_re = i02_re; a2_im = i02_im; 00880 b0_re = i10_re; b0_im = i10_im; 00881 b1_re = i11_re; b1_im = i11_im; 00882 b2_re = i12_re; b2_im = i12_im; 00883 00884 } 00885 #endif // MULTI_GPU 00886 00887 // read gauge matrix from device memory 00888 READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride); 00889 00890 // reconstruct gauge matrix 00891 RECONSTRUCT_GAUGE_MATRIX(2); 00892 00893 // multiply row 0 00894 spinorFloat A0_re = 0; 00895 A0_re += g00_re * a0_re; 00896 A0_re -= g00_im * a0_im; 00897 A0_re += g01_re * a1_re; 00898 A0_re -= g01_im * a1_im; 00899 A0_re += g02_re * a2_re; 00900 A0_re -= g02_im * a2_im; 00901 spinorFloat A0_im = 0; 00902 A0_im += g00_re * a0_im; 00903 A0_im += g00_im * a0_re; 00904 A0_im += g01_re * a1_im; 00905 A0_im += g01_im * a1_re; 00906 A0_im += g02_re * a2_im; 00907 A0_im += g02_im * a2_re; 00908 spinorFloat B0_re = 0; 00909 B0_re += g00_re * b0_re; 00910 B0_re -= g00_im * b0_im; 00911 B0_re += g01_re * b1_re; 00912 B0_re -= g01_im * b1_im; 00913 B0_re += g02_re * b2_re; 00914 B0_re -= g02_im * b2_im; 00915 spinorFloat B0_im = 0; 00916 B0_im += g00_re * b0_im; 00917 B0_im += g00_im * b0_re; 00918 B0_im += g01_re * b1_im; 00919 B0_im += g01_im * b1_re; 00920 B0_im += g02_re * b2_im; 00921 B0_im += g02_im * b2_re; 00922 00923 // multiply row 1 00924 spinorFloat A1_re = 0; 00925 A1_re += g10_re * a0_re; 00926 A1_re -= g10_im * a0_im; 00927 A1_re += g11_re * a1_re; 00928 A1_re -= g11_im * a1_im; 00929 A1_re += g12_re * a2_re; 00930 A1_re -= g12_im * a2_im; 00931 spinorFloat A1_im = 0; 00932 A1_im += g10_re * a0_im; 00933 A1_im += g10_im * a0_re; 00934 A1_im += g11_re * a1_im; 00935 A1_im += g11_im * a1_re; 00936 A1_im += g12_re * a2_im; 00937 A1_im += g12_im * a2_re; 00938 spinorFloat B1_re = 0; 00939 B1_re += g10_re * b0_re; 00940 B1_re -= g10_im * b0_im; 00941 B1_re += g11_re * b1_re; 00942 B1_re -= g11_im * b1_im; 00943 B1_re += g12_re * b2_re; 00944 B1_re -= g12_im * b2_im; 00945 spinorFloat B1_im = 0; 00946 B1_im += g10_re * b0_im; 00947 B1_im += g10_im * b0_re; 00948 B1_im += g11_re * b1_im; 00949 B1_im += g11_im * b1_re; 00950 B1_im += g12_re * b2_im; 00951 B1_im += g12_im * b2_re; 00952 00953 // multiply row 2 00954 spinorFloat A2_re = 0; 00955 A2_re += g20_re * a0_re; 00956 A2_re -= g20_im * a0_im; 00957 A2_re += g21_re * a1_re; 00958 A2_re -= g21_im * a1_im; 00959 A2_re += g22_re * a2_re; 00960 A2_re -= g22_im * a2_im; 00961 spinorFloat A2_im = 0; 00962 A2_im += g20_re * a0_im; 00963 A2_im += g20_im * a0_re; 00964 A2_im += g21_re * a1_im; 00965 A2_im += g21_im * a1_re; 00966 A2_im += g22_re * a2_im; 00967 A2_im += g22_im * a2_re; 00968 spinorFloat B2_re = 0; 00969 B2_re += g20_re * b0_re; 00970 B2_re -= g20_im * b0_im; 00971 B2_re += g21_re * b1_re; 00972 B2_re -= g21_im * b1_im; 00973 B2_re += g22_re * b2_re; 00974 B2_re -= g22_im * b2_im; 00975 spinorFloat B2_im = 0; 00976 B2_im += g20_re * b0_im; 00977 B2_im += g20_im * b0_re; 00978 B2_im += g21_re * b1_im; 00979 B2_im += g21_im * b1_re; 00980 B2_im += g22_re * b2_im; 00981 B2_im += g22_im * b2_re; 00982 00983 o00_re += A0_re; 00984 o00_im += A0_im; 00985 o10_re += B0_re; 00986 o10_im += B0_im; 00987 o20_re += B0_re; 00988 o20_im += B0_im; 00989 o30_re -= A0_re; 00990 o30_im -= A0_im; 00991 00992 o01_re += A1_re; 00993 o01_im += A1_im; 00994 o11_re += B1_re; 00995 o11_im += B1_im; 00996 o21_re += B1_re; 00997 o21_im += B1_im; 00998 o31_re -= A1_re; 00999 o31_im -= A1_im; 01000 01001 o02_re += A2_re; 01002 o02_im += A2_im; 01003 o12_re += B2_re; 01004 o12_im += B2_im; 01005 o22_re += B2_re; 01006 o22_im += B2_im; 01007 o32_re -= A2_re; 01008 o32_im -= A2_im; 01009 01010 } 01011 01012 #ifdef MULTI_GPU 01013 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) || 01014 (kernel_type == EXTERIOR_KERNEL_Y && x2==0) ) 01015 #endif 01016 { 01017 // Projector P1+ 01018 // 1 0 0 1 01019 // 0 1 -1 0 01020 // 0 -1 1 0 01021 // 1 0 0 1 01022 01023 #ifdef MULTI_GPU 01024 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 : 01025 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01026 #else 01027 const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1; 01028 #endif 01029 01030 #ifdef MULTI_GPU 01031 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01032 #else 01033 const int ga_idx = sp_idx; 01034 #endif 01035 01036 spinorFloat a0_re, a0_im; 01037 spinorFloat a1_re, a1_im; 01038 spinorFloat a2_re, a2_im; 01039 spinorFloat b0_re, b0_im; 01040 spinorFloat b1_re, b1_im; 01041 spinorFloat b2_re, b2_im; 01042 01043 #ifdef MULTI_GPU 01044 if (kernel_type == INTERIOR_KERNEL) { 01045 #endif 01046 01047 // read spinor from device memory 01048 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01049 01050 // project spinor into half spinors 01051 a0_re = +i00_re+i30_re; 01052 a0_im = +i00_im+i30_im; 01053 a1_re = +i01_re+i31_re; 01054 a1_im = +i01_im+i31_im; 01055 a2_re = +i02_re+i32_re; 01056 a2_im = +i02_im+i32_im; 01057 b0_re = +i10_re-i20_re; 01058 b0_im = +i10_im-i20_im; 01059 b1_re = +i11_re-i21_re; 01060 b1_im = +i11_im-i21_im; 01061 b2_re = +i12_re-i22_re; 01062 b2_im = +i12_im-i22_im; 01063 01064 #ifdef MULTI_GPU 01065 } else { 01066 01067 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01068 01069 // read half spinor from device memory 01070 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01071 01072 a0_re = i00_re; a0_im = i00_im; 01073 a1_re = i01_re; a1_im = i01_im; 01074 a2_re = i02_re; a2_im = i02_im; 01075 b0_re = i10_re; b0_im = i10_im; 01076 b1_re = i11_re; b1_im = i11_im; 01077 b2_re = i12_re; b2_im = i12_im; 01078 01079 } 01080 #endif // MULTI_GPU 01081 01082 // read gauge matrix from device memory 01083 READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride); 01084 01085 // reconstruct gauge matrix 01086 RECONSTRUCT_GAUGE_MATRIX(3); 01087 01088 // multiply row 0 01089 spinorFloat A0_re = 0; 01090 A0_re += gT00_re * a0_re; 01091 A0_re -= gT00_im * a0_im; 01092 A0_re += gT01_re * a1_re; 01093 A0_re -= gT01_im * a1_im; 01094 A0_re += gT02_re * a2_re; 01095 A0_re -= gT02_im * a2_im; 01096 spinorFloat A0_im = 0; 01097 A0_im += gT00_re * a0_im; 01098 A0_im += gT00_im * a0_re; 01099 A0_im += gT01_re * a1_im; 01100 A0_im += gT01_im * a1_re; 01101 A0_im += gT02_re * a2_im; 01102 A0_im += gT02_im * a2_re; 01103 spinorFloat B0_re = 0; 01104 B0_re += gT00_re * b0_re; 01105 B0_re -= gT00_im * b0_im; 01106 B0_re += gT01_re * b1_re; 01107 B0_re -= gT01_im * b1_im; 01108 B0_re += gT02_re * b2_re; 01109 B0_re -= gT02_im * b2_im; 01110 spinorFloat B0_im = 0; 01111 B0_im += gT00_re * b0_im; 01112 B0_im += gT00_im * b0_re; 01113 B0_im += gT01_re * b1_im; 01114 B0_im += gT01_im * b1_re; 01115 B0_im += gT02_re * b2_im; 01116 B0_im += gT02_im * b2_re; 01117 01118 // multiply row 1 01119 spinorFloat A1_re = 0; 01120 A1_re += gT10_re * a0_re; 01121 A1_re -= gT10_im * a0_im; 01122 A1_re += gT11_re * a1_re; 01123 A1_re -= gT11_im * a1_im; 01124 A1_re += gT12_re * a2_re; 01125 A1_re -= gT12_im * a2_im; 01126 spinorFloat A1_im = 0; 01127 A1_im += gT10_re * a0_im; 01128 A1_im += gT10_im * a0_re; 01129 A1_im += gT11_re * a1_im; 01130 A1_im += gT11_im * a1_re; 01131 A1_im += gT12_re * a2_im; 01132 A1_im += gT12_im * a2_re; 01133 spinorFloat B1_re = 0; 01134 B1_re += gT10_re * b0_re; 01135 B1_re -= gT10_im * b0_im; 01136 B1_re += gT11_re * b1_re; 01137 B1_re -= gT11_im * b1_im; 01138 B1_re += gT12_re * b2_re; 01139 B1_re -= gT12_im * b2_im; 01140 spinorFloat B1_im = 0; 01141 B1_im += gT10_re * b0_im; 01142 B1_im += gT10_im * b0_re; 01143 B1_im += gT11_re * b1_im; 01144 B1_im += gT11_im * b1_re; 01145 B1_im += gT12_re * b2_im; 01146 B1_im += gT12_im * b2_re; 01147 01148 // multiply row 2 01149 spinorFloat A2_re = 0; 01150 A2_re += gT20_re * a0_re; 01151 A2_re -= gT20_im * a0_im; 01152 A2_re += gT21_re * a1_re; 01153 A2_re -= gT21_im * a1_im; 01154 A2_re += gT22_re * a2_re; 01155 A2_re -= gT22_im * a2_im; 01156 spinorFloat A2_im = 0; 01157 A2_im += gT20_re * a0_im; 01158 A2_im += gT20_im * a0_re; 01159 A2_im += gT21_re * a1_im; 01160 A2_im += gT21_im * a1_re; 01161 A2_im += gT22_re * a2_im; 01162 A2_im += gT22_im * a2_re; 01163 spinorFloat B2_re = 0; 01164 B2_re += gT20_re * b0_re; 01165 B2_re -= gT20_im * b0_im; 01166 B2_re += gT21_re * b1_re; 01167 B2_re -= gT21_im * b1_im; 01168 B2_re += gT22_re * b2_re; 01169 B2_re -= gT22_im * b2_im; 01170 spinorFloat B2_im = 0; 01171 B2_im += gT20_re * b0_im; 01172 B2_im += gT20_im * b0_re; 01173 B2_im += gT21_re * b1_im; 01174 B2_im += gT21_im * b1_re; 01175 B2_im += gT22_re * b2_im; 01176 B2_im += gT22_im * b2_re; 01177 01178 o00_re += A0_re; 01179 o00_im += A0_im; 01180 o10_re += B0_re; 01181 o10_im += B0_im; 01182 o20_re -= B0_re; 01183 o20_im -= B0_im; 01184 o30_re += A0_re; 01185 o30_im += A0_im; 01186 01187 o01_re += A1_re; 01188 o01_im += A1_im; 01189 o11_re += B1_re; 01190 o11_im += B1_im; 01191 o21_re -= B1_re; 01192 o21_im -= B1_im; 01193 o31_re += A1_re; 01194 o31_im += A1_im; 01195 01196 o02_re += A2_re; 01197 o02_im += A2_im; 01198 o12_re += B2_re; 01199 o12_im += B2_im; 01200 o22_re -= B2_re; 01201 o22_im -= B2_im; 01202 o32_re += A2_re; 01203 o32_im += A2_im; 01204 01205 } 01206 01207 #ifdef MULTI_GPU 01208 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) || 01209 (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) ) 01210 #endif 01211 { 01212 // Projector P2- 01213 // 1 0 -i 0 01214 // 0 1 0 i 01215 // i 0 1 0 01216 // 0 -i 0 1 01217 01218 #ifdef MULTI_GPU 01219 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 : 01220 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01221 #else 01222 const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1; 01223 #endif 01224 01225 const int ga_idx = sid; 01226 01227 spinorFloat a0_re, a0_im; 01228 spinorFloat a1_re, a1_im; 01229 spinorFloat a2_re, a2_im; 01230 spinorFloat b0_re, b0_im; 01231 spinorFloat b1_re, b1_im; 01232 spinorFloat b2_re, b2_im; 01233 01234 #ifdef MULTI_GPU 01235 if (kernel_type == INTERIOR_KERNEL) { 01236 #endif 01237 01238 // read spinor from device memory 01239 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01240 01241 // project spinor into half spinors 01242 a0_re = +i00_re+i20_im; 01243 a0_im = +i00_im-i20_re; 01244 a1_re = +i01_re+i21_im; 01245 a1_im = +i01_im-i21_re; 01246 a2_re = +i02_re+i22_im; 01247 a2_im = +i02_im-i22_re; 01248 b0_re = +i10_re-i30_im; 01249 b0_im = +i10_im+i30_re; 01250 b1_re = +i11_re-i31_im; 01251 b1_im = +i11_im+i31_re; 01252 b2_re = +i12_re-i32_im; 01253 b2_im = +i12_im+i32_re; 01254 01255 #ifdef MULTI_GPU 01256 } else { 01257 01258 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01259 01260 // read half spinor from device memory 01261 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01262 01263 a0_re = i00_re; a0_im = i00_im; 01264 a1_re = i01_re; a1_im = i01_im; 01265 a2_re = i02_re; a2_im = i02_im; 01266 b0_re = i10_re; b0_im = i10_im; 01267 b1_re = i11_re; b1_im = i11_im; 01268 b2_re = i12_re; b2_im = i12_im; 01269 01270 } 01271 #endif // MULTI_GPU 01272 01273 // read gauge matrix from device memory 01274 READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride); 01275 01276 // reconstruct gauge matrix 01277 RECONSTRUCT_GAUGE_MATRIX(4); 01278 01279 // multiply row 0 01280 spinorFloat A0_re = 0; 01281 A0_re += g00_re * a0_re; 01282 A0_re -= g00_im * a0_im; 01283 A0_re += g01_re * a1_re; 01284 A0_re -= g01_im * a1_im; 01285 A0_re += g02_re * a2_re; 01286 A0_re -= g02_im * a2_im; 01287 spinorFloat A0_im = 0; 01288 A0_im += g00_re * a0_im; 01289 A0_im += g00_im * a0_re; 01290 A0_im += g01_re * a1_im; 01291 A0_im += g01_im * a1_re; 01292 A0_im += g02_re * a2_im; 01293 A0_im += g02_im * a2_re; 01294 spinorFloat B0_re = 0; 01295 B0_re += g00_re * b0_re; 01296 B0_re -= g00_im * b0_im; 01297 B0_re += g01_re * b1_re; 01298 B0_re -= g01_im * b1_im; 01299 B0_re += g02_re * b2_re; 01300 B0_re -= g02_im * b2_im; 01301 spinorFloat B0_im = 0; 01302 B0_im += g00_re * b0_im; 01303 B0_im += g00_im * b0_re; 01304 B0_im += g01_re * b1_im; 01305 B0_im += g01_im * b1_re; 01306 B0_im += g02_re * b2_im; 01307 B0_im += g02_im * b2_re; 01308 01309 // multiply row 1 01310 spinorFloat A1_re = 0; 01311 A1_re += g10_re * a0_re; 01312 A1_re -= g10_im * a0_im; 01313 A1_re += g11_re * a1_re; 01314 A1_re -= g11_im * a1_im; 01315 A1_re += g12_re * a2_re; 01316 A1_re -= g12_im * a2_im; 01317 spinorFloat A1_im = 0; 01318 A1_im += g10_re * a0_im; 01319 A1_im += g10_im * a0_re; 01320 A1_im += g11_re * a1_im; 01321 A1_im += g11_im * a1_re; 01322 A1_im += g12_re * a2_im; 01323 A1_im += g12_im * a2_re; 01324 spinorFloat B1_re = 0; 01325 B1_re += g10_re * b0_re; 01326 B1_re -= g10_im * b0_im; 01327 B1_re += g11_re * b1_re; 01328 B1_re -= g11_im * b1_im; 01329 B1_re += g12_re * b2_re; 01330 B1_re -= g12_im * b2_im; 01331 spinorFloat B1_im = 0; 01332 B1_im += g10_re * b0_im; 01333 B1_im += g10_im * b0_re; 01334 B1_im += g11_re * b1_im; 01335 B1_im += g11_im * b1_re; 01336 B1_im += g12_re * b2_im; 01337 B1_im += g12_im * b2_re; 01338 01339 // multiply row 2 01340 spinorFloat A2_re = 0; 01341 A2_re += g20_re * a0_re; 01342 A2_re -= g20_im * a0_im; 01343 A2_re += g21_re * a1_re; 01344 A2_re -= g21_im * a1_im; 01345 A2_re += g22_re * a2_re; 01346 A2_re -= g22_im * a2_im; 01347 spinorFloat A2_im = 0; 01348 A2_im += g20_re * a0_im; 01349 A2_im += g20_im * a0_re; 01350 A2_im += g21_re * a1_im; 01351 A2_im += g21_im * a1_re; 01352 A2_im += g22_re * a2_im; 01353 A2_im += g22_im * a2_re; 01354 spinorFloat B2_re = 0; 01355 B2_re += g20_re * b0_re; 01356 B2_re -= g20_im * b0_im; 01357 B2_re += g21_re * b1_re; 01358 B2_re -= g21_im * b1_im; 01359 B2_re += g22_re * b2_re; 01360 B2_re -= g22_im * b2_im; 01361 spinorFloat B2_im = 0; 01362 B2_im += g20_re * b0_im; 01363 B2_im += g20_im * b0_re; 01364 B2_im += g21_re * b1_im; 01365 B2_im += g21_im * b1_re; 01366 B2_im += g22_re * b2_im; 01367 B2_im += g22_im * b2_re; 01368 01369 o00_re += A0_re; 01370 o00_im += A0_im; 01371 o10_re += B0_re; 01372 o10_im += B0_im; 01373 o20_re -= A0_im; 01374 o20_im += A0_re; 01375 o30_re += B0_im; 01376 o30_im -= B0_re; 01377 01378 o01_re += A1_re; 01379 o01_im += A1_im; 01380 o11_re += B1_re; 01381 o11_im += B1_im; 01382 o21_re -= A1_im; 01383 o21_im += A1_re; 01384 o31_re += B1_im; 01385 o31_im -= B1_re; 01386 01387 o02_re += A2_re; 01388 o02_im += A2_im; 01389 o12_re += B2_re; 01390 o12_im += B2_im; 01391 o22_re -= A2_im; 01392 o22_im += A2_re; 01393 o32_re += B2_im; 01394 o32_im -= B2_re; 01395 01396 } 01397 01398 #ifdef MULTI_GPU 01399 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) || 01400 (kernel_type == EXTERIOR_KERNEL_Z && x3==0) ) 01401 #endif 01402 { 01403 // Projector P2+ 01404 // 1 0 i 0 01405 // 0 1 0 -i 01406 // -i 0 1 0 01407 // 0 i 0 1 01408 01409 #ifdef MULTI_GPU 01410 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 : 01411 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01412 #else 01413 const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1; 01414 #endif 01415 01416 #ifdef MULTI_GPU 01417 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01418 #else 01419 const int ga_idx = sp_idx; 01420 #endif 01421 01422 spinorFloat a0_re, a0_im; 01423 spinorFloat a1_re, a1_im; 01424 spinorFloat a2_re, a2_im; 01425 spinorFloat b0_re, b0_im; 01426 spinorFloat b1_re, b1_im; 01427 spinorFloat b2_re, b2_im; 01428 01429 #ifdef MULTI_GPU 01430 if (kernel_type == INTERIOR_KERNEL) { 01431 #endif 01432 01433 // read spinor from device memory 01434 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01435 01436 // project spinor into half spinors 01437 a0_re = +i00_re-i20_im; 01438 a0_im = +i00_im+i20_re; 01439 a1_re = +i01_re-i21_im; 01440 a1_im = +i01_im+i21_re; 01441 a2_re = +i02_re-i22_im; 01442 a2_im = +i02_im+i22_re; 01443 b0_re = +i10_re+i30_im; 01444 b0_im = +i10_im-i30_re; 01445 b1_re = +i11_re+i31_im; 01446 b1_im = +i11_im-i31_re; 01447 b2_re = +i12_re+i32_im; 01448 b2_im = +i12_im-i32_re; 01449 01450 #ifdef MULTI_GPU 01451 } else { 01452 01453 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01454 01455 // read half spinor from device memory 01456 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01457 01458 a0_re = i00_re; a0_im = i00_im; 01459 a1_re = i01_re; a1_im = i01_im; 01460 a2_re = i02_re; a2_im = i02_im; 01461 b0_re = i10_re; b0_im = i10_im; 01462 b1_re = i11_re; b1_im = i11_im; 01463 b2_re = i12_re; b2_im = i12_im; 01464 01465 } 01466 #endif // MULTI_GPU 01467 01468 // read gauge matrix from device memory 01469 READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride); 01470 01471 // reconstruct gauge matrix 01472 RECONSTRUCT_GAUGE_MATRIX(5); 01473 01474 // multiply row 0 01475 spinorFloat A0_re = 0; 01476 A0_re += gT00_re * a0_re; 01477 A0_re -= gT00_im * a0_im; 01478 A0_re += gT01_re * a1_re; 01479 A0_re -= gT01_im * a1_im; 01480 A0_re += gT02_re * a2_re; 01481 A0_re -= gT02_im * a2_im; 01482 spinorFloat A0_im = 0; 01483 A0_im += gT00_re * a0_im; 01484 A0_im += gT00_im * a0_re; 01485 A0_im += gT01_re * a1_im; 01486 A0_im += gT01_im * a1_re; 01487 A0_im += gT02_re * a2_im; 01488 A0_im += gT02_im * a2_re; 01489 spinorFloat B0_re = 0; 01490 B0_re += gT00_re * b0_re; 01491 B0_re -= gT00_im * b0_im; 01492 B0_re += gT01_re * b1_re; 01493 B0_re -= gT01_im * b1_im; 01494 B0_re += gT02_re * b2_re; 01495 B0_re -= gT02_im * b2_im; 01496 spinorFloat B0_im = 0; 01497 B0_im += gT00_re * b0_im; 01498 B0_im += gT00_im * b0_re; 01499 B0_im += gT01_re * b1_im; 01500 B0_im += gT01_im * b1_re; 01501 B0_im += gT02_re * b2_im; 01502 B0_im += gT02_im * b2_re; 01503 01504 // multiply row 1 01505 spinorFloat A1_re = 0; 01506 A1_re += gT10_re * a0_re; 01507 A1_re -= gT10_im * a0_im; 01508 A1_re += gT11_re * a1_re; 01509 A1_re -= gT11_im * a1_im; 01510 A1_re += gT12_re * a2_re; 01511 A1_re -= gT12_im * a2_im; 01512 spinorFloat A1_im = 0; 01513 A1_im += gT10_re * a0_im; 01514 A1_im += gT10_im * a0_re; 01515 A1_im += gT11_re * a1_im; 01516 A1_im += gT11_im * a1_re; 01517 A1_im += gT12_re * a2_im; 01518 A1_im += gT12_im * a2_re; 01519 spinorFloat B1_re = 0; 01520 B1_re += gT10_re * b0_re; 01521 B1_re -= gT10_im * b0_im; 01522 B1_re += gT11_re * b1_re; 01523 B1_re -= gT11_im * b1_im; 01524 B1_re += gT12_re * b2_re; 01525 B1_re -= gT12_im * b2_im; 01526 spinorFloat B1_im = 0; 01527 B1_im += gT10_re * b0_im; 01528 B1_im += gT10_im * b0_re; 01529 B1_im += gT11_re * b1_im; 01530 B1_im += gT11_im * b1_re; 01531 B1_im += gT12_re * b2_im; 01532 B1_im += gT12_im * b2_re; 01533 01534 // multiply row 2 01535 spinorFloat A2_re = 0; 01536 A2_re += gT20_re * a0_re; 01537 A2_re -= gT20_im * a0_im; 01538 A2_re += gT21_re * a1_re; 01539 A2_re -= gT21_im * a1_im; 01540 A2_re += gT22_re * a2_re; 01541 A2_re -= gT22_im * a2_im; 01542 spinorFloat A2_im = 0; 01543 A2_im += gT20_re * a0_im; 01544 A2_im += gT20_im * a0_re; 01545 A2_im += gT21_re * a1_im; 01546 A2_im += gT21_im * a1_re; 01547 A2_im += gT22_re * a2_im; 01548 A2_im += gT22_im * a2_re; 01549 spinorFloat B2_re = 0; 01550 B2_re += gT20_re * b0_re; 01551 B2_re -= gT20_im * b0_im; 01552 B2_re += gT21_re * b1_re; 01553 B2_re -= gT21_im * b1_im; 01554 B2_re += gT22_re * b2_re; 01555 B2_re -= gT22_im * b2_im; 01556 spinorFloat B2_im = 0; 01557 B2_im += gT20_re * b0_im; 01558 B2_im += gT20_im * b0_re; 01559 B2_im += gT21_re * b1_im; 01560 B2_im += gT21_im * b1_re; 01561 B2_im += gT22_re * b2_im; 01562 B2_im += gT22_im * b2_re; 01563 01564 o00_re += A0_re; 01565 o00_im += A0_im; 01566 o10_re += B0_re; 01567 o10_im += B0_im; 01568 o20_re += A0_im; 01569 o20_im -= A0_re; 01570 o30_re -= B0_im; 01571 o30_im += B0_re; 01572 01573 o01_re += A1_re; 01574 o01_im += A1_im; 01575 o11_re += B1_re; 01576 o11_im += B1_im; 01577 o21_re += A1_im; 01578 o21_im -= A1_re; 01579 o31_re -= B1_im; 01580 o31_im += B1_re; 01581 01582 o02_re += A2_re; 01583 o02_im += A2_im; 01584 o12_re += B2_re; 01585 o12_im += B2_im; 01586 o22_re += A2_im; 01587 o22_im -= A2_re; 01588 o32_re -= B2_im; 01589 o32_im += B2_re; 01590 01591 } 01592 01593 #ifdef MULTI_GPU 01594 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) || 01595 (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) ) 01596 #endif 01597 { 01598 // Projector P3- 01599 // 0 0 0 0 01600 // 0 0 0 0 01601 // 0 0 2 0 01602 // 0 0 0 2 01603 01604 #ifdef MULTI_GPU 01605 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 : 01606 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01607 #else 01608 const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1; 01609 #endif 01610 01611 const int ga_idx = sid; 01612 01613 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01614 { 01615 spinorFloat a0_re, a0_im; 01616 spinorFloat a1_re, a1_im; 01617 spinorFloat a2_re, a2_im; 01618 spinorFloat b0_re, b0_im; 01619 spinorFloat b1_re, b1_im; 01620 spinorFloat b2_re, b2_im; 01621 01622 #ifdef MULTI_GPU 01623 if (kernel_type == INTERIOR_KERNEL) { 01624 #endif 01625 01626 // read spinor from device memory 01627 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01628 01629 // project spinor into half spinors 01630 a0_re = +2*i20_re; 01631 a0_im = +2*i20_im; 01632 a1_re = +2*i21_re; 01633 a1_im = +2*i21_im; 01634 a2_re = +2*i22_re; 01635 a2_im = +2*i22_im; 01636 b0_re = +2*i30_re; 01637 b0_im = +2*i30_im; 01638 b1_re = +2*i31_re; 01639 b1_im = +2*i31_im; 01640 b2_re = +2*i32_re; 01641 b2_im = +2*i32_im; 01642 01643 #ifdef MULTI_GPU 01644 } else { 01645 01646 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01647 const int t_proj_scale = TPROJSCALE; 01648 01649 // read half spinor from device memory 01650 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01651 01652 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01653 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01654 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01655 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01656 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01657 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01658 01659 } 01660 #endif // MULTI_GPU 01661 01662 // identity gauge matrix 01663 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01664 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01665 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01666 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01667 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01668 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01669 01670 o20_re += A0_re; 01671 o20_im += A0_im; 01672 o30_re += B0_re; 01673 o30_im += B0_im; 01674 01675 o21_re += A1_re; 01676 o21_im += A1_im; 01677 o31_re += B1_re; 01678 o31_im += B1_im; 01679 01680 o22_re += A2_re; 01681 o22_im += A2_im; 01682 o32_re += B2_re; 01683 o32_im += B2_im; 01684 01685 } else { 01686 spinorFloat a0_re, a0_im; 01687 spinorFloat a1_re, a1_im; 01688 spinorFloat a2_re, a2_im; 01689 spinorFloat b0_re, b0_im; 01690 spinorFloat b1_re, b1_im; 01691 spinorFloat b2_re, b2_im; 01692 01693 #ifdef MULTI_GPU 01694 if (kernel_type == INTERIOR_KERNEL) { 01695 #endif 01696 01697 // read spinor from device memory 01698 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01699 01700 // project spinor into half spinors 01701 a0_re = +2*i20_re; 01702 a0_im = +2*i20_im; 01703 a1_re = +2*i21_re; 01704 a1_im = +2*i21_im; 01705 a2_re = +2*i22_re; 01706 a2_im = +2*i22_im; 01707 b0_re = +2*i30_re; 01708 b0_im = +2*i30_im; 01709 b1_re = +2*i31_re; 01710 b1_im = +2*i31_im; 01711 b2_re = +2*i32_re; 01712 b2_im = +2*i32_im; 01713 01714 #ifdef MULTI_GPU 01715 } else { 01716 01717 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01718 const int t_proj_scale = TPROJSCALE; 01719 01720 // read half spinor from device memory 01721 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01722 01723 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01724 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01725 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01726 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01727 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01728 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01729 01730 } 01731 #endif // MULTI_GPU 01732 01733 // read gauge matrix from device memory 01734 READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride); 01735 01736 // reconstruct gauge matrix 01737 RECONSTRUCT_GAUGE_MATRIX(6); 01738 01739 // multiply row 0 01740 spinorFloat A0_re = 0; 01741 A0_re += g00_re * a0_re; 01742 A0_re -= g00_im * a0_im; 01743 A0_re += g01_re * a1_re; 01744 A0_re -= g01_im * a1_im; 01745 A0_re += g02_re * a2_re; 01746 A0_re -= g02_im * a2_im; 01747 spinorFloat A0_im = 0; 01748 A0_im += g00_re * a0_im; 01749 A0_im += g00_im * a0_re; 01750 A0_im += g01_re * a1_im; 01751 A0_im += g01_im * a1_re; 01752 A0_im += g02_re * a2_im; 01753 A0_im += g02_im * a2_re; 01754 spinorFloat B0_re = 0; 01755 B0_re += g00_re * b0_re; 01756 B0_re -= g00_im * b0_im; 01757 B0_re += g01_re * b1_re; 01758 B0_re -= g01_im * b1_im; 01759 B0_re += g02_re * b2_re; 01760 B0_re -= g02_im * b2_im; 01761 spinorFloat B0_im = 0; 01762 B0_im += g00_re * b0_im; 01763 B0_im += g00_im * b0_re; 01764 B0_im += g01_re * b1_im; 01765 B0_im += g01_im * b1_re; 01766 B0_im += g02_re * b2_im; 01767 B0_im += g02_im * b2_re; 01768 01769 // multiply row 1 01770 spinorFloat A1_re = 0; 01771 A1_re += g10_re * a0_re; 01772 A1_re -= g10_im * a0_im; 01773 A1_re += g11_re * a1_re; 01774 A1_re -= g11_im * a1_im; 01775 A1_re += g12_re * a2_re; 01776 A1_re -= g12_im * a2_im; 01777 spinorFloat A1_im = 0; 01778 A1_im += g10_re * a0_im; 01779 A1_im += g10_im * a0_re; 01780 A1_im += g11_re * a1_im; 01781 A1_im += g11_im * a1_re; 01782 A1_im += g12_re * a2_im; 01783 A1_im += g12_im * a2_re; 01784 spinorFloat B1_re = 0; 01785 B1_re += g10_re * b0_re; 01786 B1_re -= g10_im * b0_im; 01787 B1_re += g11_re * b1_re; 01788 B1_re -= g11_im * b1_im; 01789 B1_re += g12_re * b2_re; 01790 B1_re -= g12_im * b2_im; 01791 spinorFloat B1_im = 0; 01792 B1_im += g10_re * b0_im; 01793 B1_im += g10_im * b0_re; 01794 B1_im += g11_re * b1_im; 01795 B1_im += g11_im * b1_re; 01796 B1_im += g12_re * b2_im; 01797 B1_im += g12_im * b2_re; 01798 01799 // multiply row 2 01800 spinorFloat A2_re = 0; 01801 A2_re += g20_re * a0_re; 01802 A2_re -= g20_im * a0_im; 01803 A2_re += g21_re * a1_re; 01804 A2_re -= g21_im * a1_im; 01805 A2_re += g22_re * a2_re; 01806 A2_re -= g22_im * a2_im; 01807 spinorFloat A2_im = 0; 01808 A2_im += g20_re * a0_im; 01809 A2_im += g20_im * a0_re; 01810 A2_im += g21_re * a1_im; 01811 A2_im += g21_im * a1_re; 01812 A2_im += g22_re * a2_im; 01813 A2_im += g22_im * a2_re; 01814 spinorFloat B2_re = 0; 01815 B2_re += g20_re * b0_re; 01816 B2_re -= g20_im * b0_im; 01817 B2_re += g21_re * b1_re; 01818 B2_re -= g21_im * b1_im; 01819 B2_re += g22_re * b2_re; 01820 B2_re -= g22_im * b2_im; 01821 spinorFloat B2_im = 0; 01822 B2_im += g20_re * b0_im; 01823 B2_im += g20_im * b0_re; 01824 B2_im += g21_re * b1_im; 01825 B2_im += g21_im * b1_re; 01826 B2_im += g22_re * b2_im; 01827 B2_im += g22_im * b2_re; 01828 01829 o20_re += A0_re; 01830 o20_im += A0_im; 01831 o30_re += B0_re; 01832 o30_im += B0_im; 01833 01834 o21_re += A1_re; 01835 o21_im += A1_im; 01836 o31_re += B1_re; 01837 o31_im += B1_im; 01838 01839 o22_re += A2_re; 01840 o22_im += A2_im; 01841 o32_re += B2_re; 01842 o32_im += B2_im; 01843 01844 } 01845 } 01846 01847 #ifdef MULTI_GPU 01848 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) || 01849 (kernel_type == EXTERIOR_KERNEL_T && x4==0) ) 01850 #endif 01851 { 01852 // Projector P3+ 01853 // 2 0 0 0 01854 // 0 2 0 0 01855 // 0 0 0 0 01856 // 0 0 0 0 01857 01858 #ifdef MULTI_GPU 01859 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 : 01860 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01861 #else 01862 const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1; 01863 #endif 01864 01865 #ifdef MULTI_GPU 01866 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01867 #else 01868 const int ga_idx = sp_idx; 01869 #endif 01870 01871 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01872 { 01873 spinorFloat a0_re, a0_im; 01874 spinorFloat a1_re, a1_im; 01875 spinorFloat a2_re, a2_im; 01876 spinorFloat b0_re, b0_im; 01877 spinorFloat b1_re, b1_im; 01878 spinorFloat b2_re, b2_im; 01879 01880 #ifdef MULTI_GPU 01881 if (kernel_type == INTERIOR_KERNEL) { 01882 #endif 01883 01884 // read spinor from device memory 01885 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01886 01887 // project spinor into half spinors 01888 a0_re = +2*i00_re; 01889 a0_im = +2*i00_im; 01890 a1_re = +2*i01_re; 01891 a1_im = +2*i01_im; 01892 a2_re = +2*i02_re; 01893 a2_im = +2*i02_im; 01894 b0_re = +2*i10_re; 01895 b0_im = +2*i10_im; 01896 b1_re = +2*i11_re; 01897 b1_im = +2*i11_im; 01898 b2_re = +2*i12_re; 01899 b2_im = +2*i12_im; 01900 01901 #ifdef MULTI_GPU 01902 } else { 01903 01904 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01905 const int t_proj_scale = TPROJSCALE; 01906 01907 // read half spinor from device memory 01908 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01909 01910 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01911 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01912 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01913 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01914 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01915 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01916 01917 } 01918 #endif // MULTI_GPU 01919 01920 // identity gauge matrix 01921 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01922 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01923 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01924 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01925 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01926 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01927 01928 o00_re += A0_re; 01929 o00_im += A0_im; 01930 o10_re += B0_re; 01931 o10_im += B0_im; 01932 01933 o01_re += A1_re; 01934 o01_im += A1_im; 01935 o11_re += B1_re; 01936 o11_im += B1_im; 01937 01938 o02_re += A2_re; 01939 o02_im += A2_im; 01940 o12_re += B2_re; 01941 o12_im += B2_im; 01942 01943 } else { 01944 spinorFloat a0_re, a0_im; 01945 spinorFloat a1_re, a1_im; 01946 spinorFloat a2_re, a2_im; 01947 spinorFloat b0_re, b0_im; 01948 spinorFloat b1_re, b1_im; 01949 spinorFloat b2_re, b2_im; 01950 01951 #ifdef MULTI_GPU 01952 if (kernel_type == INTERIOR_KERNEL) { 01953 #endif 01954 01955 // read spinor from device memory 01956 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01957 01958 // project spinor into half spinors 01959 a0_re = +2*i00_re; 01960 a0_im = +2*i00_im; 01961 a1_re = +2*i01_re; 01962 a1_im = +2*i01_im; 01963 a2_re = +2*i02_re; 01964 a2_im = +2*i02_im; 01965 b0_re = +2*i10_re; 01966 b0_im = +2*i10_im; 01967 b1_re = +2*i11_re; 01968 b1_im = +2*i11_im; 01969 b2_re = +2*i12_re; 01970 b2_im = +2*i12_im; 01971 01972 #ifdef MULTI_GPU 01973 } else { 01974 01975 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01976 const int t_proj_scale = TPROJSCALE; 01977 01978 // read half spinor from device memory 01979 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01980 01981 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01982 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01983 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01984 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01985 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01986 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01987 01988 } 01989 #endif // MULTI_GPU 01990 01991 // read gauge matrix from device memory 01992 READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride); 01993 01994 // reconstruct gauge matrix 01995 RECONSTRUCT_GAUGE_MATRIX(7); 01996 01997 // multiply row 0 01998 spinorFloat A0_re = 0; 01999 A0_re += gT00_re * a0_re; 02000 A0_re -= gT00_im * a0_im; 02001 A0_re += gT01_re * a1_re; 02002 A0_re -= gT01_im * a1_im; 02003 A0_re += gT02_re * a2_re; 02004 A0_re -= gT02_im * a2_im; 02005 spinorFloat A0_im = 0; 02006 A0_im += gT00_re * a0_im; 02007 A0_im += gT00_im * a0_re; 02008 A0_im += gT01_re * a1_im; 02009 A0_im += gT01_im * a1_re; 02010 A0_im += gT02_re * a2_im; 02011 A0_im += gT02_im * a2_re; 02012 spinorFloat B0_re = 0; 02013 B0_re += gT00_re * b0_re; 02014 B0_re -= gT00_im * b0_im; 02015 B0_re += gT01_re * b1_re; 02016 B0_re -= gT01_im * b1_im; 02017 B0_re += gT02_re * b2_re; 02018 B0_re -= gT02_im * b2_im; 02019 spinorFloat B0_im = 0; 02020 B0_im += gT00_re * b0_im; 02021 B0_im += gT00_im * b0_re; 02022 B0_im += gT01_re * b1_im; 02023 B0_im += gT01_im * b1_re; 02024 B0_im += gT02_re * b2_im; 02025 B0_im += gT02_im * b2_re; 02026 02027 // multiply row 1 02028 spinorFloat A1_re = 0; 02029 A1_re += gT10_re * a0_re; 02030 A1_re -= gT10_im * a0_im; 02031 A1_re += gT11_re * a1_re; 02032 A1_re -= gT11_im * a1_im; 02033 A1_re += gT12_re * a2_re; 02034 A1_re -= gT12_im * a2_im; 02035 spinorFloat A1_im = 0; 02036 A1_im += gT10_re * a0_im; 02037 A1_im += gT10_im * a0_re; 02038 A1_im += gT11_re * a1_im; 02039 A1_im += gT11_im * a1_re; 02040 A1_im += gT12_re * a2_im; 02041 A1_im += gT12_im * a2_re; 02042 spinorFloat B1_re = 0; 02043 B1_re += gT10_re * b0_re; 02044 B1_re -= gT10_im * b0_im; 02045 B1_re += gT11_re * b1_re; 02046 B1_re -= gT11_im * b1_im; 02047 B1_re += gT12_re * b2_re; 02048 B1_re -= gT12_im * b2_im; 02049 spinorFloat B1_im = 0; 02050 B1_im += gT10_re * b0_im; 02051 B1_im += gT10_im * b0_re; 02052 B1_im += gT11_re * b1_im; 02053 B1_im += gT11_im * b1_re; 02054 B1_im += gT12_re * b2_im; 02055 B1_im += gT12_im * b2_re; 02056 02057 // multiply row 2 02058 spinorFloat A2_re = 0; 02059 A2_re += gT20_re * a0_re; 02060 A2_re -= gT20_im * a0_im; 02061 A2_re += gT21_re * a1_re; 02062 A2_re -= gT21_im * a1_im; 02063 A2_re += gT22_re * a2_re; 02064 A2_re -= gT22_im * a2_im; 02065 spinorFloat A2_im = 0; 02066 A2_im += gT20_re * a0_im; 02067 A2_im += gT20_im * a0_re; 02068 A2_im += gT21_re * a1_im; 02069 A2_im += gT21_im * a1_re; 02070 A2_im += gT22_re * a2_im; 02071 A2_im += gT22_im * a2_re; 02072 spinorFloat B2_re = 0; 02073 B2_re += gT20_re * b0_re; 02074 B2_re -= gT20_im * b0_im; 02075 B2_re += gT21_re * b1_re; 02076 B2_re -= gT21_im * b1_im; 02077 B2_re += gT22_re * b2_re; 02078 B2_re -= gT22_im * b2_im; 02079 spinorFloat B2_im = 0; 02080 B2_im += gT20_re * b0_im; 02081 B2_im += gT20_im * b0_re; 02082 B2_im += gT21_re * b1_im; 02083 B2_im += gT21_im * b1_re; 02084 B2_im += gT22_re * b2_im; 02085 B2_im += gT22_im * b2_re; 02086 02087 o00_re += A0_re; 02088 o00_im += A0_im; 02089 o10_re += B0_re; 02090 o10_im += B0_im; 02091 02092 o01_re += A1_re; 02093 o01_im += A1_im; 02094 o11_re += B1_re; 02095 o11_im += B1_im; 02096 02097 o02_re += A2_re; 02098 o02_im += A2_im; 02099 o12_re += B2_re; 02100 o12_im += B2_im; 02101 02102 } 02103 } 02104 02105 #if defined MULTI_GPU && (defined DSLASH_XPAY || defined DSLASH_CLOVER) 02106 02107 int incomplete = 0; // Have all 8 contributions been computed for this site? 02108 02109 switch(kernel_type) { // intentional fall-through 02110 case INTERIOR_KERNEL: 02111 incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1)); 02112 case EXTERIOR_KERNEL_T: 02113 incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1)); 02114 case EXTERIOR_KERNEL_Z: 02115 incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1)); 02116 case EXTERIOR_KERNEL_Y: 02117 incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1)); 02118 } 02119 02120 if (!incomplete) 02121 #endif // MULTI_GPU 02122 { 02123 02124 #ifdef DSLASH_CLOVER 02125 02126 // change to chiral basis 02127 { 02128 spinorFloat a00_re = -o10_re - o30_re; 02129 spinorFloat a00_im = -o10_im - o30_im; 02130 spinorFloat a10_re = o00_re + o20_re; 02131 spinorFloat a10_im = o00_im + o20_im; 02132 spinorFloat a20_re = -o10_re + o30_re; 02133 spinorFloat a20_im = -o10_im + o30_im; 02134 spinorFloat a30_re = o00_re - o20_re; 02135 spinorFloat a30_im = o00_im - o20_im; 02136 02137 o00_re = a00_re; o00_im = a00_im; 02138 o10_re = a10_re; o10_im = a10_im; 02139 o20_re = a20_re; o20_im = a20_im; 02140 o30_re = a30_re; o30_im = a30_im; 02141 } 02142 02143 { 02144 spinorFloat a01_re = -o11_re - o31_re; 02145 spinorFloat a01_im = -o11_im - o31_im; 02146 spinorFloat a11_re = o01_re + o21_re; 02147 spinorFloat a11_im = o01_im + o21_im; 02148 spinorFloat a21_re = -o11_re + o31_re; 02149 spinorFloat a21_im = -o11_im + o31_im; 02150 spinorFloat a31_re = o01_re - o21_re; 02151 spinorFloat a31_im = o01_im - o21_im; 02152 02153 o01_re = a01_re; o01_im = a01_im; 02154 o11_re = a11_re; o11_im = a11_im; 02155 o21_re = a21_re; o21_im = a21_im; 02156 o31_re = a31_re; o31_im = a31_im; 02157 } 02158 02159 { 02160 spinorFloat a02_re = -o12_re - o32_re; 02161 spinorFloat a02_im = -o12_im - o32_im; 02162 spinorFloat a12_re = o02_re + o22_re; 02163 spinorFloat a12_im = o02_im + o22_im; 02164 spinorFloat a22_re = -o12_re + o32_re; 02165 spinorFloat a22_im = -o12_im + o32_im; 02166 spinorFloat a32_re = o02_re - o22_re; 02167 spinorFloat a32_im = o02_im - o22_im; 02168 02169 o02_re = a02_re; o02_im = a02_im; 02170 o12_re = a12_re; o12_im = a12_im; 02171 o22_re = a22_re; o22_im = a22_im; 02172 o32_re = a32_re; o32_im = a32_im; 02173 } 02174 02175 // apply first chiral block 02176 { 02177 READ_CLOVER(CLOVERTEX, 0) 02178 02179 spinorFloat a00_re = 0; spinorFloat a00_im = 0; 02180 spinorFloat a01_re = 0; spinorFloat a01_im = 0; 02181 spinorFloat a02_re = 0; spinorFloat a02_im = 0; 02182 spinorFloat a10_re = 0; spinorFloat a10_im = 0; 02183 spinorFloat a11_re = 0; spinorFloat a11_im = 0; 02184 spinorFloat a12_re = 0; spinorFloat a12_im = 0; 02185 02186 a00_re += c00_00_re * o00_re; 02187 a00_im += c00_00_re * o00_im; 02188 a00_re += c00_01_re * o01_re; 02189 a00_re -= c00_01_im * o01_im; 02190 a00_im += c00_01_re * o01_im; 02191 a00_im += c00_01_im * o01_re; 02192 a00_re += c00_02_re * o02_re; 02193 a00_re -= c00_02_im * o02_im; 02194 a00_im += c00_02_re * o02_im; 02195 a00_im += c00_02_im * o02_re; 02196 a00_re += c00_10_re * o10_re; 02197 a00_re -= c00_10_im * o10_im; 02198 a00_im += c00_10_re * o10_im; 02199 a00_im += c00_10_im * o10_re; 02200 a00_re += c00_11_re * o11_re; 02201 a00_re -= c00_11_im * o11_im; 02202 a00_im += c00_11_re * o11_im; 02203 a00_im += c00_11_im * o11_re; 02204 a00_re += c00_12_re * o12_re; 02205 a00_re -= c00_12_im * o12_im; 02206 a00_im += c00_12_re * o12_im; 02207 a00_im += c00_12_im * o12_re; 02208 02209 a01_re += c01_00_re * o00_re; 02210 a01_re -= c01_00_im * o00_im; 02211 a01_im += c01_00_re * o00_im; 02212 a01_im += c01_00_im * o00_re; 02213 a01_re += c01_01_re * o01_re; 02214 a01_im += c01_01_re * o01_im; 02215 a01_re += c01_02_re * o02_re; 02216 a01_re -= c01_02_im * o02_im; 02217 a01_im += c01_02_re * o02_im; 02218 a01_im += c01_02_im * o02_re; 02219 a01_re += c01_10_re * o10_re; 02220 a01_re -= c01_10_im * o10_im; 02221 a01_im += c01_10_re * o10_im; 02222 a01_im += c01_10_im * o10_re; 02223 a01_re += c01_11_re * o11_re; 02224 a01_re -= c01_11_im * o11_im; 02225 a01_im += c01_11_re * o11_im; 02226 a01_im += c01_11_im * o11_re; 02227 a01_re += c01_12_re * o12_re; 02228 a01_re -= c01_12_im * o12_im; 02229 a01_im += c01_12_re * o12_im; 02230 a01_im += c01_12_im * o12_re; 02231 02232 a02_re += c02_00_re * o00_re; 02233 a02_re -= c02_00_im * o00_im; 02234 a02_im += c02_00_re * o00_im; 02235 a02_im += c02_00_im * o00_re; 02236 a02_re += c02_01_re * o01_re; 02237 a02_re -= c02_01_im * o01_im; 02238 a02_im += c02_01_re * o01_im; 02239 a02_im += c02_01_im * o01_re; 02240 a02_re += c02_02_re * o02_re; 02241 a02_im += c02_02_re * o02_im; 02242 a02_re += c02_10_re * o10_re; 02243 a02_re -= c02_10_im * o10_im; 02244 a02_im += c02_10_re * o10_im; 02245 a02_im += c02_10_im * o10_re; 02246 a02_re += c02_11_re * o11_re; 02247 a02_re -= c02_11_im * o11_im; 02248 a02_im += c02_11_re * o11_im; 02249 a02_im += c02_11_im * o11_re; 02250 a02_re += c02_12_re * o12_re; 02251 a02_re -= c02_12_im * o12_im; 02252 a02_im += c02_12_re * o12_im; 02253 a02_im += c02_12_im * o12_re; 02254 02255 a10_re += c10_00_re * o00_re; 02256 a10_re -= c10_00_im * o00_im; 02257 a10_im += c10_00_re * o00_im; 02258 a10_im += c10_00_im * o00_re; 02259 a10_re += c10_01_re * o01_re; 02260 a10_re -= c10_01_im * o01_im; 02261 a10_im += c10_01_re * o01_im; 02262 a10_im += c10_01_im * o01_re; 02263 a10_re += c10_02_re * o02_re; 02264 a10_re -= c10_02_im * o02_im; 02265 a10_im += c10_02_re * o02_im; 02266 a10_im += c10_02_im * o02_re; 02267 a10_re += c10_10_re * o10_re; 02268 a10_im += c10_10_re * o10_im; 02269 a10_re += c10_11_re * o11_re; 02270 a10_re -= c10_11_im * o11_im; 02271 a10_im += c10_11_re * o11_im; 02272 a10_im += c10_11_im * o11_re; 02273 a10_re += c10_12_re * o12_re; 02274 a10_re -= c10_12_im * o12_im; 02275 a10_im += c10_12_re * o12_im; 02276 a10_im += c10_12_im * o12_re; 02277 02278 a11_re += c11_00_re * o00_re; 02279 a11_re -= c11_00_im * o00_im; 02280 a11_im += c11_00_re * o00_im; 02281 a11_im += c11_00_im * o00_re; 02282 a11_re += c11_01_re * o01_re; 02283 a11_re -= c11_01_im * o01_im; 02284 a11_im += c11_01_re * o01_im; 02285 a11_im += c11_01_im * o01_re; 02286 a11_re += c11_02_re * o02_re; 02287 a11_re -= c11_02_im * o02_im; 02288 a11_im += c11_02_re * o02_im; 02289 a11_im += c11_02_im * o02_re; 02290 a11_re += c11_10_re * o10_re; 02291 a11_re -= c11_10_im * o10_im; 02292 a11_im += c11_10_re * o10_im; 02293 a11_im += c11_10_im * o10_re; 02294 a11_re += c11_11_re * o11_re; 02295 a11_im += c11_11_re * o11_im; 02296 a11_re += c11_12_re * o12_re; 02297 a11_re -= c11_12_im * o12_im; 02298 a11_im += c11_12_re * o12_im; 02299 a11_im += c11_12_im * o12_re; 02300 02301 a12_re += c12_00_re * o00_re; 02302 a12_re -= c12_00_im * o00_im; 02303 a12_im += c12_00_re * o00_im; 02304 a12_im += c12_00_im * o00_re; 02305 a12_re += c12_01_re * o01_re; 02306 a12_re -= c12_01_im * o01_im; 02307 a12_im += c12_01_re * o01_im; 02308 a12_im += c12_01_im * o01_re; 02309 a12_re += c12_02_re * o02_re; 02310 a12_re -= c12_02_im * o02_im; 02311 a12_im += c12_02_re * o02_im; 02312 a12_im += c12_02_im * o02_re; 02313 a12_re += c12_10_re * o10_re; 02314 a12_re -= c12_10_im * o10_im; 02315 a12_im += c12_10_re * o10_im; 02316 a12_im += c12_10_im * o10_re; 02317 a12_re += c12_11_re * o11_re; 02318 a12_re -= c12_11_im * o11_im; 02319 a12_im += c12_11_re * o11_im; 02320 a12_im += c12_11_im * o11_re; 02321 a12_re += c12_12_re * o12_re; 02322 a12_im += c12_12_re * o12_im; 02323 02324 o00_re = a00_re; o00_im = a00_im; 02325 o01_re = a01_re; o01_im = a01_im; 02326 o02_re = a02_re; o02_im = a02_im; 02327 o10_re = a10_re; o10_im = a10_im; 02328 o11_re = a11_re; o11_im = a11_im; 02329 o12_re = a12_re; o12_im = a12_im; 02330 02331 } 02332 02333 // apply second chiral block 02334 { 02335 READ_CLOVER(CLOVERTEX, 1) 02336 02337 spinorFloat a20_re = 0; spinorFloat a20_im = 0; 02338 spinorFloat a21_re = 0; spinorFloat a21_im = 0; 02339 spinorFloat a22_re = 0; spinorFloat a22_im = 0; 02340 spinorFloat a30_re = 0; spinorFloat a30_im = 0; 02341 spinorFloat a31_re = 0; spinorFloat a31_im = 0; 02342 spinorFloat a32_re = 0; spinorFloat a32_im = 0; 02343 02344 a20_re += c20_20_re * o20_re; 02345 a20_im += c20_20_re * o20_im; 02346 a20_re += c20_21_re * o21_re; 02347 a20_re -= c20_21_im * o21_im; 02348 a20_im += c20_21_re * o21_im; 02349 a20_im += c20_21_im * o21_re; 02350 a20_re += c20_22_re * o22_re; 02351 a20_re -= c20_22_im * o22_im; 02352 a20_im += c20_22_re * o22_im; 02353 a20_im += c20_22_im * o22_re; 02354 a20_re += c20_30_re * o30_re; 02355 a20_re -= c20_30_im * o30_im; 02356 a20_im += c20_30_re * o30_im; 02357 a20_im += c20_30_im * o30_re; 02358 a20_re += c20_31_re * o31_re; 02359 a20_re -= c20_31_im * o31_im; 02360 a20_im += c20_31_re * o31_im; 02361 a20_im += c20_31_im * o31_re; 02362 a20_re += c20_32_re * o32_re; 02363 a20_re -= c20_32_im * o32_im; 02364 a20_im += c20_32_re * o32_im; 02365 a20_im += c20_32_im * o32_re; 02366 02367 a21_re += c21_20_re * o20_re; 02368 a21_re -= c21_20_im * o20_im; 02369 a21_im += c21_20_re * o20_im; 02370 a21_im += c21_20_im * o20_re; 02371 a21_re += c21_21_re * o21_re; 02372 a21_im += c21_21_re * o21_im; 02373 a21_re += c21_22_re * o22_re; 02374 a21_re -= c21_22_im * o22_im; 02375 a21_im += c21_22_re * o22_im; 02376 a21_im += c21_22_im * o22_re; 02377 a21_re += c21_30_re * o30_re; 02378 a21_re -= c21_30_im * o30_im; 02379 a21_im += c21_30_re * o30_im; 02380 a21_im += c21_30_im * o30_re; 02381 a21_re += c21_31_re * o31_re; 02382 a21_re -= c21_31_im * o31_im; 02383 a21_im += c21_31_re * o31_im; 02384 a21_im += c21_31_im * o31_re; 02385 a21_re += c21_32_re * o32_re; 02386 a21_re -= c21_32_im * o32_im; 02387 a21_im += c21_32_re * o32_im; 02388 a21_im += c21_32_im * o32_re; 02389 02390 a22_re += c22_20_re * o20_re; 02391 a22_re -= c22_20_im * o20_im; 02392 a22_im += c22_20_re * o20_im; 02393 a22_im += c22_20_im * o20_re; 02394 a22_re += c22_21_re * o21_re; 02395 a22_re -= c22_21_im * o21_im; 02396 a22_im += c22_21_re * o21_im; 02397 a22_im += c22_21_im * o21_re; 02398 a22_re += c22_22_re * o22_re; 02399 a22_im += c22_22_re * o22_im; 02400 a22_re += c22_30_re * o30_re; 02401 a22_re -= c22_30_im * o30_im; 02402 a22_im += c22_30_re * o30_im; 02403 a22_im += c22_30_im * o30_re; 02404 a22_re += c22_31_re * o31_re; 02405 a22_re -= c22_31_im * o31_im; 02406 a22_im += c22_31_re * o31_im; 02407 a22_im += c22_31_im * o31_re; 02408 a22_re += c22_32_re * o32_re; 02409 a22_re -= c22_32_im * o32_im; 02410 a22_im += c22_32_re * o32_im; 02411 a22_im += c22_32_im * o32_re; 02412 02413 a30_re += c30_20_re * o20_re; 02414 a30_re -= c30_20_im * o20_im; 02415 a30_im += c30_20_re * o20_im; 02416 a30_im += c30_20_im * o20_re; 02417 a30_re += c30_21_re * o21_re; 02418 a30_re -= c30_21_im * o21_im; 02419 a30_im += c30_21_re * o21_im; 02420 a30_im += c30_21_im * o21_re; 02421 a30_re += c30_22_re * o22_re; 02422 a30_re -= c30_22_im * o22_im; 02423 a30_im += c30_22_re * o22_im; 02424 a30_im += c30_22_im * o22_re; 02425 a30_re += c30_30_re * o30_re; 02426 a30_im += c30_30_re * o30_im; 02427 a30_re += c30_31_re * o31_re; 02428 a30_re -= c30_31_im * o31_im; 02429 a30_im += c30_31_re * o31_im; 02430 a30_im += c30_31_im * o31_re; 02431 a30_re += c30_32_re * o32_re; 02432 a30_re -= c30_32_im * o32_im; 02433 a30_im += c30_32_re * o32_im; 02434 a30_im += c30_32_im * o32_re; 02435 02436 a31_re += c31_20_re * o20_re; 02437 a31_re -= c31_20_im * o20_im; 02438 a31_im += c31_20_re * o20_im; 02439 a31_im += c31_20_im * o20_re; 02440 a31_re += c31_21_re * o21_re; 02441 a31_re -= c31_21_im * o21_im; 02442 a31_im += c31_21_re * o21_im; 02443 a31_im += c31_21_im * o21_re; 02444 a31_re += c31_22_re * o22_re; 02445 a31_re -= c31_22_im * o22_im; 02446 a31_im += c31_22_re * o22_im; 02447 a31_im += c31_22_im * o22_re; 02448 a31_re += c31_30_re * o30_re; 02449 a31_re -= c31_30_im * o30_im; 02450 a31_im += c31_30_re * o30_im; 02451 a31_im += c31_30_im * o30_re; 02452 a31_re += c31_31_re * o31_re; 02453 a31_im += c31_31_re * o31_im; 02454 a31_re += c31_32_re * o32_re; 02455 a31_re -= c31_32_im * o32_im; 02456 a31_im += c31_32_re * o32_im; 02457 a31_im += c31_32_im * o32_re; 02458 02459 a32_re += c32_20_re * o20_re; 02460 a32_re -= c32_20_im * o20_im; 02461 a32_im += c32_20_re * o20_im; 02462 a32_im += c32_20_im * o20_re; 02463 a32_re += c32_21_re * o21_re; 02464 a32_re -= c32_21_im * o21_im; 02465 a32_im += c32_21_re * o21_im; 02466 a32_im += c32_21_im * o21_re; 02467 a32_re += c32_22_re * o22_re; 02468 a32_re -= c32_22_im * o22_im; 02469 a32_im += c32_22_re * o22_im; 02470 a32_im += c32_22_im * o22_re; 02471 a32_re += c32_30_re * o30_re; 02472 a32_re -= c32_30_im * o30_im; 02473 a32_im += c32_30_re * o30_im; 02474 a32_im += c32_30_im * o30_re; 02475 a32_re += c32_31_re * o31_re; 02476 a32_re -= c32_31_im * o31_im; 02477 a32_im += c32_31_re * o31_im; 02478 a32_im += c32_31_im * o31_re; 02479 a32_re += c32_32_re * o32_re; 02480 a32_im += c32_32_re * o32_im; 02481 02482 o20_re = a20_re; o20_im = a20_im; 02483 o21_re = a21_re; o21_im = a21_im; 02484 o22_re = a22_re; o22_im = a22_im; 02485 o30_re = a30_re; o30_im = a30_im; 02486 o31_re = a31_re; o31_im = a31_im; 02487 o32_re = a32_re; o32_im = a32_im; 02488 02489 } 02490 02491 // change back from chiral basis 02492 // (note: required factor of 1/2 is included in clover term normalization) 02493 { 02494 spinorFloat a00_re = o10_re + o30_re; 02495 spinorFloat a00_im = o10_im + o30_im; 02496 spinorFloat a10_re = -o00_re - o20_re; 02497 spinorFloat a10_im = -o00_im - o20_im; 02498 spinorFloat a20_re = o10_re - o30_re; 02499 spinorFloat a20_im = o10_im - o30_im; 02500 spinorFloat a30_re = -o00_re + o20_re; 02501 spinorFloat a30_im = -o00_im + o20_im; 02502 02503 o00_re = a00_re; o00_im = a00_im; 02504 o10_re = a10_re; o10_im = a10_im; 02505 o20_re = a20_re; o20_im = a20_im; 02506 o30_re = a30_re; o30_im = a30_im; 02507 } 02508 02509 { 02510 spinorFloat a01_re = o11_re + o31_re; 02511 spinorFloat a01_im = o11_im + o31_im; 02512 spinorFloat a11_re = -o01_re - o21_re; 02513 spinorFloat a11_im = -o01_im - o21_im; 02514 spinorFloat a21_re = o11_re - o31_re; 02515 spinorFloat a21_im = o11_im - o31_im; 02516 spinorFloat a31_re = -o01_re + o21_re; 02517 spinorFloat a31_im = -o01_im + o21_im; 02518 02519 o01_re = a01_re; o01_im = a01_im; 02520 o11_re = a11_re; o11_im = a11_im; 02521 o21_re = a21_re; o21_im = a21_im; 02522 o31_re = a31_re; o31_im = a31_im; 02523 } 02524 02525 { 02526 spinorFloat a02_re = o12_re + o32_re; 02527 spinorFloat a02_im = o12_im + o32_im; 02528 spinorFloat a12_re = -o02_re - o22_re; 02529 spinorFloat a12_im = -o02_im - o22_im; 02530 spinorFloat a22_re = o12_re - o32_re; 02531 spinorFloat a22_im = o12_im - o32_im; 02532 spinorFloat a32_re = -o02_re + o22_re; 02533 spinorFloat a32_im = -o02_im + o22_im; 02534 02535 o02_re = a02_re; o02_im = a02_im; 02536 o12_re = a12_re; o12_im = a12_im; 02537 o22_re = a22_re; o22_im = a22_im; 02538 o32_re = a32_re; o32_im = a32_im; 02539 } 02540 02541 #endif // DSLASH_CLOVER 02542 02543 #ifdef DSLASH_XPAY 02544 02545 READ_ACCUM(ACCUMTEX, sp_stride) 02546 02547 #ifdef SPINOR_DOUBLE 02548 o00_re = a*o00_re + accum0.x; 02549 o00_im = a*o00_im + accum0.y; 02550 o01_re = a*o01_re + accum1.x; 02551 o01_im = a*o01_im + accum1.y; 02552 o02_re = a*o02_re + accum2.x; 02553 o02_im = a*o02_im + accum2.y; 02554 o10_re = a*o10_re + accum3.x; 02555 o10_im = a*o10_im + accum3.y; 02556 o11_re = a*o11_re + accum4.x; 02557 o11_im = a*o11_im + accum4.y; 02558 o12_re = a*o12_re + accum5.x; 02559 o12_im = a*o12_im + accum5.y; 02560 o20_re = a*o20_re + accum6.x; 02561 o20_im = a*o20_im + accum6.y; 02562 o21_re = a*o21_re + accum7.x; 02563 o21_im = a*o21_im + accum7.y; 02564 o22_re = a*o22_re + accum8.x; 02565 o22_im = a*o22_im + accum8.y; 02566 o30_re = a*o30_re + accum9.x; 02567 o30_im = a*o30_im + accum9.y; 02568 o31_re = a*o31_re + accum10.x; 02569 o31_im = a*o31_im + accum10.y; 02570 o32_re = a*o32_re + accum11.x; 02571 o32_im = a*o32_im + accum11.y; 02572 #else 02573 o00_re = a*o00_re + accum0.x; 02574 o00_im = a*o00_im + accum0.y; 02575 o01_re = a*o01_re + accum0.z; 02576 o01_im = a*o01_im + accum0.w; 02577 o02_re = a*o02_re + accum1.x; 02578 o02_im = a*o02_im + accum1.y; 02579 o10_re = a*o10_re + accum1.z; 02580 o10_im = a*o10_im + accum1.w; 02581 o11_re = a*o11_re + accum2.x; 02582 o11_im = a*o11_im + accum2.y; 02583 o12_re = a*o12_re + accum2.z; 02584 o12_im = a*o12_im + accum2.w; 02585 o20_re = a*o20_re + accum3.x; 02586 o20_im = a*o20_im + accum3.y; 02587 o21_re = a*o21_re + accum3.z; 02588 o21_im = a*o21_im + accum3.w; 02589 o22_re = a*o22_re + accum4.x; 02590 o22_im = a*o22_im + accum4.y; 02591 o30_re = a*o30_re + accum4.z; 02592 o30_im = a*o30_im + accum4.w; 02593 o31_re = a*o31_re + accum5.x; 02594 o31_im = a*o31_im + accum5.y; 02595 o32_re = a*o32_re + accum5.z; 02596 o32_im = a*o32_im + accum5.w; 02597 #endif // SPINOR_DOUBLE 02598 02599 #endif // DSLASH_XPAY 02600 } 02601 02602 // write spinor field back to device memory 02603 WRITE_SPINOR(sp_stride); 02604 02605 // undefine to prevent warning when precision is changed 02606 #undef spinorFloat 02607 #undef SHARED_STRIDE 02608 02609 #undef A_re 02610 #undef A_im 02611 02612 #undef g00_re 02613 #undef g00_im 02614 #undef g01_re 02615 #undef g01_im 02616 #undef g02_re 02617 #undef g02_im 02618 #undef g10_re 02619 #undef g10_im 02620 #undef g11_re 02621 #undef g11_im 02622 #undef g12_re 02623 #undef g12_im 02624 #undef g20_re 02625 #undef g20_im 02626 #undef g21_re 02627 #undef g21_im 02628 #undef g22_re 02629 #undef g22_im 02630 02631 #undef i00_re 02632 #undef i00_im 02633 #undef i01_re 02634 #undef i01_im 02635 #undef i02_re 02636 #undef i02_im 02637 #undef i10_re 02638 #undef i10_im 02639 #undef i11_re 02640 #undef i11_im 02641 #undef i12_re 02642 #undef i12_im 02643 #undef i20_re 02644 #undef i20_im 02645 #undef i21_re 02646 #undef i21_im 02647 #undef i22_re 02648 #undef i22_im 02649 #undef i30_re 02650 #undef i30_im 02651 #undef i31_re 02652 #undef i31_im 02653 #undef i32_re 02654 #undef i32_im 02655 02656 #undef c00_00_re 02657 #undef c01_01_re 02658 #undef c02_02_re 02659 #undef c10_10_re 02660 #undef c11_11_re 02661 #undef c12_12_re 02662 #undef c01_00_re 02663 #undef c01_00_im 02664 #undef c02_00_re 02665 #undef c02_00_im 02666 #undef c10_00_re 02667 #undef c10_00_im 02668 #undef c11_00_re 02669 #undef c11_00_im 02670 #undef c12_00_re 02671 #undef c12_00_im 02672 #undef c02_01_re 02673 #undef c02_01_im 02674 #undef c10_01_re 02675 #undef c10_01_im 02676 #undef c11_01_re 02677 #undef c11_01_im 02678 #undef c12_01_re 02679 #undef c12_01_im 02680 #undef c10_02_re 02681 #undef c10_02_im 02682 #undef c11_02_re 02683 #undef c11_02_im 02684 #undef c12_02_re 02685 #undef c12_02_im 02686 #undef c11_10_re 02687 #undef c11_10_im 02688 #undef c12_10_re 02689 #undef c12_10_im 02690 #undef c12_11_re 02691 #undef c12_11_im 02692 02693 #undef o00_re 02694 #undef o00_im 02695 #undef o01_re 02696 #undef o01_im 02697 #undef o02_re 02698 #undef o02_im 02699 #undef o10_re 02700 #undef o10_im 02701 #undef o11_re 02702 #undef o11_im 02703 #undef o12_re 02704 #undef o12_im 02705 #undef o20_re 02706 #undef o20_im 02707 #undef o21_re 02708 #undef o21_im 02709 #undef o22_re 02710 #undef o22_im 02711 #undef o30_re 02712 02713 #undef VOLATILE