QUDA v0.4.0
A library for QCD on GPUs
|
00001 // *** CUDA DSLASH DAGGER *** 00002 00003 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 00004 00005 00006 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 00007 #define VOLATILE 00008 #else // Open64 compiler 00009 #define VOLATILE volatile 00010 #endif 00011 // input spinor 00012 #ifdef SPINOR_DOUBLE 00013 #define spinorFloat double 00014 #define i00_re I0.x 00015 #define i00_im I0.y 00016 #define i01_re I1.x 00017 #define i01_im I1.y 00018 #define i02_re I2.x 00019 #define i02_im I2.y 00020 #define i10_re I3.x 00021 #define i10_im I3.y 00022 #define i11_re I4.x 00023 #define i11_im I4.y 00024 #define i12_re I5.x 00025 #define i12_im I5.y 00026 #define i20_re I6.x 00027 #define i20_im I6.y 00028 #define i21_re I7.x 00029 #define i21_im I7.y 00030 #define i22_re I8.x 00031 #define i22_im I8.y 00032 #define i30_re I9.x 00033 #define i30_im I9.y 00034 #define i31_re I10.x 00035 #define i31_im I10.y 00036 #define i32_re I11.x 00037 #define i32_im I11.y 00038 #else 00039 #define spinorFloat float 00040 #define i00_re I0.x 00041 #define i00_im I0.y 00042 #define i01_re I0.z 00043 #define i01_im I0.w 00044 #define i02_re I1.x 00045 #define i02_im I1.y 00046 #define i10_re I1.z 00047 #define i10_im I1.w 00048 #define i11_re I2.x 00049 #define i11_im I2.y 00050 #define i12_re I2.z 00051 #define i12_im I2.w 00052 #define i20_re I3.x 00053 #define i20_im I3.y 00054 #define i21_re I3.z 00055 #define i21_im I3.w 00056 #define i22_re I4.x 00057 #define i22_im I4.y 00058 #define i30_re I4.z 00059 #define i30_im I4.w 00060 #define i31_re I5.x 00061 #define i31_im I5.y 00062 #define i32_re I5.z 00063 #define i32_im I5.w 00064 #endif // SPINOR_DOUBLE 00065 00066 // gauge link 00067 #ifdef GAUGE_FLOAT2 00068 #define g00_re G0.x 00069 #define g00_im G0.y 00070 #define g01_re G1.x 00071 #define g01_im G1.y 00072 #define g02_re G2.x 00073 #define g02_im G2.y 00074 #define g10_re G3.x 00075 #define g10_im G3.y 00076 #define g11_re G4.x 00077 #define g11_im G4.y 00078 #define g12_re G5.x 00079 #define g12_im G5.y 00080 #define g20_re G6.x 00081 #define g20_im G6.y 00082 #define g21_re G7.x 00083 #define g21_im G7.y 00084 #define g22_re G8.x 00085 #define g22_im G8.y 00086 // temporaries 00087 #define A_re G9.x 00088 #define A_im G9.y 00089 00090 #else 00091 #define g00_re G0.x 00092 #define g00_im G0.y 00093 #define g01_re G0.z 00094 #define g01_im G0.w 00095 #define g02_re G1.x 00096 #define g02_im G1.y 00097 #define g10_re G1.z 00098 #define g10_im G1.w 00099 #define g11_re G2.x 00100 #define g11_im G2.y 00101 #define g12_re G2.z 00102 #define g12_im G2.w 00103 #define g20_re G3.x 00104 #define g20_im G3.y 00105 #define g21_re G3.z 00106 #define g21_im G3.w 00107 #define g22_re G4.x 00108 #define g22_im G4.y 00109 // temporaries 00110 #define A_re G4.z 00111 #define A_im G4.w 00112 00113 #endif // GAUGE_DOUBLE 00114 00115 // conjugated gauge link 00116 #define gT00_re (+g00_re) 00117 #define gT00_im (-g00_im) 00118 #define gT01_re (+g10_re) 00119 #define gT01_im (-g10_im) 00120 #define gT02_re (+g20_re) 00121 #define gT02_im (-g20_im) 00122 #define gT10_re (+g01_re) 00123 #define gT10_im (-g01_im) 00124 #define gT11_re (+g11_re) 00125 #define gT11_im (-g11_im) 00126 #define gT12_re (+g21_re) 00127 #define gT12_im (-g21_im) 00128 #define gT20_re (+g02_re) 00129 #define gT20_im (-g02_im) 00130 #define gT21_re (+g12_re) 00131 #define gT21_im (-g12_im) 00132 #define gT22_re (+g22_re) 00133 #define gT22_im (-g22_im) 00134 00135 // first chiral block of inverted clover term 00136 #ifdef CLOVER_DOUBLE 00137 #define c00_00_re C0.x 00138 #define c01_01_re C0.y 00139 #define c02_02_re C1.x 00140 #define c10_10_re C1.y 00141 #define c11_11_re C2.x 00142 #define c12_12_re C2.y 00143 #define c01_00_re C3.x 00144 #define c01_00_im C3.y 00145 #define c02_00_re C4.x 00146 #define c02_00_im C4.y 00147 #define c10_00_re C5.x 00148 #define c10_00_im C5.y 00149 #define c11_00_re C6.x 00150 #define c11_00_im C6.y 00151 #define c12_00_re C7.x 00152 #define c12_00_im C7.y 00153 #define c02_01_re C8.x 00154 #define c02_01_im C8.y 00155 #define c10_01_re C9.x 00156 #define c10_01_im C9.y 00157 #define c11_01_re C10.x 00158 #define c11_01_im C10.y 00159 #define c12_01_re C11.x 00160 #define c12_01_im C11.y 00161 #define c10_02_re C12.x 00162 #define c10_02_im C12.y 00163 #define c11_02_re C13.x 00164 #define c11_02_im C13.y 00165 #define c12_02_re C14.x 00166 #define c12_02_im C14.y 00167 #define c11_10_re C15.x 00168 #define c11_10_im C15.y 00169 #define c12_10_re C16.x 00170 #define c12_10_im C16.y 00171 #define c12_11_re C17.x 00172 #define c12_11_im C17.y 00173 #else 00174 #define c00_00_re C0.x 00175 #define c01_01_re C0.y 00176 #define c02_02_re C0.z 00177 #define c10_10_re C0.w 00178 #define c11_11_re C1.x 00179 #define c12_12_re C1.y 00180 #define c01_00_re C1.z 00181 #define c01_00_im C1.w 00182 #define c02_00_re C2.x 00183 #define c02_00_im C2.y 00184 #define c10_00_re C2.z 00185 #define c10_00_im C2.w 00186 #define c11_00_re C3.x 00187 #define c11_00_im C3.y 00188 #define c12_00_re C3.z 00189 #define c12_00_im C3.w 00190 #define c02_01_re C4.x 00191 #define c02_01_im C4.y 00192 #define c10_01_re C4.z 00193 #define c10_01_im C4.w 00194 #define c11_01_re C5.x 00195 #define c11_01_im C5.y 00196 #define c12_01_re C5.z 00197 #define c12_01_im C5.w 00198 #define c10_02_re C6.x 00199 #define c10_02_im C6.y 00200 #define c11_02_re C6.z 00201 #define c11_02_im C6.w 00202 #define c12_02_re C7.x 00203 #define c12_02_im C7.y 00204 #define c11_10_re C7.z 00205 #define c11_10_im C7.w 00206 #define c12_10_re C8.x 00207 #define c12_10_im C8.y 00208 #define c12_11_re C8.z 00209 #define c12_11_im C8.w 00210 #endif // CLOVER_DOUBLE 00211 00212 #define c00_01_re (+c01_00_re) 00213 #define c00_01_im (-c01_00_im) 00214 #define c00_02_re (+c02_00_re) 00215 #define c00_02_im (-c02_00_im) 00216 #define c01_02_re (+c02_01_re) 00217 #define c01_02_im (-c02_01_im) 00218 #define c00_10_re (+c10_00_re) 00219 #define c00_10_im (-c10_00_im) 00220 #define c01_10_re (+c10_01_re) 00221 #define c01_10_im (-c10_01_im) 00222 #define c02_10_re (+c10_02_re) 00223 #define c02_10_im (-c10_02_im) 00224 #define c00_11_re (+c11_00_re) 00225 #define c00_11_im (-c11_00_im) 00226 #define c01_11_re (+c11_01_re) 00227 #define c01_11_im (-c11_01_im) 00228 #define c02_11_re (+c11_02_re) 00229 #define c02_11_im (-c11_02_im) 00230 #define c10_11_re (+c11_10_re) 00231 #define c10_11_im (-c11_10_im) 00232 #define c00_12_re (+c12_00_re) 00233 #define c00_12_im (-c12_00_im) 00234 #define c01_12_re (+c12_01_re) 00235 #define c01_12_im (-c12_01_im) 00236 #define c02_12_re (+c12_02_re) 00237 #define c02_12_im (-c12_02_im) 00238 #define c10_12_re (+c12_10_re) 00239 #define c10_12_im (-c12_10_im) 00240 #define c11_12_re (+c12_11_re) 00241 #define c11_12_im (-c12_11_im) 00242 00243 // second chiral block of inverted clover term (reuses C0,...,C9) 00244 #define c20_20_re c00_00_re 00245 #define c21_20_re c01_00_re 00246 #define c21_20_im c01_00_im 00247 #define c22_20_re c02_00_re 00248 #define c22_20_im c02_00_im 00249 #define c30_20_re c10_00_re 00250 #define c30_20_im c10_00_im 00251 #define c31_20_re c11_00_re 00252 #define c31_20_im c11_00_im 00253 #define c32_20_re c12_00_re 00254 #define c32_20_im c12_00_im 00255 #define c20_21_re c00_01_re 00256 #define c20_21_im c00_01_im 00257 #define c21_21_re c01_01_re 00258 #define c22_21_re c02_01_re 00259 #define c22_21_im c02_01_im 00260 #define c30_21_re c10_01_re 00261 #define c30_21_im c10_01_im 00262 #define c31_21_re c11_01_re 00263 #define c31_21_im c11_01_im 00264 #define c32_21_re c12_01_re 00265 #define c32_21_im c12_01_im 00266 #define c20_22_re c00_02_re 00267 #define c20_22_im c00_02_im 00268 #define c21_22_re c01_02_re 00269 #define c21_22_im c01_02_im 00270 #define c22_22_re c02_02_re 00271 #define c30_22_re c10_02_re 00272 #define c30_22_im c10_02_im 00273 #define c31_22_re c11_02_re 00274 #define c31_22_im c11_02_im 00275 #define c32_22_re c12_02_re 00276 #define c32_22_im c12_02_im 00277 #define c20_30_re c00_10_re 00278 #define c20_30_im c00_10_im 00279 #define c21_30_re c01_10_re 00280 #define c21_30_im c01_10_im 00281 #define c22_30_re c02_10_re 00282 #define c22_30_im c02_10_im 00283 #define c30_30_re c10_10_re 00284 #define c31_30_re c11_10_re 00285 #define c31_30_im c11_10_im 00286 #define c32_30_re c12_10_re 00287 #define c32_30_im c12_10_im 00288 #define c20_31_re c00_11_re 00289 #define c20_31_im c00_11_im 00290 #define c21_31_re c01_11_re 00291 #define c21_31_im c01_11_im 00292 #define c22_31_re c02_11_re 00293 #define c22_31_im c02_11_im 00294 #define c30_31_re c10_11_re 00295 #define c30_31_im c10_11_im 00296 #define c31_31_re c11_11_re 00297 #define c32_31_re c12_11_re 00298 #define c32_31_im c12_11_im 00299 #define c20_32_re c00_12_re 00300 #define c20_32_im c00_12_im 00301 #define c21_32_re c01_12_re 00302 #define c21_32_im c01_12_im 00303 #define c22_32_re c02_12_re 00304 #define c22_32_im c02_12_im 00305 #define c30_32_re c10_12_re 00306 #define c30_32_im c10_12_im 00307 #define c31_32_re c11_12_re 00308 #define c31_32_im c11_12_im 00309 #define c32_32_re c12_12_re 00310 00311 // output spinor 00312 VOLATILE spinorFloat o00_re; 00313 VOLATILE spinorFloat o00_im; 00314 VOLATILE spinorFloat o01_re; 00315 VOLATILE spinorFloat o01_im; 00316 VOLATILE spinorFloat o02_re; 00317 VOLATILE spinorFloat o02_im; 00318 VOLATILE spinorFloat o10_re; 00319 VOLATILE spinorFloat o10_im; 00320 VOLATILE spinorFloat o11_re; 00321 VOLATILE spinorFloat o11_im; 00322 VOLATILE spinorFloat o12_re; 00323 VOLATILE spinorFloat o12_im; 00324 VOLATILE spinorFloat o20_re; 00325 VOLATILE spinorFloat o20_im; 00326 VOLATILE spinorFloat o21_re; 00327 VOLATILE spinorFloat o21_im; 00328 VOLATILE spinorFloat o22_re; 00329 VOLATILE spinorFloat o22_im; 00330 VOLATILE spinorFloat o30_re; 00331 VOLATILE spinorFloat o30_im; 00332 VOLATILE spinorFloat o31_re; 00333 VOLATILE spinorFloat o31_im; 00334 VOLATILE spinorFloat o32_re; 00335 VOLATILE spinorFloat o32_im; 00336 00337 #include "read_gauge.h" 00338 #include "read_clover.h" 00339 #include "io_spinor.h" 00340 00341 int x1, x2, x3, x4; 00342 int X; 00343 00344 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision 00345 int sp_norm_idx; 00346 #endif // MULTI_GPU half precision 00347 00348 int sid; 00349 00350 #ifdef MULTI_GPU 00351 int face_idx; 00352 if (kernel_type == INTERIOR_KERNEL) { 00353 #endif 00354 00355 sid = blockIdx.x*blockDim.x + threadIdx.x; 00356 if (sid >= param.threads) return; 00357 00358 // Inline by hand for the moment and assume even dimensions 00359 //coordsFromIndex(X, x1, x2, x3, x4, sid, param.parity); 00360 00361 X = 2*sid; 00362 int aux1 = X / X1; 00363 x1 = X - aux1 * X1; 00364 int aux2 = aux1 / X2; 00365 x2 = aux1 - aux2 * X2; 00366 x4 = aux2 / X3; 00367 x3 = aux2 - x4 * X3; 00368 aux1 = (param.parity + x4 + x3 + x2) & 1; 00369 x1 += aux1; 00370 X += aux1; 00371 00372 o00_re = 0; o00_im = 0; 00373 o01_re = 0; o01_im = 0; 00374 o02_re = 0; o02_im = 0; 00375 o10_re = 0; o10_im = 0; 00376 o11_re = 0; o11_im = 0; 00377 o12_re = 0; o12_im = 0; 00378 o20_re = 0; o20_im = 0; 00379 o21_re = 0; o21_im = 0; 00380 o22_re = 0; o22_im = 0; 00381 o30_re = 0; o30_im = 0; 00382 o31_re = 0; o31_im = 0; 00383 o32_re = 0; o32_im = 0; 00384 00385 #ifdef MULTI_GPU 00386 } else { // exterior kernel 00387 00388 sid = blockIdx.x*blockDim.x + threadIdx.x; 00389 if (sid >= param.threads) return; 00390 00391 const int dim = static_cast<int>(kernel_type); 00392 const int face_volume = (param.threads >> 1); // volume of one face 00393 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1 00394 face_idx = sid - face_num*face_volume; // index into the respective face 00395 00396 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP) 00397 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading) 00398 //sp_idx = face_idx + param.ghostOffset[dim]; 00399 00400 #if (DD_PREC==2) // half precision 00401 sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)]; 00402 #endif 00403 00404 coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity); 00405 00406 READ_INTERMEDIATE_SPINOR(INTERTEX, sp_stride, sid, sid); 00407 00408 o00_re = i00_re; o00_im = i00_im; 00409 o01_re = i01_re; o01_im = i01_im; 00410 o02_re = i02_re; o02_im = i02_im; 00411 o10_re = i10_re; o10_im = i10_im; 00412 o11_re = i11_re; o11_im = i11_im; 00413 o12_re = i12_re; o12_im = i12_im; 00414 o20_re = i20_re; o20_im = i20_im; 00415 o21_re = i21_re; o21_im = i21_im; 00416 o22_re = i22_re; o22_im = i22_im; 00417 o30_re = i30_re; o30_im = i30_im; 00418 o31_re = i31_re; o31_im = i31_im; 00419 o32_re = i32_re; o32_im = i32_im; 00420 } 00421 #endif // MULTI_GPU 00422 00423 00424 #ifdef MULTI_GPU 00425 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) || 00426 (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) ) 00427 #endif 00428 { 00429 // Projector P0+ 00430 // 1 0 0 i 00431 // 0 1 i 0 00432 // 0 -i 1 0 00433 // -i 0 0 1 00434 00435 #ifdef MULTI_GPU 00436 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 : 00437 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00438 #else 00439 const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1; 00440 #endif 00441 00442 const int ga_idx = sid; 00443 00444 spinorFloat a0_re, a0_im; 00445 spinorFloat a1_re, a1_im; 00446 spinorFloat a2_re, a2_im; 00447 spinorFloat b0_re, b0_im; 00448 spinorFloat b1_re, b1_im; 00449 spinorFloat b2_re, b2_im; 00450 00451 #ifdef MULTI_GPU 00452 if (kernel_type == INTERIOR_KERNEL) { 00453 #endif 00454 00455 // read spinor from device memory 00456 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00457 00458 // project spinor into half spinors 00459 a0_re = +i00_re-i30_im; 00460 a0_im = +i00_im+i30_re; 00461 a1_re = +i01_re-i31_im; 00462 a1_im = +i01_im+i31_re; 00463 a2_re = +i02_re-i32_im; 00464 a2_im = +i02_im+i32_re; 00465 b0_re = +i10_re-i20_im; 00466 b0_im = +i10_im+i20_re; 00467 b1_re = +i11_re-i21_im; 00468 b1_im = +i11_im+i21_re; 00469 b2_re = +i12_re-i22_im; 00470 b2_im = +i12_im+i22_re; 00471 00472 #ifdef MULTI_GPU 00473 } else { 00474 00475 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00476 00477 // read half spinor from device memory 00478 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00479 00480 a0_re = i00_re; a0_im = i00_im; 00481 a1_re = i01_re; a1_im = i01_im; 00482 a2_re = i02_re; a2_im = i02_im; 00483 b0_re = i10_re; b0_im = i10_im; 00484 b1_re = i11_re; b1_im = i11_im; 00485 b2_re = i12_re; b2_im = i12_im; 00486 00487 } 00488 #endif // MULTI_GPU 00489 00490 // read gauge matrix from device memory 00491 READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride); 00492 00493 // reconstruct gauge matrix 00494 RECONSTRUCT_GAUGE_MATRIX(0); 00495 00496 // multiply row 0 00497 spinorFloat A0_re = 0; 00498 A0_re += g00_re * a0_re; 00499 A0_re -= g00_im * a0_im; 00500 A0_re += g01_re * a1_re; 00501 A0_re -= g01_im * a1_im; 00502 A0_re += g02_re * a2_re; 00503 A0_re -= g02_im * a2_im; 00504 spinorFloat A0_im = 0; 00505 A0_im += g00_re * a0_im; 00506 A0_im += g00_im * a0_re; 00507 A0_im += g01_re * a1_im; 00508 A0_im += g01_im * a1_re; 00509 A0_im += g02_re * a2_im; 00510 A0_im += g02_im * a2_re; 00511 spinorFloat B0_re = 0; 00512 B0_re += g00_re * b0_re; 00513 B0_re -= g00_im * b0_im; 00514 B0_re += g01_re * b1_re; 00515 B0_re -= g01_im * b1_im; 00516 B0_re += g02_re * b2_re; 00517 B0_re -= g02_im * b2_im; 00518 spinorFloat B0_im = 0; 00519 B0_im += g00_re * b0_im; 00520 B0_im += g00_im * b0_re; 00521 B0_im += g01_re * b1_im; 00522 B0_im += g01_im * b1_re; 00523 B0_im += g02_re * b2_im; 00524 B0_im += g02_im * b2_re; 00525 00526 // multiply row 1 00527 spinorFloat A1_re = 0; 00528 A1_re += g10_re * a0_re; 00529 A1_re -= g10_im * a0_im; 00530 A1_re += g11_re * a1_re; 00531 A1_re -= g11_im * a1_im; 00532 A1_re += g12_re * a2_re; 00533 A1_re -= g12_im * a2_im; 00534 spinorFloat A1_im = 0; 00535 A1_im += g10_re * a0_im; 00536 A1_im += g10_im * a0_re; 00537 A1_im += g11_re * a1_im; 00538 A1_im += g11_im * a1_re; 00539 A1_im += g12_re * a2_im; 00540 A1_im += g12_im * a2_re; 00541 spinorFloat B1_re = 0; 00542 B1_re += g10_re * b0_re; 00543 B1_re -= g10_im * b0_im; 00544 B1_re += g11_re * b1_re; 00545 B1_re -= g11_im * b1_im; 00546 B1_re += g12_re * b2_re; 00547 B1_re -= g12_im * b2_im; 00548 spinorFloat B1_im = 0; 00549 B1_im += g10_re * b0_im; 00550 B1_im += g10_im * b0_re; 00551 B1_im += g11_re * b1_im; 00552 B1_im += g11_im * b1_re; 00553 B1_im += g12_re * b2_im; 00554 B1_im += g12_im * b2_re; 00555 00556 // multiply row 2 00557 spinorFloat A2_re = 0; 00558 A2_re += g20_re * a0_re; 00559 A2_re -= g20_im * a0_im; 00560 A2_re += g21_re * a1_re; 00561 A2_re -= g21_im * a1_im; 00562 A2_re += g22_re * a2_re; 00563 A2_re -= g22_im * a2_im; 00564 spinorFloat A2_im = 0; 00565 A2_im += g20_re * a0_im; 00566 A2_im += g20_im * a0_re; 00567 A2_im += g21_re * a1_im; 00568 A2_im += g21_im * a1_re; 00569 A2_im += g22_re * a2_im; 00570 A2_im += g22_im * a2_re; 00571 spinorFloat B2_re = 0; 00572 B2_re += g20_re * b0_re; 00573 B2_re -= g20_im * b0_im; 00574 B2_re += g21_re * b1_re; 00575 B2_re -= g21_im * b1_im; 00576 B2_re += g22_re * b2_re; 00577 B2_re -= g22_im * b2_im; 00578 spinorFloat B2_im = 0; 00579 B2_im += g20_re * b0_im; 00580 B2_im += g20_im * b0_re; 00581 B2_im += g21_re * b1_im; 00582 B2_im += g21_im * b1_re; 00583 B2_im += g22_re * b2_im; 00584 B2_im += g22_im * b2_re; 00585 00586 o00_re += A0_re; 00587 o00_im += A0_im; 00588 o10_re += B0_re; 00589 o10_im += B0_im; 00590 o20_re += B0_im; 00591 o20_im -= B0_re; 00592 o30_re += A0_im; 00593 o30_im -= A0_re; 00594 00595 o01_re += A1_re; 00596 o01_im += A1_im; 00597 o11_re += B1_re; 00598 o11_im += B1_im; 00599 o21_re += B1_im; 00600 o21_im -= B1_re; 00601 o31_re += A1_im; 00602 o31_im -= A1_re; 00603 00604 o02_re += A2_re; 00605 o02_im += A2_im; 00606 o12_re += B2_re; 00607 o12_im += B2_im; 00608 o22_re += B2_im; 00609 o22_im -= B2_re; 00610 o32_re += A2_im; 00611 o32_im -= A2_re; 00612 00613 } 00614 00615 #ifdef MULTI_GPU 00616 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) || 00617 (kernel_type == EXTERIOR_KERNEL_X && x1==0) ) 00618 #endif 00619 { 00620 // Projector P0- 00621 // 1 0 0 -i 00622 // 0 1 -i 0 00623 // 0 i 1 0 00624 // i 0 0 1 00625 00626 #ifdef MULTI_GPU 00627 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 : 00628 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00629 #else 00630 const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1; 00631 #endif 00632 00633 #ifdef MULTI_GPU 00634 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 00635 #else 00636 const int ga_idx = sp_idx; 00637 #endif 00638 00639 spinorFloat a0_re, a0_im; 00640 spinorFloat a1_re, a1_im; 00641 spinorFloat a2_re, a2_im; 00642 spinorFloat b0_re, b0_im; 00643 spinorFloat b1_re, b1_im; 00644 spinorFloat b2_re, b2_im; 00645 00646 #ifdef MULTI_GPU 00647 if (kernel_type == INTERIOR_KERNEL) { 00648 #endif 00649 00650 // read spinor from device memory 00651 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00652 00653 // project spinor into half spinors 00654 a0_re = +i00_re+i30_im; 00655 a0_im = +i00_im-i30_re; 00656 a1_re = +i01_re+i31_im; 00657 a1_im = +i01_im-i31_re; 00658 a2_re = +i02_re+i32_im; 00659 a2_im = +i02_im-i32_re; 00660 b0_re = +i10_re+i20_im; 00661 b0_im = +i10_im-i20_re; 00662 b1_re = +i11_re+i21_im; 00663 b1_im = +i11_im-i21_re; 00664 b2_re = +i12_re+i22_im; 00665 b2_im = +i12_im-i22_re; 00666 00667 #ifdef MULTI_GPU 00668 } else { 00669 00670 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00671 00672 // read half spinor from device memory 00673 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 00674 00675 a0_re = i00_re; a0_im = i00_im; 00676 a1_re = i01_re; a1_im = i01_im; 00677 a2_re = i02_re; a2_im = i02_im; 00678 b0_re = i10_re; b0_im = i10_im; 00679 b1_re = i11_re; b1_im = i11_im; 00680 b2_re = i12_re; b2_im = i12_im; 00681 00682 } 00683 #endif // MULTI_GPU 00684 00685 // read gauge matrix from device memory 00686 READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride); 00687 00688 // reconstruct gauge matrix 00689 RECONSTRUCT_GAUGE_MATRIX(1); 00690 00691 // multiply row 0 00692 spinorFloat A0_re = 0; 00693 A0_re += gT00_re * a0_re; 00694 A0_re -= gT00_im * a0_im; 00695 A0_re += gT01_re * a1_re; 00696 A0_re -= gT01_im * a1_im; 00697 A0_re += gT02_re * a2_re; 00698 A0_re -= gT02_im * a2_im; 00699 spinorFloat A0_im = 0; 00700 A0_im += gT00_re * a0_im; 00701 A0_im += gT00_im * a0_re; 00702 A0_im += gT01_re * a1_im; 00703 A0_im += gT01_im * a1_re; 00704 A0_im += gT02_re * a2_im; 00705 A0_im += gT02_im * a2_re; 00706 spinorFloat B0_re = 0; 00707 B0_re += gT00_re * b0_re; 00708 B0_re -= gT00_im * b0_im; 00709 B0_re += gT01_re * b1_re; 00710 B0_re -= gT01_im * b1_im; 00711 B0_re += gT02_re * b2_re; 00712 B0_re -= gT02_im * b2_im; 00713 spinorFloat B0_im = 0; 00714 B0_im += gT00_re * b0_im; 00715 B0_im += gT00_im * b0_re; 00716 B0_im += gT01_re * b1_im; 00717 B0_im += gT01_im * b1_re; 00718 B0_im += gT02_re * b2_im; 00719 B0_im += gT02_im * b2_re; 00720 00721 // multiply row 1 00722 spinorFloat A1_re = 0; 00723 A1_re += gT10_re * a0_re; 00724 A1_re -= gT10_im * a0_im; 00725 A1_re += gT11_re * a1_re; 00726 A1_re -= gT11_im * a1_im; 00727 A1_re += gT12_re * a2_re; 00728 A1_re -= gT12_im * a2_im; 00729 spinorFloat A1_im = 0; 00730 A1_im += gT10_re * a0_im; 00731 A1_im += gT10_im * a0_re; 00732 A1_im += gT11_re * a1_im; 00733 A1_im += gT11_im * a1_re; 00734 A1_im += gT12_re * a2_im; 00735 A1_im += gT12_im * a2_re; 00736 spinorFloat B1_re = 0; 00737 B1_re += gT10_re * b0_re; 00738 B1_re -= gT10_im * b0_im; 00739 B1_re += gT11_re * b1_re; 00740 B1_re -= gT11_im * b1_im; 00741 B1_re += gT12_re * b2_re; 00742 B1_re -= gT12_im * b2_im; 00743 spinorFloat B1_im = 0; 00744 B1_im += gT10_re * b0_im; 00745 B1_im += gT10_im * b0_re; 00746 B1_im += gT11_re * b1_im; 00747 B1_im += gT11_im * b1_re; 00748 B1_im += gT12_re * b2_im; 00749 B1_im += gT12_im * b2_re; 00750 00751 // multiply row 2 00752 spinorFloat A2_re = 0; 00753 A2_re += gT20_re * a0_re; 00754 A2_re -= gT20_im * a0_im; 00755 A2_re += gT21_re * a1_re; 00756 A2_re -= gT21_im * a1_im; 00757 A2_re += gT22_re * a2_re; 00758 A2_re -= gT22_im * a2_im; 00759 spinorFloat A2_im = 0; 00760 A2_im += gT20_re * a0_im; 00761 A2_im += gT20_im * a0_re; 00762 A2_im += gT21_re * a1_im; 00763 A2_im += gT21_im * a1_re; 00764 A2_im += gT22_re * a2_im; 00765 A2_im += gT22_im * a2_re; 00766 spinorFloat B2_re = 0; 00767 B2_re += gT20_re * b0_re; 00768 B2_re -= gT20_im * b0_im; 00769 B2_re += gT21_re * b1_re; 00770 B2_re -= gT21_im * b1_im; 00771 B2_re += gT22_re * b2_re; 00772 B2_re -= gT22_im * b2_im; 00773 spinorFloat B2_im = 0; 00774 B2_im += gT20_re * b0_im; 00775 B2_im += gT20_im * b0_re; 00776 B2_im += gT21_re * b1_im; 00777 B2_im += gT21_im * b1_re; 00778 B2_im += gT22_re * b2_im; 00779 B2_im += gT22_im * b2_re; 00780 00781 o00_re += A0_re; 00782 o00_im += A0_im; 00783 o10_re += B0_re; 00784 o10_im += B0_im; 00785 o20_re -= B0_im; 00786 o20_im += B0_re; 00787 o30_re -= A0_im; 00788 o30_im += A0_re; 00789 00790 o01_re += A1_re; 00791 o01_im += A1_im; 00792 o11_re += B1_re; 00793 o11_im += B1_im; 00794 o21_re -= B1_im; 00795 o21_im += B1_re; 00796 o31_re -= A1_im; 00797 o31_im += A1_re; 00798 00799 o02_re += A2_re; 00800 o02_im += A2_im; 00801 o12_re += B2_re; 00802 o12_im += B2_im; 00803 o22_re -= B2_im; 00804 o22_im += B2_re; 00805 o32_re -= A2_im; 00806 o32_im += A2_re; 00807 00808 } 00809 00810 #ifdef MULTI_GPU 00811 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) || 00812 (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) ) 00813 #endif 00814 { 00815 // Projector P1+ 00816 // 1 0 0 1 00817 // 0 1 -1 0 00818 // 0 -1 1 0 00819 // 1 0 0 1 00820 00821 #ifdef MULTI_GPU 00822 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 : 00823 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00824 #else 00825 const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1; 00826 #endif 00827 00828 const int ga_idx = sid; 00829 00830 spinorFloat a0_re, a0_im; 00831 spinorFloat a1_re, a1_im; 00832 spinorFloat a2_re, a2_im; 00833 spinorFloat b0_re, b0_im; 00834 spinorFloat b1_re, b1_im; 00835 spinorFloat b2_re, b2_im; 00836 00837 #ifdef MULTI_GPU 00838 if (kernel_type == INTERIOR_KERNEL) { 00839 #endif 00840 00841 // read spinor from device memory 00842 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00843 00844 // project spinor into half spinors 00845 a0_re = +i00_re+i30_re; 00846 a0_im = +i00_im+i30_im; 00847 a1_re = +i01_re+i31_re; 00848 a1_im = +i01_im+i31_im; 00849 a2_re = +i02_re+i32_re; 00850 a2_im = +i02_im+i32_im; 00851 b0_re = +i10_re-i20_re; 00852 b0_im = +i10_im-i20_im; 00853 b1_re = +i11_re-i21_re; 00854 b1_im = +i11_im-i21_im; 00855 b2_re = +i12_re-i22_re; 00856 b2_im = +i12_im-i22_im; 00857 00858 #ifdef MULTI_GPU 00859 } else { 00860 00861 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00862 00863 // read half spinor from device memory 00864 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00865 00866 a0_re = i00_re; a0_im = i00_im; 00867 a1_re = i01_re; a1_im = i01_im; 00868 a2_re = i02_re; a2_im = i02_im; 00869 b0_re = i10_re; b0_im = i10_im; 00870 b1_re = i11_re; b1_im = i11_im; 00871 b2_re = i12_re; b2_im = i12_im; 00872 00873 } 00874 #endif // MULTI_GPU 00875 00876 // read gauge matrix from device memory 00877 READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride); 00878 00879 // reconstruct gauge matrix 00880 RECONSTRUCT_GAUGE_MATRIX(2); 00881 00882 // multiply row 0 00883 spinorFloat A0_re = 0; 00884 A0_re += g00_re * a0_re; 00885 A0_re -= g00_im * a0_im; 00886 A0_re += g01_re * a1_re; 00887 A0_re -= g01_im * a1_im; 00888 A0_re += g02_re * a2_re; 00889 A0_re -= g02_im * a2_im; 00890 spinorFloat A0_im = 0; 00891 A0_im += g00_re * a0_im; 00892 A0_im += g00_im * a0_re; 00893 A0_im += g01_re * a1_im; 00894 A0_im += g01_im * a1_re; 00895 A0_im += g02_re * a2_im; 00896 A0_im += g02_im * a2_re; 00897 spinorFloat B0_re = 0; 00898 B0_re += g00_re * b0_re; 00899 B0_re -= g00_im * b0_im; 00900 B0_re += g01_re * b1_re; 00901 B0_re -= g01_im * b1_im; 00902 B0_re += g02_re * b2_re; 00903 B0_re -= g02_im * b2_im; 00904 spinorFloat B0_im = 0; 00905 B0_im += g00_re * b0_im; 00906 B0_im += g00_im * b0_re; 00907 B0_im += g01_re * b1_im; 00908 B0_im += g01_im * b1_re; 00909 B0_im += g02_re * b2_im; 00910 B0_im += g02_im * b2_re; 00911 00912 // multiply row 1 00913 spinorFloat A1_re = 0; 00914 A1_re += g10_re * a0_re; 00915 A1_re -= g10_im * a0_im; 00916 A1_re += g11_re * a1_re; 00917 A1_re -= g11_im * a1_im; 00918 A1_re += g12_re * a2_re; 00919 A1_re -= g12_im * a2_im; 00920 spinorFloat A1_im = 0; 00921 A1_im += g10_re * a0_im; 00922 A1_im += g10_im * a0_re; 00923 A1_im += g11_re * a1_im; 00924 A1_im += g11_im * a1_re; 00925 A1_im += g12_re * a2_im; 00926 A1_im += g12_im * a2_re; 00927 spinorFloat B1_re = 0; 00928 B1_re += g10_re * b0_re; 00929 B1_re -= g10_im * b0_im; 00930 B1_re += g11_re * b1_re; 00931 B1_re -= g11_im * b1_im; 00932 B1_re += g12_re * b2_re; 00933 B1_re -= g12_im * b2_im; 00934 spinorFloat B1_im = 0; 00935 B1_im += g10_re * b0_im; 00936 B1_im += g10_im * b0_re; 00937 B1_im += g11_re * b1_im; 00938 B1_im += g11_im * b1_re; 00939 B1_im += g12_re * b2_im; 00940 B1_im += g12_im * b2_re; 00941 00942 // multiply row 2 00943 spinorFloat A2_re = 0; 00944 A2_re += g20_re * a0_re; 00945 A2_re -= g20_im * a0_im; 00946 A2_re += g21_re * a1_re; 00947 A2_re -= g21_im * a1_im; 00948 A2_re += g22_re * a2_re; 00949 A2_re -= g22_im * a2_im; 00950 spinorFloat A2_im = 0; 00951 A2_im += g20_re * a0_im; 00952 A2_im += g20_im * a0_re; 00953 A2_im += g21_re * a1_im; 00954 A2_im += g21_im * a1_re; 00955 A2_im += g22_re * a2_im; 00956 A2_im += g22_im * a2_re; 00957 spinorFloat B2_re = 0; 00958 B2_re += g20_re * b0_re; 00959 B2_re -= g20_im * b0_im; 00960 B2_re += g21_re * b1_re; 00961 B2_re -= g21_im * b1_im; 00962 B2_re += g22_re * b2_re; 00963 B2_re -= g22_im * b2_im; 00964 spinorFloat B2_im = 0; 00965 B2_im += g20_re * b0_im; 00966 B2_im += g20_im * b0_re; 00967 B2_im += g21_re * b1_im; 00968 B2_im += g21_im * b1_re; 00969 B2_im += g22_re * b2_im; 00970 B2_im += g22_im * b2_re; 00971 00972 o00_re += A0_re; 00973 o00_im += A0_im; 00974 o10_re += B0_re; 00975 o10_im += B0_im; 00976 o20_re -= B0_re; 00977 o20_im -= B0_im; 00978 o30_re += A0_re; 00979 o30_im += A0_im; 00980 00981 o01_re += A1_re; 00982 o01_im += A1_im; 00983 o11_re += B1_re; 00984 o11_im += B1_im; 00985 o21_re -= B1_re; 00986 o21_im -= B1_im; 00987 o31_re += A1_re; 00988 o31_im += A1_im; 00989 00990 o02_re += A2_re; 00991 o02_im += A2_im; 00992 o12_re += B2_re; 00993 o12_im += B2_im; 00994 o22_re -= B2_re; 00995 o22_im -= B2_im; 00996 o32_re += A2_re; 00997 o32_im += A2_im; 00998 00999 } 01000 01001 #ifdef MULTI_GPU 01002 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) || 01003 (kernel_type == EXTERIOR_KERNEL_Y && x2==0) ) 01004 #endif 01005 { 01006 // Projector P1- 01007 // 1 0 0 -1 01008 // 0 1 1 0 01009 // 0 1 1 0 01010 // -1 0 0 1 01011 01012 #ifdef MULTI_GPU 01013 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 : 01014 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01015 #else 01016 const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1; 01017 #endif 01018 01019 #ifdef MULTI_GPU 01020 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01021 #else 01022 const int ga_idx = sp_idx; 01023 #endif 01024 01025 spinorFloat a0_re, a0_im; 01026 spinorFloat a1_re, a1_im; 01027 spinorFloat a2_re, a2_im; 01028 spinorFloat b0_re, b0_im; 01029 spinorFloat b1_re, b1_im; 01030 spinorFloat b2_re, b2_im; 01031 01032 #ifdef MULTI_GPU 01033 if (kernel_type == INTERIOR_KERNEL) { 01034 #endif 01035 01036 // read spinor from device memory 01037 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01038 01039 // project spinor into half spinors 01040 a0_re = +i00_re-i30_re; 01041 a0_im = +i00_im-i30_im; 01042 a1_re = +i01_re-i31_re; 01043 a1_im = +i01_im-i31_im; 01044 a2_re = +i02_re-i32_re; 01045 a2_im = +i02_im-i32_im; 01046 b0_re = +i10_re+i20_re; 01047 b0_im = +i10_im+i20_im; 01048 b1_re = +i11_re+i21_re; 01049 b1_im = +i11_im+i21_im; 01050 b2_re = +i12_re+i22_re; 01051 b2_im = +i12_im+i22_im; 01052 01053 #ifdef MULTI_GPU 01054 } else { 01055 01056 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01057 01058 // read half spinor from device memory 01059 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01060 01061 a0_re = i00_re; a0_im = i00_im; 01062 a1_re = i01_re; a1_im = i01_im; 01063 a2_re = i02_re; a2_im = i02_im; 01064 b0_re = i10_re; b0_im = i10_im; 01065 b1_re = i11_re; b1_im = i11_im; 01066 b2_re = i12_re; b2_im = i12_im; 01067 01068 } 01069 #endif // MULTI_GPU 01070 01071 // read gauge matrix from device memory 01072 READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride); 01073 01074 // reconstruct gauge matrix 01075 RECONSTRUCT_GAUGE_MATRIX(3); 01076 01077 // multiply row 0 01078 spinorFloat A0_re = 0; 01079 A0_re += gT00_re * a0_re; 01080 A0_re -= gT00_im * a0_im; 01081 A0_re += gT01_re * a1_re; 01082 A0_re -= gT01_im * a1_im; 01083 A0_re += gT02_re * a2_re; 01084 A0_re -= gT02_im * a2_im; 01085 spinorFloat A0_im = 0; 01086 A0_im += gT00_re * a0_im; 01087 A0_im += gT00_im * a0_re; 01088 A0_im += gT01_re * a1_im; 01089 A0_im += gT01_im * a1_re; 01090 A0_im += gT02_re * a2_im; 01091 A0_im += gT02_im * a2_re; 01092 spinorFloat B0_re = 0; 01093 B0_re += gT00_re * b0_re; 01094 B0_re -= gT00_im * b0_im; 01095 B0_re += gT01_re * b1_re; 01096 B0_re -= gT01_im * b1_im; 01097 B0_re += gT02_re * b2_re; 01098 B0_re -= gT02_im * b2_im; 01099 spinorFloat B0_im = 0; 01100 B0_im += gT00_re * b0_im; 01101 B0_im += gT00_im * b0_re; 01102 B0_im += gT01_re * b1_im; 01103 B0_im += gT01_im * b1_re; 01104 B0_im += gT02_re * b2_im; 01105 B0_im += gT02_im * b2_re; 01106 01107 // multiply row 1 01108 spinorFloat A1_re = 0; 01109 A1_re += gT10_re * a0_re; 01110 A1_re -= gT10_im * a0_im; 01111 A1_re += gT11_re * a1_re; 01112 A1_re -= gT11_im * a1_im; 01113 A1_re += gT12_re * a2_re; 01114 A1_re -= gT12_im * a2_im; 01115 spinorFloat A1_im = 0; 01116 A1_im += gT10_re * a0_im; 01117 A1_im += gT10_im * a0_re; 01118 A1_im += gT11_re * a1_im; 01119 A1_im += gT11_im * a1_re; 01120 A1_im += gT12_re * a2_im; 01121 A1_im += gT12_im * a2_re; 01122 spinorFloat B1_re = 0; 01123 B1_re += gT10_re * b0_re; 01124 B1_re -= gT10_im * b0_im; 01125 B1_re += gT11_re * b1_re; 01126 B1_re -= gT11_im * b1_im; 01127 B1_re += gT12_re * b2_re; 01128 B1_re -= gT12_im * b2_im; 01129 spinorFloat B1_im = 0; 01130 B1_im += gT10_re * b0_im; 01131 B1_im += gT10_im * b0_re; 01132 B1_im += gT11_re * b1_im; 01133 B1_im += gT11_im * b1_re; 01134 B1_im += gT12_re * b2_im; 01135 B1_im += gT12_im * b2_re; 01136 01137 // multiply row 2 01138 spinorFloat A2_re = 0; 01139 A2_re += gT20_re * a0_re; 01140 A2_re -= gT20_im * a0_im; 01141 A2_re += gT21_re * a1_re; 01142 A2_re -= gT21_im * a1_im; 01143 A2_re += gT22_re * a2_re; 01144 A2_re -= gT22_im * a2_im; 01145 spinorFloat A2_im = 0; 01146 A2_im += gT20_re * a0_im; 01147 A2_im += gT20_im * a0_re; 01148 A2_im += gT21_re * a1_im; 01149 A2_im += gT21_im * a1_re; 01150 A2_im += gT22_re * a2_im; 01151 A2_im += gT22_im * a2_re; 01152 spinorFloat B2_re = 0; 01153 B2_re += gT20_re * b0_re; 01154 B2_re -= gT20_im * b0_im; 01155 B2_re += gT21_re * b1_re; 01156 B2_re -= gT21_im * b1_im; 01157 B2_re += gT22_re * b2_re; 01158 B2_re -= gT22_im * b2_im; 01159 spinorFloat B2_im = 0; 01160 B2_im += gT20_re * b0_im; 01161 B2_im += gT20_im * b0_re; 01162 B2_im += gT21_re * b1_im; 01163 B2_im += gT21_im * b1_re; 01164 B2_im += gT22_re * b2_im; 01165 B2_im += gT22_im * b2_re; 01166 01167 o00_re += A0_re; 01168 o00_im += A0_im; 01169 o10_re += B0_re; 01170 o10_im += B0_im; 01171 o20_re += B0_re; 01172 o20_im += B0_im; 01173 o30_re -= A0_re; 01174 o30_im -= A0_im; 01175 01176 o01_re += A1_re; 01177 o01_im += A1_im; 01178 o11_re += B1_re; 01179 o11_im += B1_im; 01180 o21_re += B1_re; 01181 o21_im += B1_im; 01182 o31_re -= A1_re; 01183 o31_im -= A1_im; 01184 01185 o02_re += A2_re; 01186 o02_im += A2_im; 01187 o12_re += B2_re; 01188 o12_im += B2_im; 01189 o22_re += B2_re; 01190 o22_im += B2_im; 01191 o32_re -= A2_re; 01192 o32_im -= A2_im; 01193 01194 } 01195 01196 #ifdef MULTI_GPU 01197 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) || 01198 (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) ) 01199 #endif 01200 { 01201 // Projector P2+ 01202 // 1 0 i 0 01203 // 0 1 0 -i 01204 // -i 0 1 0 01205 // 0 i 0 1 01206 01207 #ifdef MULTI_GPU 01208 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 : 01209 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01210 #else 01211 const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1; 01212 #endif 01213 01214 const int ga_idx = sid; 01215 01216 spinorFloat a0_re, a0_im; 01217 spinorFloat a1_re, a1_im; 01218 spinorFloat a2_re, a2_im; 01219 spinorFloat b0_re, b0_im; 01220 spinorFloat b1_re, b1_im; 01221 spinorFloat b2_re, b2_im; 01222 01223 #ifdef MULTI_GPU 01224 if (kernel_type == INTERIOR_KERNEL) { 01225 #endif 01226 01227 // read spinor from device memory 01228 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01229 01230 // project spinor into half spinors 01231 a0_re = +i00_re-i20_im; 01232 a0_im = +i00_im+i20_re; 01233 a1_re = +i01_re-i21_im; 01234 a1_im = +i01_im+i21_re; 01235 a2_re = +i02_re-i22_im; 01236 a2_im = +i02_im+i22_re; 01237 b0_re = +i10_re+i30_im; 01238 b0_im = +i10_im-i30_re; 01239 b1_re = +i11_re+i31_im; 01240 b1_im = +i11_im-i31_re; 01241 b2_re = +i12_re+i32_im; 01242 b2_im = +i12_im-i32_re; 01243 01244 #ifdef MULTI_GPU 01245 } else { 01246 01247 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01248 01249 // read half spinor from device memory 01250 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01251 01252 a0_re = i00_re; a0_im = i00_im; 01253 a1_re = i01_re; a1_im = i01_im; 01254 a2_re = i02_re; a2_im = i02_im; 01255 b0_re = i10_re; b0_im = i10_im; 01256 b1_re = i11_re; b1_im = i11_im; 01257 b2_re = i12_re; b2_im = i12_im; 01258 01259 } 01260 #endif // MULTI_GPU 01261 01262 // read gauge matrix from device memory 01263 READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride); 01264 01265 // reconstruct gauge matrix 01266 RECONSTRUCT_GAUGE_MATRIX(4); 01267 01268 // multiply row 0 01269 spinorFloat A0_re = 0; 01270 A0_re += g00_re * a0_re; 01271 A0_re -= g00_im * a0_im; 01272 A0_re += g01_re * a1_re; 01273 A0_re -= g01_im * a1_im; 01274 A0_re += g02_re * a2_re; 01275 A0_re -= g02_im * a2_im; 01276 spinorFloat A0_im = 0; 01277 A0_im += g00_re * a0_im; 01278 A0_im += g00_im * a0_re; 01279 A0_im += g01_re * a1_im; 01280 A0_im += g01_im * a1_re; 01281 A0_im += g02_re * a2_im; 01282 A0_im += g02_im * a2_re; 01283 spinorFloat B0_re = 0; 01284 B0_re += g00_re * b0_re; 01285 B0_re -= g00_im * b0_im; 01286 B0_re += g01_re * b1_re; 01287 B0_re -= g01_im * b1_im; 01288 B0_re += g02_re * b2_re; 01289 B0_re -= g02_im * b2_im; 01290 spinorFloat B0_im = 0; 01291 B0_im += g00_re * b0_im; 01292 B0_im += g00_im * b0_re; 01293 B0_im += g01_re * b1_im; 01294 B0_im += g01_im * b1_re; 01295 B0_im += g02_re * b2_im; 01296 B0_im += g02_im * b2_re; 01297 01298 // multiply row 1 01299 spinorFloat A1_re = 0; 01300 A1_re += g10_re * a0_re; 01301 A1_re -= g10_im * a0_im; 01302 A1_re += g11_re * a1_re; 01303 A1_re -= g11_im * a1_im; 01304 A1_re += g12_re * a2_re; 01305 A1_re -= g12_im * a2_im; 01306 spinorFloat A1_im = 0; 01307 A1_im += g10_re * a0_im; 01308 A1_im += g10_im * a0_re; 01309 A1_im += g11_re * a1_im; 01310 A1_im += g11_im * a1_re; 01311 A1_im += g12_re * a2_im; 01312 A1_im += g12_im * a2_re; 01313 spinorFloat B1_re = 0; 01314 B1_re += g10_re * b0_re; 01315 B1_re -= g10_im * b0_im; 01316 B1_re += g11_re * b1_re; 01317 B1_re -= g11_im * b1_im; 01318 B1_re += g12_re * b2_re; 01319 B1_re -= g12_im * b2_im; 01320 spinorFloat B1_im = 0; 01321 B1_im += g10_re * b0_im; 01322 B1_im += g10_im * b0_re; 01323 B1_im += g11_re * b1_im; 01324 B1_im += g11_im * b1_re; 01325 B1_im += g12_re * b2_im; 01326 B1_im += g12_im * b2_re; 01327 01328 // multiply row 2 01329 spinorFloat A2_re = 0; 01330 A2_re += g20_re * a0_re; 01331 A2_re -= g20_im * a0_im; 01332 A2_re += g21_re * a1_re; 01333 A2_re -= g21_im * a1_im; 01334 A2_re += g22_re * a2_re; 01335 A2_re -= g22_im * a2_im; 01336 spinorFloat A2_im = 0; 01337 A2_im += g20_re * a0_im; 01338 A2_im += g20_im * a0_re; 01339 A2_im += g21_re * a1_im; 01340 A2_im += g21_im * a1_re; 01341 A2_im += g22_re * a2_im; 01342 A2_im += g22_im * a2_re; 01343 spinorFloat B2_re = 0; 01344 B2_re += g20_re * b0_re; 01345 B2_re -= g20_im * b0_im; 01346 B2_re += g21_re * b1_re; 01347 B2_re -= g21_im * b1_im; 01348 B2_re += g22_re * b2_re; 01349 B2_re -= g22_im * b2_im; 01350 spinorFloat B2_im = 0; 01351 B2_im += g20_re * b0_im; 01352 B2_im += g20_im * b0_re; 01353 B2_im += g21_re * b1_im; 01354 B2_im += g21_im * b1_re; 01355 B2_im += g22_re * b2_im; 01356 B2_im += g22_im * b2_re; 01357 01358 o00_re += A0_re; 01359 o00_im += A0_im; 01360 o10_re += B0_re; 01361 o10_im += B0_im; 01362 o20_re += A0_im; 01363 o20_im -= A0_re; 01364 o30_re -= B0_im; 01365 o30_im += B0_re; 01366 01367 o01_re += A1_re; 01368 o01_im += A1_im; 01369 o11_re += B1_re; 01370 o11_im += B1_im; 01371 o21_re += A1_im; 01372 o21_im -= A1_re; 01373 o31_re -= B1_im; 01374 o31_im += B1_re; 01375 01376 o02_re += A2_re; 01377 o02_im += A2_im; 01378 o12_re += B2_re; 01379 o12_im += B2_im; 01380 o22_re += A2_im; 01381 o22_im -= A2_re; 01382 o32_re -= B2_im; 01383 o32_im += B2_re; 01384 01385 } 01386 01387 #ifdef MULTI_GPU 01388 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) || 01389 (kernel_type == EXTERIOR_KERNEL_Z && x3==0) ) 01390 #endif 01391 { 01392 // Projector P2- 01393 // 1 0 -i 0 01394 // 0 1 0 i 01395 // i 0 1 0 01396 // 0 -i 0 1 01397 01398 #ifdef MULTI_GPU 01399 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 : 01400 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01401 #else 01402 const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1; 01403 #endif 01404 01405 #ifdef MULTI_GPU 01406 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01407 #else 01408 const int ga_idx = sp_idx; 01409 #endif 01410 01411 spinorFloat a0_re, a0_im; 01412 spinorFloat a1_re, a1_im; 01413 spinorFloat a2_re, a2_im; 01414 spinorFloat b0_re, b0_im; 01415 spinorFloat b1_re, b1_im; 01416 spinorFloat b2_re, b2_im; 01417 01418 #ifdef MULTI_GPU 01419 if (kernel_type == INTERIOR_KERNEL) { 01420 #endif 01421 01422 // read spinor from device memory 01423 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01424 01425 // project spinor into half spinors 01426 a0_re = +i00_re+i20_im; 01427 a0_im = +i00_im-i20_re; 01428 a1_re = +i01_re+i21_im; 01429 a1_im = +i01_im-i21_re; 01430 a2_re = +i02_re+i22_im; 01431 a2_im = +i02_im-i22_re; 01432 b0_re = +i10_re-i30_im; 01433 b0_im = +i10_im+i30_re; 01434 b1_re = +i11_re-i31_im; 01435 b1_im = +i11_im+i31_re; 01436 b2_re = +i12_re-i32_im; 01437 b2_im = +i12_im+i32_re; 01438 01439 #ifdef MULTI_GPU 01440 } else { 01441 01442 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01443 01444 // read half spinor from device memory 01445 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01446 01447 a0_re = i00_re; a0_im = i00_im; 01448 a1_re = i01_re; a1_im = i01_im; 01449 a2_re = i02_re; a2_im = i02_im; 01450 b0_re = i10_re; b0_im = i10_im; 01451 b1_re = i11_re; b1_im = i11_im; 01452 b2_re = i12_re; b2_im = i12_im; 01453 01454 } 01455 #endif // MULTI_GPU 01456 01457 // read gauge matrix from device memory 01458 READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride); 01459 01460 // reconstruct gauge matrix 01461 RECONSTRUCT_GAUGE_MATRIX(5); 01462 01463 // multiply row 0 01464 spinorFloat A0_re = 0; 01465 A0_re += gT00_re * a0_re; 01466 A0_re -= gT00_im * a0_im; 01467 A0_re += gT01_re * a1_re; 01468 A0_re -= gT01_im * a1_im; 01469 A0_re += gT02_re * a2_re; 01470 A0_re -= gT02_im * a2_im; 01471 spinorFloat A0_im = 0; 01472 A0_im += gT00_re * a0_im; 01473 A0_im += gT00_im * a0_re; 01474 A0_im += gT01_re * a1_im; 01475 A0_im += gT01_im * a1_re; 01476 A0_im += gT02_re * a2_im; 01477 A0_im += gT02_im * a2_re; 01478 spinorFloat B0_re = 0; 01479 B0_re += gT00_re * b0_re; 01480 B0_re -= gT00_im * b0_im; 01481 B0_re += gT01_re * b1_re; 01482 B0_re -= gT01_im * b1_im; 01483 B0_re += gT02_re * b2_re; 01484 B0_re -= gT02_im * b2_im; 01485 spinorFloat B0_im = 0; 01486 B0_im += gT00_re * b0_im; 01487 B0_im += gT00_im * b0_re; 01488 B0_im += gT01_re * b1_im; 01489 B0_im += gT01_im * b1_re; 01490 B0_im += gT02_re * b2_im; 01491 B0_im += gT02_im * b2_re; 01492 01493 // multiply row 1 01494 spinorFloat A1_re = 0; 01495 A1_re += gT10_re * a0_re; 01496 A1_re -= gT10_im * a0_im; 01497 A1_re += gT11_re * a1_re; 01498 A1_re -= gT11_im * a1_im; 01499 A1_re += gT12_re * a2_re; 01500 A1_re -= gT12_im * a2_im; 01501 spinorFloat A1_im = 0; 01502 A1_im += gT10_re * a0_im; 01503 A1_im += gT10_im * a0_re; 01504 A1_im += gT11_re * a1_im; 01505 A1_im += gT11_im * a1_re; 01506 A1_im += gT12_re * a2_im; 01507 A1_im += gT12_im * a2_re; 01508 spinorFloat B1_re = 0; 01509 B1_re += gT10_re * b0_re; 01510 B1_re -= gT10_im * b0_im; 01511 B1_re += gT11_re * b1_re; 01512 B1_re -= gT11_im * b1_im; 01513 B1_re += gT12_re * b2_re; 01514 B1_re -= gT12_im * b2_im; 01515 spinorFloat B1_im = 0; 01516 B1_im += gT10_re * b0_im; 01517 B1_im += gT10_im * b0_re; 01518 B1_im += gT11_re * b1_im; 01519 B1_im += gT11_im * b1_re; 01520 B1_im += gT12_re * b2_im; 01521 B1_im += gT12_im * b2_re; 01522 01523 // multiply row 2 01524 spinorFloat A2_re = 0; 01525 A2_re += gT20_re * a0_re; 01526 A2_re -= gT20_im * a0_im; 01527 A2_re += gT21_re * a1_re; 01528 A2_re -= gT21_im * a1_im; 01529 A2_re += gT22_re * a2_re; 01530 A2_re -= gT22_im * a2_im; 01531 spinorFloat A2_im = 0; 01532 A2_im += gT20_re * a0_im; 01533 A2_im += gT20_im * a0_re; 01534 A2_im += gT21_re * a1_im; 01535 A2_im += gT21_im * a1_re; 01536 A2_im += gT22_re * a2_im; 01537 A2_im += gT22_im * a2_re; 01538 spinorFloat B2_re = 0; 01539 B2_re += gT20_re * b0_re; 01540 B2_re -= gT20_im * b0_im; 01541 B2_re += gT21_re * b1_re; 01542 B2_re -= gT21_im * b1_im; 01543 B2_re += gT22_re * b2_re; 01544 B2_re -= gT22_im * b2_im; 01545 spinorFloat B2_im = 0; 01546 B2_im += gT20_re * b0_im; 01547 B2_im += gT20_im * b0_re; 01548 B2_im += gT21_re * b1_im; 01549 B2_im += gT21_im * b1_re; 01550 B2_im += gT22_re * b2_im; 01551 B2_im += gT22_im * b2_re; 01552 01553 o00_re += A0_re; 01554 o00_im += A0_im; 01555 o10_re += B0_re; 01556 o10_im += B0_im; 01557 o20_re -= A0_im; 01558 o20_im += A0_re; 01559 o30_re += B0_im; 01560 o30_im -= B0_re; 01561 01562 o01_re += A1_re; 01563 o01_im += A1_im; 01564 o11_re += B1_re; 01565 o11_im += B1_im; 01566 o21_re -= A1_im; 01567 o21_im += A1_re; 01568 o31_re += B1_im; 01569 o31_im -= B1_re; 01570 01571 o02_re += A2_re; 01572 o02_im += A2_im; 01573 o12_re += B2_re; 01574 o12_im += B2_im; 01575 o22_re -= A2_im; 01576 o22_im += A2_re; 01577 o32_re += B2_im; 01578 o32_im -= B2_re; 01579 01580 } 01581 01582 #ifdef MULTI_GPU 01583 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) || 01584 (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) ) 01585 #endif 01586 { 01587 // Projector P3+ 01588 // 2 0 0 0 01589 // 0 2 0 0 01590 // 0 0 0 0 01591 // 0 0 0 0 01592 01593 #ifdef MULTI_GPU 01594 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 : 01595 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01596 #else 01597 const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1; 01598 #endif 01599 01600 const int ga_idx = sid; 01601 01602 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01603 { 01604 spinorFloat a0_re, a0_im; 01605 spinorFloat a1_re, a1_im; 01606 spinorFloat a2_re, a2_im; 01607 spinorFloat b0_re, b0_im; 01608 spinorFloat b1_re, b1_im; 01609 spinorFloat b2_re, b2_im; 01610 01611 #ifdef MULTI_GPU 01612 if (kernel_type == INTERIOR_KERNEL) { 01613 #endif 01614 01615 // read spinor from device memory 01616 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01617 01618 // project spinor into half spinors 01619 a0_re = +2*i00_re; 01620 a0_im = +2*i00_im; 01621 a1_re = +2*i01_re; 01622 a1_im = +2*i01_im; 01623 a2_re = +2*i02_re; 01624 a2_im = +2*i02_im; 01625 b0_re = +2*i10_re; 01626 b0_im = +2*i10_im; 01627 b1_re = +2*i11_re; 01628 b1_im = +2*i11_im; 01629 b2_re = +2*i12_re; 01630 b2_im = +2*i12_im; 01631 01632 #ifdef MULTI_GPU 01633 } else { 01634 01635 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01636 const int t_proj_scale = TPROJSCALE; 01637 01638 // read half spinor from device memory 01639 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01640 01641 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01642 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01643 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01644 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01645 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01646 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01647 01648 } 01649 #endif // MULTI_GPU 01650 01651 // identity gauge matrix 01652 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01653 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01654 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01655 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01656 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01657 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01658 01659 o00_re += A0_re; 01660 o00_im += A0_im; 01661 o10_re += B0_re; 01662 o10_im += B0_im; 01663 01664 o01_re += A1_re; 01665 o01_im += A1_im; 01666 o11_re += B1_re; 01667 o11_im += B1_im; 01668 01669 o02_re += A2_re; 01670 o02_im += A2_im; 01671 o12_re += B2_re; 01672 o12_im += B2_im; 01673 01674 } else { 01675 spinorFloat a0_re, a0_im; 01676 spinorFloat a1_re, a1_im; 01677 spinorFloat a2_re, a2_im; 01678 spinorFloat b0_re, b0_im; 01679 spinorFloat b1_re, b1_im; 01680 spinorFloat b2_re, b2_im; 01681 01682 #ifdef MULTI_GPU 01683 if (kernel_type == INTERIOR_KERNEL) { 01684 #endif 01685 01686 // read spinor from device memory 01687 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01688 01689 // project spinor into half spinors 01690 a0_re = +2*i00_re; 01691 a0_im = +2*i00_im; 01692 a1_re = +2*i01_re; 01693 a1_im = +2*i01_im; 01694 a2_re = +2*i02_re; 01695 a2_im = +2*i02_im; 01696 b0_re = +2*i10_re; 01697 b0_im = +2*i10_im; 01698 b1_re = +2*i11_re; 01699 b1_im = +2*i11_im; 01700 b2_re = +2*i12_re; 01701 b2_im = +2*i12_im; 01702 01703 #ifdef MULTI_GPU 01704 } else { 01705 01706 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01707 const int t_proj_scale = TPROJSCALE; 01708 01709 // read half spinor from device memory 01710 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01711 01712 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01713 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01714 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01715 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01716 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01717 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01718 01719 } 01720 #endif // MULTI_GPU 01721 01722 // read gauge matrix from device memory 01723 READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride); 01724 01725 // reconstruct gauge matrix 01726 RECONSTRUCT_GAUGE_MATRIX(6); 01727 01728 // multiply row 0 01729 spinorFloat A0_re = 0; 01730 A0_re += g00_re * a0_re; 01731 A0_re -= g00_im * a0_im; 01732 A0_re += g01_re * a1_re; 01733 A0_re -= g01_im * a1_im; 01734 A0_re += g02_re * a2_re; 01735 A0_re -= g02_im * a2_im; 01736 spinorFloat A0_im = 0; 01737 A0_im += g00_re * a0_im; 01738 A0_im += g00_im * a0_re; 01739 A0_im += g01_re * a1_im; 01740 A0_im += g01_im * a1_re; 01741 A0_im += g02_re * a2_im; 01742 A0_im += g02_im * a2_re; 01743 spinorFloat B0_re = 0; 01744 B0_re += g00_re * b0_re; 01745 B0_re -= g00_im * b0_im; 01746 B0_re += g01_re * b1_re; 01747 B0_re -= g01_im * b1_im; 01748 B0_re += g02_re * b2_re; 01749 B0_re -= g02_im * b2_im; 01750 spinorFloat B0_im = 0; 01751 B0_im += g00_re * b0_im; 01752 B0_im += g00_im * b0_re; 01753 B0_im += g01_re * b1_im; 01754 B0_im += g01_im * b1_re; 01755 B0_im += g02_re * b2_im; 01756 B0_im += g02_im * b2_re; 01757 01758 // multiply row 1 01759 spinorFloat A1_re = 0; 01760 A1_re += g10_re * a0_re; 01761 A1_re -= g10_im * a0_im; 01762 A1_re += g11_re * a1_re; 01763 A1_re -= g11_im * a1_im; 01764 A1_re += g12_re * a2_re; 01765 A1_re -= g12_im * a2_im; 01766 spinorFloat A1_im = 0; 01767 A1_im += g10_re * a0_im; 01768 A1_im += g10_im * a0_re; 01769 A1_im += g11_re * a1_im; 01770 A1_im += g11_im * a1_re; 01771 A1_im += g12_re * a2_im; 01772 A1_im += g12_im * a2_re; 01773 spinorFloat B1_re = 0; 01774 B1_re += g10_re * b0_re; 01775 B1_re -= g10_im * b0_im; 01776 B1_re += g11_re * b1_re; 01777 B1_re -= g11_im * b1_im; 01778 B1_re += g12_re * b2_re; 01779 B1_re -= g12_im * b2_im; 01780 spinorFloat B1_im = 0; 01781 B1_im += g10_re * b0_im; 01782 B1_im += g10_im * b0_re; 01783 B1_im += g11_re * b1_im; 01784 B1_im += g11_im * b1_re; 01785 B1_im += g12_re * b2_im; 01786 B1_im += g12_im * b2_re; 01787 01788 // multiply row 2 01789 spinorFloat A2_re = 0; 01790 A2_re += g20_re * a0_re; 01791 A2_re -= g20_im * a0_im; 01792 A2_re += g21_re * a1_re; 01793 A2_re -= g21_im * a1_im; 01794 A2_re += g22_re * a2_re; 01795 A2_re -= g22_im * a2_im; 01796 spinorFloat A2_im = 0; 01797 A2_im += g20_re * a0_im; 01798 A2_im += g20_im * a0_re; 01799 A2_im += g21_re * a1_im; 01800 A2_im += g21_im * a1_re; 01801 A2_im += g22_re * a2_im; 01802 A2_im += g22_im * a2_re; 01803 spinorFloat B2_re = 0; 01804 B2_re += g20_re * b0_re; 01805 B2_re -= g20_im * b0_im; 01806 B2_re += g21_re * b1_re; 01807 B2_re -= g21_im * b1_im; 01808 B2_re += g22_re * b2_re; 01809 B2_re -= g22_im * b2_im; 01810 spinorFloat B2_im = 0; 01811 B2_im += g20_re * b0_im; 01812 B2_im += g20_im * b0_re; 01813 B2_im += g21_re * b1_im; 01814 B2_im += g21_im * b1_re; 01815 B2_im += g22_re * b2_im; 01816 B2_im += g22_im * b2_re; 01817 01818 o00_re += A0_re; 01819 o00_im += A0_im; 01820 o10_re += B0_re; 01821 o10_im += B0_im; 01822 01823 o01_re += A1_re; 01824 o01_im += A1_im; 01825 o11_re += B1_re; 01826 o11_im += B1_im; 01827 01828 o02_re += A2_re; 01829 o02_im += A2_im; 01830 o12_re += B2_re; 01831 o12_im += B2_im; 01832 01833 } 01834 } 01835 01836 #ifdef MULTI_GPU 01837 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) || 01838 (kernel_type == EXTERIOR_KERNEL_T && x4==0) ) 01839 #endif 01840 { 01841 // Projector P3- 01842 // 0 0 0 0 01843 // 0 0 0 0 01844 // 0 0 2 0 01845 // 0 0 0 2 01846 01847 #ifdef MULTI_GPU 01848 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 : 01849 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01850 #else 01851 const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1; 01852 #endif 01853 01854 #ifdef MULTI_GPU 01855 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01856 #else 01857 const int ga_idx = sp_idx; 01858 #endif 01859 01860 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01861 { 01862 spinorFloat a0_re, a0_im; 01863 spinorFloat a1_re, a1_im; 01864 spinorFloat a2_re, a2_im; 01865 spinorFloat b0_re, b0_im; 01866 spinorFloat b1_re, b1_im; 01867 spinorFloat b2_re, b2_im; 01868 01869 #ifdef MULTI_GPU 01870 if (kernel_type == INTERIOR_KERNEL) { 01871 #endif 01872 01873 // read spinor from device memory 01874 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01875 01876 // project spinor into half spinors 01877 a0_re = +2*i20_re; 01878 a0_im = +2*i20_im; 01879 a1_re = +2*i21_re; 01880 a1_im = +2*i21_im; 01881 a2_re = +2*i22_re; 01882 a2_im = +2*i22_im; 01883 b0_re = +2*i30_re; 01884 b0_im = +2*i30_im; 01885 b1_re = +2*i31_re; 01886 b1_im = +2*i31_im; 01887 b2_re = +2*i32_re; 01888 b2_im = +2*i32_im; 01889 01890 #ifdef MULTI_GPU 01891 } else { 01892 01893 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01894 const int t_proj_scale = TPROJSCALE; 01895 01896 // read half spinor from device memory 01897 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01898 01899 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01900 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01901 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01902 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01903 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01904 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01905 01906 } 01907 #endif // MULTI_GPU 01908 01909 // identity gauge matrix 01910 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01911 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01912 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01913 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01914 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01915 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01916 01917 o20_re += A0_re; 01918 o20_im += A0_im; 01919 o30_re += B0_re; 01920 o30_im += B0_im; 01921 01922 o21_re += A1_re; 01923 o21_im += A1_im; 01924 o31_re += B1_re; 01925 o31_im += B1_im; 01926 01927 o22_re += A2_re; 01928 o22_im += A2_im; 01929 o32_re += B2_re; 01930 o32_im += B2_im; 01931 01932 } else { 01933 spinorFloat a0_re, a0_im; 01934 spinorFloat a1_re, a1_im; 01935 spinorFloat a2_re, a2_im; 01936 spinorFloat b0_re, b0_im; 01937 spinorFloat b1_re, b1_im; 01938 spinorFloat b2_re, b2_im; 01939 01940 #ifdef MULTI_GPU 01941 if (kernel_type == INTERIOR_KERNEL) { 01942 #endif 01943 01944 // read spinor from device memory 01945 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01946 01947 // project spinor into half spinors 01948 a0_re = +2*i20_re; 01949 a0_im = +2*i20_im; 01950 a1_re = +2*i21_re; 01951 a1_im = +2*i21_im; 01952 a2_re = +2*i22_re; 01953 a2_im = +2*i22_im; 01954 b0_re = +2*i30_re; 01955 b0_im = +2*i30_im; 01956 b1_re = +2*i31_re; 01957 b1_im = +2*i31_im; 01958 b2_re = +2*i32_re; 01959 b2_im = +2*i32_im; 01960 01961 #ifdef MULTI_GPU 01962 } else { 01963 01964 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01965 const int t_proj_scale = TPROJSCALE; 01966 01967 // read half spinor from device memory 01968 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01969 01970 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01971 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01972 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01973 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01974 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01975 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01976 01977 } 01978 #endif // MULTI_GPU 01979 01980 // read gauge matrix from device memory 01981 READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride); 01982 01983 // reconstruct gauge matrix 01984 RECONSTRUCT_GAUGE_MATRIX(7); 01985 01986 // multiply row 0 01987 spinorFloat A0_re = 0; 01988 A0_re += gT00_re * a0_re; 01989 A0_re -= gT00_im * a0_im; 01990 A0_re += gT01_re * a1_re; 01991 A0_re -= gT01_im * a1_im; 01992 A0_re += gT02_re * a2_re; 01993 A0_re -= gT02_im * a2_im; 01994 spinorFloat A0_im = 0; 01995 A0_im += gT00_re * a0_im; 01996 A0_im += gT00_im * a0_re; 01997 A0_im += gT01_re * a1_im; 01998 A0_im += gT01_im * a1_re; 01999 A0_im += gT02_re * a2_im; 02000 A0_im += gT02_im * a2_re; 02001 spinorFloat B0_re = 0; 02002 B0_re += gT00_re * b0_re; 02003 B0_re -= gT00_im * b0_im; 02004 B0_re += gT01_re * b1_re; 02005 B0_re -= gT01_im * b1_im; 02006 B0_re += gT02_re * b2_re; 02007 B0_re -= gT02_im * b2_im; 02008 spinorFloat B0_im = 0; 02009 B0_im += gT00_re * b0_im; 02010 B0_im += gT00_im * b0_re; 02011 B0_im += gT01_re * b1_im; 02012 B0_im += gT01_im * b1_re; 02013 B0_im += gT02_re * b2_im; 02014 B0_im += gT02_im * b2_re; 02015 02016 // multiply row 1 02017 spinorFloat A1_re = 0; 02018 A1_re += gT10_re * a0_re; 02019 A1_re -= gT10_im * a0_im; 02020 A1_re += gT11_re * a1_re; 02021 A1_re -= gT11_im * a1_im; 02022 A1_re += gT12_re * a2_re; 02023 A1_re -= gT12_im * a2_im; 02024 spinorFloat A1_im = 0; 02025 A1_im += gT10_re * a0_im; 02026 A1_im += gT10_im * a0_re; 02027 A1_im += gT11_re * a1_im; 02028 A1_im += gT11_im * a1_re; 02029 A1_im += gT12_re * a2_im; 02030 A1_im += gT12_im * a2_re; 02031 spinorFloat B1_re = 0; 02032 B1_re += gT10_re * b0_re; 02033 B1_re -= gT10_im * b0_im; 02034 B1_re += gT11_re * b1_re; 02035 B1_re -= gT11_im * b1_im; 02036 B1_re += gT12_re * b2_re; 02037 B1_re -= gT12_im * b2_im; 02038 spinorFloat B1_im = 0; 02039 B1_im += gT10_re * b0_im; 02040 B1_im += gT10_im * b0_re; 02041 B1_im += gT11_re * b1_im; 02042 B1_im += gT11_im * b1_re; 02043 B1_im += gT12_re * b2_im; 02044 B1_im += gT12_im * b2_re; 02045 02046 // multiply row 2 02047 spinorFloat A2_re = 0; 02048 A2_re += gT20_re * a0_re; 02049 A2_re -= gT20_im * a0_im; 02050 A2_re += gT21_re * a1_re; 02051 A2_re -= gT21_im * a1_im; 02052 A2_re += gT22_re * a2_re; 02053 A2_re -= gT22_im * a2_im; 02054 spinorFloat A2_im = 0; 02055 A2_im += gT20_re * a0_im; 02056 A2_im += gT20_im * a0_re; 02057 A2_im += gT21_re * a1_im; 02058 A2_im += gT21_im * a1_re; 02059 A2_im += gT22_re * a2_im; 02060 A2_im += gT22_im * a2_re; 02061 spinorFloat B2_re = 0; 02062 B2_re += gT20_re * b0_re; 02063 B2_re -= gT20_im * b0_im; 02064 B2_re += gT21_re * b1_re; 02065 B2_re -= gT21_im * b1_im; 02066 B2_re += gT22_re * b2_re; 02067 B2_re -= gT22_im * b2_im; 02068 spinorFloat B2_im = 0; 02069 B2_im += gT20_re * b0_im; 02070 B2_im += gT20_im * b0_re; 02071 B2_im += gT21_re * b1_im; 02072 B2_im += gT21_im * b1_re; 02073 B2_im += gT22_re * b2_im; 02074 B2_im += gT22_im * b2_re; 02075 02076 o20_re += A0_re; 02077 o20_im += A0_im; 02078 o30_re += B0_re; 02079 o30_im += B0_im; 02080 02081 o21_re += A1_re; 02082 o21_im += A1_im; 02083 o31_re += B1_re; 02084 o31_im += B1_im; 02085 02086 o22_re += A2_re; 02087 o22_im += A2_im; 02088 o32_re += B2_re; 02089 o32_im += B2_im; 02090 02091 } 02092 } 02093 02094 #if defined MULTI_GPU && (defined DSLASH_XPAY || defined DSLASH_CLOVER) 02095 02096 int incomplete = 0; // Have all 8 contributions been computed for this site? 02097 02098 switch(kernel_type) { // intentional fall-through 02099 case INTERIOR_KERNEL: 02100 incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1)); 02101 case EXTERIOR_KERNEL_T: 02102 incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1)); 02103 case EXTERIOR_KERNEL_Z: 02104 incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1)); 02105 case EXTERIOR_KERNEL_Y: 02106 incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1)); 02107 } 02108 02109 if (!incomplete) 02110 #endif // MULTI_GPU 02111 { 02112 02113 #ifdef DSLASH_CLOVER 02114 02115 // change to chiral basis 02116 { 02117 spinorFloat a00_re = -o10_re - o30_re; 02118 spinorFloat a00_im = -o10_im - o30_im; 02119 spinorFloat a10_re = o00_re + o20_re; 02120 spinorFloat a10_im = o00_im + o20_im; 02121 spinorFloat a20_re = -o10_re + o30_re; 02122 spinorFloat a20_im = -o10_im + o30_im; 02123 spinorFloat a30_re = o00_re - o20_re; 02124 spinorFloat a30_im = o00_im - o20_im; 02125 02126 o00_re = a00_re; o00_im = a00_im; 02127 o10_re = a10_re; o10_im = a10_im; 02128 o20_re = a20_re; o20_im = a20_im; 02129 o30_re = a30_re; o30_im = a30_im; 02130 } 02131 02132 { 02133 spinorFloat a01_re = -o11_re - o31_re; 02134 spinorFloat a01_im = -o11_im - o31_im; 02135 spinorFloat a11_re = o01_re + o21_re; 02136 spinorFloat a11_im = o01_im + o21_im; 02137 spinorFloat a21_re = -o11_re + o31_re; 02138 spinorFloat a21_im = -o11_im + o31_im; 02139 spinorFloat a31_re = o01_re - o21_re; 02140 spinorFloat a31_im = o01_im - o21_im; 02141 02142 o01_re = a01_re; o01_im = a01_im; 02143 o11_re = a11_re; o11_im = a11_im; 02144 o21_re = a21_re; o21_im = a21_im; 02145 o31_re = a31_re; o31_im = a31_im; 02146 } 02147 02148 { 02149 spinorFloat a02_re = -o12_re - o32_re; 02150 spinorFloat a02_im = -o12_im - o32_im; 02151 spinorFloat a12_re = o02_re + o22_re; 02152 spinorFloat a12_im = o02_im + o22_im; 02153 spinorFloat a22_re = -o12_re + o32_re; 02154 spinorFloat a22_im = -o12_im + o32_im; 02155 spinorFloat a32_re = o02_re - o22_re; 02156 spinorFloat a32_im = o02_im - o22_im; 02157 02158 o02_re = a02_re; o02_im = a02_im; 02159 o12_re = a12_re; o12_im = a12_im; 02160 o22_re = a22_re; o22_im = a22_im; 02161 o32_re = a32_re; o32_im = a32_im; 02162 } 02163 02164 // apply first chiral block 02165 { 02166 READ_CLOVER(CLOVERTEX, 0) 02167 02168 spinorFloat a00_re = 0; spinorFloat a00_im = 0; 02169 spinorFloat a01_re = 0; spinorFloat a01_im = 0; 02170 spinorFloat a02_re = 0; spinorFloat a02_im = 0; 02171 spinorFloat a10_re = 0; spinorFloat a10_im = 0; 02172 spinorFloat a11_re = 0; spinorFloat a11_im = 0; 02173 spinorFloat a12_re = 0; spinorFloat a12_im = 0; 02174 02175 a00_re += c00_00_re * o00_re; 02176 a00_im += c00_00_re * o00_im; 02177 a00_re += c00_01_re * o01_re; 02178 a00_re -= c00_01_im * o01_im; 02179 a00_im += c00_01_re * o01_im; 02180 a00_im += c00_01_im * o01_re; 02181 a00_re += c00_02_re * o02_re; 02182 a00_re -= c00_02_im * o02_im; 02183 a00_im += c00_02_re * o02_im; 02184 a00_im += c00_02_im * o02_re; 02185 a00_re += c00_10_re * o10_re; 02186 a00_re -= c00_10_im * o10_im; 02187 a00_im += c00_10_re * o10_im; 02188 a00_im += c00_10_im * o10_re; 02189 a00_re += c00_11_re * o11_re; 02190 a00_re -= c00_11_im * o11_im; 02191 a00_im += c00_11_re * o11_im; 02192 a00_im += c00_11_im * o11_re; 02193 a00_re += c00_12_re * o12_re; 02194 a00_re -= c00_12_im * o12_im; 02195 a00_im += c00_12_re * o12_im; 02196 a00_im += c00_12_im * o12_re; 02197 02198 a01_re += c01_00_re * o00_re; 02199 a01_re -= c01_00_im * o00_im; 02200 a01_im += c01_00_re * o00_im; 02201 a01_im += c01_00_im * o00_re; 02202 a01_re += c01_01_re * o01_re; 02203 a01_im += c01_01_re * o01_im; 02204 a01_re += c01_02_re * o02_re; 02205 a01_re -= c01_02_im * o02_im; 02206 a01_im += c01_02_re * o02_im; 02207 a01_im += c01_02_im * o02_re; 02208 a01_re += c01_10_re * o10_re; 02209 a01_re -= c01_10_im * o10_im; 02210 a01_im += c01_10_re * o10_im; 02211 a01_im += c01_10_im * o10_re; 02212 a01_re += c01_11_re * o11_re; 02213 a01_re -= c01_11_im * o11_im; 02214 a01_im += c01_11_re * o11_im; 02215 a01_im += c01_11_im * o11_re; 02216 a01_re += c01_12_re * o12_re; 02217 a01_re -= c01_12_im * o12_im; 02218 a01_im += c01_12_re * o12_im; 02219 a01_im += c01_12_im * o12_re; 02220 02221 a02_re += c02_00_re * o00_re; 02222 a02_re -= c02_00_im * o00_im; 02223 a02_im += c02_00_re * o00_im; 02224 a02_im += c02_00_im * o00_re; 02225 a02_re += c02_01_re * o01_re; 02226 a02_re -= c02_01_im * o01_im; 02227 a02_im += c02_01_re * o01_im; 02228 a02_im += c02_01_im * o01_re; 02229 a02_re += c02_02_re * o02_re; 02230 a02_im += c02_02_re * o02_im; 02231 a02_re += c02_10_re * o10_re; 02232 a02_re -= c02_10_im * o10_im; 02233 a02_im += c02_10_re * o10_im; 02234 a02_im += c02_10_im * o10_re; 02235 a02_re += c02_11_re * o11_re; 02236 a02_re -= c02_11_im * o11_im; 02237 a02_im += c02_11_re * o11_im; 02238 a02_im += c02_11_im * o11_re; 02239 a02_re += c02_12_re * o12_re; 02240 a02_re -= c02_12_im * o12_im; 02241 a02_im += c02_12_re * o12_im; 02242 a02_im += c02_12_im * o12_re; 02243 02244 a10_re += c10_00_re * o00_re; 02245 a10_re -= c10_00_im * o00_im; 02246 a10_im += c10_00_re * o00_im; 02247 a10_im += c10_00_im * o00_re; 02248 a10_re += c10_01_re * o01_re; 02249 a10_re -= c10_01_im * o01_im; 02250 a10_im += c10_01_re * o01_im; 02251 a10_im += c10_01_im * o01_re; 02252 a10_re += c10_02_re * o02_re; 02253 a10_re -= c10_02_im * o02_im; 02254 a10_im += c10_02_re * o02_im; 02255 a10_im += c10_02_im * o02_re; 02256 a10_re += c10_10_re * o10_re; 02257 a10_im += c10_10_re * o10_im; 02258 a10_re += c10_11_re * o11_re; 02259 a10_re -= c10_11_im * o11_im; 02260 a10_im += c10_11_re * o11_im; 02261 a10_im += c10_11_im * o11_re; 02262 a10_re += c10_12_re * o12_re; 02263 a10_re -= c10_12_im * o12_im; 02264 a10_im += c10_12_re * o12_im; 02265 a10_im += c10_12_im * o12_re; 02266 02267 a11_re += c11_00_re * o00_re; 02268 a11_re -= c11_00_im * o00_im; 02269 a11_im += c11_00_re * o00_im; 02270 a11_im += c11_00_im * o00_re; 02271 a11_re += c11_01_re * o01_re; 02272 a11_re -= c11_01_im * o01_im; 02273 a11_im += c11_01_re * o01_im; 02274 a11_im += c11_01_im * o01_re; 02275 a11_re += c11_02_re * o02_re; 02276 a11_re -= c11_02_im * o02_im; 02277 a11_im += c11_02_re * o02_im; 02278 a11_im += c11_02_im * o02_re; 02279 a11_re += c11_10_re * o10_re; 02280 a11_re -= c11_10_im * o10_im; 02281 a11_im += c11_10_re * o10_im; 02282 a11_im += c11_10_im * o10_re; 02283 a11_re += c11_11_re * o11_re; 02284 a11_im += c11_11_re * o11_im; 02285 a11_re += c11_12_re * o12_re; 02286 a11_re -= c11_12_im * o12_im; 02287 a11_im += c11_12_re * o12_im; 02288 a11_im += c11_12_im * o12_re; 02289 02290 a12_re += c12_00_re * o00_re; 02291 a12_re -= c12_00_im * o00_im; 02292 a12_im += c12_00_re * o00_im; 02293 a12_im += c12_00_im * o00_re; 02294 a12_re += c12_01_re * o01_re; 02295 a12_re -= c12_01_im * o01_im; 02296 a12_im += c12_01_re * o01_im; 02297 a12_im += c12_01_im * o01_re; 02298 a12_re += c12_02_re * o02_re; 02299 a12_re -= c12_02_im * o02_im; 02300 a12_im += c12_02_re * o02_im; 02301 a12_im += c12_02_im * o02_re; 02302 a12_re += c12_10_re * o10_re; 02303 a12_re -= c12_10_im * o10_im; 02304 a12_im += c12_10_re * o10_im; 02305 a12_im += c12_10_im * o10_re; 02306 a12_re += c12_11_re * o11_re; 02307 a12_re -= c12_11_im * o11_im; 02308 a12_im += c12_11_re * o11_im; 02309 a12_im += c12_11_im * o11_re; 02310 a12_re += c12_12_re * o12_re; 02311 a12_im += c12_12_re * o12_im; 02312 02313 o00_re = a00_re; o00_im = a00_im; 02314 o01_re = a01_re; o01_im = a01_im; 02315 o02_re = a02_re; o02_im = a02_im; 02316 o10_re = a10_re; o10_im = a10_im; 02317 o11_re = a11_re; o11_im = a11_im; 02318 o12_re = a12_re; o12_im = a12_im; 02319 02320 } 02321 02322 // apply second chiral block 02323 { 02324 READ_CLOVER(CLOVERTEX, 1) 02325 02326 spinorFloat a20_re = 0; spinorFloat a20_im = 0; 02327 spinorFloat a21_re = 0; spinorFloat a21_im = 0; 02328 spinorFloat a22_re = 0; spinorFloat a22_im = 0; 02329 spinorFloat a30_re = 0; spinorFloat a30_im = 0; 02330 spinorFloat a31_re = 0; spinorFloat a31_im = 0; 02331 spinorFloat a32_re = 0; spinorFloat a32_im = 0; 02332 02333 a20_re += c20_20_re * o20_re; 02334 a20_im += c20_20_re * o20_im; 02335 a20_re += c20_21_re * o21_re; 02336 a20_re -= c20_21_im * o21_im; 02337 a20_im += c20_21_re * o21_im; 02338 a20_im += c20_21_im * o21_re; 02339 a20_re += c20_22_re * o22_re; 02340 a20_re -= c20_22_im * o22_im; 02341 a20_im += c20_22_re * o22_im; 02342 a20_im += c20_22_im * o22_re; 02343 a20_re += c20_30_re * o30_re; 02344 a20_re -= c20_30_im * o30_im; 02345 a20_im += c20_30_re * o30_im; 02346 a20_im += c20_30_im * o30_re; 02347 a20_re += c20_31_re * o31_re; 02348 a20_re -= c20_31_im * o31_im; 02349 a20_im += c20_31_re * o31_im; 02350 a20_im += c20_31_im * o31_re; 02351 a20_re += c20_32_re * o32_re; 02352 a20_re -= c20_32_im * o32_im; 02353 a20_im += c20_32_re * o32_im; 02354 a20_im += c20_32_im * o32_re; 02355 02356 a21_re += c21_20_re * o20_re; 02357 a21_re -= c21_20_im * o20_im; 02358 a21_im += c21_20_re * o20_im; 02359 a21_im += c21_20_im * o20_re; 02360 a21_re += c21_21_re * o21_re; 02361 a21_im += c21_21_re * o21_im; 02362 a21_re += c21_22_re * o22_re; 02363 a21_re -= c21_22_im * o22_im; 02364 a21_im += c21_22_re * o22_im; 02365 a21_im += c21_22_im * o22_re; 02366 a21_re += c21_30_re * o30_re; 02367 a21_re -= c21_30_im * o30_im; 02368 a21_im += c21_30_re * o30_im; 02369 a21_im += c21_30_im * o30_re; 02370 a21_re += c21_31_re * o31_re; 02371 a21_re -= c21_31_im * o31_im; 02372 a21_im += c21_31_re * o31_im; 02373 a21_im += c21_31_im * o31_re; 02374 a21_re += c21_32_re * o32_re; 02375 a21_re -= c21_32_im * o32_im; 02376 a21_im += c21_32_re * o32_im; 02377 a21_im += c21_32_im * o32_re; 02378 02379 a22_re += c22_20_re * o20_re; 02380 a22_re -= c22_20_im * o20_im; 02381 a22_im += c22_20_re * o20_im; 02382 a22_im += c22_20_im * o20_re; 02383 a22_re += c22_21_re * o21_re; 02384 a22_re -= c22_21_im * o21_im; 02385 a22_im += c22_21_re * o21_im; 02386 a22_im += c22_21_im * o21_re; 02387 a22_re += c22_22_re * o22_re; 02388 a22_im += c22_22_re * o22_im; 02389 a22_re += c22_30_re * o30_re; 02390 a22_re -= c22_30_im * o30_im; 02391 a22_im += c22_30_re * o30_im; 02392 a22_im += c22_30_im * o30_re; 02393 a22_re += c22_31_re * o31_re; 02394 a22_re -= c22_31_im * o31_im; 02395 a22_im += c22_31_re * o31_im; 02396 a22_im += c22_31_im * o31_re; 02397 a22_re += c22_32_re * o32_re; 02398 a22_re -= c22_32_im * o32_im; 02399 a22_im += c22_32_re * o32_im; 02400 a22_im += c22_32_im * o32_re; 02401 02402 a30_re += c30_20_re * o20_re; 02403 a30_re -= c30_20_im * o20_im; 02404 a30_im += c30_20_re * o20_im; 02405 a30_im += c30_20_im * o20_re; 02406 a30_re += c30_21_re * o21_re; 02407 a30_re -= c30_21_im * o21_im; 02408 a30_im += c30_21_re * o21_im; 02409 a30_im += c30_21_im * o21_re; 02410 a30_re += c30_22_re * o22_re; 02411 a30_re -= c30_22_im * o22_im; 02412 a30_im += c30_22_re * o22_im; 02413 a30_im += c30_22_im * o22_re; 02414 a30_re += c30_30_re * o30_re; 02415 a30_im += c30_30_re * o30_im; 02416 a30_re += c30_31_re * o31_re; 02417 a30_re -= c30_31_im * o31_im; 02418 a30_im += c30_31_re * o31_im; 02419 a30_im += c30_31_im * o31_re; 02420 a30_re += c30_32_re * o32_re; 02421 a30_re -= c30_32_im * o32_im; 02422 a30_im += c30_32_re * o32_im; 02423 a30_im += c30_32_im * o32_re; 02424 02425 a31_re += c31_20_re * o20_re; 02426 a31_re -= c31_20_im * o20_im; 02427 a31_im += c31_20_re * o20_im; 02428 a31_im += c31_20_im * o20_re; 02429 a31_re += c31_21_re * o21_re; 02430 a31_re -= c31_21_im * o21_im; 02431 a31_im += c31_21_re * o21_im; 02432 a31_im += c31_21_im * o21_re; 02433 a31_re += c31_22_re * o22_re; 02434 a31_re -= c31_22_im * o22_im; 02435 a31_im += c31_22_re * o22_im; 02436 a31_im += c31_22_im * o22_re; 02437 a31_re += c31_30_re * o30_re; 02438 a31_re -= c31_30_im * o30_im; 02439 a31_im += c31_30_re * o30_im; 02440 a31_im += c31_30_im * o30_re; 02441 a31_re += c31_31_re * o31_re; 02442 a31_im += c31_31_re * o31_im; 02443 a31_re += c31_32_re * o32_re; 02444 a31_re -= c31_32_im * o32_im; 02445 a31_im += c31_32_re * o32_im; 02446 a31_im += c31_32_im * o32_re; 02447 02448 a32_re += c32_20_re * o20_re; 02449 a32_re -= c32_20_im * o20_im; 02450 a32_im += c32_20_re * o20_im; 02451 a32_im += c32_20_im * o20_re; 02452 a32_re += c32_21_re * o21_re; 02453 a32_re -= c32_21_im * o21_im; 02454 a32_im += c32_21_re * o21_im; 02455 a32_im += c32_21_im * o21_re; 02456 a32_re += c32_22_re * o22_re; 02457 a32_re -= c32_22_im * o22_im; 02458 a32_im += c32_22_re * o22_im; 02459 a32_im += c32_22_im * o22_re; 02460 a32_re += c32_30_re * o30_re; 02461 a32_re -= c32_30_im * o30_im; 02462 a32_im += c32_30_re * o30_im; 02463 a32_im += c32_30_im * o30_re; 02464 a32_re += c32_31_re * o31_re; 02465 a32_re -= c32_31_im * o31_im; 02466 a32_im += c32_31_re * o31_im; 02467 a32_im += c32_31_im * o31_re; 02468 a32_re += c32_32_re * o32_re; 02469 a32_im += c32_32_re * o32_im; 02470 02471 o20_re = a20_re; o20_im = a20_im; 02472 o21_re = a21_re; o21_im = a21_im; 02473 o22_re = a22_re; o22_im = a22_im; 02474 o30_re = a30_re; o30_im = a30_im; 02475 o31_re = a31_re; o31_im = a31_im; 02476 o32_re = a32_re; o32_im = a32_im; 02477 02478 } 02479 02480 // change back from chiral basis 02481 // (note: required factor of 1/2 is included in clover term normalization) 02482 { 02483 spinorFloat a00_re = o10_re + o30_re; 02484 spinorFloat a00_im = o10_im + o30_im; 02485 spinorFloat a10_re = -o00_re - o20_re; 02486 spinorFloat a10_im = -o00_im - o20_im; 02487 spinorFloat a20_re = o10_re - o30_re; 02488 spinorFloat a20_im = o10_im - o30_im; 02489 spinorFloat a30_re = -o00_re + o20_re; 02490 spinorFloat a30_im = -o00_im + o20_im; 02491 02492 o00_re = a00_re; o00_im = a00_im; 02493 o10_re = a10_re; o10_im = a10_im; 02494 o20_re = a20_re; o20_im = a20_im; 02495 o30_re = a30_re; o30_im = a30_im; 02496 } 02497 02498 { 02499 spinorFloat a01_re = o11_re + o31_re; 02500 spinorFloat a01_im = o11_im + o31_im; 02501 spinorFloat a11_re = -o01_re - o21_re; 02502 spinorFloat a11_im = -o01_im - o21_im; 02503 spinorFloat a21_re = o11_re - o31_re; 02504 spinorFloat a21_im = o11_im - o31_im; 02505 spinorFloat a31_re = -o01_re + o21_re; 02506 spinorFloat a31_im = -o01_im + o21_im; 02507 02508 o01_re = a01_re; o01_im = a01_im; 02509 o11_re = a11_re; o11_im = a11_im; 02510 o21_re = a21_re; o21_im = a21_im; 02511 o31_re = a31_re; o31_im = a31_im; 02512 } 02513 02514 { 02515 spinorFloat a02_re = o12_re + o32_re; 02516 spinorFloat a02_im = o12_im + o32_im; 02517 spinorFloat a12_re = -o02_re - o22_re; 02518 spinorFloat a12_im = -o02_im - o22_im; 02519 spinorFloat a22_re = o12_re - o32_re; 02520 spinorFloat a22_im = o12_im - o32_im; 02521 spinorFloat a32_re = -o02_re + o22_re; 02522 spinorFloat a32_im = -o02_im + o22_im; 02523 02524 o02_re = a02_re; o02_im = a02_im; 02525 o12_re = a12_re; o12_im = a12_im; 02526 o22_re = a22_re; o22_im = a22_im; 02527 o32_re = a32_re; o32_im = a32_im; 02528 } 02529 02530 #endif // DSLASH_CLOVER 02531 02532 #ifdef DSLASH_XPAY 02533 02534 READ_ACCUM(ACCUMTEX, sp_stride) 02535 02536 #ifdef SPINOR_DOUBLE 02537 o00_re = a*o00_re + accum0.x; 02538 o00_im = a*o00_im + accum0.y; 02539 o01_re = a*o01_re + accum1.x; 02540 o01_im = a*o01_im + accum1.y; 02541 o02_re = a*o02_re + accum2.x; 02542 o02_im = a*o02_im + accum2.y; 02543 o10_re = a*o10_re + accum3.x; 02544 o10_im = a*o10_im + accum3.y; 02545 o11_re = a*o11_re + accum4.x; 02546 o11_im = a*o11_im + accum4.y; 02547 o12_re = a*o12_re + accum5.x; 02548 o12_im = a*o12_im + accum5.y; 02549 o20_re = a*o20_re + accum6.x; 02550 o20_im = a*o20_im + accum6.y; 02551 o21_re = a*o21_re + accum7.x; 02552 o21_im = a*o21_im + accum7.y; 02553 o22_re = a*o22_re + accum8.x; 02554 o22_im = a*o22_im + accum8.y; 02555 o30_re = a*o30_re + accum9.x; 02556 o30_im = a*o30_im + accum9.y; 02557 o31_re = a*o31_re + accum10.x; 02558 o31_im = a*o31_im + accum10.y; 02559 o32_re = a*o32_re + accum11.x; 02560 o32_im = a*o32_im + accum11.y; 02561 #else 02562 o00_re = a*o00_re + accum0.x; 02563 o00_im = a*o00_im + accum0.y; 02564 o01_re = a*o01_re + accum0.z; 02565 o01_im = a*o01_im + accum0.w; 02566 o02_re = a*o02_re + accum1.x; 02567 o02_im = a*o02_im + accum1.y; 02568 o10_re = a*o10_re + accum1.z; 02569 o10_im = a*o10_im + accum1.w; 02570 o11_re = a*o11_re + accum2.x; 02571 o11_im = a*o11_im + accum2.y; 02572 o12_re = a*o12_re + accum2.z; 02573 o12_im = a*o12_im + accum2.w; 02574 o20_re = a*o20_re + accum3.x; 02575 o20_im = a*o20_im + accum3.y; 02576 o21_re = a*o21_re + accum3.z; 02577 o21_im = a*o21_im + accum3.w; 02578 o22_re = a*o22_re + accum4.x; 02579 o22_im = a*o22_im + accum4.y; 02580 o30_re = a*o30_re + accum4.z; 02581 o30_im = a*o30_im + accum4.w; 02582 o31_re = a*o31_re + accum5.x; 02583 o31_im = a*o31_im + accum5.y; 02584 o32_re = a*o32_re + accum5.z; 02585 o32_im = a*o32_im + accum5.w; 02586 #endif // SPINOR_DOUBLE 02587 02588 #endif // DSLASH_XPAY 02589 } 02590 02591 // write spinor field back to device memory 02592 WRITE_SPINOR(sp_stride); 02593 02594 // undefine to prevent warning when precision is changed 02595 #undef spinorFloat 02596 #undef A_re 02597 #undef A_im 02598 02599 #undef g00_re 02600 #undef g00_im 02601 #undef g01_re 02602 #undef g01_im 02603 #undef g02_re 02604 #undef g02_im 02605 #undef g10_re 02606 #undef g10_im 02607 #undef g11_re 02608 #undef g11_im 02609 #undef g12_re 02610 #undef g12_im 02611 #undef g20_re 02612 #undef g20_im 02613 #undef g21_re 02614 #undef g21_im 02615 #undef g22_re 02616 #undef g22_im 02617 02618 #undef i00_re 02619 #undef i00_im 02620 #undef i01_re 02621 #undef i01_im 02622 #undef i02_re 02623 #undef i02_im 02624 #undef i10_re 02625 #undef i10_im 02626 #undef i11_re 02627 #undef i11_im 02628 #undef i12_re 02629 #undef i12_im 02630 #undef i20_re 02631 #undef i20_im 02632 #undef i21_re 02633 #undef i21_im 02634 #undef i22_re 02635 #undef i22_im 02636 #undef i30_re 02637 #undef i30_im 02638 #undef i31_re 02639 #undef i31_im 02640 #undef i32_re 02641 #undef i32_im 02642 02643 #undef c00_00_re 02644 #undef c01_01_re 02645 #undef c02_02_re 02646 #undef c10_10_re 02647 #undef c11_11_re 02648 #undef c12_12_re 02649 #undef c01_00_re 02650 #undef c01_00_im 02651 #undef c02_00_re 02652 #undef c02_00_im 02653 #undef c10_00_re 02654 #undef c10_00_im 02655 #undef c11_00_re 02656 #undef c11_00_im 02657 #undef c12_00_re 02658 #undef c12_00_im 02659 #undef c02_01_re 02660 #undef c02_01_im 02661 #undef c10_01_re 02662 #undef c10_01_im 02663 #undef c11_01_re 02664 #undef c11_01_im 02665 #undef c12_01_re 02666 #undef c12_01_im 02667 #undef c10_02_re 02668 #undef c10_02_im 02669 #undef c11_02_re 02670 #undef c11_02_im 02671 #undef c12_02_re 02672 #undef c12_02_im 02673 #undef c11_10_re 02674 #undef c11_10_im 02675 #undef c12_10_re 02676 #undef c12_10_im 02677 #undef c12_11_re 02678 #undef c12_11_im 02679 02680 02681 #undef VOLATILE