QUDA v0.4.0
A library for QCD on GPUs
|
00001 // *** CUDA DSLASH DAGGER *** 00002 00003 #define DSLASH_SHARED_FLOATS_PER_THREAD 24 00004 00005 00006 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 00007 #define VOLATILE 00008 #else // Open64 compiler 00009 #define VOLATILE volatile 00010 #endif 00011 // input spinor 00012 #ifdef SPINOR_DOUBLE 00013 #define spinorFloat double 00014 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2 00015 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2 00016 #define i00_re I0.x 00017 #define i00_im I0.y 00018 #define i01_re I1.x 00019 #define i01_im I1.y 00020 #define i02_re I2.x 00021 #define i02_im I2.y 00022 #define i10_re I3.x 00023 #define i10_im I3.y 00024 #define i11_re I4.x 00025 #define i11_im I4.y 00026 #define i12_re I5.x 00027 #define i12_im I5.y 00028 #define i20_re I6.x 00029 #define i20_im I6.y 00030 #define i21_re I7.x 00031 #define i21_im I7.y 00032 #define i22_re I8.x 00033 #define i22_im I8.y 00034 #define i30_re I9.x 00035 #define i30_im I9.y 00036 #define i31_re I10.x 00037 #define i31_im I10.y 00038 #define i32_re I11.x 00039 #define i32_im I11.y 00040 #else 00041 #define spinorFloat float 00042 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4 00043 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4 00044 #define i00_re I0.x 00045 #define i00_im I0.y 00046 #define i01_re I0.z 00047 #define i01_im I0.w 00048 #define i02_re I1.x 00049 #define i02_im I1.y 00050 #define i10_re I1.z 00051 #define i10_im I1.w 00052 #define i11_re I2.x 00053 #define i11_im I2.y 00054 #define i12_re I2.z 00055 #define i12_im I2.w 00056 #define i20_re I3.x 00057 #define i20_im I3.y 00058 #define i21_re I3.z 00059 #define i21_im I3.w 00060 #define i22_re I4.x 00061 #define i22_im I4.y 00062 #define i30_re I4.z 00063 #define i30_im I4.w 00064 #define i31_re I5.x 00065 #define i31_im I5.y 00066 #define i32_re I5.z 00067 #define i32_im I5.w 00068 #endif // SPINOR_DOUBLE 00069 00070 // gauge link 00071 #ifdef GAUGE_FLOAT2 00072 #define g00_re G0.x 00073 #define g00_im G0.y 00074 #define g01_re G1.x 00075 #define g01_im G1.y 00076 #define g02_re G2.x 00077 #define g02_im G2.y 00078 #define g10_re G3.x 00079 #define g10_im G3.y 00080 #define g11_re G4.x 00081 #define g11_im G4.y 00082 #define g12_re G5.x 00083 #define g12_im G5.y 00084 #define g20_re G6.x 00085 #define g20_im G6.y 00086 #define g21_re G7.x 00087 #define g21_im G7.y 00088 #define g22_re G8.x 00089 #define g22_im G8.y 00090 // temporaries 00091 #define A_re G9.x 00092 #define A_im G9.y 00093 00094 #else 00095 #define g00_re G0.x 00096 #define g00_im G0.y 00097 #define g01_re G0.z 00098 #define g01_im G0.w 00099 #define g02_re G1.x 00100 #define g02_im G1.y 00101 #define g10_re G1.z 00102 #define g10_im G1.w 00103 #define g11_re G2.x 00104 #define g11_im G2.y 00105 #define g12_re G2.z 00106 #define g12_im G2.w 00107 #define g20_re G3.x 00108 #define g20_im G3.y 00109 #define g21_re G3.z 00110 #define g21_im G3.w 00111 #define g22_re G4.x 00112 #define g22_im G4.y 00113 // temporaries 00114 #define A_re G4.z 00115 #define A_im G4.w 00116 00117 #endif // GAUGE_DOUBLE 00118 00119 // conjugated gauge link 00120 #define gT00_re (+g00_re) 00121 #define gT00_im (-g00_im) 00122 #define gT01_re (+g10_re) 00123 #define gT01_im (-g10_im) 00124 #define gT02_re (+g20_re) 00125 #define gT02_im (-g20_im) 00126 #define gT10_re (+g01_re) 00127 #define gT10_im (-g01_im) 00128 #define gT11_re (+g11_re) 00129 #define gT11_im (-g11_im) 00130 #define gT12_re (+g21_re) 00131 #define gT12_im (-g21_im) 00132 #define gT20_re (+g02_re) 00133 #define gT20_im (-g02_im) 00134 #define gT21_re (+g12_re) 00135 #define gT21_im (-g12_im) 00136 #define gT22_re (+g22_re) 00137 #define gT22_im (-g22_im) 00138 00139 // first chiral block of inverted clover term 00140 #ifdef CLOVER_DOUBLE 00141 #define c00_00_re C0.x 00142 #define c01_01_re C0.y 00143 #define c02_02_re C1.x 00144 #define c10_10_re C1.y 00145 #define c11_11_re C2.x 00146 #define c12_12_re C2.y 00147 #define c01_00_re C3.x 00148 #define c01_00_im C3.y 00149 #define c02_00_re C4.x 00150 #define c02_00_im C4.y 00151 #define c10_00_re C5.x 00152 #define c10_00_im C5.y 00153 #define c11_00_re C6.x 00154 #define c11_00_im C6.y 00155 #define c12_00_re C7.x 00156 #define c12_00_im C7.y 00157 #define c02_01_re C8.x 00158 #define c02_01_im C8.y 00159 #define c10_01_re C9.x 00160 #define c10_01_im C9.y 00161 #define c11_01_re C10.x 00162 #define c11_01_im C10.y 00163 #define c12_01_re C11.x 00164 #define c12_01_im C11.y 00165 #define c10_02_re C12.x 00166 #define c10_02_im C12.y 00167 #define c11_02_re C13.x 00168 #define c11_02_im C13.y 00169 #define c12_02_re C14.x 00170 #define c12_02_im C14.y 00171 #define c11_10_re C15.x 00172 #define c11_10_im C15.y 00173 #define c12_10_re C16.x 00174 #define c12_10_im C16.y 00175 #define c12_11_re C17.x 00176 #define c12_11_im C17.y 00177 #else 00178 #define c00_00_re C0.x 00179 #define c01_01_re C0.y 00180 #define c02_02_re C0.z 00181 #define c10_10_re C0.w 00182 #define c11_11_re C1.x 00183 #define c12_12_re C1.y 00184 #define c01_00_re C1.z 00185 #define c01_00_im C1.w 00186 #define c02_00_re C2.x 00187 #define c02_00_im C2.y 00188 #define c10_00_re C2.z 00189 #define c10_00_im C2.w 00190 #define c11_00_re C3.x 00191 #define c11_00_im C3.y 00192 #define c12_00_re C3.z 00193 #define c12_00_im C3.w 00194 #define c02_01_re C4.x 00195 #define c02_01_im C4.y 00196 #define c10_01_re C4.z 00197 #define c10_01_im C4.w 00198 #define c11_01_re C5.x 00199 #define c11_01_im C5.y 00200 #define c12_01_re C5.z 00201 #define c12_01_im C5.w 00202 #define c10_02_re C6.x 00203 #define c10_02_im C6.y 00204 #define c11_02_re C6.z 00205 #define c11_02_im C6.w 00206 #define c12_02_re C7.x 00207 #define c12_02_im C7.y 00208 #define c11_10_re C7.z 00209 #define c11_10_im C7.w 00210 #define c12_10_re C8.x 00211 #define c12_10_im C8.y 00212 #define c12_11_re C8.z 00213 #define c12_11_im C8.w 00214 #endif // CLOVER_DOUBLE 00215 00216 #define c00_01_re (+c01_00_re) 00217 #define c00_01_im (-c01_00_im) 00218 #define c00_02_re (+c02_00_re) 00219 #define c00_02_im (-c02_00_im) 00220 #define c01_02_re (+c02_01_re) 00221 #define c01_02_im (-c02_01_im) 00222 #define c00_10_re (+c10_00_re) 00223 #define c00_10_im (-c10_00_im) 00224 #define c01_10_re (+c10_01_re) 00225 #define c01_10_im (-c10_01_im) 00226 #define c02_10_re (+c10_02_re) 00227 #define c02_10_im (-c10_02_im) 00228 #define c00_11_re (+c11_00_re) 00229 #define c00_11_im (-c11_00_im) 00230 #define c01_11_re (+c11_01_re) 00231 #define c01_11_im (-c11_01_im) 00232 #define c02_11_re (+c11_02_re) 00233 #define c02_11_im (-c11_02_im) 00234 #define c10_11_re (+c11_10_re) 00235 #define c10_11_im (-c11_10_im) 00236 #define c00_12_re (+c12_00_re) 00237 #define c00_12_im (-c12_00_im) 00238 #define c01_12_re (+c12_01_re) 00239 #define c01_12_im (-c12_01_im) 00240 #define c02_12_re (+c12_02_re) 00241 #define c02_12_im (-c12_02_im) 00242 #define c10_12_re (+c12_10_re) 00243 #define c10_12_im (-c12_10_im) 00244 #define c11_12_re (+c12_11_re) 00245 #define c11_12_im (-c12_11_im) 00246 00247 // second chiral block of inverted clover term (reuses C0,...,C9) 00248 #define c20_20_re c00_00_re 00249 #define c21_20_re c01_00_re 00250 #define c21_20_im c01_00_im 00251 #define c22_20_re c02_00_re 00252 #define c22_20_im c02_00_im 00253 #define c30_20_re c10_00_re 00254 #define c30_20_im c10_00_im 00255 #define c31_20_re c11_00_re 00256 #define c31_20_im c11_00_im 00257 #define c32_20_re c12_00_re 00258 #define c32_20_im c12_00_im 00259 #define c20_21_re c00_01_re 00260 #define c20_21_im c00_01_im 00261 #define c21_21_re c01_01_re 00262 #define c22_21_re c02_01_re 00263 #define c22_21_im c02_01_im 00264 #define c30_21_re c10_01_re 00265 #define c30_21_im c10_01_im 00266 #define c31_21_re c11_01_re 00267 #define c31_21_im c11_01_im 00268 #define c32_21_re c12_01_re 00269 #define c32_21_im c12_01_im 00270 #define c20_22_re c00_02_re 00271 #define c20_22_im c00_02_im 00272 #define c21_22_re c01_02_re 00273 #define c21_22_im c01_02_im 00274 #define c22_22_re c02_02_re 00275 #define c30_22_re c10_02_re 00276 #define c30_22_im c10_02_im 00277 #define c31_22_re c11_02_re 00278 #define c31_22_im c11_02_im 00279 #define c32_22_re c12_02_re 00280 #define c32_22_im c12_02_im 00281 #define c20_30_re c00_10_re 00282 #define c20_30_im c00_10_im 00283 #define c21_30_re c01_10_re 00284 #define c21_30_im c01_10_im 00285 #define c22_30_re c02_10_re 00286 #define c22_30_im c02_10_im 00287 #define c30_30_re c10_10_re 00288 #define c31_30_re c11_10_re 00289 #define c31_30_im c11_10_im 00290 #define c32_30_re c12_10_re 00291 #define c32_30_im c12_10_im 00292 #define c20_31_re c00_11_re 00293 #define c20_31_im c00_11_im 00294 #define c21_31_re c01_11_re 00295 #define c21_31_im c01_11_im 00296 #define c22_31_re c02_11_re 00297 #define c22_31_im c02_11_im 00298 #define c30_31_re c10_11_re 00299 #define c30_31_im c10_11_im 00300 #define c31_31_re c11_11_re 00301 #define c32_31_re c12_11_re 00302 #define c32_31_im c12_11_im 00303 #define c20_32_re c00_12_re 00304 #define c20_32_im c00_12_im 00305 #define c21_32_re c01_12_re 00306 #define c21_32_im c01_12_im 00307 #define c22_32_re c02_12_re 00308 #define c22_32_im c02_12_im 00309 #define c30_32_re c10_12_re 00310 #define c30_32_im c10_12_im 00311 #define c31_32_re c11_12_re 00312 #define c31_32_im c11_12_im 00313 #define c32_32_re c12_12_re 00314 00315 // output spinor 00316 VOLATILE spinorFloat o00_re; 00317 VOLATILE spinorFloat o00_im; 00318 VOLATILE spinorFloat o01_re; 00319 VOLATILE spinorFloat o01_im; 00320 VOLATILE spinorFloat o02_re; 00321 VOLATILE spinorFloat o02_im; 00322 VOLATILE spinorFloat o10_re; 00323 VOLATILE spinorFloat o10_im; 00324 VOLATILE spinorFloat o11_re; 00325 VOLATILE spinorFloat o11_im; 00326 VOLATILE spinorFloat o12_re; 00327 VOLATILE spinorFloat o12_im; 00328 VOLATILE spinorFloat o20_re; 00329 VOLATILE spinorFloat o20_im; 00330 VOLATILE spinorFloat o21_re; 00331 VOLATILE spinorFloat o21_im; 00332 VOLATILE spinorFloat o22_re; 00333 VOLATILE spinorFloat o22_im; 00334 VOLATILE spinorFloat o30_re; 00335 VOLATILE spinorFloat o30_im; 00336 VOLATILE spinorFloat o31_re; 00337 VOLATILE spinorFloat o31_im; 00338 VOLATILE spinorFloat o32_re; 00339 VOLATILE spinorFloat o32_im; 00340 00341 #ifdef SPINOR_DOUBLE 00342 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 00343 #else 00344 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 00345 #endif 00346 00347 #include "read_gauge.h" 00348 #include "read_clover.h" 00349 #include "io_spinor.h" 00350 00351 int x1, x2, x3, x4; 00352 int X; 00353 00354 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision 00355 int sp_norm_idx; 00356 #endif // MULTI_GPU half precision 00357 00358 int sid; 00359 00360 #ifdef MULTI_GPU 00361 int face_idx; 00362 if (kernel_type == INTERIOR_KERNEL) { 00363 #endif 00364 00365 // Inline by hand for the moment and assume even dimensions 00366 //coordsFromIndex(X, x1, x2, x3, x4, sid, param.parity); 00367 00368 int xt = blockIdx.x*blockDim.x + threadIdx.x; 00369 int aux = xt+xt; 00370 if (aux >= X1*X4) return; 00371 00372 x4 = aux / X1; 00373 x1 = aux - x4*X1; 00374 00375 x2 = blockIdx.y*blockDim.y + threadIdx.y; 00376 if (x2 >= X2) return; 00377 00378 x3 = blockIdx.z*blockDim.z + threadIdx.z; 00379 if (x3 >= X3) return; 00380 00381 x1 += (param.parity + x4 + x3 + x2) &1; 00382 X = ((x4*X3 + x3)*X2 + x2)*X1 + x1; 00383 sid = X >> 1; 00384 00385 o00_re = 0; o00_im = 0; 00386 o01_re = 0; o01_im = 0; 00387 o02_re = 0; o02_im = 0; 00388 o10_re = 0; o10_im = 0; 00389 o11_re = 0; o11_im = 0; 00390 o12_re = 0; o12_im = 0; 00391 o20_re = 0; o20_im = 0; 00392 o21_re = 0; o21_im = 0; 00393 o22_re = 0; o22_im = 0; 00394 o30_re = 0; o30_im = 0; 00395 o31_re = 0; o31_im = 0; 00396 o32_re = 0; o32_im = 0; 00397 00398 #ifdef MULTI_GPU 00399 } else { // exterior kernel 00400 00401 sid = blockIdx.x*blockDim.x + threadIdx.x; 00402 if (sid >= param.threads) return; 00403 00404 const int dim = static_cast<int>(kernel_type); 00405 const int face_volume = (param.threads >> 1); // volume of one face 00406 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1 00407 face_idx = sid - face_num*face_volume; // index into the respective face 00408 00409 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP) 00410 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading) 00411 //sp_idx = face_idx + param.ghostOffset[dim]; 00412 00413 #if (DD_PREC==2) // half precision 00414 sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)]; 00415 #endif 00416 00417 coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity); 00418 00419 READ_INTERMEDIATE_SPINOR(INTERTEX, sp_stride, sid, sid); 00420 00421 o00_re = i00_re; o00_im = i00_im; 00422 o01_re = i01_re; o01_im = i01_im; 00423 o02_re = i02_re; o02_im = i02_im; 00424 o10_re = i10_re; o10_im = i10_im; 00425 o11_re = i11_re; o11_im = i11_im; 00426 o12_re = i12_re; o12_im = i12_im; 00427 o20_re = i20_re; o20_im = i20_im; 00428 o21_re = i21_re; o21_im = i21_im; 00429 o22_re = i22_re; o22_im = i22_im; 00430 o30_re = i30_re; o30_im = i30_im; 00431 o31_re = i31_re; o31_im = i31_im; 00432 o32_re = i32_re; o32_im = i32_im; 00433 } 00434 #endif // MULTI_GPU 00435 00436 00437 #ifdef MULTI_GPU 00438 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) || 00439 (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) ) 00440 #endif 00441 { 00442 // Projector P0+ 00443 // 1 0 0 i 00444 // 0 1 i 0 00445 // 0 -i 1 0 00446 // -i 0 0 1 00447 00448 #ifdef MULTI_GPU 00449 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 : 00450 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00451 #else 00452 const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1; 00453 #endif 00454 00455 const int ga_idx = sid; 00456 00457 spinorFloat a0_re, a0_im; 00458 spinorFloat a1_re, a1_im; 00459 spinorFloat a2_re, a2_im; 00460 spinorFloat b0_re, b0_im; 00461 spinorFloat b1_re, b1_im; 00462 spinorFloat b2_re, b2_im; 00463 00464 #ifdef MULTI_GPU 00465 if (kernel_type == INTERIOR_KERNEL) { 00466 #endif 00467 00468 // read spinor from device memory 00469 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00470 00471 // store spinor into shared memory 00472 WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i); 00473 00474 // project spinor into half spinors 00475 a0_re = +i00_re-i30_im; 00476 a0_im = +i00_im+i30_re; 00477 a1_re = +i01_re-i31_im; 00478 a1_im = +i01_im+i31_re; 00479 a2_re = +i02_re-i32_im; 00480 a2_im = +i02_im+i32_re; 00481 b0_re = +i10_re-i20_im; 00482 b0_im = +i10_im+i20_re; 00483 b1_re = +i11_re-i21_im; 00484 b1_im = +i11_im+i21_re; 00485 b2_re = +i12_re-i22_im; 00486 b2_im = +i12_im+i22_re; 00487 00488 #ifdef MULTI_GPU 00489 } else { 00490 00491 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00492 00493 // read half spinor from device memory 00494 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00495 00496 a0_re = i00_re; a0_im = i00_im; 00497 a1_re = i01_re; a1_im = i01_im; 00498 a2_re = i02_re; a2_im = i02_im; 00499 b0_re = i10_re; b0_im = i10_im; 00500 b1_re = i11_re; b1_im = i11_im; 00501 b2_re = i12_re; b2_im = i12_im; 00502 00503 } 00504 #endif // MULTI_GPU 00505 00506 // read gauge matrix from device memory 00507 READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride); 00508 00509 // reconstruct gauge matrix 00510 RECONSTRUCT_GAUGE_MATRIX(0); 00511 00512 // multiply row 0 00513 spinorFloat A0_re = 0; 00514 A0_re += g00_re * a0_re; 00515 A0_re -= g00_im * a0_im; 00516 A0_re += g01_re * a1_re; 00517 A0_re -= g01_im * a1_im; 00518 A0_re += g02_re * a2_re; 00519 A0_re -= g02_im * a2_im; 00520 spinorFloat A0_im = 0; 00521 A0_im += g00_re * a0_im; 00522 A0_im += g00_im * a0_re; 00523 A0_im += g01_re * a1_im; 00524 A0_im += g01_im * a1_re; 00525 A0_im += g02_re * a2_im; 00526 A0_im += g02_im * a2_re; 00527 spinorFloat B0_re = 0; 00528 B0_re += g00_re * b0_re; 00529 B0_re -= g00_im * b0_im; 00530 B0_re += g01_re * b1_re; 00531 B0_re -= g01_im * b1_im; 00532 B0_re += g02_re * b2_re; 00533 B0_re -= g02_im * b2_im; 00534 spinorFloat B0_im = 0; 00535 B0_im += g00_re * b0_im; 00536 B0_im += g00_im * b0_re; 00537 B0_im += g01_re * b1_im; 00538 B0_im += g01_im * b1_re; 00539 B0_im += g02_re * b2_im; 00540 B0_im += g02_im * b2_re; 00541 00542 // multiply row 1 00543 spinorFloat A1_re = 0; 00544 A1_re += g10_re * a0_re; 00545 A1_re -= g10_im * a0_im; 00546 A1_re += g11_re * a1_re; 00547 A1_re -= g11_im * a1_im; 00548 A1_re += g12_re * a2_re; 00549 A1_re -= g12_im * a2_im; 00550 spinorFloat A1_im = 0; 00551 A1_im += g10_re * a0_im; 00552 A1_im += g10_im * a0_re; 00553 A1_im += g11_re * a1_im; 00554 A1_im += g11_im * a1_re; 00555 A1_im += g12_re * a2_im; 00556 A1_im += g12_im * a2_re; 00557 spinorFloat B1_re = 0; 00558 B1_re += g10_re * b0_re; 00559 B1_re -= g10_im * b0_im; 00560 B1_re += g11_re * b1_re; 00561 B1_re -= g11_im * b1_im; 00562 B1_re += g12_re * b2_re; 00563 B1_re -= g12_im * b2_im; 00564 spinorFloat B1_im = 0; 00565 B1_im += g10_re * b0_im; 00566 B1_im += g10_im * b0_re; 00567 B1_im += g11_re * b1_im; 00568 B1_im += g11_im * b1_re; 00569 B1_im += g12_re * b2_im; 00570 B1_im += g12_im * b2_re; 00571 00572 // multiply row 2 00573 spinorFloat A2_re = 0; 00574 A2_re += g20_re * a0_re; 00575 A2_re -= g20_im * a0_im; 00576 A2_re += g21_re * a1_re; 00577 A2_re -= g21_im * a1_im; 00578 A2_re += g22_re * a2_re; 00579 A2_re -= g22_im * a2_im; 00580 spinorFloat A2_im = 0; 00581 A2_im += g20_re * a0_im; 00582 A2_im += g20_im * a0_re; 00583 A2_im += g21_re * a1_im; 00584 A2_im += g21_im * a1_re; 00585 A2_im += g22_re * a2_im; 00586 A2_im += g22_im * a2_re; 00587 spinorFloat B2_re = 0; 00588 B2_re += g20_re * b0_re; 00589 B2_re -= g20_im * b0_im; 00590 B2_re += g21_re * b1_re; 00591 B2_re -= g21_im * b1_im; 00592 B2_re += g22_re * b2_re; 00593 B2_re -= g22_im * b2_im; 00594 spinorFloat B2_im = 0; 00595 B2_im += g20_re * b0_im; 00596 B2_im += g20_im * b0_re; 00597 B2_im += g21_re * b1_im; 00598 B2_im += g21_im * b1_re; 00599 B2_im += g22_re * b2_im; 00600 B2_im += g22_im * b2_re; 00601 00602 o00_re += A0_re; 00603 o00_im += A0_im; 00604 o10_re += B0_re; 00605 o10_im += B0_im; 00606 o20_re += B0_im; 00607 o20_im -= B0_re; 00608 o30_re += A0_im; 00609 o30_im -= A0_re; 00610 00611 o01_re += A1_re; 00612 o01_im += A1_im; 00613 o11_re += B1_re; 00614 o11_im += B1_im; 00615 o21_re += B1_im; 00616 o21_im -= B1_re; 00617 o31_re += A1_im; 00618 o31_im -= A1_re; 00619 00620 o02_re += A2_re; 00621 o02_im += A2_im; 00622 o12_re += B2_re; 00623 o12_im += B2_im; 00624 o22_re += B2_im; 00625 o22_im -= B2_re; 00626 o32_re += A2_im; 00627 o32_im -= A2_re; 00628 00629 } 00630 00631 #ifdef MULTI_GPU 00632 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) || 00633 (kernel_type == EXTERIOR_KERNEL_X && x1==0) ) 00634 #endif 00635 { 00636 // Projector P0- 00637 // 1 0 0 -i 00638 // 0 1 -i 0 00639 // 0 i 1 0 00640 // i 0 0 1 00641 00642 #ifdef MULTI_GPU 00643 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 : 00644 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00645 #else 00646 const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1; 00647 #endif 00648 00649 #ifdef MULTI_GPU 00650 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 00651 #else 00652 const int ga_idx = sp_idx; 00653 #endif 00654 00655 spinorFloat a0_re, a0_im; 00656 spinorFloat a1_re, a1_im; 00657 spinorFloat a2_re, a2_im; 00658 spinorFloat b0_re, b0_im; 00659 spinorFloat b1_re, b1_im; 00660 spinorFloat b2_re, b2_im; 00661 00662 #ifdef MULTI_GPU 00663 if (kernel_type == INTERIOR_KERNEL) { 00664 #endif 00665 00666 // load spinor from shared memory 00667 int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1; 00668 __syncthreads(); 00669 READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z); 00670 00671 // project spinor into half spinors 00672 a0_re = +i00_re+i30_im; 00673 a0_im = +i00_im-i30_re; 00674 a1_re = +i01_re+i31_im; 00675 a1_im = +i01_im-i31_re; 00676 a2_re = +i02_re+i32_im; 00677 a2_im = +i02_im-i32_re; 00678 b0_re = +i10_re+i20_im; 00679 b0_im = +i10_im-i20_re; 00680 b1_re = +i11_re+i21_im; 00681 b1_im = +i11_im-i21_re; 00682 b2_re = +i12_re+i22_im; 00683 b2_im = +i12_im-i22_re; 00684 00685 #ifdef MULTI_GPU 00686 } else { 00687 00688 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00689 00690 // read half spinor from device memory 00691 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 00692 00693 a0_re = i00_re; a0_im = i00_im; 00694 a1_re = i01_re; a1_im = i01_im; 00695 a2_re = i02_re; a2_im = i02_im; 00696 b0_re = i10_re; b0_im = i10_im; 00697 b1_re = i11_re; b1_im = i11_im; 00698 b2_re = i12_re; b2_im = i12_im; 00699 00700 } 00701 #endif // MULTI_GPU 00702 00703 // read gauge matrix from device memory 00704 READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride); 00705 00706 // reconstruct gauge matrix 00707 RECONSTRUCT_GAUGE_MATRIX(1); 00708 00709 // multiply row 0 00710 spinorFloat A0_re = 0; 00711 A0_re += gT00_re * a0_re; 00712 A0_re -= gT00_im * a0_im; 00713 A0_re += gT01_re * a1_re; 00714 A0_re -= gT01_im * a1_im; 00715 A0_re += gT02_re * a2_re; 00716 A0_re -= gT02_im * a2_im; 00717 spinorFloat A0_im = 0; 00718 A0_im += gT00_re * a0_im; 00719 A0_im += gT00_im * a0_re; 00720 A0_im += gT01_re * a1_im; 00721 A0_im += gT01_im * a1_re; 00722 A0_im += gT02_re * a2_im; 00723 A0_im += gT02_im * a2_re; 00724 spinorFloat B0_re = 0; 00725 B0_re += gT00_re * b0_re; 00726 B0_re -= gT00_im * b0_im; 00727 B0_re += gT01_re * b1_re; 00728 B0_re -= gT01_im * b1_im; 00729 B0_re += gT02_re * b2_re; 00730 B0_re -= gT02_im * b2_im; 00731 spinorFloat B0_im = 0; 00732 B0_im += gT00_re * b0_im; 00733 B0_im += gT00_im * b0_re; 00734 B0_im += gT01_re * b1_im; 00735 B0_im += gT01_im * b1_re; 00736 B0_im += gT02_re * b2_im; 00737 B0_im += gT02_im * b2_re; 00738 00739 // multiply row 1 00740 spinorFloat A1_re = 0; 00741 A1_re += gT10_re * a0_re; 00742 A1_re -= gT10_im * a0_im; 00743 A1_re += gT11_re * a1_re; 00744 A1_re -= gT11_im * a1_im; 00745 A1_re += gT12_re * a2_re; 00746 A1_re -= gT12_im * a2_im; 00747 spinorFloat A1_im = 0; 00748 A1_im += gT10_re * a0_im; 00749 A1_im += gT10_im * a0_re; 00750 A1_im += gT11_re * a1_im; 00751 A1_im += gT11_im * a1_re; 00752 A1_im += gT12_re * a2_im; 00753 A1_im += gT12_im * a2_re; 00754 spinorFloat B1_re = 0; 00755 B1_re += gT10_re * b0_re; 00756 B1_re -= gT10_im * b0_im; 00757 B1_re += gT11_re * b1_re; 00758 B1_re -= gT11_im * b1_im; 00759 B1_re += gT12_re * b2_re; 00760 B1_re -= gT12_im * b2_im; 00761 spinorFloat B1_im = 0; 00762 B1_im += gT10_re * b0_im; 00763 B1_im += gT10_im * b0_re; 00764 B1_im += gT11_re * b1_im; 00765 B1_im += gT11_im * b1_re; 00766 B1_im += gT12_re * b2_im; 00767 B1_im += gT12_im * b2_re; 00768 00769 // multiply row 2 00770 spinorFloat A2_re = 0; 00771 A2_re += gT20_re * a0_re; 00772 A2_re -= gT20_im * a0_im; 00773 A2_re += gT21_re * a1_re; 00774 A2_re -= gT21_im * a1_im; 00775 A2_re += gT22_re * a2_re; 00776 A2_re -= gT22_im * a2_im; 00777 spinorFloat A2_im = 0; 00778 A2_im += gT20_re * a0_im; 00779 A2_im += gT20_im * a0_re; 00780 A2_im += gT21_re * a1_im; 00781 A2_im += gT21_im * a1_re; 00782 A2_im += gT22_re * a2_im; 00783 A2_im += gT22_im * a2_re; 00784 spinorFloat B2_re = 0; 00785 B2_re += gT20_re * b0_re; 00786 B2_re -= gT20_im * b0_im; 00787 B2_re += gT21_re * b1_re; 00788 B2_re -= gT21_im * b1_im; 00789 B2_re += gT22_re * b2_re; 00790 B2_re -= gT22_im * b2_im; 00791 spinorFloat B2_im = 0; 00792 B2_im += gT20_re * b0_im; 00793 B2_im += gT20_im * b0_re; 00794 B2_im += gT21_re * b1_im; 00795 B2_im += gT21_im * b1_re; 00796 B2_im += gT22_re * b2_im; 00797 B2_im += gT22_im * b2_re; 00798 00799 o00_re += A0_re; 00800 o00_im += A0_im; 00801 o10_re += B0_re; 00802 o10_im += B0_im; 00803 o20_re -= B0_im; 00804 o20_im += B0_re; 00805 o30_re -= A0_im; 00806 o30_im += A0_re; 00807 00808 o01_re += A1_re; 00809 o01_im += A1_im; 00810 o11_re += B1_re; 00811 o11_im += B1_im; 00812 o21_re -= B1_im; 00813 o21_im += B1_re; 00814 o31_re -= A1_im; 00815 o31_im += A1_re; 00816 00817 o02_re += A2_re; 00818 o02_im += A2_im; 00819 o12_re += B2_re; 00820 o12_im += B2_im; 00821 o22_re -= B2_im; 00822 o22_im += B2_re; 00823 o32_re -= A2_im; 00824 o32_im += A2_re; 00825 00826 } 00827 00828 #ifdef MULTI_GPU 00829 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) || 00830 (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) ) 00831 #endif 00832 { 00833 // Projector P1+ 00834 // 1 0 0 1 00835 // 0 1 -1 0 00836 // 0 -1 1 0 00837 // 1 0 0 1 00838 00839 #ifdef MULTI_GPU 00840 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 : 00841 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00842 #else 00843 const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1; 00844 #endif 00845 00846 const int ga_idx = sid; 00847 00848 spinorFloat a0_re, a0_im; 00849 spinorFloat a1_re, a1_im; 00850 spinorFloat a2_re, a2_im; 00851 spinorFloat b0_re, b0_im; 00852 spinorFloat b1_re, b1_im; 00853 spinorFloat b2_re, b2_im; 00854 00855 #ifdef MULTI_GPU 00856 if (kernel_type == INTERIOR_KERNEL) { 00857 #endif 00858 00859 if (threadIdx.y == blockDim.y-1 && blockDim.y < X2 ) { 00860 // read spinor from device memory 00861 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00862 00863 // project spinor into half spinors 00864 a0_re = +i00_re+i30_re; 00865 a0_im = +i00_im+i30_im; 00866 a1_re = +i01_re+i31_re; 00867 a1_im = +i01_im+i31_im; 00868 a2_re = +i02_re+i32_re; 00869 a2_im = +i02_im+i32_im; 00870 b0_re = +i10_re-i20_re; 00871 b0_im = +i10_im-i20_im; 00872 b1_re = +i11_re-i21_re; 00873 b1_im = +i11_im-i21_im; 00874 b2_re = +i12_re-i22_re; 00875 b2_im = +i12_im-i22_im; 00876 } else { 00877 // load spinor from shared memory 00878 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x; 00879 int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0; 00880 READ_SPINOR_SHARED(tx, ty, threadIdx.z); 00881 00882 // project spinor into half spinors 00883 a0_re = +i00_re+i30_re; 00884 a0_im = +i00_im+i30_im; 00885 a1_re = +i01_re+i31_re; 00886 a1_im = +i01_im+i31_im; 00887 a2_re = +i02_re+i32_re; 00888 a2_im = +i02_im+i32_im; 00889 b0_re = +i10_re-i20_re; 00890 b0_im = +i10_im-i20_im; 00891 b1_re = +i11_re-i21_re; 00892 b1_im = +i11_im-i21_im; 00893 b2_re = +i12_re-i22_re; 00894 b2_im = +i12_im-i22_im; 00895 } 00896 00897 #ifdef MULTI_GPU 00898 } else { 00899 00900 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00901 00902 // read half spinor from device memory 00903 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00904 00905 a0_re = i00_re; a0_im = i00_im; 00906 a1_re = i01_re; a1_im = i01_im; 00907 a2_re = i02_re; a2_im = i02_im; 00908 b0_re = i10_re; b0_im = i10_im; 00909 b1_re = i11_re; b1_im = i11_im; 00910 b2_re = i12_re; b2_im = i12_im; 00911 00912 } 00913 #endif // MULTI_GPU 00914 00915 // read gauge matrix from device memory 00916 READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride); 00917 00918 // reconstruct gauge matrix 00919 RECONSTRUCT_GAUGE_MATRIX(2); 00920 00921 // multiply row 0 00922 spinorFloat A0_re = 0; 00923 A0_re += g00_re * a0_re; 00924 A0_re -= g00_im * a0_im; 00925 A0_re += g01_re * a1_re; 00926 A0_re -= g01_im * a1_im; 00927 A0_re += g02_re * a2_re; 00928 A0_re -= g02_im * a2_im; 00929 spinorFloat A0_im = 0; 00930 A0_im += g00_re * a0_im; 00931 A0_im += g00_im * a0_re; 00932 A0_im += g01_re * a1_im; 00933 A0_im += g01_im * a1_re; 00934 A0_im += g02_re * a2_im; 00935 A0_im += g02_im * a2_re; 00936 spinorFloat B0_re = 0; 00937 B0_re += g00_re * b0_re; 00938 B0_re -= g00_im * b0_im; 00939 B0_re += g01_re * b1_re; 00940 B0_re -= g01_im * b1_im; 00941 B0_re += g02_re * b2_re; 00942 B0_re -= g02_im * b2_im; 00943 spinorFloat B0_im = 0; 00944 B0_im += g00_re * b0_im; 00945 B0_im += g00_im * b0_re; 00946 B0_im += g01_re * b1_im; 00947 B0_im += g01_im * b1_re; 00948 B0_im += g02_re * b2_im; 00949 B0_im += g02_im * b2_re; 00950 00951 // multiply row 1 00952 spinorFloat A1_re = 0; 00953 A1_re += g10_re * a0_re; 00954 A1_re -= g10_im * a0_im; 00955 A1_re += g11_re * a1_re; 00956 A1_re -= g11_im * a1_im; 00957 A1_re += g12_re * a2_re; 00958 A1_re -= g12_im * a2_im; 00959 spinorFloat A1_im = 0; 00960 A1_im += g10_re * a0_im; 00961 A1_im += g10_im * a0_re; 00962 A1_im += g11_re * a1_im; 00963 A1_im += g11_im * a1_re; 00964 A1_im += g12_re * a2_im; 00965 A1_im += g12_im * a2_re; 00966 spinorFloat B1_re = 0; 00967 B1_re += g10_re * b0_re; 00968 B1_re -= g10_im * b0_im; 00969 B1_re += g11_re * b1_re; 00970 B1_re -= g11_im * b1_im; 00971 B1_re += g12_re * b2_re; 00972 B1_re -= g12_im * b2_im; 00973 spinorFloat B1_im = 0; 00974 B1_im += g10_re * b0_im; 00975 B1_im += g10_im * b0_re; 00976 B1_im += g11_re * b1_im; 00977 B1_im += g11_im * b1_re; 00978 B1_im += g12_re * b2_im; 00979 B1_im += g12_im * b2_re; 00980 00981 // multiply row 2 00982 spinorFloat A2_re = 0; 00983 A2_re += g20_re * a0_re; 00984 A2_re -= g20_im * a0_im; 00985 A2_re += g21_re * a1_re; 00986 A2_re -= g21_im * a1_im; 00987 A2_re += g22_re * a2_re; 00988 A2_re -= g22_im * a2_im; 00989 spinorFloat A2_im = 0; 00990 A2_im += g20_re * a0_im; 00991 A2_im += g20_im * a0_re; 00992 A2_im += g21_re * a1_im; 00993 A2_im += g21_im * a1_re; 00994 A2_im += g22_re * a2_im; 00995 A2_im += g22_im * a2_re; 00996 spinorFloat B2_re = 0; 00997 B2_re += g20_re * b0_re; 00998 B2_re -= g20_im * b0_im; 00999 B2_re += g21_re * b1_re; 01000 B2_re -= g21_im * b1_im; 01001 B2_re += g22_re * b2_re; 01002 B2_re -= g22_im * b2_im; 01003 spinorFloat B2_im = 0; 01004 B2_im += g20_re * b0_im; 01005 B2_im += g20_im * b0_re; 01006 B2_im += g21_re * b1_im; 01007 B2_im += g21_im * b1_re; 01008 B2_im += g22_re * b2_im; 01009 B2_im += g22_im * b2_re; 01010 01011 o00_re += A0_re; 01012 o00_im += A0_im; 01013 o10_re += B0_re; 01014 o10_im += B0_im; 01015 o20_re -= B0_re; 01016 o20_im -= B0_im; 01017 o30_re += A0_re; 01018 o30_im += A0_im; 01019 01020 o01_re += A1_re; 01021 o01_im += A1_im; 01022 o11_re += B1_re; 01023 o11_im += B1_im; 01024 o21_re -= B1_re; 01025 o21_im -= B1_im; 01026 o31_re += A1_re; 01027 o31_im += A1_im; 01028 01029 o02_re += A2_re; 01030 o02_im += A2_im; 01031 o12_re += B2_re; 01032 o12_im += B2_im; 01033 o22_re -= B2_re; 01034 o22_im -= B2_im; 01035 o32_re += A2_re; 01036 o32_im += A2_im; 01037 01038 } 01039 01040 #ifdef MULTI_GPU 01041 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) || 01042 (kernel_type == EXTERIOR_KERNEL_Y && x2==0) ) 01043 #endif 01044 { 01045 // Projector P1- 01046 // 1 0 0 -1 01047 // 0 1 1 0 01048 // 0 1 1 0 01049 // -1 0 0 1 01050 01051 #ifdef MULTI_GPU 01052 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 : 01053 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01054 #else 01055 const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1; 01056 #endif 01057 01058 #ifdef MULTI_GPU 01059 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01060 #else 01061 const int ga_idx = sp_idx; 01062 #endif 01063 01064 spinorFloat a0_re, a0_im; 01065 spinorFloat a1_re, a1_im; 01066 spinorFloat a2_re, a2_im; 01067 spinorFloat b0_re, b0_im; 01068 spinorFloat b1_re, b1_im; 01069 spinorFloat b2_re, b2_im; 01070 01071 #ifdef MULTI_GPU 01072 if (kernel_type == INTERIOR_KERNEL) { 01073 #endif 01074 01075 if (threadIdx.y == 0 && blockDim.y < X2) { 01076 // read spinor from device memory 01077 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01078 01079 // project spinor into half spinors 01080 a0_re = +i00_re-i30_re; 01081 a0_im = +i00_im-i30_im; 01082 a1_re = +i01_re-i31_re; 01083 a1_im = +i01_im-i31_im; 01084 a2_re = +i02_re-i32_re; 01085 a2_im = +i02_im-i32_im; 01086 b0_re = +i10_re+i20_re; 01087 b0_im = +i10_im+i20_im; 01088 b1_re = +i11_re+i21_re; 01089 b1_im = +i11_im+i21_im; 01090 b2_re = +i12_re+i22_re; 01091 b2_im = +i12_im+i22_im; 01092 } else { 01093 // load spinor from shared memory 01094 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x; 01095 int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1; 01096 READ_SPINOR_SHARED(tx, ty, threadIdx.z); 01097 01098 // project spinor into half spinors 01099 a0_re = +i00_re-i30_re; 01100 a0_im = +i00_im-i30_im; 01101 a1_re = +i01_re-i31_re; 01102 a1_im = +i01_im-i31_im; 01103 a2_re = +i02_re-i32_re; 01104 a2_im = +i02_im-i32_im; 01105 b0_re = +i10_re+i20_re; 01106 b0_im = +i10_im+i20_im; 01107 b1_re = +i11_re+i21_re; 01108 b1_im = +i11_im+i21_im; 01109 b2_re = +i12_re+i22_re; 01110 b2_im = +i12_im+i22_im; 01111 } 01112 01113 #ifdef MULTI_GPU 01114 } else { 01115 01116 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01117 01118 // read half spinor from device memory 01119 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01120 01121 a0_re = i00_re; a0_im = i00_im; 01122 a1_re = i01_re; a1_im = i01_im; 01123 a2_re = i02_re; a2_im = i02_im; 01124 b0_re = i10_re; b0_im = i10_im; 01125 b1_re = i11_re; b1_im = i11_im; 01126 b2_re = i12_re; b2_im = i12_im; 01127 01128 } 01129 #endif // MULTI_GPU 01130 01131 // read gauge matrix from device memory 01132 READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride); 01133 01134 // reconstruct gauge matrix 01135 RECONSTRUCT_GAUGE_MATRIX(3); 01136 01137 // multiply row 0 01138 spinorFloat A0_re = 0; 01139 A0_re += gT00_re * a0_re; 01140 A0_re -= gT00_im * a0_im; 01141 A0_re += gT01_re * a1_re; 01142 A0_re -= gT01_im * a1_im; 01143 A0_re += gT02_re * a2_re; 01144 A0_re -= gT02_im * a2_im; 01145 spinorFloat A0_im = 0; 01146 A0_im += gT00_re * a0_im; 01147 A0_im += gT00_im * a0_re; 01148 A0_im += gT01_re * a1_im; 01149 A0_im += gT01_im * a1_re; 01150 A0_im += gT02_re * a2_im; 01151 A0_im += gT02_im * a2_re; 01152 spinorFloat B0_re = 0; 01153 B0_re += gT00_re * b0_re; 01154 B0_re -= gT00_im * b0_im; 01155 B0_re += gT01_re * b1_re; 01156 B0_re -= gT01_im * b1_im; 01157 B0_re += gT02_re * b2_re; 01158 B0_re -= gT02_im * b2_im; 01159 spinorFloat B0_im = 0; 01160 B0_im += gT00_re * b0_im; 01161 B0_im += gT00_im * b0_re; 01162 B0_im += gT01_re * b1_im; 01163 B0_im += gT01_im * b1_re; 01164 B0_im += gT02_re * b2_im; 01165 B0_im += gT02_im * b2_re; 01166 01167 // multiply row 1 01168 spinorFloat A1_re = 0; 01169 A1_re += gT10_re * a0_re; 01170 A1_re -= gT10_im * a0_im; 01171 A1_re += gT11_re * a1_re; 01172 A1_re -= gT11_im * a1_im; 01173 A1_re += gT12_re * a2_re; 01174 A1_re -= gT12_im * a2_im; 01175 spinorFloat A1_im = 0; 01176 A1_im += gT10_re * a0_im; 01177 A1_im += gT10_im * a0_re; 01178 A1_im += gT11_re * a1_im; 01179 A1_im += gT11_im * a1_re; 01180 A1_im += gT12_re * a2_im; 01181 A1_im += gT12_im * a2_re; 01182 spinorFloat B1_re = 0; 01183 B1_re += gT10_re * b0_re; 01184 B1_re -= gT10_im * b0_im; 01185 B1_re += gT11_re * b1_re; 01186 B1_re -= gT11_im * b1_im; 01187 B1_re += gT12_re * b2_re; 01188 B1_re -= gT12_im * b2_im; 01189 spinorFloat B1_im = 0; 01190 B1_im += gT10_re * b0_im; 01191 B1_im += gT10_im * b0_re; 01192 B1_im += gT11_re * b1_im; 01193 B1_im += gT11_im * b1_re; 01194 B1_im += gT12_re * b2_im; 01195 B1_im += gT12_im * b2_re; 01196 01197 // multiply row 2 01198 spinorFloat A2_re = 0; 01199 A2_re += gT20_re * a0_re; 01200 A2_re -= gT20_im * a0_im; 01201 A2_re += gT21_re * a1_re; 01202 A2_re -= gT21_im * a1_im; 01203 A2_re += gT22_re * a2_re; 01204 A2_re -= gT22_im * a2_im; 01205 spinorFloat A2_im = 0; 01206 A2_im += gT20_re * a0_im; 01207 A2_im += gT20_im * a0_re; 01208 A2_im += gT21_re * a1_im; 01209 A2_im += gT21_im * a1_re; 01210 A2_im += gT22_re * a2_im; 01211 A2_im += gT22_im * a2_re; 01212 spinorFloat B2_re = 0; 01213 B2_re += gT20_re * b0_re; 01214 B2_re -= gT20_im * b0_im; 01215 B2_re += gT21_re * b1_re; 01216 B2_re -= gT21_im * b1_im; 01217 B2_re += gT22_re * b2_re; 01218 B2_re -= gT22_im * b2_im; 01219 spinorFloat B2_im = 0; 01220 B2_im += gT20_re * b0_im; 01221 B2_im += gT20_im * b0_re; 01222 B2_im += gT21_re * b1_im; 01223 B2_im += gT21_im * b1_re; 01224 B2_im += gT22_re * b2_im; 01225 B2_im += gT22_im * b2_re; 01226 01227 o00_re += A0_re; 01228 o00_im += A0_im; 01229 o10_re += B0_re; 01230 o10_im += B0_im; 01231 o20_re += B0_re; 01232 o20_im += B0_im; 01233 o30_re -= A0_re; 01234 o30_im -= A0_im; 01235 01236 o01_re += A1_re; 01237 o01_im += A1_im; 01238 o11_re += B1_re; 01239 o11_im += B1_im; 01240 o21_re += B1_re; 01241 o21_im += B1_im; 01242 o31_re -= A1_re; 01243 o31_im -= A1_im; 01244 01245 o02_re += A2_re; 01246 o02_im += A2_im; 01247 o12_re += B2_re; 01248 o12_im += B2_im; 01249 o22_re += B2_re; 01250 o22_im += B2_im; 01251 o32_re -= A2_re; 01252 o32_im -= A2_im; 01253 01254 } 01255 01256 #ifdef MULTI_GPU 01257 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) || 01258 (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) ) 01259 #endif 01260 { 01261 // Projector P2+ 01262 // 1 0 i 0 01263 // 0 1 0 -i 01264 // -i 0 1 0 01265 // 0 i 0 1 01266 01267 #ifdef MULTI_GPU 01268 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 : 01269 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01270 #else 01271 const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1; 01272 #endif 01273 01274 const int ga_idx = sid; 01275 01276 spinorFloat a0_re, a0_im; 01277 spinorFloat a1_re, a1_im; 01278 spinorFloat a2_re, a2_im; 01279 spinorFloat b0_re, b0_im; 01280 spinorFloat b1_re, b1_im; 01281 spinorFloat b2_re, b2_im; 01282 01283 #ifdef MULTI_GPU 01284 if (kernel_type == INTERIOR_KERNEL) { 01285 #endif 01286 01287 if (threadIdx.z == blockDim.z-1 && blockDim.z < X3) { 01288 // read spinor from device memory 01289 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01290 01291 // project spinor into half spinors 01292 a0_re = +i00_re-i20_im; 01293 a0_im = +i00_im+i20_re; 01294 a1_re = +i01_re-i21_im; 01295 a1_im = +i01_im+i21_re; 01296 a2_re = +i02_re-i22_im; 01297 a2_im = +i02_im+i22_re; 01298 b0_re = +i10_re+i30_im; 01299 b0_im = +i10_im-i30_re; 01300 b1_re = +i11_re+i31_im; 01301 b1_im = +i11_im-i31_re; 01302 b2_re = +i12_re+i32_im; 01303 b2_im = +i12_im-i32_re; 01304 } else { 01305 // load spinor from shared memory 01306 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x; 01307 int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0; 01308 READ_SPINOR_SHARED(tx, threadIdx.y, tz); 01309 01310 // project spinor into half spinors 01311 a0_re = +i00_re-i20_im; 01312 a0_im = +i00_im+i20_re; 01313 a1_re = +i01_re-i21_im; 01314 a1_im = +i01_im+i21_re; 01315 a2_re = +i02_re-i22_im; 01316 a2_im = +i02_im+i22_re; 01317 b0_re = +i10_re+i30_im; 01318 b0_im = +i10_im-i30_re; 01319 b1_re = +i11_re+i31_im; 01320 b1_im = +i11_im-i31_re; 01321 b2_re = +i12_re+i32_im; 01322 b2_im = +i12_im-i32_re; 01323 } 01324 01325 #ifdef MULTI_GPU 01326 } else { 01327 01328 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01329 01330 // read half spinor from device memory 01331 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01332 01333 a0_re = i00_re; a0_im = i00_im; 01334 a1_re = i01_re; a1_im = i01_im; 01335 a2_re = i02_re; a2_im = i02_im; 01336 b0_re = i10_re; b0_im = i10_im; 01337 b1_re = i11_re; b1_im = i11_im; 01338 b2_re = i12_re; b2_im = i12_im; 01339 01340 } 01341 #endif // MULTI_GPU 01342 01343 // read gauge matrix from device memory 01344 READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride); 01345 01346 // reconstruct gauge matrix 01347 RECONSTRUCT_GAUGE_MATRIX(4); 01348 01349 // multiply row 0 01350 spinorFloat A0_re = 0; 01351 A0_re += g00_re * a0_re; 01352 A0_re -= g00_im * a0_im; 01353 A0_re += g01_re * a1_re; 01354 A0_re -= g01_im * a1_im; 01355 A0_re += g02_re * a2_re; 01356 A0_re -= g02_im * a2_im; 01357 spinorFloat A0_im = 0; 01358 A0_im += g00_re * a0_im; 01359 A0_im += g00_im * a0_re; 01360 A0_im += g01_re * a1_im; 01361 A0_im += g01_im * a1_re; 01362 A0_im += g02_re * a2_im; 01363 A0_im += g02_im * a2_re; 01364 spinorFloat B0_re = 0; 01365 B0_re += g00_re * b0_re; 01366 B0_re -= g00_im * b0_im; 01367 B0_re += g01_re * b1_re; 01368 B0_re -= g01_im * b1_im; 01369 B0_re += g02_re * b2_re; 01370 B0_re -= g02_im * b2_im; 01371 spinorFloat B0_im = 0; 01372 B0_im += g00_re * b0_im; 01373 B0_im += g00_im * b0_re; 01374 B0_im += g01_re * b1_im; 01375 B0_im += g01_im * b1_re; 01376 B0_im += g02_re * b2_im; 01377 B0_im += g02_im * b2_re; 01378 01379 // multiply row 1 01380 spinorFloat A1_re = 0; 01381 A1_re += g10_re * a0_re; 01382 A1_re -= g10_im * a0_im; 01383 A1_re += g11_re * a1_re; 01384 A1_re -= g11_im * a1_im; 01385 A1_re += g12_re * a2_re; 01386 A1_re -= g12_im * a2_im; 01387 spinorFloat A1_im = 0; 01388 A1_im += g10_re * a0_im; 01389 A1_im += g10_im * a0_re; 01390 A1_im += g11_re * a1_im; 01391 A1_im += g11_im * a1_re; 01392 A1_im += g12_re * a2_im; 01393 A1_im += g12_im * a2_re; 01394 spinorFloat B1_re = 0; 01395 B1_re += g10_re * b0_re; 01396 B1_re -= g10_im * b0_im; 01397 B1_re += g11_re * b1_re; 01398 B1_re -= g11_im * b1_im; 01399 B1_re += g12_re * b2_re; 01400 B1_re -= g12_im * b2_im; 01401 spinorFloat B1_im = 0; 01402 B1_im += g10_re * b0_im; 01403 B1_im += g10_im * b0_re; 01404 B1_im += g11_re * b1_im; 01405 B1_im += g11_im * b1_re; 01406 B1_im += g12_re * b2_im; 01407 B1_im += g12_im * b2_re; 01408 01409 // multiply row 2 01410 spinorFloat A2_re = 0; 01411 A2_re += g20_re * a0_re; 01412 A2_re -= g20_im * a0_im; 01413 A2_re += g21_re * a1_re; 01414 A2_re -= g21_im * a1_im; 01415 A2_re += g22_re * a2_re; 01416 A2_re -= g22_im * a2_im; 01417 spinorFloat A2_im = 0; 01418 A2_im += g20_re * a0_im; 01419 A2_im += g20_im * a0_re; 01420 A2_im += g21_re * a1_im; 01421 A2_im += g21_im * a1_re; 01422 A2_im += g22_re * a2_im; 01423 A2_im += g22_im * a2_re; 01424 spinorFloat B2_re = 0; 01425 B2_re += g20_re * b0_re; 01426 B2_re -= g20_im * b0_im; 01427 B2_re += g21_re * b1_re; 01428 B2_re -= g21_im * b1_im; 01429 B2_re += g22_re * b2_re; 01430 B2_re -= g22_im * b2_im; 01431 spinorFloat B2_im = 0; 01432 B2_im += g20_re * b0_im; 01433 B2_im += g20_im * b0_re; 01434 B2_im += g21_re * b1_im; 01435 B2_im += g21_im * b1_re; 01436 B2_im += g22_re * b2_im; 01437 B2_im += g22_im * b2_re; 01438 01439 o00_re += A0_re; 01440 o00_im += A0_im; 01441 o10_re += B0_re; 01442 o10_im += B0_im; 01443 o20_re += A0_im; 01444 o20_im -= A0_re; 01445 o30_re -= B0_im; 01446 o30_im += B0_re; 01447 01448 o01_re += A1_re; 01449 o01_im += A1_im; 01450 o11_re += B1_re; 01451 o11_im += B1_im; 01452 o21_re += A1_im; 01453 o21_im -= A1_re; 01454 o31_re -= B1_im; 01455 o31_im += B1_re; 01456 01457 o02_re += A2_re; 01458 o02_im += A2_im; 01459 o12_re += B2_re; 01460 o12_im += B2_im; 01461 o22_re += A2_im; 01462 o22_im -= A2_re; 01463 o32_re -= B2_im; 01464 o32_im += B2_re; 01465 01466 } 01467 01468 #ifdef MULTI_GPU 01469 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) || 01470 (kernel_type == EXTERIOR_KERNEL_Z && x3==0) ) 01471 #endif 01472 { 01473 // Projector P2- 01474 // 1 0 -i 0 01475 // 0 1 0 i 01476 // i 0 1 0 01477 // 0 -i 0 1 01478 01479 #ifdef MULTI_GPU 01480 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 : 01481 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01482 #else 01483 const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1; 01484 #endif 01485 01486 #ifdef MULTI_GPU 01487 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01488 #else 01489 const int ga_idx = sp_idx; 01490 #endif 01491 01492 spinorFloat a0_re, a0_im; 01493 spinorFloat a1_re, a1_im; 01494 spinorFloat a2_re, a2_im; 01495 spinorFloat b0_re, b0_im; 01496 spinorFloat b1_re, b1_im; 01497 spinorFloat b2_re, b2_im; 01498 01499 #ifdef MULTI_GPU 01500 if (kernel_type == INTERIOR_KERNEL) { 01501 #endif 01502 01503 if (threadIdx.z == 0 && blockDim.z < X3) { 01504 // read spinor from device memory 01505 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01506 01507 // project spinor into half spinors 01508 a0_re = +i00_re+i20_im; 01509 a0_im = +i00_im-i20_re; 01510 a1_re = +i01_re+i21_im; 01511 a1_im = +i01_im-i21_re; 01512 a2_re = +i02_re+i22_im; 01513 a2_im = +i02_im-i22_re; 01514 b0_re = +i10_re-i30_im; 01515 b0_im = +i10_im+i30_re; 01516 b1_re = +i11_re-i31_im; 01517 b1_im = +i11_im+i31_re; 01518 b2_re = +i12_re-i32_im; 01519 b2_im = +i12_im+i32_re; 01520 } else { 01521 // load spinor from shared memory 01522 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x; 01523 int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1; 01524 READ_SPINOR_SHARED(tx, threadIdx.y, tz); 01525 01526 // project spinor into half spinors 01527 a0_re = +i00_re+i20_im; 01528 a0_im = +i00_im-i20_re; 01529 a1_re = +i01_re+i21_im; 01530 a1_im = +i01_im-i21_re; 01531 a2_re = +i02_re+i22_im; 01532 a2_im = +i02_im-i22_re; 01533 b0_re = +i10_re-i30_im; 01534 b0_im = +i10_im+i30_re; 01535 b1_re = +i11_re-i31_im; 01536 b1_im = +i11_im+i31_re; 01537 b2_re = +i12_re-i32_im; 01538 b2_im = +i12_im+i32_re; 01539 } 01540 01541 #ifdef MULTI_GPU 01542 } else { 01543 01544 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01545 01546 // read half spinor from device memory 01547 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01548 01549 a0_re = i00_re; a0_im = i00_im; 01550 a1_re = i01_re; a1_im = i01_im; 01551 a2_re = i02_re; a2_im = i02_im; 01552 b0_re = i10_re; b0_im = i10_im; 01553 b1_re = i11_re; b1_im = i11_im; 01554 b2_re = i12_re; b2_im = i12_im; 01555 01556 } 01557 #endif // MULTI_GPU 01558 01559 // read gauge matrix from device memory 01560 READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride); 01561 01562 // reconstruct gauge matrix 01563 RECONSTRUCT_GAUGE_MATRIX(5); 01564 01565 // multiply row 0 01566 spinorFloat A0_re = 0; 01567 A0_re += gT00_re * a0_re; 01568 A0_re -= gT00_im * a0_im; 01569 A0_re += gT01_re * a1_re; 01570 A0_re -= gT01_im * a1_im; 01571 A0_re += gT02_re * a2_re; 01572 A0_re -= gT02_im * a2_im; 01573 spinorFloat A0_im = 0; 01574 A0_im += gT00_re * a0_im; 01575 A0_im += gT00_im * a0_re; 01576 A0_im += gT01_re * a1_im; 01577 A0_im += gT01_im * a1_re; 01578 A0_im += gT02_re * a2_im; 01579 A0_im += gT02_im * a2_re; 01580 spinorFloat B0_re = 0; 01581 B0_re += gT00_re * b0_re; 01582 B0_re -= gT00_im * b0_im; 01583 B0_re += gT01_re * b1_re; 01584 B0_re -= gT01_im * b1_im; 01585 B0_re += gT02_re * b2_re; 01586 B0_re -= gT02_im * b2_im; 01587 spinorFloat B0_im = 0; 01588 B0_im += gT00_re * b0_im; 01589 B0_im += gT00_im * b0_re; 01590 B0_im += gT01_re * b1_im; 01591 B0_im += gT01_im * b1_re; 01592 B0_im += gT02_re * b2_im; 01593 B0_im += gT02_im * b2_re; 01594 01595 // multiply row 1 01596 spinorFloat A1_re = 0; 01597 A1_re += gT10_re * a0_re; 01598 A1_re -= gT10_im * a0_im; 01599 A1_re += gT11_re * a1_re; 01600 A1_re -= gT11_im * a1_im; 01601 A1_re += gT12_re * a2_re; 01602 A1_re -= gT12_im * a2_im; 01603 spinorFloat A1_im = 0; 01604 A1_im += gT10_re * a0_im; 01605 A1_im += gT10_im * a0_re; 01606 A1_im += gT11_re * a1_im; 01607 A1_im += gT11_im * a1_re; 01608 A1_im += gT12_re * a2_im; 01609 A1_im += gT12_im * a2_re; 01610 spinorFloat B1_re = 0; 01611 B1_re += gT10_re * b0_re; 01612 B1_re -= gT10_im * b0_im; 01613 B1_re += gT11_re * b1_re; 01614 B1_re -= gT11_im * b1_im; 01615 B1_re += gT12_re * b2_re; 01616 B1_re -= gT12_im * b2_im; 01617 spinorFloat B1_im = 0; 01618 B1_im += gT10_re * b0_im; 01619 B1_im += gT10_im * b0_re; 01620 B1_im += gT11_re * b1_im; 01621 B1_im += gT11_im * b1_re; 01622 B1_im += gT12_re * b2_im; 01623 B1_im += gT12_im * b2_re; 01624 01625 // multiply row 2 01626 spinorFloat A2_re = 0; 01627 A2_re += gT20_re * a0_re; 01628 A2_re -= gT20_im * a0_im; 01629 A2_re += gT21_re * a1_re; 01630 A2_re -= gT21_im * a1_im; 01631 A2_re += gT22_re * a2_re; 01632 A2_re -= gT22_im * a2_im; 01633 spinorFloat A2_im = 0; 01634 A2_im += gT20_re * a0_im; 01635 A2_im += gT20_im * a0_re; 01636 A2_im += gT21_re * a1_im; 01637 A2_im += gT21_im * a1_re; 01638 A2_im += gT22_re * a2_im; 01639 A2_im += gT22_im * a2_re; 01640 spinorFloat B2_re = 0; 01641 B2_re += gT20_re * b0_re; 01642 B2_re -= gT20_im * b0_im; 01643 B2_re += gT21_re * b1_re; 01644 B2_re -= gT21_im * b1_im; 01645 B2_re += gT22_re * b2_re; 01646 B2_re -= gT22_im * b2_im; 01647 spinorFloat B2_im = 0; 01648 B2_im += gT20_re * b0_im; 01649 B2_im += gT20_im * b0_re; 01650 B2_im += gT21_re * b1_im; 01651 B2_im += gT21_im * b1_re; 01652 B2_im += gT22_re * b2_im; 01653 B2_im += gT22_im * b2_re; 01654 01655 o00_re += A0_re; 01656 o00_im += A0_im; 01657 o10_re += B0_re; 01658 o10_im += B0_im; 01659 o20_re -= A0_im; 01660 o20_im += A0_re; 01661 o30_re += B0_im; 01662 o30_im -= B0_re; 01663 01664 o01_re += A1_re; 01665 o01_im += A1_im; 01666 o11_re += B1_re; 01667 o11_im += B1_im; 01668 o21_re -= A1_im; 01669 o21_im += A1_re; 01670 o31_re += B1_im; 01671 o31_im -= B1_re; 01672 01673 o02_re += A2_re; 01674 o02_im += A2_im; 01675 o12_re += B2_re; 01676 o12_im += B2_im; 01677 o22_re -= A2_im; 01678 o22_im += A2_re; 01679 o32_re += B2_im; 01680 o32_im -= B2_re; 01681 01682 } 01683 01684 #ifdef MULTI_GPU 01685 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) || 01686 (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) ) 01687 #endif 01688 { 01689 // Projector P3+ 01690 // 2 0 0 0 01691 // 0 2 0 0 01692 // 0 0 0 0 01693 // 0 0 0 0 01694 01695 #ifdef MULTI_GPU 01696 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 : 01697 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01698 #else 01699 const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1; 01700 #endif 01701 01702 const int ga_idx = sid; 01703 01704 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01705 { 01706 spinorFloat a0_re, a0_im; 01707 spinorFloat a1_re, a1_im; 01708 spinorFloat a2_re, a2_im; 01709 spinorFloat b0_re, b0_im; 01710 spinorFloat b1_re, b1_im; 01711 spinorFloat b2_re, b2_im; 01712 01713 #ifdef MULTI_GPU 01714 if (kernel_type == INTERIOR_KERNEL) { 01715 #endif 01716 01717 // read spinor from device memory 01718 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01719 01720 // project spinor into half spinors 01721 a0_re = +2*i00_re; 01722 a0_im = +2*i00_im; 01723 a1_re = +2*i01_re; 01724 a1_im = +2*i01_im; 01725 a2_re = +2*i02_re; 01726 a2_im = +2*i02_im; 01727 b0_re = +2*i10_re; 01728 b0_im = +2*i10_im; 01729 b1_re = +2*i11_re; 01730 b1_im = +2*i11_im; 01731 b2_re = +2*i12_re; 01732 b2_im = +2*i12_im; 01733 01734 #ifdef MULTI_GPU 01735 } else { 01736 01737 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01738 const int t_proj_scale = TPROJSCALE; 01739 01740 // read half spinor from device memory 01741 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01742 01743 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01744 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01745 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01746 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01747 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01748 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01749 01750 } 01751 #endif // MULTI_GPU 01752 01753 // identity gauge matrix 01754 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01755 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01756 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01757 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01758 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01759 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01760 01761 o00_re += A0_re; 01762 o00_im += A0_im; 01763 o10_re += B0_re; 01764 o10_im += B0_im; 01765 01766 o01_re += A1_re; 01767 o01_im += A1_im; 01768 o11_re += B1_re; 01769 o11_im += B1_im; 01770 01771 o02_re += A2_re; 01772 o02_im += A2_im; 01773 o12_re += B2_re; 01774 o12_im += B2_im; 01775 01776 } else { 01777 spinorFloat a0_re, a0_im; 01778 spinorFloat a1_re, a1_im; 01779 spinorFloat a2_re, a2_im; 01780 spinorFloat b0_re, b0_im; 01781 spinorFloat b1_re, b1_im; 01782 spinorFloat b2_re, b2_im; 01783 01784 #ifdef MULTI_GPU 01785 if (kernel_type == INTERIOR_KERNEL) { 01786 #endif 01787 01788 // read spinor from device memory 01789 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01790 01791 // project spinor into half spinors 01792 a0_re = +2*i00_re; 01793 a0_im = +2*i00_im; 01794 a1_re = +2*i01_re; 01795 a1_im = +2*i01_im; 01796 a2_re = +2*i02_re; 01797 a2_im = +2*i02_im; 01798 b0_re = +2*i10_re; 01799 b0_im = +2*i10_im; 01800 b1_re = +2*i11_re; 01801 b1_im = +2*i11_im; 01802 b2_re = +2*i12_re; 01803 b2_im = +2*i12_im; 01804 01805 #ifdef MULTI_GPU 01806 } else { 01807 01808 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01809 const int t_proj_scale = TPROJSCALE; 01810 01811 // read half spinor from device memory 01812 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01813 01814 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01815 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01816 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01817 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01818 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01819 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01820 01821 } 01822 #endif // MULTI_GPU 01823 01824 // read gauge matrix from device memory 01825 READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride); 01826 01827 // reconstruct gauge matrix 01828 RECONSTRUCT_GAUGE_MATRIX(6); 01829 01830 // multiply row 0 01831 spinorFloat A0_re = 0; 01832 A0_re += g00_re * a0_re; 01833 A0_re -= g00_im * a0_im; 01834 A0_re += g01_re * a1_re; 01835 A0_re -= g01_im * a1_im; 01836 A0_re += g02_re * a2_re; 01837 A0_re -= g02_im * a2_im; 01838 spinorFloat A0_im = 0; 01839 A0_im += g00_re * a0_im; 01840 A0_im += g00_im * a0_re; 01841 A0_im += g01_re * a1_im; 01842 A0_im += g01_im * a1_re; 01843 A0_im += g02_re * a2_im; 01844 A0_im += g02_im * a2_re; 01845 spinorFloat B0_re = 0; 01846 B0_re += g00_re * b0_re; 01847 B0_re -= g00_im * b0_im; 01848 B0_re += g01_re * b1_re; 01849 B0_re -= g01_im * b1_im; 01850 B0_re += g02_re * b2_re; 01851 B0_re -= g02_im * b2_im; 01852 spinorFloat B0_im = 0; 01853 B0_im += g00_re * b0_im; 01854 B0_im += g00_im * b0_re; 01855 B0_im += g01_re * b1_im; 01856 B0_im += g01_im * b1_re; 01857 B0_im += g02_re * b2_im; 01858 B0_im += g02_im * b2_re; 01859 01860 // multiply row 1 01861 spinorFloat A1_re = 0; 01862 A1_re += g10_re * a0_re; 01863 A1_re -= g10_im * a0_im; 01864 A1_re += g11_re * a1_re; 01865 A1_re -= g11_im * a1_im; 01866 A1_re += g12_re * a2_re; 01867 A1_re -= g12_im * a2_im; 01868 spinorFloat A1_im = 0; 01869 A1_im += g10_re * a0_im; 01870 A1_im += g10_im * a0_re; 01871 A1_im += g11_re * a1_im; 01872 A1_im += g11_im * a1_re; 01873 A1_im += g12_re * a2_im; 01874 A1_im += g12_im * a2_re; 01875 spinorFloat B1_re = 0; 01876 B1_re += g10_re * b0_re; 01877 B1_re -= g10_im * b0_im; 01878 B1_re += g11_re * b1_re; 01879 B1_re -= g11_im * b1_im; 01880 B1_re += g12_re * b2_re; 01881 B1_re -= g12_im * b2_im; 01882 spinorFloat B1_im = 0; 01883 B1_im += g10_re * b0_im; 01884 B1_im += g10_im * b0_re; 01885 B1_im += g11_re * b1_im; 01886 B1_im += g11_im * b1_re; 01887 B1_im += g12_re * b2_im; 01888 B1_im += g12_im * b2_re; 01889 01890 // multiply row 2 01891 spinorFloat A2_re = 0; 01892 A2_re += g20_re * a0_re; 01893 A2_re -= g20_im * a0_im; 01894 A2_re += g21_re * a1_re; 01895 A2_re -= g21_im * a1_im; 01896 A2_re += g22_re * a2_re; 01897 A2_re -= g22_im * a2_im; 01898 spinorFloat A2_im = 0; 01899 A2_im += g20_re * a0_im; 01900 A2_im += g20_im * a0_re; 01901 A2_im += g21_re * a1_im; 01902 A2_im += g21_im * a1_re; 01903 A2_im += g22_re * a2_im; 01904 A2_im += g22_im * a2_re; 01905 spinorFloat B2_re = 0; 01906 B2_re += g20_re * b0_re; 01907 B2_re -= g20_im * b0_im; 01908 B2_re += g21_re * b1_re; 01909 B2_re -= g21_im * b1_im; 01910 B2_re += g22_re * b2_re; 01911 B2_re -= g22_im * b2_im; 01912 spinorFloat B2_im = 0; 01913 B2_im += g20_re * b0_im; 01914 B2_im += g20_im * b0_re; 01915 B2_im += g21_re * b1_im; 01916 B2_im += g21_im * b1_re; 01917 B2_im += g22_re * b2_im; 01918 B2_im += g22_im * b2_re; 01919 01920 o00_re += A0_re; 01921 o00_im += A0_im; 01922 o10_re += B0_re; 01923 o10_im += B0_im; 01924 01925 o01_re += A1_re; 01926 o01_im += A1_im; 01927 o11_re += B1_re; 01928 o11_im += B1_im; 01929 01930 o02_re += A2_re; 01931 o02_im += A2_im; 01932 o12_re += B2_re; 01933 o12_im += B2_im; 01934 01935 } 01936 } 01937 01938 #ifdef MULTI_GPU 01939 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) || 01940 (kernel_type == EXTERIOR_KERNEL_T && x4==0) ) 01941 #endif 01942 { 01943 // Projector P3- 01944 // 0 0 0 0 01945 // 0 0 0 0 01946 // 0 0 2 0 01947 // 0 0 0 2 01948 01949 #ifdef MULTI_GPU 01950 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 : 01951 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01952 #else 01953 const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1; 01954 #endif 01955 01956 #ifdef MULTI_GPU 01957 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01958 #else 01959 const int ga_idx = sp_idx; 01960 #endif 01961 01962 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01963 { 01964 spinorFloat a0_re, a0_im; 01965 spinorFloat a1_re, a1_im; 01966 spinorFloat a2_re, a2_im; 01967 spinorFloat b0_re, b0_im; 01968 spinorFloat b1_re, b1_im; 01969 spinorFloat b2_re, b2_im; 01970 01971 #ifdef MULTI_GPU 01972 if (kernel_type == INTERIOR_KERNEL) { 01973 #endif 01974 01975 // read spinor from device memory 01976 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01977 01978 // project spinor into half spinors 01979 a0_re = +2*i20_re; 01980 a0_im = +2*i20_im; 01981 a1_re = +2*i21_re; 01982 a1_im = +2*i21_im; 01983 a2_re = +2*i22_re; 01984 a2_im = +2*i22_im; 01985 b0_re = +2*i30_re; 01986 b0_im = +2*i30_im; 01987 b1_re = +2*i31_re; 01988 b1_im = +2*i31_im; 01989 b2_re = +2*i32_re; 01990 b2_im = +2*i32_im; 01991 01992 #ifdef MULTI_GPU 01993 } else { 01994 01995 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01996 const int t_proj_scale = TPROJSCALE; 01997 01998 // read half spinor from device memory 01999 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 02000 02001 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 02002 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 02003 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 02004 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 02005 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 02006 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 02007 02008 } 02009 #endif // MULTI_GPU 02010 02011 // identity gauge matrix 02012 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 02013 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 02014 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 02015 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 02016 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 02017 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 02018 02019 o20_re += A0_re; 02020 o20_im += A0_im; 02021 o30_re += B0_re; 02022 o30_im += B0_im; 02023 02024 o21_re += A1_re; 02025 o21_im += A1_im; 02026 o31_re += B1_re; 02027 o31_im += B1_im; 02028 02029 o22_re += A2_re; 02030 o22_im += A2_im; 02031 o32_re += B2_re; 02032 o32_im += B2_im; 02033 02034 } else { 02035 spinorFloat a0_re, a0_im; 02036 spinorFloat a1_re, a1_im; 02037 spinorFloat a2_re, a2_im; 02038 spinorFloat b0_re, b0_im; 02039 spinorFloat b1_re, b1_im; 02040 spinorFloat b2_re, b2_im; 02041 02042 #ifdef MULTI_GPU 02043 if (kernel_type == INTERIOR_KERNEL) { 02044 #endif 02045 02046 // read spinor from device memory 02047 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 02048 02049 // project spinor into half spinors 02050 a0_re = +2*i20_re; 02051 a0_im = +2*i20_im; 02052 a1_re = +2*i21_re; 02053 a1_im = +2*i21_im; 02054 a2_re = +2*i22_re; 02055 a2_im = +2*i22_im; 02056 b0_re = +2*i30_re; 02057 b0_im = +2*i30_im; 02058 b1_re = +2*i31_re; 02059 b1_im = +2*i31_im; 02060 b2_re = +2*i32_re; 02061 b2_im = +2*i32_im; 02062 02063 #ifdef MULTI_GPU 02064 } else { 02065 02066 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 02067 const int t_proj_scale = TPROJSCALE; 02068 02069 // read half spinor from device memory 02070 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 02071 02072 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 02073 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 02074 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 02075 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 02076 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 02077 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 02078 02079 } 02080 #endif // MULTI_GPU 02081 02082 // read gauge matrix from device memory 02083 READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride); 02084 02085 // reconstruct gauge matrix 02086 RECONSTRUCT_GAUGE_MATRIX(7); 02087 02088 // multiply row 0 02089 spinorFloat A0_re = 0; 02090 A0_re += gT00_re * a0_re; 02091 A0_re -= gT00_im * a0_im; 02092 A0_re += gT01_re * a1_re; 02093 A0_re -= gT01_im * a1_im; 02094 A0_re += gT02_re * a2_re; 02095 A0_re -= gT02_im * a2_im; 02096 spinorFloat A0_im = 0; 02097 A0_im += gT00_re * a0_im; 02098 A0_im += gT00_im * a0_re; 02099 A0_im += gT01_re * a1_im; 02100 A0_im += gT01_im * a1_re; 02101 A0_im += gT02_re * a2_im; 02102 A0_im += gT02_im * a2_re; 02103 spinorFloat B0_re = 0; 02104 B0_re += gT00_re * b0_re; 02105 B0_re -= gT00_im * b0_im; 02106 B0_re += gT01_re * b1_re; 02107 B0_re -= gT01_im * b1_im; 02108 B0_re += gT02_re * b2_re; 02109 B0_re -= gT02_im * b2_im; 02110 spinorFloat B0_im = 0; 02111 B0_im += gT00_re * b0_im; 02112 B0_im += gT00_im * b0_re; 02113 B0_im += gT01_re * b1_im; 02114 B0_im += gT01_im * b1_re; 02115 B0_im += gT02_re * b2_im; 02116 B0_im += gT02_im * b2_re; 02117 02118 // multiply row 1 02119 spinorFloat A1_re = 0; 02120 A1_re += gT10_re * a0_re; 02121 A1_re -= gT10_im * a0_im; 02122 A1_re += gT11_re * a1_re; 02123 A1_re -= gT11_im * a1_im; 02124 A1_re += gT12_re * a2_re; 02125 A1_re -= gT12_im * a2_im; 02126 spinorFloat A1_im = 0; 02127 A1_im += gT10_re * a0_im; 02128 A1_im += gT10_im * a0_re; 02129 A1_im += gT11_re * a1_im; 02130 A1_im += gT11_im * a1_re; 02131 A1_im += gT12_re * a2_im; 02132 A1_im += gT12_im * a2_re; 02133 spinorFloat B1_re = 0; 02134 B1_re += gT10_re * b0_re; 02135 B1_re -= gT10_im * b0_im; 02136 B1_re += gT11_re * b1_re; 02137 B1_re -= gT11_im * b1_im; 02138 B1_re += gT12_re * b2_re; 02139 B1_re -= gT12_im * b2_im; 02140 spinorFloat B1_im = 0; 02141 B1_im += gT10_re * b0_im; 02142 B1_im += gT10_im * b0_re; 02143 B1_im += gT11_re * b1_im; 02144 B1_im += gT11_im * b1_re; 02145 B1_im += gT12_re * b2_im; 02146 B1_im += gT12_im * b2_re; 02147 02148 // multiply row 2 02149 spinorFloat A2_re = 0; 02150 A2_re += gT20_re * a0_re; 02151 A2_re -= gT20_im * a0_im; 02152 A2_re += gT21_re * a1_re; 02153 A2_re -= gT21_im * a1_im; 02154 A2_re += gT22_re * a2_re; 02155 A2_re -= gT22_im * a2_im; 02156 spinorFloat A2_im = 0; 02157 A2_im += gT20_re * a0_im; 02158 A2_im += gT20_im * a0_re; 02159 A2_im += gT21_re * a1_im; 02160 A2_im += gT21_im * a1_re; 02161 A2_im += gT22_re * a2_im; 02162 A2_im += gT22_im * a2_re; 02163 spinorFloat B2_re = 0; 02164 B2_re += gT20_re * b0_re; 02165 B2_re -= gT20_im * b0_im; 02166 B2_re += gT21_re * b1_re; 02167 B2_re -= gT21_im * b1_im; 02168 B2_re += gT22_re * b2_re; 02169 B2_re -= gT22_im * b2_im; 02170 spinorFloat B2_im = 0; 02171 B2_im += gT20_re * b0_im; 02172 B2_im += gT20_im * b0_re; 02173 B2_im += gT21_re * b1_im; 02174 B2_im += gT21_im * b1_re; 02175 B2_im += gT22_re * b2_im; 02176 B2_im += gT22_im * b2_re; 02177 02178 o20_re += A0_re; 02179 o20_im += A0_im; 02180 o30_re += B0_re; 02181 o30_im += B0_im; 02182 02183 o21_re += A1_re; 02184 o21_im += A1_im; 02185 o31_re += B1_re; 02186 o31_im += B1_im; 02187 02188 o22_re += A2_re; 02189 o22_im += A2_im; 02190 o32_re += B2_re; 02191 o32_im += B2_im; 02192 02193 } 02194 } 02195 02196 #if defined MULTI_GPU && (defined DSLASH_XPAY || defined DSLASH_CLOVER) 02197 02198 int incomplete = 0; // Have all 8 contributions been computed for this site? 02199 02200 switch(kernel_type) { // intentional fall-through 02201 case INTERIOR_KERNEL: 02202 incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1)); 02203 case EXTERIOR_KERNEL_T: 02204 incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1)); 02205 case EXTERIOR_KERNEL_Z: 02206 incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1)); 02207 case EXTERIOR_KERNEL_Y: 02208 incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1)); 02209 } 02210 02211 if (!incomplete) 02212 #endif // MULTI_GPU 02213 { 02214 02215 #ifdef DSLASH_CLOVER 02216 02217 // change to chiral basis 02218 { 02219 spinorFloat a00_re = -o10_re - o30_re; 02220 spinorFloat a00_im = -o10_im - o30_im; 02221 spinorFloat a10_re = o00_re + o20_re; 02222 spinorFloat a10_im = o00_im + o20_im; 02223 spinorFloat a20_re = -o10_re + o30_re; 02224 spinorFloat a20_im = -o10_im + o30_im; 02225 spinorFloat a30_re = o00_re - o20_re; 02226 spinorFloat a30_im = o00_im - o20_im; 02227 02228 o00_re = a00_re; o00_im = a00_im; 02229 o10_re = a10_re; o10_im = a10_im; 02230 o20_re = a20_re; o20_im = a20_im; 02231 o30_re = a30_re; o30_im = a30_im; 02232 } 02233 02234 { 02235 spinorFloat a01_re = -o11_re - o31_re; 02236 spinorFloat a01_im = -o11_im - o31_im; 02237 spinorFloat a11_re = o01_re + o21_re; 02238 spinorFloat a11_im = o01_im + o21_im; 02239 spinorFloat a21_re = -o11_re + o31_re; 02240 spinorFloat a21_im = -o11_im + o31_im; 02241 spinorFloat a31_re = o01_re - o21_re; 02242 spinorFloat a31_im = o01_im - o21_im; 02243 02244 o01_re = a01_re; o01_im = a01_im; 02245 o11_re = a11_re; o11_im = a11_im; 02246 o21_re = a21_re; o21_im = a21_im; 02247 o31_re = a31_re; o31_im = a31_im; 02248 } 02249 02250 { 02251 spinorFloat a02_re = -o12_re - o32_re; 02252 spinorFloat a02_im = -o12_im - o32_im; 02253 spinorFloat a12_re = o02_re + o22_re; 02254 spinorFloat a12_im = o02_im + o22_im; 02255 spinorFloat a22_re = -o12_re + o32_re; 02256 spinorFloat a22_im = -o12_im + o32_im; 02257 spinorFloat a32_re = o02_re - o22_re; 02258 spinorFloat a32_im = o02_im - o22_im; 02259 02260 o02_re = a02_re; o02_im = a02_im; 02261 o12_re = a12_re; o12_im = a12_im; 02262 o22_re = a22_re; o22_im = a22_im; 02263 o32_re = a32_re; o32_im = a32_im; 02264 } 02265 02266 // apply first chiral block 02267 { 02268 READ_CLOVER(CLOVERTEX, 0) 02269 02270 spinorFloat a00_re = 0; spinorFloat a00_im = 0; 02271 spinorFloat a01_re = 0; spinorFloat a01_im = 0; 02272 spinorFloat a02_re = 0; spinorFloat a02_im = 0; 02273 spinorFloat a10_re = 0; spinorFloat a10_im = 0; 02274 spinorFloat a11_re = 0; spinorFloat a11_im = 0; 02275 spinorFloat a12_re = 0; spinorFloat a12_im = 0; 02276 02277 a00_re += c00_00_re * o00_re; 02278 a00_im += c00_00_re * o00_im; 02279 a00_re += c00_01_re * o01_re; 02280 a00_re -= c00_01_im * o01_im; 02281 a00_im += c00_01_re * o01_im; 02282 a00_im += c00_01_im * o01_re; 02283 a00_re += c00_02_re * o02_re; 02284 a00_re -= c00_02_im * o02_im; 02285 a00_im += c00_02_re * o02_im; 02286 a00_im += c00_02_im * o02_re; 02287 a00_re += c00_10_re * o10_re; 02288 a00_re -= c00_10_im * o10_im; 02289 a00_im += c00_10_re * o10_im; 02290 a00_im += c00_10_im * o10_re; 02291 a00_re += c00_11_re * o11_re; 02292 a00_re -= c00_11_im * o11_im; 02293 a00_im += c00_11_re * o11_im; 02294 a00_im += c00_11_im * o11_re; 02295 a00_re += c00_12_re * o12_re; 02296 a00_re -= c00_12_im * o12_im; 02297 a00_im += c00_12_re * o12_im; 02298 a00_im += c00_12_im * o12_re; 02299 02300 a01_re += c01_00_re * o00_re; 02301 a01_re -= c01_00_im * o00_im; 02302 a01_im += c01_00_re * o00_im; 02303 a01_im += c01_00_im * o00_re; 02304 a01_re += c01_01_re * o01_re; 02305 a01_im += c01_01_re * o01_im; 02306 a01_re += c01_02_re * o02_re; 02307 a01_re -= c01_02_im * o02_im; 02308 a01_im += c01_02_re * o02_im; 02309 a01_im += c01_02_im * o02_re; 02310 a01_re += c01_10_re * o10_re; 02311 a01_re -= c01_10_im * o10_im; 02312 a01_im += c01_10_re * o10_im; 02313 a01_im += c01_10_im * o10_re; 02314 a01_re += c01_11_re * o11_re; 02315 a01_re -= c01_11_im * o11_im; 02316 a01_im += c01_11_re * o11_im; 02317 a01_im += c01_11_im * o11_re; 02318 a01_re += c01_12_re * o12_re; 02319 a01_re -= c01_12_im * o12_im; 02320 a01_im += c01_12_re * o12_im; 02321 a01_im += c01_12_im * o12_re; 02322 02323 a02_re += c02_00_re * o00_re; 02324 a02_re -= c02_00_im * o00_im; 02325 a02_im += c02_00_re * o00_im; 02326 a02_im += c02_00_im * o00_re; 02327 a02_re += c02_01_re * o01_re; 02328 a02_re -= c02_01_im * o01_im; 02329 a02_im += c02_01_re * o01_im; 02330 a02_im += c02_01_im * o01_re; 02331 a02_re += c02_02_re * o02_re; 02332 a02_im += c02_02_re * o02_im; 02333 a02_re += c02_10_re * o10_re; 02334 a02_re -= c02_10_im * o10_im; 02335 a02_im += c02_10_re * o10_im; 02336 a02_im += c02_10_im * o10_re; 02337 a02_re += c02_11_re * o11_re; 02338 a02_re -= c02_11_im * o11_im; 02339 a02_im += c02_11_re * o11_im; 02340 a02_im += c02_11_im * o11_re; 02341 a02_re += c02_12_re * o12_re; 02342 a02_re -= c02_12_im * o12_im; 02343 a02_im += c02_12_re * o12_im; 02344 a02_im += c02_12_im * o12_re; 02345 02346 a10_re += c10_00_re * o00_re; 02347 a10_re -= c10_00_im * o00_im; 02348 a10_im += c10_00_re * o00_im; 02349 a10_im += c10_00_im * o00_re; 02350 a10_re += c10_01_re * o01_re; 02351 a10_re -= c10_01_im * o01_im; 02352 a10_im += c10_01_re * o01_im; 02353 a10_im += c10_01_im * o01_re; 02354 a10_re += c10_02_re * o02_re; 02355 a10_re -= c10_02_im * o02_im; 02356 a10_im += c10_02_re * o02_im; 02357 a10_im += c10_02_im * o02_re; 02358 a10_re += c10_10_re * o10_re; 02359 a10_im += c10_10_re * o10_im; 02360 a10_re += c10_11_re * o11_re; 02361 a10_re -= c10_11_im * o11_im; 02362 a10_im += c10_11_re * o11_im; 02363 a10_im += c10_11_im * o11_re; 02364 a10_re += c10_12_re * o12_re; 02365 a10_re -= c10_12_im * o12_im; 02366 a10_im += c10_12_re * o12_im; 02367 a10_im += c10_12_im * o12_re; 02368 02369 a11_re += c11_00_re * o00_re; 02370 a11_re -= c11_00_im * o00_im; 02371 a11_im += c11_00_re * o00_im; 02372 a11_im += c11_00_im * o00_re; 02373 a11_re += c11_01_re * o01_re; 02374 a11_re -= c11_01_im * o01_im; 02375 a11_im += c11_01_re * o01_im; 02376 a11_im += c11_01_im * o01_re; 02377 a11_re += c11_02_re * o02_re; 02378 a11_re -= c11_02_im * o02_im; 02379 a11_im += c11_02_re * o02_im; 02380 a11_im += c11_02_im * o02_re; 02381 a11_re += c11_10_re * o10_re; 02382 a11_re -= c11_10_im * o10_im; 02383 a11_im += c11_10_re * o10_im; 02384 a11_im += c11_10_im * o10_re; 02385 a11_re += c11_11_re * o11_re; 02386 a11_im += c11_11_re * o11_im; 02387 a11_re += c11_12_re * o12_re; 02388 a11_re -= c11_12_im * o12_im; 02389 a11_im += c11_12_re * o12_im; 02390 a11_im += c11_12_im * o12_re; 02391 02392 a12_re += c12_00_re * o00_re; 02393 a12_re -= c12_00_im * o00_im; 02394 a12_im += c12_00_re * o00_im; 02395 a12_im += c12_00_im * o00_re; 02396 a12_re += c12_01_re * o01_re; 02397 a12_re -= c12_01_im * o01_im; 02398 a12_im += c12_01_re * o01_im; 02399 a12_im += c12_01_im * o01_re; 02400 a12_re += c12_02_re * o02_re; 02401 a12_re -= c12_02_im * o02_im; 02402 a12_im += c12_02_re * o02_im; 02403 a12_im += c12_02_im * o02_re; 02404 a12_re += c12_10_re * o10_re; 02405 a12_re -= c12_10_im * o10_im; 02406 a12_im += c12_10_re * o10_im; 02407 a12_im += c12_10_im * o10_re; 02408 a12_re += c12_11_re * o11_re; 02409 a12_re -= c12_11_im * o11_im; 02410 a12_im += c12_11_re * o11_im; 02411 a12_im += c12_11_im * o11_re; 02412 a12_re += c12_12_re * o12_re; 02413 a12_im += c12_12_re * o12_im; 02414 02415 o00_re = a00_re; o00_im = a00_im; 02416 o01_re = a01_re; o01_im = a01_im; 02417 o02_re = a02_re; o02_im = a02_im; 02418 o10_re = a10_re; o10_im = a10_im; 02419 o11_re = a11_re; o11_im = a11_im; 02420 o12_re = a12_re; o12_im = a12_im; 02421 02422 } 02423 02424 // apply second chiral block 02425 { 02426 READ_CLOVER(CLOVERTEX, 1) 02427 02428 spinorFloat a20_re = 0; spinorFloat a20_im = 0; 02429 spinorFloat a21_re = 0; spinorFloat a21_im = 0; 02430 spinorFloat a22_re = 0; spinorFloat a22_im = 0; 02431 spinorFloat a30_re = 0; spinorFloat a30_im = 0; 02432 spinorFloat a31_re = 0; spinorFloat a31_im = 0; 02433 spinorFloat a32_re = 0; spinorFloat a32_im = 0; 02434 02435 a20_re += c20_20_re * o20_re; 02436 a20_im += c20_20_re * o20_im; 02437 a20_re += c20_21_re * o21_re; 02438 a20_re -= c20_21_im * o21_im; 02439 a20_im += c20_21_re * o21_im; 02440 a20_im += c20_21_im * o21_re; 02441 a20_re += c20_22_re * o22_re; 02442 a20_re -= c20_22_im * o22_im; 02443 a20_im += c20_22_re * o22_im; 02444 a20_im += c20_22_im * o22_re; 02445 a20_re += c20_30_re * o30_re; 02446 a20_re -= c20_30_im * o30_im; 02447 a20_im += c20_30_re * o30_im; 02448 a20_im += c20_30_im * o30_re; 02449 a20_re += c20_31_re * o31_re; 02450 a20_re -= c20_31_im * o31_im; 02451 a20_im += c20_31_re * o31_im; 02452 a20_im += c20_31_im * o31_re; 02453 a20_re += c20_32_re * o32_re; 02454 a20_re -= c20_32_im * o32_im; 02455 a20_im += c20_32_re * o32_im; 02456 a20_im += c20_32_im * o32_re; 02457 02458 a21_re += c21_20_re * o20_re; 02459 a21_re -= c21_20_im * o20_im; 02460 a21_im += c21_20_re * o20_im; 02461 a21_im += c21_20_im * o20_re; 02462 a21_re += c21_21_re * o21_re; 02463 a21_im += c21_21_re * o21_im; 02464 a21_re += c21_22_re * o22_re; 02465 a21_re -= c21_22_im * o22_im; 02466 a21_im += c21_22_re * o22_im; 02467 a21_im += c21_22_im * o22_re; 02468 a21_re += c21_30_re * o30_re; 02469 a21_re -= c21_30_im * o30_im; 02470 a21_im += c21_30_re * o30_im; 02471 a21_im += c21_30_im * o30_re; 02472 a21_re += c21_31_re * o31_re; 02473 a21_re -= c21_31_im * o31_im; 02474 a21_im += c21_31_re * o31_im; 02475 a21_im += c21_31_im * o31_re; 02476 a21_re += c21_32_re * o32_re; 02477 a21_re -= c21_32_im * o32_im; 02478 a21_im += c21_32_re * o32_im; 02479 a21_im += c21_32_im * o32_re; 02480 02481 a22_re += c22_20_re * o20_re; 02482 a22_re -= c22_20_im * o20_im; 02483 a22_im += c22_20_re * o20_im; 02484 a22_im += c22_20_im * o20_re; 02485 a22_re += c22_21_re * o21_re; 02486 a22_re -= c22_21_im * o21_im; 02487 a22_im += c22_21_re * o21_im; 02488 a22_im += c22_21_im * o21_re; 02489 a22_re += c22_22_re * o22_re; 02490 a22_im += c22_22_re * o22_im; 02491 a22_re += c22_30_re * o30_re; 02492 a22_re -= c22_30_im * o30_im; 02493 a22_im += c22_30_re * o30_im; 02494 a22_im += c22_30_im * o30_re; 02495 a22_re += c22_31_re * o31_re; 02496 a22_re -= c22_31_im * o31_im; 02497 a22_im += c22_31_re * o31_im; 02498 a22_im += c22_31_im * o31_re; 02499 a22_re += c22_32_re * o32_re; 02500 a22_re -= c22_32_im * o32_im; 02501 a22_im += c22_32_re * o32_im; 02502 a22_im += c22_32_im * o32_re; 02503 02504 a30_re += c30_20_re * o20_re; 02505 a30_re -= c30_20_im * o20_im; 02506 a30_im += c30_20_re * o20_im; 02507 a30_im += c30_20_im * o20_re; 02508 a30_re += c30_21_re * o21_re; 02509 a30_re -= c30_21_im * o21_im; 02510 a30_im += c30_21_re * o21_im; 02511 a30_im += c30_21_im * o21_re; 02512 a30_re += c30_22_re * o22_re; 02513 a30_re -= c30_22_im * o22_im; 02514 a30_im += c30_22_re * o22_im; 02515 a30_im += c30_22_im * o22_re; 02516 a30_re += c30_30_re * o30_re; 02517 a30_im += c30_30_re * o30_im; 02518 a30_re += c30_31_re * o31_re; 02519 a30_re -= c30_31_im * o31_im; 02520 a30_im += c30_31_re * o31_im; 02521 a30_im += c30_31_im * o31_re; 02522 a30_re += c30_32_re * o32_re; 02523 a30_re -= c30_32_im * o32_im; 02524 a30_im += c30_32_re * o32_im; 02525 a30_im += c30_32_im * o32_re; 02526 02527 a31_re += c31_20_re * o20_re; 02528 a31_re -= c31_20_im * o20_im; 02529 a31_im += c31_20_re * o20_im; 02530 a31_im += c31_20_im * o20_re; 02531 a31_re += c31_21_re * o21_re; 02532 a31_re -= c31_21_im * o21_im; 02533 a31_im += c31_21_re * o21_im; 02534 a31_im += c31_21_im * o21_re; 02535 a31_re += c31_22_re * o22_re; 02536 a31_re -= c31_22_im * o22_im; 02537 a31_im += c31_22_re * o22_im; 02538 a31_im += c31_22_im * o22_re; 02539 a31_re += c31_30_re * o30_re; 02540 a31_re -= c31_30_im * o30_im; 02541 a31_im += c31_30_re * o30_im; 02542 a31_im += c31_30_im * o30_re; 02543 a31_re += c31_31_re * o31_re; 02544 a31_im += c31_31_re * o31_im; 02545 a31_re += c31_32_re * o32_re; 02546 a31_re -= c31_32_im * o32_im; 02547 a31_im += c31_32_re * o32_im; 02548 a31_im += c31_32_im * o32_re; 02549 02550 a32_re += c32_20_re * o20_re; 02551 a32_re -= c32_20_im * o20_im; 02552 a32_im += c32_20_re * o20_im; 02553 a32_im += c32_20_im * o20_re; 02554 a32_re += c32_21_re * o21_re; 02555 a32_re -= c32_21_im * o21_im; 02556 a32_im += c32_21_re * o21_im; 02557 a32_im += c32_21_im * o21_re; 02558 a32_re += c32_22_re * o22_re; 02559 a32_re -= c32_22_im * o22_im; 02560 a32_im += c32_22_re * o22_im; 02561 a32_im += c32_22_im * o22_re; 02562 a32_re += c32_30_re * o30_re; 02563 a32_re -= c32_30_im * o30_im; 02564 a32_im += c32_30_re * o30_im; 02565 a32_im += c32_30_im * o30_re; 02566 a32_re += c32_31_re * o31_re; 02567 a32_re -= c32_31_im * o31_im; 02568 a32_im += c32_31_re * o31_im; 02569 a32_im += c32_31_im * o31_re; 02570 a32_re += c32_32_re * o32_re; 02571 a32_im += c32_32_re * o32_im; 02572 02573 o20_re = a20_re; o20_im = a20_im; 02574 o21_re = a21_re; o21_im = a21_im; 02575 o22_re = a22_re; o22_im = a22_im; 02576 o30_re = a30_re; o30_im = a30_im; 02577 o31_re = a31_re; o31_im = a31_im; 02578 o32_re = a32_re; o32_im = a32_im; 02579 02580 } 02581 02582 // change back from chiral basis 02583 // (note: required factor of 1/2 is included in clover term normalization) 02584 { 02585 spinorFloat a00_re = o10_re + o30_re; 02586 spinorFloat a00_im = o10_im + o30_im; 02587 spinorFloat a10_re = -o00_re - o20_re; 02588 spinorFloat a10_im = -o00_im - o20_im; 02589 spinorFloat a20_re = o10_re - o30_re; 02590 spinorFloat a20_im = o10_im - o30_im; 02591 spinorFloat a30_re = -o00_re + o20_re; 02592 spinorFloat a30_im = -o00_im + o20_im; 02593 02594 o00_re = a00_re; o00_im = a00_im; 02595 o10_re = a10_re; o10_im = a10_im; 02596 o20_re = a20_re; o20_im = a20_im; 02597 o30_re = a30_re; o30_im = a30_im; 02598 } 02599 02600 { 02601 spinorFloat a01_re = o11_re + o31_re; 02602 spinorFloat a01_im = o11_im + o31_im; 02603 spinorFloat a11_re = -o01_re - o21_re; 02604 spinorFloat a11_im = -o01_im - o21_im; 02605 spinorFloat a21_re = o11_re - o31_re; 02606 spinorFloat a21_im = o11_im - o31_im; 02607 spinorFloat a31_re = -o01_re + o21_re; 02608 spinorFloat a31_im = -o01_im + o21_im; 02609 02610 o01_re = a01_re; o01_im = a01_im; 02611 o11_re = a11_re; o11_im = a11_im; 02612 o21_re = a21_re; o21_im = a21_im; 02613 o31_re = a31_re; o31_im = a31_im; 02614 } 02615 02616 { 02617 spinorFloat a02_re = o12_re + o32_re; 02618 spinorFloat a02_im = o12_im + o32_im; 02619 spinorFloat a12_re = -o02_re - o22_re; 02620 spinorFloat a12_im = -o02_im - o22_im; 02621 spinorFloat a22_re = o12_re - o32_re; 02622 spinorFloat a22_im = o12_im - o32_im; 02623 spinorFloat a32_re = -o02_re + o22_re; 02624 spinorFloat a32_im = -o02_im + o22_im; 02625 02626 o02_re = a02_re; o02_im = a02_im; 02627 o12_re = a12_re; o12_im = a12_im; 02628 o22_re = a22_re; o22_im = a22_im; 02629 o32_re = a32_re; o32_im = a32_im; 02630 } 02631 02632 #endif // DSLASH_CLOVER 02633 02634 #ifdef DSLASH_XPAY 02635 02636 READ_ACCUM(ACCUMTEX, sp_stride) 02637 02638 #ifdef SPINOR_DOUBLE 02639 o00_re = a*o00_re + accum0.x; 02640 o00_im = a*o00_im + accum0.y; 02641 o01_re = a*o01_re + accum1.x; 02642 o01_im = a*o01_im + accum1.y; 02643 o02_re = a*o02_re + accum2.x; 02644 o02_im = a*o02_im + accum2.y; 02645 o10_re = a*o10_re + accum3.x; 02646 o10_im = a*o10_im + accum3.y; 02647 o11_re = a*o11_re + accum4.x; 02648 o11_im = a*o11_im + accum4.y; 02649 o12_re = a*o12_re + accum5.x; 02650 o12_im = a*o12_im + accum5.y; 02651 o20_re = a*o20_re + accum6.x; 02652 o20_im = a*o20_im + accum6.y; 02653 o21_re = a*o21_re + accum7.x; 02654 o21_im = a*o21_im + accum7.y; 02655 o22_re = a*o22_re + accum8.x; 02656 o22_im = a*o22_im + accum8.y; 02657 o30_re = a*o30_re + accum9.x; 02658 o30_im = a*o30_im + accum9.y; 02659 o31_re = a*o31_re + accum10.x; 02660 o31_im = a*o31_im + accum10.y; 02661 o32_re = a*o32_re + accum11.x; 02662 o32_im = a*o32_im + accum11.y; 02663 #else 02664 o00_re = a*o00_re + accum0.x; 02665 o00_im = a*o00_im + accum0.y; 02666 o01_re = a*o01_re + accum0.z; 02667 o01_im = a*o01_im + accum0.w; 02668 o02_re = a*o02_re + accum1.x; 02669 o02_im = a*o02_im + accum1.y; 02670 o10_re = a*o10_re + accum1.z; 02671 o10_im = a*o10_im + accum1.w; 02672 o11_re = a*o11_re + accum2.x; 02673 o11_im = a*o11_im + accum2.y; 02674 o12_re = a*o12_re + accum2.z; 02675 o12_im = a*o12_im + accum2.w; 02676 o20_re = a*o20_re + accum3.x; 02677 o20_im = a*o20_im + accum3.y; 02678 o21_re = a*o21_re + accum3.z; 02679 o21_im = a*o21_im + accum3.w; 02680 o22_re = a*o22_re + accum4.x; 02681 o22_im = a*o22_im + accum4.y; 02682 o30_re = a*o30_re + accum4.z; 02683 o30_im = a*o30_im + accum4.w; 02684 o31_re = a*o31_re + accum5.x; 02685 o31_im = a*o31_im + accum5.y; 02686 o32_re = a*o32_re + accum5.z; 02687 o32_im = a*o32_im + accum5.w; 02688 #endif // SPINOR_DOUBLE 02689 02690 #endif // DSLASH_XPAY 02691 } 02692 02693 // write spinor field back to device memory 02694 WRITE_SPINOR(sp_stride); 02695 02696 // undefine to prevent warning when precision is changed 02697 #undef spinorFloat 02698 #undef WRITE_SPINOR_SHARED 02699 #undef READ_SPINOR_SHARED 02700 #undef SHARED_STRIDE 02701 02702 #undef A_re 02703 #undef A_im 02704 02705 #undef g00_re 02706 #undef g00_im 02707 #undef g01_re 02708 #undef g01_im 02709 #undef g02_re 02710 #undef g02_im 02711 #undef g10_re 02712 #undef g10_im 02713 #undef g11_re 02714 #undef g11_im 02715 #undef g12_re 02716 #undef g12_im 02717 #undef g20_re 02718 #undef g20_im 02719 #undef g21_re 02720 #undef g21_im 02721 #undef g22_re 02722 #undef g22_im 02723 02724 #undef i00_re 02725 #undef i00_im 02726 #undef i01_re 02727 #undef i01_im 02728 #undef i02_re 02729 #undef i02_im 02730 #undef i10_re 02731 #undef i10_im 02732 #undef i11_re 02733 #undef i11_im 02734 #undef i12_re 02735 #undef i12_im 02736 #undef i20_re 02737 #undef i20_im 02738 #undef i21_re 02739 #undef i21_im 02740 #undef i22_re 02741 #undef i22_im 02742 #undef i30_re 02743 #undef i30_im 02744 #undef i31_re 02745 #undef i31_im 02746 #undef i32_re 02747 #undef i32_im 02748 02749 #undef c00_00_re 02750 #undef c01_01_re 02751 #undef c02_02_re 02752 #undef c10_10_re 02753 #undef c11_11_re 02754 #undef c12_12_re 02755 #undef c01_00_re 02756 #undef c01_00_im 02757 #undef c02_00_re 02758 #undef c02_00_im 02759 #undef c10_00_re 02760 #undef c10_00_im 02761 #undef c11_00_re 02762 #undef c11_00_im 02763 #undef c12_00_re 02764 #undef c12_00_im 02765 #undef c02_01_re 02766 #undef c02_01_im 02767 #undef c10_01_re 02768 #undef c10_01_im 02769 #undef c11_01_re 02770 #undef c11_01_im 02771 #undef c12_01_re 02772 #undef c12_01_im 02773 #undef c10_02_re 02774 #undef c10_02_im 02775 #undef c11_02_re 02776 #undef c11_02_im 02777 #undef c12_02_re 02778 #undef c12_02_im 02779 #undef c11_10_re 02780 #undef c11_10_im 02781 #undef c12_10_re 02782 #undef c12_10_im 02783 #undef c12_11_re 02784 #undef c12_11_im 02785 02786 #undef o00_re 02787 #undef o00_im 02788 #undef o01_re 02789 #undef o01_im 02790 #undef o02_re 02791 #undef o02_im 02792 #undef o10_re 02793 #undef o10_im 02794 #undef o11_re 02795 #undef o11_im 02796 #undef o12_re 02797 #undef o12_im 02798 #undef o20_re 02799 #undef o20_im 02800 #undef o21_re 02801 #undef o21_im 02802 #undef o22_re 02803 #undef o22_im 02804 #undef o30_re 02805 #undef o30_im 02806 #undef o31_re 02807 #undef o31_im 02808 #undef o32_re 02809 #undef o32_im 02810 02811 #undef VOLATILE