QUDA v0.4.0
A library for QCD on GPUs
|
00001 // *** CUDA DSLASH *** 00002 00003 #define DSLASH_SHARED_FLOATS_PER_THREAD 24 00004 00005 00006 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 00007 #define VOLATILE 00008 #else // Open64 compiler 00009 #define VOLATILE volatile 00010 #endif 00011 // input spinor 00012 #ifdef SPINOR_DOUBLE 00013 #define spinorFloat double 00014 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2 00015 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2 00016 #define i00_re I0.x 00017 #define i00_im I0.y 00018 #define i01_re I1.x 00019 #define i01_im I1.y 00020 #define i02_re I2.x 00021 #define i02_im I2.y 00022 #define i10_re I3.x 00023 #define i10_im I3.y 00024 #define i11_re I4.x 00025 #define i11_im I4.y 00026 #define i12_re I5.x 00027 #define i12_im I5.y 00028 #define i20_re I6.x 00029 #define i20_im I6.y 00030 #define i21_re I7.x 00031 #define i21_im I7.y 00032 #define i22_re I8.x 00033 #define i22_im I8.y 00034 #define i30_re I9.x 00035 #define i30_im I9.y 00036 #define i31_re I10.x 00037 #define i31_im I10.y 00038 #define i32_re I11.x 00039 #define i32_im I11.y 00040 #else 00041 #define spinorFloat float 00042 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4 00043 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4 00044 #define i00_re I0.x 00045 #define i00_im I0.y 00046 #define i01_re I0.z 00047 #define i01_im I0.w 00048 #define i02_re I1.x 00049 #define i02_im I1.y 00050 #define i10_re I1.z 00051 #define i10_im I1.w 00052 #define i11_re I2.x 00053 #define i11_im I2.y 00054 #define i12_re I2.z 00055 #define i12_im I2.w 00056 #define i20_re I3.x 00057 #define i20_im I3.y 00058 #define i21_re I3.z 00059 #define i21_im I3.w 00060 #define i22_re I4.x 00061 #define i22_im I4.y 00062 #define i30_re I4.z 00063 #define i30_im I4.w 00064 #define i31_re I5.x 00065 #define i31_im I5.y 00066 #define i32_re I5.z 00067 #define i32_im I5.w 00068 #endif // SPINOR_DOUBLE 00069 00070 // gauge link 00071 #ifdef GAUGE_FLOAT2 00072 #define g00_re G0.x 00073 #define g00_im G0.y 00074 #define g01_re G1.x 00075 #define g01_im G1.y 00076 #define g02_re G2.x 00077 #define g02_im G2.y 00078 #define g10_re G3.x 00079 #define g10_im G3.y 00080 #define g11_re G4.x 00081 #define g11_im G4.y 00082 #define g12_re G5.x 00083 #define g12_im G5.y 00084 #define g20_re G6.x 00085 #define g20_im G6.y 00086 #define g21_re G7.x 00087 #define g21_im G7.y 00088 #define g22_re G8.x 00089 #define g22_im G8.y 00090 // temporaries 00091 #define A_re G9.x 00092 #define A_im G9.y 00093 00094 #else 00095 #define g00_re G0.x 00096 #define g00_im G0.y 00097 #define g01_re G0.z 00098 #define g01_im G0.w 00099 #define g02_re G1.x 00100 #define g02_im G1.y 00101 #define g10_re G1.z 00102 #define g10_im G1.w 00103 #define g11_re G2.x 00104 #define g11_im G2.y 00105 #define g12_re G2.z 00106 #define g12_im G2.w 00107 #define g20_re G3.x 00108 #define g20_im G3.y 00109 #define g21_re G3.z 00110 #define g21_im G3.w 00111 #define g22_re G4.x 00112 #define g22_im G4.y 00113 // temporaries 00114 #define A_re G4.z 00115 #define A_im G4.w 00116 00117 #endif // GAUGE_DOUBLE 00118 00119 // conjugated gauge link 00120 #define gT00_re (+g00_re) 00121 #define gT00_im (-g00_im) 00122 #define gT01_re (+g10_re) 00123 #define gT01_im (-g10_im) 00124 #define gT02_re (+g20_re) 00125 #define gT02_im (-g20_im) 00126 #define gT10_re (+g01_re) 00127 #define gT10_im (-g01_im) 00128 #define gT11_re (+g11_re) 00129 #define gT11_im (-g11_im) 00130 #define gT12_re (+g21_re) 00131 #define gT12_im (-g21_im) 00132 #define gT20_re (+g02_re) 00133 #define gT20_im (-g02_im) 00134 #define gT21_re (+g12_re) 00135 #define gT21_im (-g12_im) 00136 #define gT22_re (+g22_re) 00137 #define gT22_im (-g22_im) 00138 00139 // output spinor 00140 VOLATILE spinorFloat o00_re; 00141 VOLATILE spinorFloat o00_im; 00142 VOLATILE spinorFloat o01_re; 00143 VOLATILE spinorFloat o01_im; 00144 VOLATILE spinorFloat o02_re; 00145 VOLATILE spinorFloat o02_im; 00146 VOLATILE spinorFloat o10_re; 00147 VOLATILE spinorFloat o10_im; 00148 VOLATILE spinorFloat o11_re; 00149 VOLATILE spinorFloat o11_im; 00150 VOLATILE spinorFloat o12_re; 00151 VOLATILE spinorFloat o12_im; 00152 VOLATILE spinorFloat o20_re; 00153 VOLATILE spinorFloat o20_im; 00154 VOLATILE spinorFloat o21_re; 00155 VOLATILE spinorFloat o21_im; 00156 VOLATILE spinorFloat o22_re; 00157 VOLATILE spinorFloat o22_im; 00158 VOLATILE spinorFloat o30_re; 00159 VOLATILE spinorFloat o30_im; 00160 VOLATILE spinorFloat o31_re; 00161 VOLATILE spinorFloat o31_im; 00162 VOLATILE spinorFloat o32_re; 00163 VOLATILE spinorFloat o32_im; 00164 00165 #ifdef SPINOR_DOUBLE 00166 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 00167 #else 00168 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 00169 #endif 00170 00171 #include "read_gauge.h" 00172 #include "read_clover.h" 00173 #include "io_spinor.h" 00174 00175 int x1, x2, x3, x4; 00176 int X; 00177 00178 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision 00179 int sp_norm_idx; 00180 #endif // MULTI_GPU half precision 00181 00182 int sid; 00183 00184 #ifdef MULTI_GPU 00185 int face_idx; 00186 if (kernel_type == INTERIOR_KERNEL) { 00187 #endif 00188 00189 // Inline by hand for the moment and assume even dimensions 00190 //coordsFromIndex(X, x1, x2, x3, x4, sid, param.parity); 00191 00192 int xt = blockIdx.x*blockDim.x + threadIdx.x; 00193 int aux = xt+xt; 00194 if (aux >= X1*X4) return; 00195 00196 x4 = aux / X1; 00197 x1 = aux - x4*X1; 00198 00199 x2 = blockIdx.y*blockDim.y + threadIdx.y; 00200 if (x2 >= X2) return; 00201 00202 x3 = blockIdx.z*blockDim.z + threadIdx.z; 00203 if (x3 >= X3) return; 00204 00205 x1 += (param.parity + x4 + x3 + x2) &1; 00206 X = ((x4*X3 + x3)*X2 + x2)*X1 + x1; 00207 sid = X >> 1; 00208 00209 o00_re = 0; o00_im = 0; 00210 o01_re = 0; o01_im = 0; 00211 o02_re = 0; o02_im = 0; 00212 o10_re = 0; o10_im = 0; 00213 o11_re = 0; o11_im = 0; 00214 o12_re = 0; o12_im = 0; 00215 o20_re = 0; o20_im = 0; 00216 o21_re = 0; o21_im = 0; 00217 o22_re = 0; o22_im = 0; 00218 o30_re = 0; o30_im = 0; 00219 o31_re = 0; o31_im = 0; 00220 o32_re = 0; o32_im = 0; 00221 00222 #ifdef MULTI_GPU 00223 } else { // exterior kernel 00224 00225 sid = blockIdx.x*blockDim.x + threadIdx.x; 00226 if (sid >= param.threads) return; 00227 00228 const int dim = static_cast<int>(kernel_type); 00229 const int face_volume = (param.threads >> 1); // volume of one face 00230 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1 00231 face_idx = sid - face_num*face_volume; // index into the respective face 00232 00233 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP) 00234 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading) 00235 //sp_idx = face_idx + param.ghostOffset[dim]; 00236 00237 #if (DD_PREC==2) // half precision 00238 sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)]; 00239 #endif 00240 00241 coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity); 00242 00243 READ_INTERMEDIATE_SPINOR(INTERTEX, sp_stride, sid, sid); 00244 00245 o00_re = i00_re; o00_im = i00_im; 00246 o01_re = i01_re; o01_im = i01_im; 00247 o02_re = i02_re; o02_im = i02_im; 00248 o10_re = i10_re; o10_im = i10_im; 00249 o11_re = i11_re; o11_im = i11_im; 00250 o12_re = i12_re; o12_im = i12_im; 00251 o20_re = i20_re; o20_im = i20_im; 00252 o21_re = i21_re; o21_im = i21_im; 00253 o22_re = i22_re; o22_im = i22_im; 00254 o30_re = i30_re; o30_im = i30_im; 00255 o31_re = i31_re; o31_im = i31_im; 00256 o32_re = i32_re; o32_im = i32_im; 00257 } 00258 #endif // MULTI_GPU 00259 00260 00261 #ifdef MULTI_GPU 00262 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) || 00263 (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) ) 00264 #endif 00265 { 00266 // Projector P0- 00267 // 1 0 0 -i 00268 // 0 1 -i 0 00269 // 0 i 1 0 00270 // i 0 0 1 00271 00272 #ifdef MULTI_GPU 00273 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 : 00274 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00275 #else 00276 const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1; 00277 #endif 00278 00279 const int ga_idx = sid; 00280 00281 spinorFloat a0_re, a0_im; 00282 spinorFloat a1_re, a1_im; 00283 spinorFloat a2_re, a2_im; 00284 spinorFloat b0_re, b0_im; 00285 spinorFloat b1_re, b1_im; 00286 spinorFloat b2_re, b2_im; 00287 00288 #ifdef MULTI_GPU 00289 if (kernel_type == INTERIOR_KERNEL) { 00290 #endif 00291 00292 // read spinor from device memory 00293 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00294 00295 // store spinor into shared memory 00296 WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i); 00297 00298 // project spinor into half spinors 00299 a0_re = +i00_re+i30_im; 00300 a0_im = +i00_im-i30_re; 00301 a1_re = +i01_re+i31_im; 00302 a1_im = +i01_im-i31_re; 00303 a2_re = +i02_re+i32_im; 00304 a2_im = +i02_im-i32_re; 00305 b0_re = +i10_re+i20_im; 00306 b0_im = +i10_im-i20_re; 00307 b1_re = +i11_re+i21_im; 00308 b1_im = +i11_im-i21_re; 00309 b2_re = +i12_re+i22_im; 00310 b2_im = +i12_im-i22_re; 00311 00312 #ifdef MULTI_GPU 00313 } else { 00314 00315 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00316 00317 // read half spinor from device memory 00318 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00319 00320 a0_re = i00_re; a0_im = i00_im; 00321 a1_re = i01_re; a1_im = i01_im; 00322 a2_re = i02_re; a2_im = i02_im; 00323 b0_re = i10_re; b0_im = i10_im; 00324 b1_re = i11_re; b1_im = i11_im; 00325 b2_re = i12_re; b2_im = i12_im; 00326 00327 } 00328 #endif // MULTI_GPU 00329 00330 // read gauge matrix from device memory 00331 READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride); 00332 00333 // reconstruct gauge matrix 00334 RECONSTRUCT_GAUGE_MATRIX(0); 00335 00336 // multiply row 0 00337 spinorFloat A0_re = 0; 00338 A0_re += g00_re * a0_re; 00339 A0_re -= g00_im * a0_im; 00340 A0_re += g01_re * a1_re; 00341 A0_re -= g01_im * a1_im; 00342 A0_re += g02_re * a2_re; 00343 A0_re -= g02_im * a2_im; 00344 spinorFloat A0_im = 0; 00345 A0_im += g00_re * a0_im; 00346 A0_im += g00_im * a0_re; 00347 A0_im += g01_re * a1_im; 00348 A0_im += g01_im * a1_re; 00349 A0_im += g02_re * a2_im; 00350 A0_im += g02_im * a2_re; 00351 spinorFloat B0_re = 0; 00352 B0_re += g00_re * b0_re; 00353 B0_re -= g00_im * b0_im; 00354 B0_re += g01_re * b1_re; 00355 B0_re -= g01_im * b1_im; 00356 B0_re += g02_re * b2_re; 00357 B0_re -= g02_im * b2_im; 00358 spinorFloat B0_im = 0; 00359 B0_im += g00_re * b0_im; 00360 B0_im += g00_im * b0_re; 00361 B0_im += g01_re * b1_im; 00362 B0_im += g01_im * b1_re; 00363 B0_im += g02_re * b2_im; 00364 B0_im += g02_im * b2_re; 00365 00366 // multiply row 1 00367 spinorFloat A1_re = 0; 00368 A1_re += g10_re * a0_re; 00369 A1_re -= g10_im * a0_im; 00370 A1_re += g11_re * a1_re; 00371 A1_re -= g11_im * a1_im; 00372 A1_re += g12_re * a2_re; 00373 A1_re -= g12_im * a2_im; 00374 spinorFloat A1_im = 0; 00375 A1_im += g10_re * a0_im; 00376 A1_im += g10_im * a0_re; 00377 A1_im += g11_re * a1_im; 00378 A1_im += g11_im * a1_re; 00379 A1_im += g12_re * a2_im; 00380 A1_im += g12_im * a2_re; 00381 spinorFloat B1_re = 0; 00382 B1_re += g10_re * b0_re; 00383 B1_re -= g10_im * b0_im; 00384 B1_re += g11_re * b1_re; 00385 B1_re -= g11_im * b1_im; 00386 B1_re += g12_re * b2_re; 00387 B1_re -= g12_im * b2_im; 00388 spinorFloat B1_im = 0; 00389 B1_im += g10_re * b0_im; 00390 B1_im += g10_im * b0_re; 00391 B1_im += g11_re * b1_im; 00392 B1_im += g11_im * b1_re; 00393 B1_im += g12_re * b2_im; 00394 B1_im += g12_im * b2_re; 00395 00396 // multiply row 2 00397 spinorFloat A2_re = 0; 00398 A2_re += g20_re * a0_re; 00399 A2_re -= g20_im * a0_im; 00400 A2_re += g21_re * a1_re; 00401 A2_re -= g21_im * a1_im; 00402 A2_re += g22_re * a2_re; 00403 A2_re -= g22_im * a2_im; 00404 spinorFloat A2_im = 0; 00405 A2_im += g20_re * a0_im; 00406 A2_im += g20_im * a0_re; 00407 A2_im += g21_re * a1_im; 00408 A2_im += g21_im * a1_re; 00409 A2_im += g22_re * a2_im; 00410 A2_im += g22_im * a2_re; 00411 spinorFloat B2_re = 0; 00412 B2_re += g20_re * b0_re; 00413 B2_re -= g20_im * b0_im; 00414 B2_re += g21_re * b1_re; 00415 B2_re -= g21_im * b1_im; 00416 B2_re += g22_re * b2_re; 00417 B2_re -= g22_im * b2_im; 00418 spinorFloat B2_im = 0; 00419 B2_im += g20_re * b0_im; 00420 B2_im += g20_im * b0_re; 00421 B2_im += g21_re * b1_im; 00422 B2_im += g21_im * b1_re; 00423 B2_im += g22_re * b2_im; 00424 B2_im += g22_im * b2_re; 00425 00426 o00_re += A0_re; 00427 o00_im += A0_im; 00428 o10_re += B0_re; 00429 o10_im += B0_im; 00430 o20_re -= B0_im; 00431 o20_im += B0_re; 00432 o30_re -= A0_im; 00433 o30_im += A0_re; 00434 00435 o01_re += A1_re; 00436 o01_im += A1_im; 00437 o11_re += B1_re; 00438 o11_im += B1_im; 00439 o21_re -= B1_im; 00440 o21_im += B1_re; 00441 o31_re -= A1_im; 00442 o31_im += A1_re; 00443 00444 o02_re += A2_re; 00445 o02_im += A2_im; 00446 o12_re += B2_re; 00447 o12_im += B2_im; 00448 o22_re -= B2_im; 00449 o22_im += B2_re; 00450 o32_re -= A2_im; 00451 o32_im += A2_re; 00452 00453 } 00454 00455 #ifdef MULTI_GPU 00456 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) || 00457 (kernel_type == EXTERIOR_KERNEL_X && x1==0) ) 00458 #endif 00459 { 00460 // Projector P0+ 00461 // 1 0 0 i 00462 // 0 1 i 0 00463 // 0 -i 1 0 00464 // -i 0 0 1 00465 00466 #ifdef MULTI_GPU 00467 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 : 00468 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00469 #else 00470 const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1; 00471 #endif 00472 00473 #ifdef MULTI_GPU 00474 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 00475 #else 00476 const int ga_idx = sp_idx; 00477 #endif 00478 00479 spinorFloat a0_re, a0_im; 00480 spinorFloat a1_re, a1_im; 00481 spinorFloat a2_re, a2_im; 00482 spinorFloat b0_re, b0_im; 00483 spinorFloat b1_re, b1_im; 00484 spinorFloat b2_re, b2_im; 00485 00486 #ifdef MULTI_GPU 00487 if (kernel_type == INTERIOR_KERNEL) { 00488 #endif 00489 00490 // load spinor from shared memory 00491 int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1; 00492 __syncthreads(); 00493 READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z); 00494 00495 // project spinor into half spinors 00496 a0_re = +i00_re-i30_im; 00497 a0_im = +i00_im+i30_re; 00498 a1_re = +i01_re-i31_im; 00499 a1_im = +i01_im+i31_re; 00500 a2_re = +i02_re-i32_im; 00501 a2_im = +i02_im+i32_re; 00502 b0_re = +i10_re-i20_im; 00503 b0_im = +i10_im+i20_re; 00504 b1_re = +i11_re-i21_im; 00505 b1_im = +i11_im+i21_re; 00506 b2_re = +i12_re-i22_im; 00507 b2_im = +i12_im+i22_re; 00508 00509 #ifdef MULTI_GPU 00510 } else { 00511 00512 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00513 00514 // read half spinor from device memory 00515 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 00516 00517 a0_re = i00_re; a0_im = i00_im; 00518 a1_re = i01_re; a1_im = i01_im; 00519 a2_re = i02_re; a2_im = i02_im; 00520 b0_re = i10_re; b0_im = i10_im; 00521 b1_re = i11_re; b1_im = i11_im; 00522 b2_re = i12_re; b2_im = i12_im; 00523 00524 } 00525 #endif // MULTI_GPU 00526 00527 // read gauge matrix from device memory 00528 READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride); 00529 00530 // reconstruct gauge matrix 00531 RECONSTRUCT_GAUGE_MATRIX(1); 00532 00533 // multiply row 0 00534 spinorFloat A0_re = 0; 00535 A0_re += gT00_re * a0_re; 00536 A0_re -= gT00_im * a0_im; 00537 A0_re += gT01_re * a1_re; 00538 A0_re -= gT01_im * a1_im; 00539 A0_re += gT02_re * a2_re; 00540 A0_re -= gT02_im * a2_im; 00541 spinorFloat A0_im = 0; 00542 A0_im += gT00_re * a0_im; 00543 A0_im += gT00_im * a0_re; 00544 A0_im += gT01_re * a1_im; 00545 A0_im += gT01_im * a1_re; 00546 A0_im += gT02_re * a2_im; 00547 A0_im += gT02_im * a2_re; 00548 spinorFloat B0_re = 0; 00549 B0_re += gT00_re * b0_re; 00550 B0_re -= gT00_im * b0_im; 00551 B0_re += gT01_re * b1_re; 00552 B0_re -= gT01_im * b1_im; 00553 B0_re += gT02_re * b2_re; 00554 B0_re -= gT02_im * b2_im; 00555 spinorFloat B0_im = 0; 00556 B0_im += gT00_re * b0_im; 00557 B0_im += gT00_im * b0_re; 00558 B0_im += gT01_re * b1_im; 00559 B0_im += gT01_im * b1_re; 00560 B0_im += gT02_re * b2_im; 00561 B0_im += gT02_im * b2_re; 00562 00563 // multiply row 1 00564 spinorFloat A1_re = 0; 00565 A1_re += gT10_re * a0_re; 00566 A1_re -= gT10_im * a0_im; 00567 A1_re += gT11_re * a1_re; 00568 A1_re -= gT11_im * a1_im; 00569 A1_re += gT12_re * a2_re; 00570 A1_re -= gT12_im * a2_im; 00571 spinorFloat A1_im = 0; 00572 A1_im += gT10_re * a0_im; 00573 A1_im += gT10_im * a0_re; 00574 A1_im += gT11_re * a1_im; 00575 A1_im += gT11_im * a1_re; 00576 A1_im += gT12_re * a2_im; 00577 A1_im += gT12_im * a2_re; 00578 spinorFloat B1_re = 0; 00579 B1_re += gT10_re * b0_re; 00580 B1_re -= gT10_im * b0_im; 00581 B1_re += gT11_re * b1_re; 00582 B1_re -= gT11_im * b1_im; 00583 B1_re += gT12_re * b2_re; 00584 B1_re -= gT12_im * b2_im; 00585 spinorFloat B1_im = 0; 00586 B1_im += gT10_re * b0_im; 00587 B1_im += gT10_im * b0_re; 00588 B1_im += gT11_re * b1_im; 00589 B1_im += gT11_im * b1_re; 00590 B1_im += gT12_re * b2_im; 00591 B1_im += gT12_im * b2_re; 00592 00593 // multiply row 2 00594 spinorFloat A2_re = 0; 00595 A2_re += gT20_re * a0_re; 00596 A2_re -= gT20_im * a0_im; 00597 A2_re += gT21_re * a1_re; 00598 A2_re -= gT21_im * a1_im; 00599 A2_re += gT22_re * a2_re; 00600 A2_re -= gT22_im * a2_im; 00601 spinorFloat A2_im = 0; 00602 A2_im += gT20_re * a0_im; 00603 A2_im += gT20_im * a0_re; 00604 A2_im += gT21_re * a1_im; 00605 A2_im += gT21_im * a1_re; 00606 A2_im += gT22_re * a2_im; 00607 A2_im += gT22_im * a2_re; 00608 spinorFloat B2_re = 0; 00609 B2_re += gT20_re * b0_re; 00610 B2_re -= gT20_im * b0_im; 00611 B2_re += gT21_re * b1_re; 00612 B2_re -= gT21_im * b1_im; 00613 B2_re += gT22_re * b2_re; 00614 B2_re -= gT22_im * b2_im; 00615 spinorFloat B2_im = 0; 00616 B2_im += gT20_re * b0_im; 00617 B2_im += gT20_im * b0_re; 00618 B2_im += gT21_re * b1_im; 00619 B2_im += gT21_im * b1_re; 00620 B2_im += gT22_re * b2_im; 00621 B2_im += gT22_im * b2_re; 00622 00623 o00_re += A0_re; 00624 o00_im += A0_im; 00625 o10_re += B0_re; 00626 o10_im += B0_im; 00627 o20_re += B0_im; 00628 o20_im -= B0_re; 00629 o30_re += A0_im; 00630 o30_im -= A0_re; 00631 00632 o01_re += A1_re; 00633 o01_im += A1_im; 00634 o11_re += B1_re; 00635 o11_im += B1_im; 00636 o21_re += B1_im; 00637 o21_im -= B1_re; 00638 o31_re += A1_im; 00639 o31_im -= A1_re; 00640 00641 o02_re += A2_re; 00642 o02_im += A2_im; 00643 o12_re += B2_re; 00644 o12_im += B2_im; 00645 o22_re += B2_im; 00646 o22_im -= B2_re; 00647 o32_re += A2_im; 00648 o32_im -= A2_re; 00649 00650 } 00651 00652 #ifdef MULTI_GPU 00653 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) || 00654 (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) ) 00655 #endif 00656 { 00657 // Projector P1- 00658 // 1 0 0 -1 00659 // 0 1 1 0 00660 // 0 1 1 0 00661 // -1 0 0 1 00662 00663 #ifdef MULTI_GPU 00664 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 : 00665 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00666 #else 00667 const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1; 00668 #endif 00669 00670 const int ga_idx = sid; 00671 00672 spinorFloat a0_re, a0_im; 00673 spinorFloat a1_re, a1_im; 00674 spinorFloat a2_re, a2_im; 00675 spinorFloat b0_re, b0_im; 00676 spinorFloat b1_re, b1_im; 00677 spinorFloat b2_re, b2_im; 00678 00679 #ifdef MULTI_GPU 00680 if (kernel_type == INTERIOR_KERNEL) { 00681 #endif 00682 00683 if (threadIdx.y == blockDim.y-1 && blockDim.y < X2 ) { 00684 // read spinor from device memory 00685 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00686 00687 // project spinor into half spinors 00688 a0_re = +i00_re-i30_re; 00689 a0_im = +i00_im-i30_im; 00690 a1_re = +i01_re-i31_re; 00691 a1_im = +i01_im-i31_im; 00692 a2_re = +i02_re-i32_re; 00693 a2_im = +i02_im-i32_im; 00694 b0_re = +i10_re+i20_re; 00695 b0_im = +i10_im+i20_im; 00696 b1_re = +i11_re+i21_re; 00697 b1_im = +i11_im+i21_im; 00698 b2_re = +i12_re+i22_re; 00699 b2_im = +i12_im+i22_im; 00700 } else { 00701 // load spinor from shared memory 00702 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x; 00703 int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0; 00704 READ_SPINOR_SHARED(tx, ty, threadIdx.z); 00705 00706 // project spinor into half spinors 00707 a0_re = +i00_re-i30_re; 00708 a0_im = +i00_im-i30_im; 00709 a1_re = +i01_re-i31_re; 00710 a1_im = +i01_im-i31_im; 00711 a2_re = +i02_re-i32_re; 00712 a2_im = +i02_im-i32_im; 00713 b0_re = +i10_re+i20_re; 00714 b0_im = +i10_im+i20_im; 00715 b1_re = +i11_re+i21_re; 00716 b1_im = +i11_im+i21_im; 00717 b2_re = +i12_re+i22_re; 00718 b2_im = +i12_im+i22_im; 00719 } 00720 00721 #ifdef MULTI_GPU 00722 } else { 00723 00724 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00725 00726 // read half spinor from device memory 00727 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 00728 00729 a0_re = i00_re; a0_im = i00_im; 00730 a1_re = i01_re; a1_im = i01_im; 00731 a2_re = i02_re; a2_im = i02_im; 00732 b0_re = i10_re; b0_im = i10_im; 00733 b1_re = i11_re; b1_im = i11_im; 00734 b2_re = i12_re; b2_im = i12_im; 00735 00736 } 00737 #endif // MULTI_GPU 00738 00739 // read gauge matrix from device memory 00740 READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride); 00741 00742 // reconstruct gauge matrix 00743 RECONSTRUCT_GAUGE_MATRIX(2); 00744 00745 // multiply row 0 00746 spinorFloat A0_re = 0; 00747 A0_re += g00_re * a0_re; 00748 A0_re -= g00_im * a0_im; 00749 A0_re += g01_re * a1_re; 00750 A0_re -= g01_im * a1_im; 00751 A0_re += g02_re * a2_re; 00752 A0_re -= g02_im * a2_im; 00753 spinorFloat A0_im = 0; 00754 A0_im += g00_re * a0_im; 00755 A0_im += g00_im * a0_re; 00756 A0_im += g01_re * a1_im; 00757 A0_im += g01_im * a1_re; 00758 A0_im += g02_re * a2_im; 00759 A0_im += g02_im * a2_re; 00760 spinorFloat B0_re = 0; 00761 B0_re += g00_re * b0_re; 00762 B0_re -= g00_im * b0_im; 00763 B0_re += g01_re * b1_re; 00764 B0_re -= g01_im * b1_im; 00765 B0_re += g02_re * b2_re; 00766 B0_re -= g02_im * b2_im; 00767 spinorFloat B0_im = 0; 00768 B0_im += g00_re * b0_im; 00769 B0_im += g00_im * b0_re; 00770 B0_im += g01_re * b1_im; 00771 B0_im += g01_im * b1_re; 00772 B0_im += g02_re * b2_im; 00773 B0_im += g02_im * b2_re; 00774 00775 // multiply row 1 00776 spinorFloat A1_re = 0; 00777 A1_re += g10_re * a0_re; 00778 A1_re -= g10_im * a0_im; 00779 A1_re += g11_re * a1_re; 00780 A1_re -= g11_im * a1_im; 00781 A1_re += g12_re * a2_re; 00782 A1_re -= g12_im * a2_im; 00783 spinorFloat A1_im = 0; 00784 A1_im += g10_re * a0_im; 00785 A1_im += g10_im * a0_re; 00786 A1_im += g11_re * a1_im; 00787 A1_im += g11_im * a1_re; 00788 A1_im += g12_re * a2_im; 00789 A1_im += g12_im * a2_re; 00790 spinorFloat B1_re = 0; 00791 B1_re += g10_re * b0_re; 00792 B1_re -= g10_im * b0_im; 00793 B1_re += g11_re * b1_re; 00794 B1_re -= g11_im * b1_im; 00795 B1_re += g12_re * b2_re; 00796 B1_re -= g12_im * b2_im; 00797 spinorFloat B1_im = 0; 00798 B1_im += g10_re * b0_im; 00799 B1_im += g10_im * b0_re; 00800 B1_im += g11_re * b1_im; 00801 B1_im += g11_im * b1_re; 00802 B1_im += g12_re * b2_im; 00803 B1_im += g12_im * b2_re; 00804 00805 // multiply row 2 00806 spinorFloat A2_re = 0; 00807 A2_re += g20_re * a0_re; 00808 A2_re -= g20_im * a0_im; 00809 A2_re += g21_re * a1_re; 00810 A2_re -= g21_im * a1_im; 00811 A2_re += g22_re * a2_re; 00812 A2_re -= g22_im * a2_im; 00813 spinorFloat A2_im = 0; 00814 A2_im += g20_re * a0_im; 00815 A2_im += g20_im * a0_re; 00816 A2_im += g21_re * a1_im; 00817 A2_im += g21_im * a1_re; 00818 A2_im += g22_re * a2_im; 00819 A2_im += g22_im * a2_re; 00820 spinorFloat B2_re = 0; 00821 B2_re += g20_re * b0_re; 00822 B2_re -= g20_im * b0_im; 00823 B2_re += g21_re * b1_re; 00824 B2_re -= g21_im * b1_im; 00825 B2_re += g22_re * b2_re; 00826 B2_re -= g22_im * b2_im; 00827 spinorFloat B2_im = 0; 00828 B2_im += g20_re * b0_im; 00829 B2_im += g20_im * b0_re; 00830 B2_im += g21_re * b1_im; 00831 B2_im += g21_im * b1_re; 00832 B2_im += g22_re * b2_im; 00833 B2_im += g22_im * b2_re; 00834 00835 o00_re += A0_re; 00836 o00_im += A0_im; 00837 o10_re += B0_re; 00838 o10_im += B0_im; 00839 o20_re += B0_re; 00840 o20_im += B0_im; 00841 o30_re -= A0_re; 00842 o30_im -= A0_im; 00843 00844 o01_re += A1_re; 00845 o01_im += A1_im; 00846 o11_re += B1_re; 00847 o11_im += B1_im; 00848 o21_re += B1_re; 00849 o21_im += B1_im; 00850 o31_re -= A1_re; 00851 o31_im -= A1_im; 00852 00853 o02_re += A2_re; 00854 o02_im += A2_im; 00855 o12_re += B2_re; 00856 o12_im += B2_im; 00857 o22_re += B2_re; 00858 o22_im += B2_im; 00859 o32_re -= A2_re; 00860 o32_im -= A2_im; 00861 00862 } 00863 00864 #ifdef MULTI_GPU 00865 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) || 00866 (kernel_type == EXTERIOR_KERNEL_Y && x2==0) ) 00867 #endif 00868 { 00869 // Projector P1+ 00870 // 1 0 0 1 00871 // 0 1 -1 0 00872 // 0 -1 1 0 00873 // 1 0 0 1 00874 00875 #ifdef MULTI_GPU 00876 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 : 00877 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 00878 #else 00879 const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1; 00880 #endif 00881 00882 #ifdef MULTI_GPU 00883 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 00884 #else 00885 const int ga_idx = sp_idx; 00886 #endif 00887 00888 spinorFloat a0_re, a0_im; 00889 spinorFloat a1_re, a1_im; 00890 spinorFloat a2_re, a2_im; 00891 spinorFloat b0_re, b0_im; 00892 spinorFloat b1_re, b1_im; 00893 spinorFloat b2_re, b2_im; 00894 00895 #ifdef MULTI_GPU 00896 if (kernel_type == INTERIOR_KERNEL) { 00897 #endif 00898 00899 if (threadIdx.y == 0 && blockDim.y < X2) { 00900 // read spinor from device memory 00901 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00902 00903 // project spinor into half spinors 00904 a0_re = +i00_re+i30_re; 00905 a0_im = +i00_im+i30_im; 00906 a1_re = +i01_re+i31_re; 00907 a1_im = +i01_im+i31_im; 00908 a2_re = +i02_re+i32_re; 00909 a2_im = +i02_im+i32_im; 00910 b0_re = +i10_re-i20_re; 00911 b0_im = +i10_im-i20_im; 00912 b1_re = +i11_re-i21_re; 00913 b1_im = +i11_im-i21_im; 00914 b2_re = +i12_re-i22_re; 00915 b2_im = +i12_im-i22_im; 00916 } else { 00917 // load spinor from shared memory 00918 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x; 00919 int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1; 00920 READ_SPINOR_SHARED(tx, ty, threadIdx.z); 00921 00922 // project spinor into half spinors 00923 a0_re = +i00_re+i30_re; 00924 a0_im = +i00_im+i30_im; 00925 a1_re = +i01_re+i31_re; 00926 a1_im = +i01_im+i31_im; 00927 a2_re = +i02_re+i32_re; 00928 a2_im = +i02_im+i32_im; 00929 b0_re = +i10_re-i20_re; 00930 b0_im = +i10_im-i20_im; 00931 b1_re = +i11_re-i21_re; 00932 b1_im = +i11_im-i21_im; 00933 b2_re = +i12_re-i22_re; 00934 b2_im = +i12_im-i22_im; 00935 } 00936 00937 #ifdef MULTI_GPU 00938 } else { 00939 00940 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 00941 00942 // read half spinor from device memory 00943 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 00944 00945 a0_re = i00_re; a0_im = i00_im; 00946 a1_re = i01_re; a1_im = i01_im; 00947 a2_re = i02_re; a2_im = i02_im; 00948 b0_re = i10_re; b0_im = i10_im; 00949 b1_re = i11_re; b1_im = i11_im; 00950 b2_re = i12_re; b2_im = i12_im; 00951 00952 } 00953 #endif // MULTI_GPU 00954 00955 // read gauge matrix from device memory 00956 READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride); 00957 00958 // reconstruct gauge matrix 00959 RECONSTRUCT_GAUGE_MATRIX(3); 00960 00961 // multiply row 0 00962 spinorFloat A0_re = 0; 00963 A0_re += gT00_re * a0_re; 00964 A0_re -= gT00_im * a0_im; 00965 A0_re += gT01_re * a1_re; 00966 A0_re -= gT01_im * a1_im; 00967 A0_re += gT02_re * a2_re; 00968 A0_re -= gT02_im * a2_im; 00969 spinorFloat A0_im = 0; 00970 A0_im += gT00_re * a0_im; 00971 A0_im += gT00_im * a0_re; 00972 A0_im += gT01_re * a1_im; 00973 A0_im += gT01_im * a1_re; 00974 A0_im += gT02_re * a2_im; 00975 A0_im += gT02_im * a2_re; 00976 spinorFloat B0_re = 0; 00977 B0_re += gT00_re * b0_re; 00978 B0_re -= gT00_im * b0_im; 00979 B0_re += gT01_re * b1_re; 00980 B0_re -= gT01_im * b1_im; 00981 B0_re += gT02_re * b2_re; 00982 B0_re -= gT02_im * b2_im; 00983 spinorFloat B0_im = 0; 00984 B0_im += gT00_re * b0_im; 00985 B0_im += gT00_im * b0_re; 00986 B0_im += gT01_re * b1_im; 00987 B0_im += gT01_im * b1_re; 00988 B0_im += gT02_re * b2_im; 00989 B0_im += gT02_im * b2_re; 00990 00991 // multiply row 1 00992 spinorFloat A1_re = 0; 00993 A1_re += gT10_re * a0_re; 00994 A1_re -= gT10_im * a0_im; 00995 A1_re += gT11_re * a1_re; 00996 A1_re -= gT11_im * a1_im; 00997 A1_re += gT12_re * a2_re; 00998 A1_re -= gT12_im * a2_im; 00999 spinorFloat A1_im = 0; 01000 A1_im += gT10_re * a0_im; 01001 A1_im += gT10_im * a0_re; 01002 A1_im += gT11_re * a1_im; 01003 A1_im += gT11_im * a1_re; 01004 A1_im += gT12_re * a2_im; 01005 A1_im += gT12_im * a2_re; 01006 spinorFloat B1_re = 0; 01007 B1_re += gT10_re * b0_re; 01008 B1_re -= gT10_im * b0_im; 01009 B1_re += gT11_re * b1_re; 01010 B1_re -= gT11_im * b1_im; 01011 B1_re += gT12_re * b2_re; 01012 B1_re -= gT12_im * b2_im; 01013 spinorFloat B1_im = 0; 01014 B1_im += gT10_re * b0_im; 01015 B1_im += gT10_im * b0_re; 01016 B1_im += gT11_re * b1_im; 01017 B1_im += gT11_im * b1_re; 01018 B1_im += gT12_re * b2_im; 01019 B1_im += gT12_im * b2_re; 01020 01021 // multiply row 2 01022 spinorFloat A2_re = 0; 01023 A2_re += gT20_re * a0_re; 01024 A2_re -= gT20_im * a0_im; 01025 A2_re += gT21_re * a1_re; 01026 A2_re -= gT21_im * a1_im; 01027 A2_re += gT22_re * a2_re; 01028 A2_re -= gT22_im * a2_im; 01029 spinorFloat A2_im = 0; 01030 A2_im += gT20_re * a0_im; 01031 A2_im += gT20_im * a0_re; 01032 A2_im += gT21_re * a1_im; 01033 A2_im += gT21_im * a1_re; 01034 A2_im += gT22_re * a2_im; 01035 A2_im += gT22_im * a2_re; 01036 spinorFloat B2_re = 0; 01037 B2_re += gT20_re * b0_re; 01038 B2_re -= gT20_im * b0_im; 01039 B2_re += gT21_re * b1_re; 01040 B2_re -= gT21_im * b1_im; 01041 B2_re += gT22_re * b2_re; 01042 B2_re -= gT22_im * b2_im; 01043 spinorFloat B2_im = 0; 01044 B2_im += gT20_re * b0_im; 01045 B2_im += gT20_im * b0_re; 01046 B2_im += gT21_re * b1_im; 01047 B2_im += gT21_im * b1_re; 01048 B2_im += gT22_re * b2_im; 01049 B2_im += gT22_im * b2_re; 01050 01051 o00_re += A0_re; 01052 o00_im += A0_im; 01053 o10_re += B0_re; 01054 o10_im += B0_im; 01055 o20_re -= B0_re; 01056 o20_im -= B0_im; 01057 o30_re += A0_re; 01058 o30_im += A0_im; 01059 01060 o01_re += A1_re; 01061 o01_im += A1_im; 01062 o11_re += B1_re; 01063 o11_im += B1_im; 01064 o21_re -= B1_re; 01065 o21_im -= B1_im; 01066 o31_re += A1_re; 01067 o31_im += A1_im; 01068 01069 o02_re += A2_re; 01070 o02_im += A2_im; 01071 o12_re += B2_re; 01072 o12_im += B2_im; 01073 o22_re -= B2_re; 01074 o22_im -= B2_im; 01075 o32_re += A2_re; 01076 o32_im += A2_im; 01077 01078 } 01079 01080 #ifdef MULTI_GPU 01081 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) || 01082 (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) ) 01083 #endif 01084 { 01085 // Projector P2- 01086 // 1 0 -i 0 01087 // 0 1 0 i 01088 // i 0 1 0 01089 // 0 -i 0 1 01090 01091 #ifdef MULTI_GPU 01092 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 : 01093 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01094 #else 01095 const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1; 01096 #endif 01097 01098 const int ga_idx = sid; 01099 01100 spinorFloat a0_re, a0_im; 01101 spinorFloat a1_re, a1_im; 01102 spinorFloat a2_re, a2_im; 01103 spinorFloat b0_re, b0_im; 01104 spinorFloat b1_re, b1_im; 01105 spinorFloat b2_re, b2_im; 01106 01107 #ifdef MULTI_GPU 01108 if (kernel_type == INTERIOR_KERNEL) { 01109 #endif 01110 01111 if (threadIdx.z == blockDim.z-1 && blockDim.z < X3) { 01112 // read spinor from device memory 01113 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01114 01115 // project spinor into half spinors 01116 a0_re = +i00_re+i20_im; 01117 a0_im = +i00_im-i20_re; 01118 a1_re = +i01_re+i21_im; 01119 a1_im = +i01_im-i21_re; 01120 a2_re = +i02_re+i22_im; 01121 a2_im = +i02_im-i22_re; 01122 b0_re = +i10_re-i30_im; 01123 b0_im = +i10_im+i30_re; 01124 b1_re = +i11_re-i31_im; 01125 b1_im = +i11_im+i31_re; 01126 b2_re = +i12_re-i32_im; 01127 b2_im = +i12_im+i32_re; 01128 } else { 01129 // load spinor from shared memory 01130 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x; 01131 int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0; 01132 READ_SPINOR_SHARED(tx, threadIdx.y, tz); 01133 01134 // project spinor into half spinors 01135 a0_re = +i00_re+i20_im; 01136 a0_im = +i00_im-i20_re; 01137 a1_re = +i01_re+i21_im; 01138 a1_im = +i01_im-i21_re; 01139 a2_re = +i02_re+i22_im; 01140 a2_im = +i02_im-i22_re; 01141 b0_re = +i10_re-i30_im; 01142 b0_im = +i10_im+i30_re; 01143 b1_re = +i11_re-i31_im; 01144 b1_im = +i11_im+i31_re; 01145 b2_re = +i12_re-i32_im; 01146 b2_im = +i12_im+i32_re; 01147 } 01148 01149 #ifdef MULTI_GPU 01150 } else { 01151 01152 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01153 01154 // read half spinor from device memory 01155 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01156 01157 a0_re = i00_re; a0_im = i00_im; 01158 a1_re = i01_re; a1_im = i01_im; 01159 a2_re = i02_re; a2_im = i02_im; 01160 b0_re = i10_re; b0_im = i10_im; 01161 b1_re = i11_re; b1_im = i11_im; 01162 b2_re = i12_re; b2_im = i12_im; 01163 01164 } 01165 #endif // MULTI_GPU 01166 01167 // read gauge matrix from device memory 01168 READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride); 01169 01170 // reconstruct gauge matrix 01171 RECONSTRUCT_GAUGE_MATRIX(4); 01172 01173 // multiply row 0 01174 spinorFloat A0_re = 0; 01175 A0_re += g00_re * a0_re; 01176 A0_re -= g00_im * a0_im; 01177 A0_re += g01_re * a1_re; 01178 A0_re -= g01_im * a1_im; 01179 A0_re += g02_re * a2_re; 01180 A0_re -= g02_im * a2_im; 01181 spinorFloat A0_im = 0; 01182 A0_im += g00_re * a0_im; 01183 A0_im += g00_im * a0_re; 01184 A0_im += g01_re * a1_im; 01185 A0_im += g01_im * a1_re; 01186 A0_im += g02_re * a2_im; 01187 A0_im += g02_im * a2_re; 01188 spinorFloat B0_re = 0; 01189 B0_re += g00_re * b0_re; 01190 B0_re -= g00_im * b0_im; 01191 B0_re += g01_re * b1_re; 01192 B0_re -= g01_im * b1_im; 01193 B0_re += g02_re * b2_re; 01194 B0_re -= g02_im * b2_im; 01195 spinorFloat B0_im = 0; 01196 B0_im += g00_re * b0_im; 01197 B0_im += g00_im * b0_re; 01198 B0_im += g01_re * b1_im; 01199 B0_im += g01_im * b1_re; 01200 B0_im += g02_re * b2_im; 01201 B0_im += g02_im * b2_re; 01202 01203 // multiply row 1 01204 spinorFloat A1_re = 0; 01205 A1_re += g10_re * a0_re; 01206 A1_re -= g10_im * a0_im; 01207 A1_re += g11_re * a1_re; 01208 A1_re -= g11_im * a1_im; 01209 A1_re += g12_re * a2_re; 01210 A1_re -= g12_im * a2_im; 01211 spinorFloat A1_im = 0; 01212 A1_im += g10_re * a0_im; 01213 A1_im += g10_im * a0_re; 01214 A1_im += g11_re * a1_im; 01215 A1_im += g11_im * a1_re; 01216 A1_im += g12_re * a2_im; 01217 A1_im += g12_im * a2_re; 01218 spinorFloat B1_re = 0; 01219 B1_re += g10_re * b0_re; 01220 B1_re -= g10_im * b0_im; 01221 B1_re += g11_re * b1_re; 01222 B1_re -= g11_im * b1_im; 01223 B1_re += g12_re * b2_re; 01224 B1_re -= g12_im * b2_im; 01225 spinorFloat B1_im = 0; 01226 B1_im += g10_re * b0_im; 01227 B1_im += g10_im * b0_re; 01228 B1_im += g11_re * b1_im; 01229 B1_im += g11_im * b1_re; 01230 B1_im += g12_re * b2_im; 01231 B1_im += g12_im * b2_re; 01232 01233 // multiply row 2 01234 spinorFloat A2_re = 0; 01235 A2_re += g20_re * a0_re; 01236 A2_re -= g20_im * a0_im; 01237 A2_re += g21_re * a1_re; 01238 A2_re -= g21_im * a1_im; 01239 A2_re += g22_re * a2_re; 01240 A2_re -= g22_im * a2_im; 01241 spinorFloat A2_im = 0; 01242 A2_im += g20_re * a0_im; 01243 A2_im += g20_im * a0_re; 01244 A2_im += g21_re * a1_im; 01245 A2_im += g21_im * a1_re; 01246 A2_im += g22_re * a2_im; 01247 A2_im += g22_im * a2_re; 01248 spinorFloat B2_re = 0; 01249 B2_re += g20_re * b0_re; 01250 B2_re -= g20_im * b0_im; 01251 B2_re += g21_re * b1_re; 01252 B2_re -= g21_im * b1_im; 01253 B2_re += g22_re * b2_re; 01254 B2_re -= g22_im * b2_im; 01255 spinorFloat B2_im = 0; 01256 B2_im += g20_re * b0_im; 01257 B2_im += g20_im * b0_re; 01258 B2_im += g21_re * b1_im; 01259 B2_im += g21_im * b1_re; 01260 B2_im += g22_re * b2_im; 01261 B2_im += g22_im * b2_re; 01262 01263 o00_re += A0_re; 01264 o00_im += A0_im; 01265 o10_re += B0_re; 01266 o10_im += B0_im; 01267 o20_re -= A0_im; 01268 o20_im += A0_re; 01269 o30_re += B0_im; 01270 o30_im -= B0_re; 01271 01272 o01_re += A1_re; 01273 o01_im += A1_im; 01274 o11_re += B1_re; 01275 o11_im += B1_im; 01276 o21_re -= A1_im; 01277 o21_im += A1_re; 01278 o31_re += B1_im; 01279 o31_im -= B1_re; 01280 01281 o02_re += A2_re; 01282 o02_im += A2_im; 01283 o12_re += B2_re; 01284 o12_im += B2_im; 01285 o22_re -= A2_im; 01286 o22_im += A2_re; 01287 o32_re += B2_im; 01288 o32_im -= B2_re; 01289 01290 } 01291 01292 #ifdef MULTI_GPU 01293 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) || 01294 (kernel_type == EXTERIOR_KERNEL_Z && x3==0) ) 01295 #endif 01296 { 01297 // Projector P2+ 01298 // 1 0 i 0 01299 // 0 1 0 -i 01300 // -i 0 1 0 01301 // 0 i 0 1 01302 01303 #ifdef MULTI_GPU 01304 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 : 01305 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01306 #else 01307 const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1; 01308 #endif 01309 01310 #ifdef MULTI_GPU 01311 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01312 #else 01313 const int ga_idx = sp_idx; 01314 #endif 01315 01316 spinorFloat a0_re, a0_im; 01317 spinorFloat a1_re, a1_im; 01318 spinorFloat a2_re, a2_im; 01319 spinorFloat b0_re, b0_im; 01320 spinorFloat b1_re, b1_im; 01321 spinorFloat b2_re, b2_im; 01322 01323 #ifdef MULTI_GPU 01324 if (kernel_type == INTERIOR_KERNEL) { 01325 #endif 01326 01327 if (threadIdx.z == 0 && blockDim.z < X3) { 01328 // read spinor from device memory 01329 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01330 01331 // project spinor into half spinors 01332 a0_re = +i00_re-i20_im; 01333 a0_im = +i00_im+i20_re; 01334 a1_re = +i01_re-i21_im; 01335 a1_im = +i01_im+i21_re; 01336 a2_re = +i02_re-i22_im; 01337 a2_im = +i02_im+i22_re; 01338 b0_re = +i10_re+i30_im; 01339 b0_im = +i10_im-i30_re; 01340 b1_re = +i11_re+i31_im; 01341 b1_im = +i11_im-i31_re; 01342 b2_re = +i12_re+i32_im; 01343 b2_im = +i12_im-i32_re; 01344 } else { 01345 // load spinor from shared memory 01346 int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x; 01347 int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1; 01348 READ_SPINOR_SHARED(tx, threadIdx.y, tz); 01349 01350 // project spinor into half spinors 01351 a0_re = +i00_re-i20_im; 01352 a0_im = +i00_im+i20_re; 01353 a1_re = +i01_re-i21_im; 01354 a1_im = +i01_im+i21_re; 01355 a2_re = +i02_re-i22_im; 01356 a2_im = +i02_im+i22_re; 01357 b0_re = +i10_re+i30_im; 01358 b0_im = +i10_im-i30_re; 01359 b1_re = +i11_re+i31_im; 01360 b1_im = +i11_im-i31_re; 01361 b2_re = +i12_re+i32_im; 01362 b2_im = +i12_im-i32_re; 01363 } 01364 01365 #ifdef MULTI_GPU 01366 } else { 01367 01368 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01369 01370 // read half spinor from device memory 01371 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01372 01373 a0_re = i00_re; a0_im = i00_im; 01374 a1_re = i01_re; a1_im = i01_im; 01375 a2_re = i02_re; a2_im = i02_im; 01376 b0_re = i10_re; b0_im = i10_im; 01377 b1_re = i11_re; b1_im = i11_im; 01378 b2_re = i12_re; b2_im = i12_im; 01379 01380 } 01381 #endif // MULTI_GPU 01382 01383 // read gauge matrix from device memory 01384 READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride); 01385 01386 // reconstruct gauge matrix 01387 RECONSTRUCT_GAUGE_MATRIX(5); 01388 01389 // multiply row 0 01390 spinorFloat A0_re = 0; 01391 A0_re += gT00_re * a0_re; 01392 A0_re -= gT00_im * a0_im; 01393 A0_re += gT01_re * a1_re; 01394 A0_re -= gT01_im * a1_im; 01395 A0_re += gT02_re * a2_re; 01396 A0_re -= gT02_im * a2_im; 01397 spinorFloat A0_im = 0; 01398 A0_im += gT00_re * a0_im; 01399 A0_im += gT00_im * a0_re; 01400 A0_im += gT01_re * a1_im; 01401 A0_im += gT01_im * a1_re; 01402 A0_im += gT02_re * a2_im; 01403 A0_im += gT02_im * a2_re; 01404 spinorFloat B0_re = 0; 01405 B0_re += gT00_re * b0_re; 01406 B0_re -= gT00_im * b0_im; 01407 B0_re += gT01_re * b1_re; 01408 B0_re -= gT01_im * b1_im; 01409 B0_re += gT02_re * b2_re; 01410 B0_re -= gT02_im * b2_im; 01411 spinorFloat B0_im = 0; 01412 B0_im += gT00_re * b0_im; 01413 B0_im += gT00_im * b0_re; 01414 B0_im += gT01_re * b1_im; 01415 B0_im += gT01_im * b1_re; 01416 B0_im += gT02_re * b2_im; 01417 B0_im += gT02_im * b2_re; 01418 01419 // multiply row 1 01420 spinorFloat A1_re = 0; 01421 A1_re += gT10_re * a0_re; 01422 A1_re -= gT10_im * a0_im; 01423 A1_re += gT11_re * a1_re; 01424 A1_re -= gT11_im * a1_im; 01425 A1_re += gT12_re * a2_re; 01426 A1_re -= gT12_im * a2_im; 01427 spinorFloat A1_im = 0; 01428 A1_im += gT10_re * a0_im; 01429 A1_im += gT10_im * a0_re; 01430 A1_im += gT11_re * a1_im; 01431 A1_im += gT11_im * a1_re; 01432 A1_im += gT12_re * a2_im; 01433 A1_im += gT12_im * a2_re; 01434 spinorFloat B1_re = 0; 01435 B1_re += gT10_re * b0_re; 01436 B1_re -= gT10_im * b0_im; 01437 B1_re += gT11_re * b1_re; 01438 B1_re -= gT11_im * b1_im; 01439 B1_re += gT12_re * b2_re; 01440 B1_re -= gT12_im * b2_im; 01441 spinorFloat B1_im = 0; 01442 B1_im += gT10_re * b0_im; 01443 B1_im += gT10_im * b0_re; 01444 B1_im += gT11_re * b1_im; 01445 B1_im += gT11_im * b1_re; 01446 B1_im += gT12_re * b2_im; 01447 B1_im += gT12_im * b2_re; 01448 01449 // multiply row 2 01450 spinorFloat A2_re = 0; 01451 A2_re += gT20_re * a0_re; 01452 A2_re -= gT20_im * a0_im; 01453 A2_re += gT21_re * a1_re; 01454 A2_re -= gT21_im * a1_im; 01455 A2_re += gT22_re * a2_re; 01456 A2_re -= gT22_im * a2_im; 01457 spinorFloat A2_im = 0; 01458 A2_im += gT20_re * a0_im; 01459 A2_im += gT20_im * a0_re; 01460 A2_im += gT21_re * a1_im; 01461 A2_im += gT21_im * a1_re; 01462 A2_im += gT22_re * a2_im; 01463 A2_im += gT22_im * a2_re; 01464 spinorFloat B2_re = 0; 01465 B2_re += gT20_re * b0_re; 01466 B2_re -= gT20_im * b0_im; 01467 B2_re += gT21_re * b1_re; 01468 B2_re -= gT21_im * b1_im; 01469 B2_re += gT22_re * b2_re; 01470 B2_re -= gT22_im * b2_im; 01471 spinorFloat B2_im = 0; 01472 B2_im += gT20_re * b0_im; 01473 B2_im += gT20_im * b0_re; 01474 B2_im += gT21_re * b1_im; 01475 B2_im += gT21_im * b1_re; 01476 B2_im += gT22_re * b2_im; 01477 B2_im += gT22_im * b2_re; 01478 01479 o00_re += A0_re; 01480 o00_im += A0_im; 01481 o10_re += B0_re; 01482 o10_im += B0_im; 01483 o20_re += A0_im; 01484 o20_im -= A0_re; 01485 o30_re -= B0_im; 01486 o30_im += B0_re; 01487 01488 o01_re += A1_re; 01489 o01_im += A1_im; 01490 o11_re += B1_re; 01491 o11_im += B1_im; 01492 o21_re += A1_im; 01493 o21_im -= A1_re; 01494 o31_re -= B1_im; 01495 o31_im += B1_re; 01496 01497 o02_re += A2_re; 01498 o02_im += A2_im; 01499 o12_re += B2_re; 01500 o12_im += B2_im; 01501 o22_re += A2_im; 01502 o22_im -= A2_re; 01503 o32_re -= B2_im; 01504 o32_im += B2_re; 01505 01506 } 01507 01508 #ifdef MULTI_GPU 01509 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) || 01510 (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) ) 01511 #endif 01512 { 01513 // Projector P3- 01514 // 0 0 0 0 01515 // 0 0 0 0 01516 // 0 0 2 0 01517 // 0 0 0 2 01518 01519 #ifdef MULTI_GPU 01520 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 : 01521 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01522 #else 01523 const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1; 01524 #endif 01525 01526 const int ga_idx = sid; 01527 01528 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01529 { 01530 spinorFloat a0_re, a0_im; 01531 spinorFloat a1_re, a1_im; 01532 spinorFloat a2_re, a2_im; 01533 spinorFloat b0_re, b0_im; 01534 spinorFloat b1_re, b1_im; 01535 spinorFloat b2_re, b2_im; 01536 01537 #ifdef MULTI_GPU 01538 if (kernel_type == INTERIOR_KERNEL) { 01539 #endif 01540 01541 // read spinor from device memory 01542 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01543 01544 // project spinor into half spinors 01545 a0_re = +2*i20_re; 01546 a0_im = +2*i20_im; 01547 a1_re = +2*i21_re; 01548 a1_im = +2*i21_im; 01549 a2_re = +2*i22_re; 01550 a2_im = +2*i22_im; 01551 b0_re = +2*i30_re; 01552 b0_im = +2*i30_im; 01553 b1_re = +2*i31_re; 01554 b1_im = +2*i31_im; 01555 b2_re = +2*i32_re; 01556 b2_im = +2*i32_im; 01557 01558 #ifdef MULTI_GPU 01559 } else { 01560 01561 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01562 const int t_proj_scale = TPROJSCALE; 01563 01564 // read half spinor from device memory 01565 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01566 01567 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01568 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01569 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01570 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01571 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01572 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01573 01574 } 01575 #endif // MULTI_GPU 01576 01577 // identity gauge matrix 01578 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01579 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01580 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01581 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01582 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01583 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01584 01585 o20_re += A0_re; 01586 o20_im += A0_im; 01587 o30_re += B0_re; 01588 o30_im += B0_im; 01589 01590 o21_re += A1_re; 01591 o21_im += A1_im; 01592 o31_re += B1_re; 01593 o31_im += B1_im; 01594 01595 o22_re += A2_re; 01596 o22_im += A2_im; 01597 o32_re += B2_re; 01598 o32_im += B2_im; 01599 01600 } else { 01601 spinorFloat a0_re, a0_im; 01602 spinorFloat a1_re, a1_im; 01603 spinorFloat a2_re, a2_im; 01604 spinorFloat b0_re, b0_im; 01605 spinorFloat b1_re, b1_im; 01606 spinorFloat b2_re, b2_im; 01607 01608 #ifdef MULTI_GPU 01609 if (kernel_type == INTERIOR_KERNEL) { 01610 #endif 01611 01612 // read spinor from device memory 01613 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01614 01615 // project spinor into half spinors 01616 a0_re = +2*i20_re; 01617 a0_im = +2*i20_im; 01618 a1_re = +2*i21_re; 01619 a1_im = +2*i21_im; 01620 a2_re = +2*i22_re; 01621 a2_im = +2*i22_im; 01622 b0_re = +2*i30_re; 01623 b0_im = +2*i30_im; 01624 b1_re = +2*i31_re; 01625 b1_im = +2*i31_im; 01626 b2_re = +2*i32_re; 01627 b2_im = +2*i32_im; 01628 01629 #ifdef MULTI_GPU 01630 } else { 01631 01632 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01633 const int t_proj_scale = TPROJSCALE; 01634 01635 // read half spinor from device memory 01636 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx); 01637 01638 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01639 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01640 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01641 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01642 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01643 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01644 01645 } 01646 #endif // MULTI_GPU 01647 01648 // read gauge matrix from device memory 01649 READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride); 01650 01651 // reconstruct gauge matrix 01652 RECONSTRUCT_GAUGE_MATRIX(6); 01653 01654 // multiply row 0 01655 spinorFloat A0_re = 0; 01656 A0_re += g00_re * a0_re; 01657 A0_re -= g00_im * a0_im; 01658 A0_re += g01_re * a1_re; 01659 A0_re -= g01_im * a1_im; 01660 A0_re += g02_re * a2_re; 01661 A0_re -= g02_im * a2_im; 01662 spinorFloat A0_im = 0; 01663 A0_im += g00_re * a0_im; 01664 A0_im += g00_im * a0_re; 01665 A0_im += g01_re * a1_im; 01666 A0_im += g01_im * a1_re; 01667 A0_im += g02_re * a2_im; 01668 A0_im += g02_im * a2_re; 01669 spinorFloat B0_re = 0; 01670 B0_re += g00_re * b0_re; 01671 B0_re -= g00_im * b0_im; 01672 B0_re += g01_re * b1_re; 01673 B0_re -= g01_im * b1_im; 01674 B0_re += g02_re * b2_re; 01675 B0_re -= g02_im * b2_im; 01676 spinorFloat B0_im = 0; 01677 B0_im += g00_re * b0_im; 01678 B0_im += g00_im * b0_re; 01679 B0_im += g01_re * b1_im; 01680 B0_im += g01_im * b1_re; 01681 B0_im += g02_re * b2_im; 01682 B0_im += g02_im * b2_re; 01683 01684 // multiply row 1 01685 spinorFloat A1_re = 0; 01686 A1_re += g10_re * a0_re; 01687 A1_re -= g10_im * a0_im; 01688 A1_re += g11_re * a1_re; 01689 A1_re -= g11_im * a1_im; 01690 A1_re += g12_re * a2_re; 01691 A1_re -= g12_im * a2_im; 01692 spinorFloat A1_im = 0; 01693 A1_im += g10_re * a0_im; 01694 A1_im += g10_im * a0_re; 01695 A1_im += g11_re * a1_im; 01696 A1_im += g11_im * a1_re; 01697 A1_im += g12_re * a2_im; 01698 A1_im += g12_im * a2_re; 01699 spinorFloat B1_re = 0; 01700 B1_re += g10_re * b0_re; 01701 B1_re -= g10_im * b0_im; 01702 B1_re += g11_re * b1_re; 01703 B1_re -= g11_im * b1_im; 01704 B1_re += g12_re * b2_re; 01705 B1_re -= g12_im * b2_im; 01706 spinorFloat B1_im = 0; 01707 B1_im += g10_re * b0_im; 01708 B1_im += g10_im * b0_re; 01709 B1_im += g11_re * b1_im; 01710 B1_im += g11_im * b1_re; 01711 B1_im += g12_re * b2_im; 01712 B1_im += g12_im * b2_re; 01713 01714 // multiply row 2 01715 spinorFloat A2_re = 0; 01716 A2_re += g20_re * a0_re; 01717 A2_re -= g20_im * a0_im; 01718 A2_re += g21_re * a1_re; 01719 A2_re -= g21_im * a1_im; 01720 A2_re += g22_re * a2_re; 01721 A2_re -= g22_im * a2_im; 01722 spinorFloat A2_im = 0; 01723 A2_im += g20_re * a0_im; 01724 A2_im += g20_im * a0_re; 01725 A2_im += g21_re * a1_im; 01726 A2_im += g21_im * a1_re; 01727 A2_im += g22_re * a2_im; 01728 A2_im += g22_im * a2_re; 01729 spinorFloat B2_re = 0; 01730 B2_re += g20_re * b0_re; 01731 B2_re -= g20_im * b0_im; 01732 B2_re += g21_re * b1_re; 01733 B2_re -= g21_im * b1_im; 01734 B2_re += g22_re * b2_re; 01735 B2_re -= g22_im * b2_im; 01736 spinorFloat B2_im = 0; 01737 B2_im += g20_re * b0_im; 01738 B2_im += g20_im * b0_re; 01739 B2_im += g21_re * b1_im; 01740 B2_im += g21_im * b1_re; 01741 B2_im += g22_re * b2_im; 01742 B2_im += g22_im * b2_re; 01743 01744 o20_re += A0_re; 01745 o20_im += A0_im; 01746 o30_re += B0_re; 01747 o30_im += B0_im; 01748 01749 o21_re += A1_re; 01750 o21_im += A1_im; 01751 o31_re += B1_re; 01752 o31_im += B1_im; 01753 01754 o22_re += A2_re; 01755 o22_im += A2_im; 01756 o32_re += B2_re; 01757 o32_im += B2_im; 01758 01759 } 01760 } 01761 01762 #ifdef MULTI_GPU 01763 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) || 01764 (kernel_type == EXTERIOR_KERNEL_T && x4==0) ) 01765 #endif 01766 { 01767 // Projector P3+ 01768 // 2 0 0 0 01769 // 0 2 0 0 01770 // 0 0 0 0 01771 // 0 0 0 0 01772 01773 #ifdef MULTI_GPU 01774 const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 : 01775 face_idx + param.ghostOffset[static_cast<int>(kernel_type)]; 01776 #else 01777 const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1; 01778 #endif 01779 01780 #ifdef MULTI_GPU 01781 const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx); 01782 #else 01783 const int ga_idx = sp_idx; 01784 #endif 01785 01786 if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) 01787 { 01788 spinorFloat a0_re, a0_im; 01789 spinorFloat a1_re, a1_im; 01790 spinorFloat a2_re, a2_im; 01791 spinorFloat b0_re, b0_im; 01792 spinorFloat b1_re, b1_im; 01793 spinorFloat b2_re, b2_im; 01794 01795 #ifdef MULTI_GPU 01796 if (kernel_type == INTERIOR_KERNEL) { 01797 #endif 01798 01799 // read spinor from device memory 01800 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01801 01802 // project spinor into half spinors 01803 a0_re = +2*i00_re; 01804 a0_im = +2*i00_im; 01805 a1_re = +2*i01_re; 01806 a1_im = +2*i01_im; 01807 a2_re = +2*i02_re; 01808 a2_im = +2*i02_im; 01809 b0_re = +2*i10_re; 01810 b0_im = +2*i10_im; 01811 b1_re = +2*i11_re; 01812 b1_im = +2*i11_im; 01813 b2_re = +2*i12_re; 01814 b2_im = +2*i12_im; 01815 01816 #ifdef MULTI_GPU 01817 } else { 01818 01819 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01820 const int t_proj_scale = TPROJSCALE; 01821 01822 // read half spinor from device memory 01823 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01824 01825 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01826 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01827 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01828 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01829 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01830 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01831 01832 } 01833 #endif // MULTI_GPU 01834 01835 // identity gauge matrix 01836 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01837 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01838 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01839 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01840 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01841 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01842 01843 o00_re += A0_re; 01844 o00_im += A0_im; 01845 o10_re += B0_re; 01846 o10_im += B0_im; 01847 01848 o01_re += A1_re; 01849 o01_im += A1_im; 01850 o11_re += B1_re; 01851 o11_im += B1_im; 01852 01853 o02_re += A2_re; 01854 o02_im += A2_im; 01855 o12_re += B2_re; 01856 o12_im += B2_im; 01857 01858 } else { 01859 spinorFloat a0_re, a0_im; 01860 spinorFloat a1_re, a1_im; 01861 spinorFloat a2_re, a2_im; 01862 spinorFloat b0_re, b0_im; 01863 spinorFloat b1_re, b1_im; 01864 spinorFloat b2_re, b2_im; 01865 01866 #ifdef MULTI_GPU 01867 if (kernel_type == INTERIOR_KERNEL) { 01868 #endif 01869 01870 // read spinor from device memory 01871 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01872 01873 // project spinor into half spinors 01874 a0_re = +2*i00_re; 01875 a0_im = +2*i00_im; 01876 a1_re = +2*i01_re; 01877 a1_im = +2*i01_im; 01878 a2_re = +2*i02_re; 01879 a2_im = +2*i02_im; 01880 b0_re = +2*i10_re; 01881 b0_im = +2*i10_im; 01882 b1_re = +2*i11_re; 01883 b1_im = +2*i11_im; 01884 b2_re = +2*i12_re; 01885 b2_im = +2*i12_im; 01886 01887 #ifdef MULTI_GPU 01888 } else { 01889 01890 const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)]; 01891 const int t_proj_scale = TPROJSCALE; 01892 01893 // read half spinor from device memory 01894 READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx); 01895 01896 a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im; 01897 a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im; 01898 a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im; 01899 b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im; 01900 b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im; 01901 b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im; 01902 01903 } 01904 #endif // MULTI_GPU 01905 01906 // read gauge matrix from device memory 01907 READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride); 01908 01909 // reconstruct gauge matrix 01910 RECONSTRUCT_GAUGE_MATRIX(7); 01911 01912 // multiply row 0 01913 spinorFloat A0_re = 0; 01914 A0_re += gT00_re * a0_re; 01915 A0_re -= gT00_im * a0_im; 01916 A0_re += gT01_re * a1_re; 01917 A0_re -= gT01_im * a1_im; 01918 A0_re += gT02_re * a2_re; 01919 A0_re -= gT02_im * a2_im; 01920 spinorFloat A0_im = 0; 01921 A0_im += gT00_re * a0_im; 01922 A0_im += gT00_im * a0_re; 01923 A0_im += gT01_re * a1_im; 01924 A0_im += gT01_im * a1_re; 01925 A0_im += gT02_re * a2_im; 01926 A0_im += gT02_im * a2_re; 01927 spinorFloat B0_re = 0; 01928 B0_re += gT00_re * b0_re; 01929 B0_re -= gT00_im * b0_im; 01930 B0_re += gT01_re * b1_re; 01931 B0_re -= gT01_im * b1_im; 01932 B0_re += gT02_re * b2_re; 01933 B0_re -= gT02_im * b2_im; 01934 spinorFloat B0_im = 0; 01935 B0_im += gT00_re * b0_im; 01936 B0_im += gT00_im * b0_re; 01937 B0_im += gT01_re * b1_im; 01938 B0_im += gT01_im * b1_re; 01939 B0_im += gT02_re * b2_im; 01940 B0_im += gT02_im * b2_re; 01941 01942 // multiply row 1 01943 spinorFloat A1_re = 0; 01944 A1_re += gT10_re * a0_re; 01945 A1_re -= gT10_im * a0_im; 01946 A1_re += gT11_re * a1_re; 01947 A1_re -= gT11_im * a1_im; 01948 A1_re += gT12_re * a2_re; 01949 A1_re -= gT12_im * a2_im; 01950 spinorFloat A1_im = 0; 01951 A1_im += gT10_re * a0_im; 01952 A1_im += gT10_im * a0_re; 01953 A1_im += gT11_re * a1_im; 01954 A1_im += gT11_im * a1_re; 01955 A1_im += gT12_re * a2_im; 01956 A1_im += gT12_im * a2_re; 01957 spinorFloat B1_re = 0; 01958 B1_re += gT10_re * b0_re; 01959 B1_re -= gT10_im * b0_im; 01960 B1_re += gT11_re * b1_re; 01961 B1_re -= gT11_im * b1_im; 01962 B1_re += gT12_re * b2_re; 01963 B1_re -= gT12_im * b2_im; 01964 spinorFloat B1_im = 0; 01965 B1_im += gT10_re * b0_im; 01966 B1_im += gT10_im * b0_re; 01967 B1_im += gT11_re * b1_im; 01968 B1_im += gT11_im * b1_re; 01969 B1_im += gT12_re * b2_im; 01970 B1_im += gT12_im * b2_re; 01971 01972 // multiply row 2 01973 spinorFloat A2_re = 0; 01974 A2_re += gT20_re * a0_re; 01975 A2_re -= gT20_im * a0_im; 01976 A2_re += gT21_re * a1_re; 01977 A2_re -= gT21_im * a1_im; 01978 A2_re += gT22_re * a2_re; 01979 A2_re -= gT22_im * a2_im; 01980 spinorFloat A2_im = 0; 01981 A2_im += gT20_re * a0_im; 01982 A2_im += gT20_im * a0_re; 01983 A2_im += gT21_re * a1_im; 01984 A2_im += gT21_im * a1_re; 01985 A2_im += gT22_re * a2_im; 01986 A2_im += gT22_im * a2_re; 01987 spinorFloat B2_re = 0; 01988 B2_re += gT20_re * b0_re; 01989 B2_re -= gT20_im * b0_im; 01990 B2_re += gT21_re * b1_re; 01991 B2_re -= gT21_im * b1_im; 01992 B2_re += gT22_re * b2_re; 01993 B2_re -= gT22_im * b2_im; 01994 spinorFloat B2_im = 0; 01995 B2_im += gT20_re * b0_im; 01996 B2_im += gT20_im * b0_re; 01997 B2_im += gT21_re * b1_im; 01998 B2_im += gT21_im * b1_re; 01999 B2_im += gT22_re * b2_im; 02000 B2_im += gT22_im * b2_re; 02001 02002 o00_re += A0_re; 02003 o00_im += A0_im; 02004 o10_re += B0_re; 02005 o10_im += B0_im; 02006 02007 o01_re += A1_re; 02008 o01_im += A1_im; 02009 o11_re += B1_re; 02010 o11_im += B1_im; 02011 02012 o02_re += A2_re; 02013 o02_im += A2_im; 02014 o12_re += B2_re; 02015 o12_im += B2_im; 02016 02017 } 02018 } 02019 02020 #ifdef MULTI_GPU 02021 02022 int incomplete = 0; // Have all 8 contributions been computed for this site? 02023 02024 switch(kernel_type) { // intentional fall-through 02025 case INTERIOR_KERNEL: 02026 incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1)); 02027 case EXTERIOR_KERNEL_T: 02028 incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1)); 02029 case EXTERIOR_KERNEL_Z: 02030 incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1)); 02031 case EXTERIOR_KERNEL_Y: 02032 incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1)); 02033 } 02034 02035 if (!incomplete) 02036 #endif // MULTI_GPU 02037 { 02038 02039 { 02040 // apply twisted mass rotation 02041 VOLATILE spinorFloat tmp00_re = +o00_re-o20_im*a; 02042 VOLATILE spinorFloat tmp00_im = +o00_im+o20_re*a; 02043 VOLATILE spinorFloat tmp01_re = +o01_re-o21_im*a; 02044 VOLATILE spinorFloat tmp01_im = +o01_im+o21_re*a; 02045 VOLATILE spinorFloat tmp02_re = +o02_re-o22_im*a; 02046 VOLATILE spinorFloat tmp02_im = +o02_im+o22_re*a; 02047 02048 VOLATILE spinorFloat tmp10_re = +o10_re-o30_im*a; 02049 VOLATILE spinorFloat tmp10_im = +o10_im+o30_re*a; 02050 VOLATILE spinorFloat tmp11_re = +o11_re-o31_im*a; 02051 VOLATILE spinorFloat tmp11_im = +o11_im+o31_re*a; 02052 VOLATILE spinorFloat tmp12_re = +o12_re-o32_im*a; 02053 VOLATILE spinorFloat tmp12_im = +o12_im+o32_re*a; 02054 02055 VOLATILE spinorFloat tmp20_re = -o00_im*a+o20_re; 02056 VOLATILE spinorFloat tmp20_im = +o00_re*a+o20_im; 02057 VOLATILE spinorFloat tmp21_re = -o01_im*a+o21_re; 02058 VOLATILE spinorFloat tmp21_im = +o01_re*a+o21_im; 02059 VOLATILE spinorFloat tmp22_re = -o02_im*a+o22_re; 02060 VOLATILE spinorFloat tmp22_im = +o02_re*a+o22_im; 02061 02062 VOLATILE spinorFloat tmp30_re = -o10_im*a+o30_re; 02063 VOLATILE spinorFloat tmp30_im = +o10_re*a+o30_im; 02064 VOLATILE spinorFloat tmp31_re = -o11_im*a+o31_re; 02065 VOLATILE spinorFloat tmp31_im = +o11_re*a+o31_im; 02066 VOLATILE spinorFloat tmp32_re = -o12_im*a+o32_re; 02067 VOLATILE spinorFloat tmp32_im = +o12_re*a+o32_im; 02068 02069 02070 #ifndef DSLASH_XPAY 02071 //scale by b = 1/(1 + a*a) 02072 o00_re = b*tmp00_re; 02073 o00_im = b*tmp00_im; 02074 o01_re = b*tmp01_re; 02075 o01_im = b*tmp01_im; 02076 o02_re = b*tmp02_re; 02077 o02_im = b*tmp02_im; 02078 o10_re = b*tmp10_re; 02079 o10_im = b*tmp10_im; 02080 o11_re = b*tmp11_re; 02081 o11_im = b*tmp11_im; 02082 o12_re = b*tmp12_re; 02083 o12_im = b*tmp12_im; 02084 o20_re = b*tmp20_re; 02085 o20_im = b*tmp20_im; 02086 o21_re = b*tmp21_re; 02087 o21_im = b*tmp21_im; 02088 o22_re = b*tmp22_re; 02089 o22_im = b*tmp22_im; 02090 o30_re = b*tmp30_re; 02091 o30_im = b*tmp30_im; 02092 o31_re = b*tmp31_re; 02093 o31_im = b*tmp31_im; 02094 o32_re = b*tmp32_re; 02095 o32_im = b*tmp32_im; 02096 #else 02097 o00_re = tmp00_re; 02098 o00_im = tmp00_im; 02099 o01_re = tmp01_re; 02100 o01_im = tmp01_im; 02101 o02_re = tmp02_re; 02102 o02_im = tmp02_im; 02103 o10_re = tmp10_re; 02104 o10_im = tmp10_im; 02105 o11_re = tmp11_re; 02106 o11_im = tmp11_im; 02107 o12_re = tmp12_re; 02108 o12_im = tmp12_im; 02109 o20_re = tmp20_re; 02110 o20_im = tmp20_im; 02111 o21_re = tmp21_re; 02112 o21_im = tmp21_im; 02113 o22_re = tmp22_re; 02114 o22_im = tmp22_im; 02115 o30_re = tmp30_re; 02116 o30_im = tmp30_im; 02117 o31_re = tmp31_re; 02118 o31_im = tmp31_im; 02119 o32_re = tmp32_re; 02120 o32_im = tmp32_im; 02121 #endif // DSLASH_XPAY 02122 02123 } 02124 #ifdef DSLASH_XPAY 02125 02126 READ_ACCUM(ACCUMTEX, sp_stride) 02127 02128 #ifdef SPINOR_DOUBLE 02129 o00_re = b*o00_re + accum0.x; 02130 o00_im = b*o00_im + accum0.y; 02131 o01_re = b*o01_re + accum1.x; 02132 o01_im = b*o01_im + accum1.y; 02133 o02_re = b*o02_re + accum2.x; 02134 o02_im = b*o02_im + accum2.y; 02135 o10_re = b*o10_re + accum3.x; 02136 o10_im = b*o10_im + accum3.y; 02137 o11_re = b*o11_re + accum4.x; 02138 o11_im = b*o11_im + accum4.y; 02139 o12_re = b*o12_re + accum5.x; 02140 o12_im = b*o12_im + accum5.y; 02141 o20_re = b*o20_re + accum6.x; 02142 o20_im = b*o20_im + accum6.y; 02143 o21_re = b*o21_re + accum7.x; 02144 o21_im = b*o21_im + accum7.y; 02145 o22_re = b*o22_re + accum8.x; 02146 o22_im = b*o22_im + accum8.y; 02147 o30_re = b*o30_re + accum9.x; 02148 o30_im = b*o30_im + accum9.y; 02149 o31_re = b*o31_re + accum10.x; 02150 o31_im = b*o31_im + accum10.y; 02151 o32_re = b*o32_re + accum11.x; 02152 o32_im = b*o32_im + accum11.y; 02153 #else 02154 o00_re = b*o00_re + accum0.x; 02155 o00_im = b*o00_im + accum0.y; 02156 o01_re = b*o01_re + accum0.z; 02157 o01_im = b*o01_im + accum0.w; 02158 o02_re = b*o02_re + accum1.x; 02159 o02_im = b*o02_im + accum1.y; 02160 o10_re = b*o10_re + accum1.z; 02161 o10_im = b*o10_im + accum1.w; 02162 o11_re = b*o11_re + accum2.x; 02163 o11_im = b*o11_im + accum2.y; 02164 o12_re = b*o12_re + accum2.z; 02165 o12_im = b*o12_im + accum2.w; 02166 o20_re = b*o20_re + accum3.x; 02167 o20_im = b*o20_im + accum3.y; 02168 o21_re = b*o21_re + accum3.z; 02169 o21_im = b*o21_im + accum3.w; 02170 o22_re = b*o22_re + accum4.x; 02171 o22_im = b*o22_im + accum4.y; 02172 o30_re = b*o30_re + accum4.z; 02173 o30_im = b*o30_im + accum4.w; 02174 o31_re = b*o31_re + accum5.x; 02175 o31_im = b*o31_im + accum5.y; 02176 o32_re = b*o32_re + accum5.z; 02177 o32_im = b*o32_im + accum5.w; 02178 #endif // SPINOR_DOUBLE 02179 02180 #endif // DSLASH_XPAY 02181 } 02182 02183 // write spinor field back to device memory 02184 WRITE_SPINOR(sp_stride); 02185 02186 // undefine to prevent warning when precision is changed 02187 #undef spinorFloat 02188 #undef WRITE_SPINOR_SHARED 02189 #undef READ_SPINOR_SHARED 02190 #undef SHARED_STRIDE 02191 02192 #undef A_re 02193 #undef A_im 02194 02195 #undef g00_re 02196 #undef g00_im 02197 #undef g01_re 02198 #undef g01_im 02199 #undef g02_re 02200 #undef g02_im 02201 #undef g10_re 02202 #undef g10_im 02203 #undef g11_re 02204 #undef g11_im 02205 #undef g12_re 02206 #undef g12_im 02207 #undef g20_re 02208 #undef g20_im 02209 #undef g21_re 02210 #undef g21_im 02211 #undef g22_re 02212 #undef g22_im 02213 02214 #undef i00_re 02215 #undef i00_im 02216 #undef i01_re 02217 #undef i01_im 02218 #undef i02_re 02219 #undef i02_im 02220 #undef i10_re 02221 #undef i10_im 02222 #undef i11_re 02223 #undef i11_im 02224 #undef i12_re 02225 #undef i12_im 02226 #undef i20_re 02227 #undef i20_im 02228 #undef i21_re 02229 #undef i21_im 02230 #undef i22_re 02231 #undef i22_im 02232 #undef i30_re 02233 #undef i30_im 02234 #undef i31_re 02235 #undef i31_im 02236 #undef i32_re 02237 #undef i32_im 02238 02239 02240 #undef o00_re 02241 #undef o00_im 02242 #undef o01_re 02243 #undef o01_im 02244 #undef o02_re 02245 #undef o02_im 02246 #undef o10_re 02247 #undef o10_im 02248 #undef o11_re 02249 #undef o11_im 02250 #undef o12_re 02251 #undef o12_im 02252 #undef o20_re 02253 #undef o20_im 02254 #undef o21_re 02255 #undef o21_im 02256 #undef o22_re 02257 #undef o22_im 02258 #undef o30_re 02259 #undef o30_im 02260 #undef o31_re 02261 #undef o31_im 02262 #undef o32_re 02263 #undef o32_im 02264 02265 #undef VOLATILE