QUDA v0.4.0
A library for QCD on GPUs
|
00001 //J dslash_dagger_dwf_core.h 00002 //J Ver. 09.10.a 00003 00004 // goto HERE to continue checking 00005 00006 00007 //J Q. Where do the diagonal components 00008 //J += (m0-5) psi(x,s) 00009 //J get performed? Not in this hopping 00010 //J file. Here, m0 is the dwf barrier 00011 //J height, related to Andrew P.'s documentation mdwf.pdf 00012 //J by m0= -M5. 00013 //J A. They get carried out using the xpay 00014 //J operations in dslash_dwf_cuda.cu. 00015 //J These are defined in the dslash_dwf_post.h that is 00016 //J included at the end of this file. 00017 // 00018 00019 //J Carry out the 4d operations with this include. 00020 // It does not undefine things. That comes 00021 // at the end of this file, through another include. 00022 //#include "dslash_dagger_core_ante.h" 00023 00024 // *** CUDA DSLASH DAGGER *** 00025 00026 //#define SHARED_FLOATS_PER_THREAD 0 // FIXME 00027 #define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double)) 00028 00029 #define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float)) 00030 00031 // input spinor 00032 #ifdef SPINOR_DOUBLE 00033 #define spinorFloat double 00034 #define i00_re I0.x 00035 #define i00_im I0.y 00036 #define i01_re I1.x 00037 #define i01_im I1.y 00038 #define i02_re I2.x 00039 #define i02_im I2.y 00040 #define i10_re I3.x 00041 #define i10_im I3.y 00042 #define i11_re I4.x 00043 #define i11_im I4.y 00044 #define i12_re I5.x 00045 #define i12_im I5.y 00046 #define i20_re I6.x 00047 #define i20_im I6.y 00048 #define i21_re I7.x 00049 #define i21_im I7.y 00050 #define i22_re I8.x 00051 #define i22_im I8.y 00052 #define i30_re I9.x 00053 #define i30_im I9.y 00054 #define i31_re I10.x 00055 #define i31_im I10.y 00056 #define i32_re I11.x 00057 #define i32_im I11.y 00058 00059 #else 00060 #define spinorFloat float 00061 #define i00_re I0.x 00062 #define i00_im I0.y 00063 #define i01_re I0.z 00064 #define i01_im I0.w 00065 #define i02_re I1.x 00066 #define i02_im I1.y 00067 #define i10_re I1.z 00068 #define i10_im I1.w 00069 #define i11_re I2.x 00070 #define i11_im I2.y 00071 #define i12_re I2.z 00072 #define i12_im I2.w 00073 #define i20_re I3.x 00074 #define i20_im I3.y 00075 #define i21_re I3.z 00076 #define i21_im I3.w 00077 #define i22_re I4.x 00078 #define i22_im I4.y 00079 #define i30_re I4.z 00080 #define i30_im I4.w 00081 #define i31_re I5.x 00082 #define i31_im I5.y 00083 #define i32_re I5.z 00084 #define i32_im I5.w 00085 #endif 00086 00087 // gauge link 00088 #ifdef GAUGE_FLOAT2 00089 #define g00_re G0.x 00090 #define g00_im G0.y 00091 #define g01_re G1.x 00092 #define g01_im G1.y 00093 #define g02_re G2.x 00094 #define g02_im G2.y 00095 #define g10_re G3.x 00096 #define g10_im G3.y 00097 #define g11_re G4.x 00098 #define g11_im G4.y 00099 #define g12_re G5.x 00100 #define g12_im G5.y 00101 #define g20_re G6.x 00102 #define g20_im G6.y 00103 #define g21_re G7.x 00104 #define g21_im G7.y 00105 #define g22_re G8.x 00106 #define g22_im G8.y 00107 // temporaries 00108 #define A_re G9.x 00109 #define A_im G9.y 00110 00111 #else 00112 #define g00_re G0.x 00113 #define g00_im G0.y 00114 #define g01_re G0.z 00115 #define g01_im G0.w 00116 #define g02_re G1.x 00117 #define g02_im G1.y 00118 #define g10_re G1.z 00119 #define g10_im G1.w 00120 #define g11_re G2.x 00121 #define g11_im G2.y 00122 #define g12_re G2.z 00123 #define g12_im G2.w 00124 #define g20_re G3.x 00125 #define g20_im G3.y 00126 #define g21_re G3.z 00127 #define g21_im G3.w 00128 #define g22_re G4.x 00129 #define g22_im G4.y 00130 // temporaries 00131 #define A_re G4.z 00132 #define A_im G4.w 00133 00134 #endif 00135 00136 // conjugated gauge link 00137 #define gT00_re (+g00_re) 00138 #define gT00_im (-g00_im) 00139 #define gT01_re (+g10_re) 00140 #define gT01_im (-g10_im) 00141 #define gT02_re (+g20_re) 00142 #define gT02_im (-g20_im) 00143 #define gT10_re (+g01_re) 00144 #define gT10_im (-g01_im) 00145 #define gT11_re (+g11_re) 00146 #define gT11_im (-g11_im) 00147 #define gT12_re (+g21_re) 00148 #define gT12_im (-g21_im) 00149 #define gT20_re (+g02_re) 00150 #define gT20_im (-g02_im) 00151 #define gT21_re (+g12_re) 00152 #define gT21_im (-g12_im) 00153 #define gT22_re (+g22_re) 00154 #define gT22_im (-g22_im) 00155 00156 00157 // output spinor 00158 volatile spinorFloat o00_re; 00159 volatile spinorFloat o00_im; 00160 volatile spinorFloat o01_re; 00161 volatile spinorFloat o01_im; 00162 volatile spinorFloat o02_re; 00163 volatile spinorFloat o02_im; 00164 volatile spinorFloat o10_re; 00165 volatile spinorFloat o10_im; 00166 volatile spinorFloat o11_re; 00167 volatile spinorFloat o11_im; 00168 volatile spinorFloat o12_re; 00169 volatile spinorFloat o12_im; 00170 volatile spinorFloat o20_re; 00171 volatile spinorFloat o20_im; 00172 volatile spinorFloat o21_re; 00173 volatile spinorFloat o21_im; 00174 volatile spinorFloat o22_re; 00175 volatile spinorFloat o22_im; 00176 volatile spinorFloat o30_re; 00177 volatile spinorFloat o30_im; 00178 volatile spinorFloat o31_re; 00179 volatile spinorFloat o31_im; 00180 volatile spinorFloat o32_re; 00181 volatile spinorFloat o32_im; 00182 00183 00184 00185 #include "read_gauge.h" 00186 //#include "read_clover.h" 00187 #include "io_spinor.h" 00188 00189 int sid = blockIdx.x*blockDim.x + threadIdx.x; 00190 if (sid >= param.threads) return; 00191 int boundaryCrossings = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h) + sid/(X4*X3*X2*X1h); 00192 int boundaryCrossings4d = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h); 00193 int X = 2*sid + (boundaryCrossings + param.parity) % 2; 00194 int xs = X/(X4*X3*X2*X1); 00195 int x4 = (X/(X3*X2*X1)) % X4; 00196 int x3 = (X/(X2*X1)) % X3; 00197 int x2 = (X/X1) % X2; 00198 int x1 = X % X1; 00199 00200 o00_re = o00_im = 0; 00201 o01_re = o01_im = 0; 00202 o02_re = o02_im = 0; 00203 o10_re = o10_im = 0; 00204 o11_re = o11_im = 0; 00205 o12_re = o12_im = 0; 00206 o20_re = o20_im = 0; 00207 o21_re = o21_im = 0; 00208 o22_re = o22_im = 0; 00209 o30_re = o30_im = 0; 00210 o31_re = o31_im = 0; 00211 o32_re = o32_im = 0; 00212 00213 { 00214 // Projector P0+ 00215 // 1 0 0 i 00216 // 0 1 i 0 00217 // 0 -i 1 0 00218 // -i 0 0 1 00219 00220 int sp_idx = ((x1==X1-1) ? X-(X1-1) : X+1) / 2; 00221 int ga_idx = sid % Vh; 00222 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 00223 // read gauge matrix from device memory 00224 READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride); 00225 00226 // read spinor from device memory 00227 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00228 00229 // reconstruct gauge matrix 00230 RECONSTRUCT_GAUGE_MATRIX(0); 00231 00232 // project spinor into half spinors 00233 spinorFloat a0_re = +i00_re-i30_im; 00234 spinorFloat a0_im = +i00_im+i30_re; 00235 spinorFloat a1_re = +i01_re-i31_im; 00236 spinorFloat a1_im = +i01_im+i31_re; 00237 spinorFloat a2_re = +i02_re-i32_im; 00238 spinorFloat a2_im = +i02_im+i32_re; 00239 00240 spinorFloat b0_re = +i10_re-i20_im; 00241 spinorFloat b0_im = +i10_im+i20_re; 00242 spinorFloat b1_re = +i11_re-i21_im; 00243 spinorFloat b1_im = +i11_im+i21_re; 00244 spinorFloat b2_re = +i12_re-i22_im; 00245 spinorFloat b2_im = +i12_im+i22_re; 00246 00247 // multiply row 0 00248 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 00249 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 00250 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 00251 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 00252 00253 // multiply row 1 00254 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 00255 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 00256 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 00257 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 00258 00259 // multiply row 2 00260 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 00261 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 00262 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 00263 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 00264 00265 o00_re += A0_re; 00266 o00_im += A0_im; 00267 o10_re += B0_re; 00268 o10_im += B0_im; 00269 o20_re += B0_im; 00270 o20_im -= B0_re; 00271 o30_re += A0_im; 00272 o30_im -= A0_re; 00273 00274 o01_re += A1_re; 00275 o01_im += A1_im; 00276 o11_re += B1_re; 00277 o11_im += B1_im; 00278 o21_re += B1_im; 00279 o21_im -= B1_re; 00280 o31_re += A1_im; 00281 o31_im -= A1_re; 00282 00283 o02_re += A2_re; 00284 o02_im += A2_im; 00285 o12_re += B2_re; 00286 o12_im += B2_im; 00287 o22_re += B2_im; 00288 o22_im -= B2_re; 00289 o32_re += A2_im; 00290 o32_im -= A2_re; 00291 00292 } else { 00293 00294 // read gauge matrix from device memory 00295 READ_GAUGE_MATRIX(G, GAUGE1TEX, 0, ga_idx, ga_stride); 00296 00297 // read spinor from device memory 00298 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00299 00300 // reconstruct gauge matrix 00301 RECONSTRUCT_GAUGE_MATRIX(0); 00302 00303 // project spinor into half spinors 00304 spinorFloat a0_re = +i00_re-i30_im; 00305 spinorFloat a0_im = +i00_im+i30_re; 00306 spinorFloat a1_re = +i01_re-i31_im; 00307 spinorFloat a1_im = +i01_im+i31_re; 00308 spinorFloat a2_re = +i02_re-i32_im; 00309 spinorFloat a2_im = +i02_im+i32_re; 00310 00311 spinorFloat b0_re = +i10_re-i20_im; 00312 spinorFloat b0_im = +i10_im+i20_re; 00313 spinorFloat b1_re = +i11_re-i21_im; 00314 spinorFloat b1_im = +i11_im+i21_re; 00315 spinorFloat b2_re = +i12_re-i22_im; 00316 spinorFloat b2_im = +i12_im+i22_re; 00317 00318 // multiply row 0 00319 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 00320 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 00321 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 00322 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 00323 00324 // multiply row 1 00325 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 00326 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 00327 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 00328 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 00329 00330 // multiply row 2 00331 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 00332 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 00333 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 00334 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 00335 00336 o00_re += A0_re; 00337 o00_im += A0_im; 00338 o10_re += B0_re; 00339 o10_im += B0_im; 00340 o20_re += B0_im; 00341 o20_im -= B0_re; 00342 o30_re += A0_im; 00343 o30_im -= A0_re; 00344 00345 o01_re += A1_re; 00346 o01_im += A1_im; 00347 o11_re += B1_re; 00348 o11_im += B1_im; 00349 o21_re += B1_im; 00350 o21_im -= B1_re; 00351 o31_re += A1_im; 00352 o31_im -= A1_re; 00353 00354 o02_re += A2_re; 00355 o02_im += A2_im; 00356 o12_re += B2_re; 00357 o12_im += B2_im; 00358 o22_re += B2_im; 00359 o22_im -= B2_re; 00360 o32_re += A2_im; 00361 o32_im -= A2_re; 00362 } 00363 } 00364 00365 { 00366 // Projector P0- 00367 // 1 0 0 -i 00368 // 0 1 -i 0 00369 // 0 i 1 0 00370 // i 0 0 1 00371 00372 int sp_idx = ((x1==0) ? X+(X1-1) : X-1) / 2; 00373 int ga_idx = sp_idx % Vh; 00374 00375 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 00376 // read gauge matrix from device memory 00377 READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride); 00378 00379 // read spinor from device memory 00380 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00381 00382 // reconstruct gauge matrix 00383 RECONSTRUCT_GAUGE_MATRIX(1); 00384 00385 // project spinor into half spinors 00386 spinorFloat a0_re = +i00_re+i30_im; 00387 spinorFloat a0_im = +i00_im-i30_re; 00388 spinorFloat a1_re = +i01_re+i31_im; 00389 spinorFloat a1_im = +i01_im-i31_re; 00390 spinorFloat a2_re = +i02_re+i32_im; 00391 spinorFloat a2_im = +i02_im-i32_re; 00392 00393 spinorFloat b0_re = +i10_re+i20_im; 00394 spinorFloat b0_im = +i10_im-i20_re; 00395 spinorFloat b1_re = +i11_re+i21_im; 00396 spinorFloat b1_im = +i11_im-i21_re; 00397 spinorFloat b2_re = +i12_re+i22_im; 00398 spinorFloat b2_im = +i12_im-i22_re; 00399 00400 // multiply row 0 00401 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 00402 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 00403 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 00404 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 00405 00406 // multiply row 1 00407 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 00408 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 00409 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 00410 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 00411 00412 // multiply row 2 00413 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 00414 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 00415 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 00416 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 00417 00418 o00_re += A0_re; 00419 o00_im += A0_im; 00420 o10_re += B0_re; 00421 o10_im += B0_im; 00422 o20_re -= B0_im; 00423 o20_im += B0_re; 00424 o30_re -= A0_im; 00425 o30_im += A0_re; 00426 00427 o01_re += A1_re; 00428 o01_im += A1_im; 00429 o11_re += B1_re; 00430 o11_im += B1_im; 00431 o21_re -= B1_im; 00432 o21_im += B1_re; 00433 o31_re -= A1_im; 00434 o31_im += A1_re; 00435 00436 o02_re += A2_re; 00437 o02_im += A2_im; 00438 o12_re += B2_re; 00439 o12_im += B2_im; 00440 o22_re -= B2_im; 00441 o22_im += B2_re; 00442 o32_re -= A2_im; 00443 o32_im += A2_re; 00444 } else { 00445 // read gauge matrix from device memory 00446 READ_GAUGE_MATRIX(G, GAUGE0TEX, 1, ga_idx, ga_stride); 00447 00448 // read spinor from device memory 00449 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00450 00451 // reconstruct gauge matrix 00452 RECONSTRUCT_GAUGE_MATRIX(1); 00453 00454 // project spinor into half spinors 00455 spinorFloat a0_re = +i00_re+i30_im; 00456 spinorFloat a0_im = +i00_im-i30_re; 00457 spinorFloat a1_re = +i01_re+i31_im; 00458 spinorFloat a1_im = +i01_im-i31_re; 00459 spinorFloat a2_re = +i02_re+i32_im; 00460 spinorFloat a2_im = +i02_im-i32_re; 00461 00462 spinorFloat b0_re = +i10_re+i20_im; 00463 spinorFloat b0_im = +i10_im-i20_re; 00464 spinorFloat b1_re = +i11_re+i21_im; 00465 spinorFloat b1_im = +i11_im-i21_re; 00466 spinorFloat b2_re = +i12_re+i22_im; 00467 spinorFloat b2_im = +i12_im-i22_re; 00468 00469 // multiply row 0 00470 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 00471 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 00472 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 00473 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 00474 00475 // multiply row 1 00476 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 00477 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 00478 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 00479 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 00480 00481 // multiply row 2 00482 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 00483 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 00484 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 00485 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 00486 00487 o00_re += A0_re; 00488 o00_im += A0_im; 00489 o10_re += B0_re; 00490 o10_im += B0_im; 00491 o20_re -= B0_im; 00492 o20_im += B0_re; 00493 o30_re -= A0_im; 00494 o30_im += A0_re; 00495 00496 o01_re += A1_re; 00497 o01_im += A1_im; 00498 o11_re += B1_re; 00499 o11_im += B1_im; 00500 o21_re -= B1_im; 00501 o21_im += B1_re; 00502 o31_re -= A1_im; 00503 o31_im += A1_re; 00504 00505 o02_re += A2_re; 00506 o02_im += A2_im; 00507 o12_re += B2_re; 00508 o12_im += B2_im; 00509 o22_re -= B2_im; 00510 o22_im += B2_re; 00511 o32_re -= A2_im; 00512 o32_im += A2_re; 00513 } 00514 } 00515 00516 { 00517 // Projector P1+ 00518 // 1 0 0 1 00519 // 0 1 -1 0 00520 // 0 -1 1 0 00521 // 1 0 0 1 00522 00523 int sp_idx = ((x2==X2-1) ? X-(X2-1)*X1 : X+X1) / 2; 00524 int ga_idx = sid % Vh; 00525 00526 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 00527 00528 // read gauge matrix from device memory 00529 READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride); 00530 00531 // read spinor from device memory 00532 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00533 00534 // reconstruct gauge matrix 00535 RECONSTRUCT_GAUGE_MATRIX(2); 00536 00537 // project spinor into half spinors 00538 spinorFloat a0_re = +i00_re+i30_re; 00539 spinorFloat a0_im = +i00_im+i30_im; 00540 spinorFloat a1_re = +i01_re+i31_re; 00541 spinorFloat a1_im = +i01_im+i31_im; 00542 spinorFloat a2_re = +i02_re+i32_re; 00543 spinorFloat a2_im = +i02_im+i32_im; 00544 00545 spinorFloat b0_re = +i10_re-i20_re; 00546 spinorFloat b0_im = +i10_im-i20_im; 00547 spinorFloat b1_re = +i11_re-i21_re; 00548 spinorFloat b1_im = +i11_im-i21_im; 00549 spinorFloat b2_re = +i12_re-i22_re; 00550 spinorFloat b2_im = +i12_im-i22_im; 00551 00552 // multiply row 0 00553 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 00554 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 00555 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 00556 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 00557 00558 // multiply row 1 00559 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 00560 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 00561 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 00562 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 00563 00564 // multiply row 2 00565 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 00566 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 00567 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 00568 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 00569 00570 o00_re += A0_re; 00571 o00_im += A0_im; 00572 o10_re += B0_re; 00573 o10_im += B0_im; 00574 o20_re -= B0_re; 00575 o20_im -= B0_im; 00576 o30_re += A0_re; 00577 o30_im += A0_im; 00578 00579 o01_re += A1_re; 00580 o01_im += A1_im; 00581 o11_re += B1_re; 00582 o11_im += B1_im; 00583 o21_re -= B1_re; 00584 o21_im -= B1_im; 00585 o31_re += A1_re; 00586 o31_im += A1_im; 00587 00588 o02_re += A2_re; 00589 o02_im += A2_im; 00590 o12_re += B2_re; 00591 o12_im += B2_im; 00592 o22_re -= B2_re; 00593 o22_im -= B2_im; 00594 o32_re += A2_re; 00595 o32_im += A2_im; 00596 00597 } else { 00598 00599 // read gauge matrix from device memory 00600 READ_GAUGE_MATRIX(G, GAUGE1TEX, 2, ga_idx, ga_stride); 00601 00602 // read spinor from device memory 00603 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00604 00605 // reconstruct gauge matrix 00606 RECONSTRUCT_GAUGE_MATRIX(2); 00607 00608 // project spinor into half spinors 00609 spinorFloat a0_re = +i00_re+i30_re; 00610 spinorFloat a0_im = +i00_im+i30_im; 00611 spinorFloat a1_re = +i01_re+i31_re; 00612 spinorFloat a1_im = +i01_im+i31_im; 00613 spinorFloat a2_re = +i02_re+i32_re; 00614 spinorFloat a2_im = +i02_im+i32_im; 00615 00616 spinorFloat b0_re = +i10_re-i20_re; 00617 spinorFloat b0_im = +i10_im-i20_im; 00618 spinorFloat b1_re = +i11_re-i21_re; 00619 spinorFloat b1_im = +i11_im-i21_im; 00620 spinorFloat b2_re = +i12_re-i22_re; 00621 spinorFloat b2_im = +i12_im-i22_im; 00622 00623 // multiply row 0 00624 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 00625 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 00626 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 00627 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 00628 00629 // multiply row 1 00630 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 00631 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 00632 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 00633 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 00634 00635 // multiply row 2 00636 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 00637 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 00638 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 00639 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 00640 00641 o00_re += A0_re; 00642 o00_im += A0_im; 00643 o10_re += B0_re; 00644 o10_im += B0_im; 00645 o20_re -= B0_re; 00646 o20_im -= B0_im; 00647 o30_re += A0_re; 00648 o30_im += A0_im; 00649 00650 o01_re += A1_re; 00651 o01_im += A1_im; 00652 o11_re += B1_re; 00653 o11_im += B1_im; 00654 o21_re -= B1_re; 00655 o21_im -= B1_im; 00656 o31_re += A1_re; 00657 o31_im += A1_im; 00658 00659 o02_re += A2_re; 00660 o02_im += A2_im; 00661 o12_re += B2_re; 00662 o12_im += B2_im; 00663 o22_re -= B2_re; 00664 o22_im -= B2_im; 00665 o32_re += A2_re; 00666 o32_im += A2_im; 00667 00668 } 00669 } 00670 00671 { 00672 // Projector P1- 00673 // 1 0 0 -1 00674 // 0 1 1 0 00675 // 0 1 1 0 00676 // -1 0 0 1 00677 00678 int sp_idx = ((x2==0) ? X+(X2-1)*X1 : X-X1) / 2; 00679 int ga_idx = sp_idx % Vh; 00680 00681 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 00682 00683 // read gauge matrix from device memory 00684 READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride); 00685 00686 // read spinor from device memory 00687 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00688 00689 // reconstruct gauge matrix 00690 RECONSTRUCT_GAUGE_MATRIX(3); 00691 00692 // project spinor into half spinors 00693 spinorFloat a0_re = +i00_re-i30_re; 00694 spinorFloat a0_im = +i00_im-i30_im; 00695 spinorFloat a1_re = +i01_re-i31_re; 00696 spinorFloat a1_im = +i01_im-i31_im; 00697 spinorFloat a2_re = +i02_re-i32_re; 00698 spinorFloat a2_im = +i02_im-i32_im; 00699 00700 spinorFloat b0_re = +i10_re+i20_re; 00701 spinorFloat b0_im = +i10_im+i20_im; 00702 spinorFloat b1_re = +i11_re+i21_re; 00703 spinorFloat b1_im = +i11_im+i21_im; 00704 spinorFloat b2_re = +i12_re+i22_re; 00705 spinorFloat b2_im = +i12_im+i22_im; 00706 00707 // multiply row 0 00708 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 00709 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 00710 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 00711 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 00712 00713 // multiply row 1 00714 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 00715 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 00716 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 00717 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 00718 00719 // multiply row 2 00720 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 00721 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 00722 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 00723 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 00724 00725 o00_re += A0_re; 00726 o00_im += A0_im; 00727 o10_re += B0_re; 00728 o10_im += B0_im; 00729 o20_re += B0_re; 00730 o20_im += B0_im; 00731 o30_re -= A0_re; 00732 o30_im -= A0_im; 00733 00734 o01_re += A1_re; 00735 o01_im += A1_im; 00736 o11_re += B1_re; 00737 o11_im += B1_im; 00738 o21_re += B1_re; 00739 o21_im += B1_im; 00740 o31_re -= A1_re; 00741 o31_im -= A1_im; 00742 00743 o02_re += A2_re; 00744 o02_im += A2_im; 00745 o12_re += B2_re; 00746 o12_im += B2_im; 00747 o22_re += B2_re; 00748 o22_im += B2_im; 00749 o32_re -= A2_re; 00750 o32_im -= A2_im; 00751 } else { 00752 // read gauge matrix from device memory 00753 READ_GAUGE_MATRIX(G, GAUGE0TEX, 3, ga_idx, ga_stride); 00754 00755 // read spinor from device memory 00756 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00757 00758 // reconstruct gauge matrix 00759 RECONSTRUCT_GAUGE_MATRIX(3); 00760 00761 // project spinor into half spinors 00762 spinorFloat a0_re = +i00_re-i30_re; 00763 spinorFloat a0_im = +i00_im-i30_im; 00764 spinorFloat a1_re = +i01_re-i31_re; 00765 spinorFloat a1_im = +i01_im-i31_im; 00766 spinorFloat a2_re = +i02_re-i32_re; 00767 spinorFloat a2_im = +i02_im-i32_im; 00768 00769 spinorFloat b0_re = +i10_re+i20_re; 00770 spinorFloat b0_im = +i10_im+i20_im; 00771 spinorFloat b1_re = +i11_re+i21_re; 00772 spinorFloat b1_im = +i11_im+i21_im; 00773 spinorFloat b2_re = +i12_re+i22_re; 00774 spinorFloat b2_im = +i12_im+i22_im; 00775 00776 // multiply row 0 00777 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 00778 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 00779 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 00780 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 00781 00782 // multiply row 1 00783 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 00784 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 00785 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 00786 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 00787 00788 // multiply row 2 00789 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 00790 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 00791 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 00792 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 00793 00794 o00_re += A0_re; 00795 o00_im += A0_im; 00796 o10_re += B0_re; 00797 o10_im += B0_im; 00798 o20_re += B0_re; 00799 o20_im += B0_im; 00800 o30_re -= A0_re; 00801 o30_im -= A0_im; 00802 00803 o01_re += A1_re; 00804 o01_im += A1_im; 00805 o11_re += B1_re; 00806 o11_im += B1_im; 00807 o21_re += B1_re; 00808 o21_im += B1_im; 00809 o31_re -= A1_re; 00810 o31_im -= A1_im; 00811 00812 o02_re += A2_re; 00813 o02_im += A2_im; 00814 o12_re += B2_re; 00815 o12_im += B2_im; 00816 o22_re += B2_re; 00817 o22_im += B2_im; 00818 o32_re -= A2_re; 00819 o32_im -= A2_im; 00820 } 00821 } 00822 00823 { 00824 // Projector P2+ 00825 // 1 0 i 0 00826 // 0 1 0 -i 00827 // -i 0 1 0 00828 // 0 i 0 1 00829 00830 int sp_idx = ((x3==X3-1) ? X-(X3-1)*X2*X1 : X+X2*X1) / 2; 00831 int ga_idx = sid % Vh; 00832 00833 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 00834 00835 // read gauge matrix from device memory 00836 READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride); 00837 00838 // read spinor from device memory 00839 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00840 00841 // reconstruct gauge matrix 00842 RECONSTRUCT_GAUGE_MATRIX(4); 00843 00844 // project spinor into half spinors 00845 spinorFloat a0_re = +i00_re-i20_im; 00846 spinorFloat a0_im = +i00_im+i20_re; 00847 spinorFloat a1_re = +i01_re-i21_im; 00848 spinorFloat a1_im = +i01_im+i21_re; 00849 spinorFloat a2_re = +i02_re-i22_im; 00850 spinorFloat a2_im = +i02_im+i22_re; 00851 00852 spinorFloat b0_re = +i10_re+i30_im; 00853 spinorFloat b0_im = +i10_im-i30_re; 00854 spinorFloat b1_re = +i11_re+i31_im; 00855 spinorFloat b1_im = +i11_im-i31_re; 00856 spinorFloat b2_re = +i12_re+i32_im; 00857 spinorFloat b2_im = +i12_im-i32_re; 00858 00859 // multiply row 0 00860 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 00861 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 00862 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 00863 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 00864 00865 // multiply row 1 00866 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 00867 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 00868 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 00869 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 00870 00871 // multiply row 2 00872 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 00873 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 00874 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 00875 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 00876 00877 o00_re += A0_re; 00878 o00_im += A0_im; 00879 o10_re += B0_re; 00880 o10_im += B0_im; 00881 o20_re += A0_im; 00882 o20_im -= A0_re; 00883 o30_re -= B0_im; 00884 o30_im += B0_re; 00885 00886 o01_re += A1_re; 00887 o01_im += A1_im; 00888 o11_re += B1_re; 00889 o11_im += B1_im; 00890 o21_re += A1_im; 00891 o21_im -= A1_re; 00892 o31_re -= B1_im; 00893 o31_im += B1_re; 00894 00895 o02_re += A2_re; 00896 o02_im += A2_im; 00897 o12_re += B2_re; 00898 o12_im += B2_im; 00899 o22_re += A2_im; 00900 o22_im -= A2_re; 00901 o32_re -= B2_im; 00902 o32_im += B2_re; 00903 00904 } else { 00905 00906 // read gauge matrix from device memory 00907 READ_GAUGE_MATRIX(G, GAUGE1TEX, 4, ga_idx, ga_stride); 00908 00909 // read spinor from device memory 00910 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00911 00912 // reconstruct gauge matrix 00913 RECONSTRUCT_GAUGE_MATRIX(4); 00914 00915 // project spinor into half spinors 00916 spinorFloat a0_re = +i00_re-i20_im; 00917 spinorFloat a0_im = +i00_im+i20_re; 00918 spinorFloat a1_re = +i01_re-i21_im; 00919 spinorFloat a1_im = +i01_im+i21_re; 00920 spinorFloat a2_re = +i02_re-i22_im; 00921 spinorFloat a2_im = +i02_im+i22_re; 00922 00923 spinorFloat b0_re = +i10_re+i30_im; 00924 spinorFloat b0_im = +i10_im-i30_re; 00925 spinorFloat b1_re = +i11_re+i31_im; 00926 spinorFloat b1_im = +i11_im-i31_re; 00927 spinorFloat b2_re = +i12_re+i32_im; 00928 spinorFloat b2_im = +i12_im-i32_re; 00929 00930 // multiply row 0 00931 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 00932 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 00933 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 00934 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 00935 00936 // multiply row 1 00937 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 00938 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 00939 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 00940 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 00941 00942 // multiply row 2 00943 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 00944 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 00945 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 00946 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 00947 00948 o00_re += A0_re; 00949 o00_im += A0_im; 00950 o10_re += B0_re; 00951 o10_im += B0_im; 00952 o20_re += A0_im; 00953 o20_im -= A0_re; 00954 o30_re -= B0_im; 00955 o30_im += B0_re; 00956 00957 o01_re += A1_re; 00958 o01_im += A1_im; 00959 o11_re += B1_re; 00960 o11_im += B1_im; 00961 o21_re += A1_im; 00962 o21_im -= A1_re; 00963 o31_re -= B1_im; 00964 o31_im += B1_re; 00965 00966 o02_re += A2_re; 00967 o02_im += A2_im; 00968 o12_re += B2_re; 00969 o12_im += B2_im; 00970 o22_re += A2_im; 00971 o22_im -= A2_re; 00972 o32_re -= B2_im; 00973 o32_im += B2_re; 00974 } 00975 } 00976 00977 { 00978 // Projector P2- 00979 // 1 0 -i 0 00980 // 0 1 0 i 00981 // i 0 1 0 00982 // 0 -i 0 1 00983 00984 int sp_idx = ((x3==0) ? X+(X3-1)*X2*X1 : X-X2*X1) / 2; 00985 int ga_idx = sp_idx % Vh; 00986 00987 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 00988 00989 // read gauge matrix from device memory 00990 READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride); 00991 00992 // read spinor from device memory 00993 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00994 00995 // reconstruct gauge matrix 00996 RECONSTRUCT_GAUGE_MATRIX(5); 00997 00998 // project spinor into half spinors 00999 spinorFloat a0_re = +i00_re+i20_im; 01000 spinorFloat a0_im = +i00_im-i20_re; 01001 spinorFloat a1_re = +i01_re+i21_im; 01002 spinorFloat a1_im = +i01_im-i21_re; 01003 spinorFloat a2_re = +i02_re+i22_im; 01004 spinorFloat a2_im = +i02_im-i22_re; 01005 01006 spinorFloat b0_re = +i10_re-i30_im; 01007 spinorFloat b0_im = +i10_im+i30_re; 01008 spinorFloat b1_re = +i11_re-i31_im; 01009 spinorFloat b1_im = +i11_im+i31_re; 01010 spinorFloat b2_re = +i12_re-i32_im; 01011 spinorFloat b2_im = +i12_im+i32_re; 01012 01013 // multiply row 0 01014 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 01015 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 01016 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 01017 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 01018 01019 // multiply row 1 01020 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 01021 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 01022 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 01023 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 01024 01025 // multiply row 2 01026 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 01027 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 01028 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 01029 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 01030 01031 o00_re += A0_re; 01032 o00_im += A0_im; 01033 o10_re += B0_re; 01034 o10_im += B0_im; 01035 o20_re -= A0_im; 01036 o20_im += A0_re; 01037 o30_re += B0_im; 01038 o30_im -= B0_re; 01039 01040 o01_re += A1_re; 01041 o01_im += A1_im; 01042 o11_re += B1_re; 01043 o11_im += B1_im; 01044 o21_re -= A1_im; 01045 o21_im += A1_re; 01046 o31_re += B1_im; 01047 o31_im -= B1_re; 01048 01049 o02_re += A2_re; 01050 o02_im += A2_im; 01051 o12_re += B2_re; 01052 o12_im += B2_im; 01053 o22_re -= A2_im; 01054 o22_im += A2_re; 01055 o32_re += B2_im; 01056 o32_im -= B2_re; 01057 01058 } else { 01059 01060 // read gauge matrix from device memory 01061 READ_GAUGE_MATRIX(G, GAUGE0TEX, 5, ga_idx, ga_stride); 01062 01063 // read spinor from device memory 01064 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01065 01066 // reconstruct gauge matrix 01067 RECONSTRUCT_GAUGE_MATRIX(5); 01068 01069 // project spinor into half spinors 01070 spinorFloat a0_re = +i00_re+i20_im; 01071 spinorFloat a0_im = +i00_im-i20_re; 01072 spinorFloat a1_re = +i01_re+i21_im; 01073 spinorFloat a1_im = +i01_im-i21_re; 01074 spinorFloat a2_re = +i02_re+i22_im; 01075 spinorFloat a2_im = +i02_im-i22_re; 01076 01077 spinorFloat b0_re = +i10_re-i30_im; 01078 spinorFloat b0_im = +i10_im+i30_re; 01079 spinorFloat b1_re = +i11_re-i31_im; 01080 spinorFloat b1_im = +i11_im+i31_re; 01081 spinorFloat b2_re = +i12_re-i32_im; 01082 spinorFloat b2_im = +i12_im+i32_re; 01083 01084 // multiply row 0 01085 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 01086 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 01087 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 01088 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 01089 01090 // multiply row 1 01091 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 01092 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 01093 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 01094 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 01095 01096 // multiply row 2 01097 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 01098 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 01099 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 01100 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 01101 01102 o00_re += A0_re; 01103 o00_im += A0_im; 01104 o10_re += B0_re; 01105 o10_im += B0_im; 01106 o20_re -= A0_im; 01107 o20_im += A0_re; 01108 o30_re += B0_im; 01109 o30_im -= B0_re; 01110 01111 o01_re += A1_re; 01112 o01_im += A1_im; 01113 o11_re += B1_re; 01114 o11_im += B1_im; 01115 o21_re -= A1_im; 01116 o21_im += A1_re; 01117 o31_re += B1_im; 01118 o31_im -= B1_re; 01119 01120 o02_re += A2_re; 01121 o02_im += A2_im; 01122 o12_re += B2_re; 01123 o12_im += B2_im; 01124 o22_re -= A2_im; 01125 o22_im += A2_re; 01126 o32_re += B2_im; 01127 o32_im -= B2_re; 01128 01129 } 01130 } 01131 01132 { 01133 // Projector P3+ 01134 // 2 0 0 0 01135 // 0 2 0 0 01136 // 0 0 0 0 01137 // 0 0 0 0 01138 01139 int sp_idx = ((x4==X4-1) ? X-(X4-1)*X3*X2*X1 : X+X3*X2*X1) / 2; 01140 int ga_idx = sid % Vh; 01141 01142 if (gauge_fixed && ga_idx < (X4-1)*X1h*X2*X3) { 01143 // read spinor from device memory 01144 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01145 01146 // project spinor into half spinors 01147 spinorFloat a0_re = +2*i00_re; 01148 spinorFloat a0_im = +2*i00_im; 01149 spinorFloat a1_re = +2*i01_re; 01150 spinorFloat a1_im = +2*i01_im; 01151 spinorFloat a2_re = +2*i02_re; 01152 spinorFloat a2_im = +2*i02_im; 01153 01154 spinorFloat b0_re = +2*i10_re; 01155 spinorFloat b0_im = +2*i10_im; 01156 spinorFloat b1_re = +2*i11_re; 01157 spinorFloat b1_im = +2*i11_im; 01158 spinorFloat b2_re = +2*i12_re; 01159 spinorFloat b2_im = +2*i12_im; 01160 01161 // identity gauge matrix 01162 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01163 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01164 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01165 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01166 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01167 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01168 01169 o00_re += A0_re; 01170 o00_im += A0_im; 01171 o10_re += B0_re; 01172 o10_im += B0_im; 01173 01174 o01_re += A1_re; 01175 o01_im += A1_im; 01176 o11_re += B1_re; 01177 o11_im += B1_im; 01178 01179 o02_re += A2_re; 01180 o02_im += A2_im; 01181 o12_re += B2_re; 01182 o12_im += B2_im; 01183 01184 } else { 01185 01186 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 01187 01188 // read gauge matrix from device memory 01189 READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride); 01190 01191 // read spinor from device memory 01192 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01193 01194 // reconstruct gauge matrix 01195 RECONSTRUCT_GAUGE_MATRIX(6); 01196 01197 // project spinor into half spinors 01198 spinorFloat a0_re = +2*i00_re; 01199 spinorFloat a0_im = +2*i00_im; 01200 spinorFloat a1_re = +2*i01_re; 01201 spinorFloat a1_im = +2*i01_im; 01202 spinorFloat a2_re = +2*i02_re; 01203 spinorFloat a2_im = +2*i02_im; 01204 01205 spinorFloat b0_re = +2*i10_re; 01206 spinorFloat b0_im = +2*i10_im; 01207 spinorFloat b1_re = +2*i11_re; 01208 spinorFloat b1_im = +2*i11_im; 01209 spinorFloat b2_re = +2*i12_re; 01210 spinorFloat b2_im = +2*i12_im; 01211 01212 // multiply row 0 01213 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 01214 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 01215 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 01216 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 01217 01218 // multiply row 1 01219 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 01220 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 01221 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 01222 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 01223 01224 // multiply row 2 01225 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 01226 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 01227 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 01228 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 01229 01230 o00_re += A0_re; 01231 o00_im += A0_im; 01232 o10_re += B0_re; 01233 o10_im += B0_im; 01234 01235 o01_re += A1_re; 01236 o01_im += A1_im; 01237 o11_re += B1_re; 01238 o11_im += B1_im; 01239 01240 o02_re += A2_re; 01241 o02_im += A2_im; 01242 o12_re += B2_re; 01243 o12_im += B2_im; 01244 } else { 01245 01246 // read gauge matrix from device memory 01247 READ_GAUGE_MATRIX(G, GAUGE1TEX, 6, ga_idx, ga_stride); 01248 01249 // read spinor from device memory 01250 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01251 01252 // reconstruct gauge matrix 01253 RECONSTRUCT_GAUGE_MATRIX(6); 01254 01255 // project spinor into half spinors 01256 spinorFloat a0_re = +2*i00_re; 01257 spinorFloat a0_im = +2*i00_im; 01258 spinorFloat a1_re = +2*i01_re; 01259 spinorFloat a1_im = +2*i01_im; 01260 spinorFloat a2_re = +2*i02_re; 01261 spinorFloat a2_im = +2*i02_im; 01262 01263 spinorFloat b0_re = +2*i10_re; 01264 spinorFloat b0_im = +2*i10_im; 01265 spinorFloat b1_re = +2*i11_re; 01266 spinorFloat b1_im = +2*i11_im; 01267 spinorFloat b2_re = +2*i12_re; 01268 spinorFloat b2_im = +2*i12_im; 01269 01270 // multiply row 0 01271 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 01272 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 01273 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 01274 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 01275 01276 // multiply row 1 01277 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 01278 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 01279 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 01280 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 01281 01282 // multiply row 2 01283 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 01284 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 01285 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 01286 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 01287 01288 o00_re += A0_re; 01289 o00_im += A0_im; 01290 o10_re += B0_re; 01291 o10_im += B0_im; 01292 01293 o01_re += A1_re; 01294 o01_im += A1_im; 01295 o11_re += B1_re; 01296 o11_im += B1_im; 01297 01298 o02_re += A2_re; 01299 o02_im += A2_im; 01300 o12_re += B2_re; 01301 o12_im += B2_im; 01302 01303 } 01304 } 01305 } 01306 01307 { 01308 // Projector P3- 01309 // 0 0 0 0 01310 // 0 0 0 0 01311 // 0 0 2 0 01312 // 0 0 0 2 01313 01314 int sp_idx = ((x4==0) ? X+(X4-1)*X3*X2*X1 : X-X3*X2*X1) / 2; 01315 int ga_idx = sp_idx % Vh; 01316 01317 if (gauge_fixed && ga_idx < (X4-1)*X1h*X2*X3) { 01318 // read spinor from device memory 01319 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01320 01321 // project spinor into half spinors 01322 spinorFloat a0_re = +2*i20_re; 01323 spinorFloat a0_im = +2*i20_im; 01324 spinorFloat a1_re = +2*i21_re; 01325 spinorFloat a1_im = +2*i21_im; 01326 spinorFloat a2_re = +2*i22_re; 01327 spinorFloat a2_im = +2*i22_im; 01328 01329 spinorFloat b0_re = +2*i30_re; 01330 spinorFloat b0_im = +2*i30_im; 01331 spinorFloat b1_re = +2*i31_re; 01332 spinorFloat b1_im = +2*i31_im; 01333 spinorFloat b2_re = +2*i32_re; 01334 spinorFloat b2_im = +2*i32_im; 01335 01336 // identity gauge matrix 01337 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01338 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01339 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01340 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01341 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01342 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01343 01344 o20_re += A0_re; 01345 o20_im += A0_im; 01346 o30_re += B0_re; 01347 o30_im += B0_im; 01348 01349 o21_re += A1_re; 01350 o21_im += A1_im; 01351 o31_re += B1_re; 01352 o31_im += B1_im; 01353 01354 o22_re += A2_re; 01355 o22_im += A2_im; 01356 o32_re += B2_re; 01357 o32_im += B2_im; 01358 01359 } else { 01360 01361 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 01362 01363 // read gauge matrix from device memory 01364 READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride); 01365 01366 // read spinor from device memory 01367 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01368 01369 // reconstruct gauge matrix 01370 RECONSTRUCT_GAUGE_MATRIX(7); 01371 01372 // project spinor into half spinors 01373 spinorFloat a0_re = +2*i20_re; 01374 spinorFloat a0_im = +2*i20_im; 01375 spinorFloat a1_re = +2*i21_re; 01376 spinorFloat a1_im = +2*i21_im; 01377 spinorFloat a2_re = +2*i22_re; 01378 spinorFloat a2_im = +2*i22_im; 01379 01380 spinorFloat b0_re = +2*i30_re; 01381 spinorFloat b0_im = +2*i30_im; 01382 spinorFloat b1_re = +2*i31_re; 01383 spinorFloat b1_im = +2*i31_im; 01384 spinorFloat b2_re = +2*i32_re; 01385 spinorFloat b2_im = +2*i32_im; 01386 01387 // multiply row 0 01388 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 01389 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 01390 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 01391 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 01392 01393 // multiply row 1 01394 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 01395 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 01396 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 01397 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 01398 01399 // multiply row 2 01400 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 01401 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 01402 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 01403 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 01404 01405 o20_re += A0_re; 01406 o20_im += A0_im; 01407 o30_re += B0_re; 01408 o30_im += B0_im; 01409 01410 o21_re += A1_re; 01411 o21_im += A1_im; 01412 o31_re += B1_re; 01413 o31_im += B1_im; 01414 01415 o22_re += A2_re; 01416 o22_im += A2_im; 01417 o32_re += B2_re; 01418 o32_im += B2_im; 01419 01420 } else { 01421 01422 // read gauge matrix from device memory 01423 READ_GAUGE_MATRIX(G, GAUGE0TEX, 7, ga_idx, ga_stride); 01424 01425 // read spinor from device memory 01426 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01427 01428 // reconstruct gauge matrix 01429 RECONSTRUCT_GAUGE_MATRIX(7); 01430 01431 // project spinor into half spinors 01432 spinorFloat a0_re = +2*i20_re; 01433 spinorFloat a0_im = +2*i20_im; 01434 spinorFloat a1_re = +2*i21_re; 01435 spinorFloat a1_im = +2*i21_im; 01436 spinorFloat a2_re = +2*i22_re; 01437 spinorFloat a2_im = +2*i22_im; 01438 01439 spinorFloat b0_re = +2*i30_re; 01440 spinorFloat b0_im = +2*i30_im; 01441 spinorFloat b1_re = +2*i31_re; 01442 spinorFloat b1_im = +2*i31_im; 01443 spinorFloat b2_re = +2*i32_re; 01444 spinorFloat b2_im = +2*i32_im; 01445 01446 // multiply row 0 01447 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 01448 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 01449 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 01450 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 01451 01452 // multiply row 1 01453 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 01454 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 01455 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 01456 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 01457 01458 // multiply row 2 01459 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 01460 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 01461 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 01462 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 01463 01464 o20_re += A0_re; 01465 o20_im += A0_im; 01466 o30_re += B0_re; 01467 o30_im += B0_im; 01468 01469 o21_re += A1_re; 01470 o21_im += A1_im; 01471 o31_re += B1_re; 01472 o31_im += B1_im; 01473 01474 o22_re += A2_re; 01475 o22_im += A2_im; 01476 o32_re += B2_re; 01477 o32_im += B2_im; 01478 01479 } 01480 } 01481 } 01482 01483 01484 01485 01486 //J ---------------------------------- 01487 //J --- DWF code for 5th dimension --- 01488 //J ---------------------------------- 01489 // 01490 //J Begin scope. 01491 { 01492 //J TODO Insert/check handler for s-direction here. 01493 01494 //J Decided to not change to chiral basis. Then: 01495 // 2 P_+ = 2 P_R = 1 1 01496 // 1 1 01497 // --- Begin right-handed spinor projection. --- 01498 { 01499 //J We are right-handed, so for the dslash_dagger we hop backwards. If we are at 01500 //J boundary in s-direction, special 01501 //J things will need to be done. xs is defined in dslash_dagger_core_ante.h. 01502 //J See near Line 328. N is the 4d volume; cf. quda.h. 01503 //J Cf. hand-written notes 8/6/09 for check of logic. 01504 //J The logic sets xs to the s-coordinate of the output 01505 //J spinor, which is accumulated by this thread. 01506 //J I.e., it uses the thread index to determine xs. 01507 int sp_idx = ((xs==0) ? X+(Ls-1)*2*Vh : X-2*Vh) / 2; 01508 // --- Read spinor from device memory. --- 01509 //J Q. How does it know which direction to hop in? 01510 //J A. It uses sp_idx as the origin and picks up 0*Vh_5d ... 5*Vh_5d 01511 //J offsets in the READ_SPINOR_UP that is below. 01512 //J This has to do with the "concurrency" optimization. 01513 //J Q. Where does Vh_5d get set and does it know about the dwf 01514 //J modification? Does it care? 01515 //J 01516 // 01517 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01518 01519 if (xs != 0) { 01520 //J OK, now the input spinor should be at: 01521 //J 0 < s <= Ls-1 01522 // 01523 //J Project spinor into half spinors, i.e., this is the term 01524 //J " + 2 P_R psi(s-1) " 01525 01526 //J ------------------------------------ 01527 //J --- Dirac index 0, Colors 0,1,2. --- 01528 //J ------------------------------------ 01529 //J dagger takes P_R instead of P_L 01530 o00_re += i00_re+i20_re; //ok 01531 o00_im += i00_im+i20_im; //ok 01532 o01_re += i01_re+i21_re; //ok 01533 o01_im += i01_im+i21_im; //ok 01534 o02_re += i02_re+i22_re; //ok 01535 o02_im += i02_im+i22_im; //ok 01536 01537 //J ------------------------------------- 01538 //J --- Dirac index 1, Colors 0,1,2. --- 01539 //J ------------------------------------- 01540 o10_re += i10_re+i30_re; //ok 01541 o10_im += i10_im+i30_im; //ok 01542 o11_re += i11_re+i31_re; //ok 01543 o11_im += i11_im+i31_im; //ok 01544 o12_re += i12_re+i32_re; //ok 01545 o12_im += i12_im+i32_im; //ok 01546 01547 //J ------------------------------------ 01548 //J --- Dirac index 2, Colors 0,1,2. --- 01549 //J ------------------------------------ 01550 o20_re += i00_re+i20_re; //ok 01551 o20_im += i00_im+i20_im; //ok 01552 o21_re += i01_re+i21_re; //ok 01553 o21_im += i01_im+i21_im; //ok 01554 o22_re += i02_re+i22_re; //ok 01555 o22_im += i02_im+i22_im; //ok 01556 01557 //J ------------------------------------- 01558 //J --- Dirac index 3, Colors 0,1,2. --- 01559 //J ------------------------------------- 01560 // color 0 (second index) 01561 o30_re += i10_re+i30_re; //ok 01562 o30_im += i10_im+i30_im; //ok 01563 // color 1 (second index) 01564 o31_re += i11_re+i31_re; //ok 01565 o31_im += i11_im+i31_im; //ok 01566 // color 2 (second index) 01567 o32_re += i12_re+i32_re; //ok 01568 o32_im += i12_im+i32_im; //ok 01569 01570 } // End (x,0) < (x,s) <= (x,Ls-1). 01571 else { 01572 //J LH boundary s=0, backwards hop to Ls-1. 01573 //J Term to add: -mferm*P_R*psi(x,Ls-1) 01574 //J With any luck, sp_idx is linear equiv. to "(x,Ls-1)" 01575 //J Above, we set: 01576 //J sp_idx= (X+(Ls-1)*X4*X3*X2*X1)/2 (*). 01577 //J efs: do some case examples where xs=0 comes out of 01578 //J dslash_ante_core.h procedure, and check that sp_idx is 01579 //J really coming out correct (and in permissable range) 01580 //J in the operation (*). 01581 //J We need mferm to get passed. A modification 01582 //J was made to DD_PARAM2 in the C preprocessing file 01583 //J dslash_dwf_def.h, adding 01584 //J an extra argument to the kernel declarations. 01585 // 01586 //J --- Dirac index 0, Colors 0,1,2. --- 01587 // color 0 (second index) 01588 o00_re += -mferm*(i00_re+i20_re); //ok 01589 o00_im += -mferm*(i00_im+i20_im); //ok 01590 // color 1 01591 o01_re += -mferm*(i01_re+i21_re); //ok 01592 o01_im += -mferm*(i01_im+i21_im); //ok 01593 // color 2 01594 o02_re += -mferm*(i02_re+i22_re); //ok 01595 o02_im += -mferm*(i02_im+i22_im); //ok 01596 01597 //J --- Dirac index 1, Colors 0,1,2. --- 01598 // color 0 01599 o10_re += -mferm*(i10_re+i30_re); //ok 01600 o10_im += -mferm*(i10_im+i30_im); //ok 01601 // color 1 01602 o11_re += -mferm*(i11_re+i31_re); //ok 01603 o11_im += -mferm*(i11_im+i31_im); //ok 01604 // color 2 01605 o12_re += -mferm*(i12_re+i32_re); //ok 01606 o12_im += -mferm*(i12_im+i32_im); //ok 01607 01608 //J --- Dirac index 2, Colors 0,1,2. --- 01609 // color 0 (second index) 01610 o20_re += -mferm*(i00_re+i20_re); //ok 01611 o20_im += -mferm*(i00_im+i20_im); //ok 01612 // color 1 01613 o21_re += -mferm*(i01_re+i21_re); //ok 01614 o21_im += -mferm*(i01_im+i21_im); //ok 01615 // color 2 01616 o22_re += -mferm*(i02_re+i22_re); //ok 01617 o22_im += -mferm*(i02_im+i22_im); //ok 01618 01619 //J --- Dirac index 3, Colors 0,1,2. --- 01620 // color 0 01621 o30_re += -mferm*(i10_re+i30_re); //ok 01622 o30_im += -mferm*(i10_im+i30_im); //ok 01623 // color 1 01624 o31_re += -mferm*(i11_re+i31_re); //ok 01625 o31_im += -mferm*(i11_im+i31_im); //ok 01626 // color 2 01627 o32_re += -mferm*(i12_re+i32_re); //ok 01628 o32_im += -mferm*(i12_im+i32_im); //ok 01629 01630 } // End (x,s)=(x,0) 01631 } 01632 // --- End of right-handed spinor projection. --- 01633 01634 // In the GPU Dirac matrix basis: 01635 // 2 P_- = 2 P_L = 1 -1 01636 // -1 1 01637 //J Begin scope for 2 P_L projection of forward-hopped spinor. 01638 { 01639 //J For P_L spinor, dslash_dagger, we hop forwards. 01640 01641 //J This bit mimics what is done for x4==X4-1 in dslash_core_ante.h. 01642 //J 01643 //J Checked logic w/ case examples. 01644 //J Cf. hand-written notes 8/6/09 for check of logic. 01645 int sp_idx = ((xs==(Ls-1)) ? X-(Ls-1)*2*Vh : X+2*Vh) / 2; 01646 01647 //J Read spinor from device memory. 01648 // 01649 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01650 01651 // 01652 // 01653 if ( xs < (Ls-1) ) { 01654 //J Case of not at RH boundary. Then we just do += P_L psi(s+1). 01655 01656 //J ------------------------------------ 01657 //J --- Dirac index 0, Colors 0,1,2. --- 01658 //J ------------------------------------ 01659 // color 0 (second index) 01660 o00_re += i00_re-i20_re; //ok 01661 o00_im += i00_im-i20_im; //ok 01662 // color 1 (second index) 01663 o01_re += i01_re-i21_re; //ok 01664 o01_im += i01_im-i21_im; //ok 01665 // color 2 (second index) 01666 o02_re += i02_re-i22_re; //ok 01667 o02_im += i02_im-i22_im; //ok 01668 01669 //J ------------------------------------- 01670 //J --- Dirac index 1, Colors 0,1,2. --- 01671 //J ------------------------------------- 01672 // color 0 (second index) 01673 o10_re += i10_re-i30_re; //ok 01674 o10_im += i10_im-i30_im; //ok 01675 // color 1 (second index) 01676 o11_re += i11_re-i31_re; //ok 01677 o11_im += i11_im-i31_im; //ok 01678 // color 2 (second index) 01679 o12_re += i12_re-i32_re; //ok 01680 o12_im += i12_im-i32_im; //ok 01681 01682 //J ------------------------------------ 01683 //J --- Dirac index 2, Colors 0,1,2. --- 01684 //J ------------------------------------ 01685 // color 0 (second index) 01686 o20_re += -i00_re+i20_re; //ok 01687 o20_im += -i00_im+i20_im; //ok 01688 // color 1 (second index) 01689 o21_re += -i01_re+i21_re; //ok 01690 o21_im += -i01_im+i21_im; //ok 01691 // color 2 (second index) 01692 o22_re += -i02_re+i22_re; //ok 01693 o22_im += -i02_im+i22_im; //ok 01694 01695 //J ------------------------------------- 01696 //J --- Dirac index 3, Colors 0,1,2. --- 01697 //J ------------------------------------- 01698 // color 0 (second index) 01699 o30_re += -i10_re+i30_re; //ok 01700 o30_im += -i10_im+i30_im; //ok 01701 // color 1 (second index) 01702 o31_re += -i11_re+i31_re; //ok 01703 o31_im += -i11_im+i31_im; //ok 01704 // color 2 (second index) 01705 o32_re += -i12_re+i32_re; //ok 01706 o32_im += -i12_im+i32_im; //ok 01707 01708 } // End (x,0) <= (x,s) < (x,Ls-1). 01709 else { 01710 //J RH boundary s=Ls-1, forwards hop to s=0. 01711 //J Term to add: -mferm*P_L*psi(x,0) 01712 01713 //J --- Dirac index 0, Colors 0,1,2. --- 01714 // color 0 (second index) 01715 o00_re += -mferm*(i00_re-i20_re); //ok 01716 o00_im += -mferm*(i00_im-i20_im); //ok 01717 // color 1 01718 o01_re += -mferm*(i01_re-i21_re); //ok 01719 o01_im += -mferm*(i01_im-i21_im); //ok 01720 // color 2 01721 o02_re += -mferm*(i02_re-i22_re); //ok 01722 o02_im += -mferm*(i02_im-i22_im); //ok 01723 01724 //J --- Dirac index 1, Colors 0,1,2. --- 01725 // color 0 01726 o10_re += -mferm*(i10_re-i30_re); //ok 01727 o10_im += -mferm*(i10_im-i30_im); //ok 01728 // color 1 01729 o11_re += -mferm*(i11_re-i31_re); //ok 01730 o11_im += -mferm*(i11_im-i31_im); //ok 01731 // color 2 01732 o12_re += -mferm*(i12_re-i32_re); //ok 01733 o12_im += -mferm*(i12_im-i32_im); //ok 01734 01735 //J --- Dirac index 2, Colors 0,1,2. --- 01736 // color 0 (second index) 01737 o20_re += -mferm*(-i00_re+i20_re); //ok 01738 o20_im += -mferm*(-i00_im+i20_im); //ok 01739 // color 1 01740 o21_re += -mferm*(-i01_re+i21_re); //ok 01741 o21_im += -mferm*(-i01_im+i21_im); //ok 01742 // color 2 01743 o22_re += -mferm*(-i02_re+i22_re); //ok 01744 o22_im += -mferm*(-i02_im+i22_im); //ok 01745 01746 //J --- Dirac index 3, Colors 0,1,2. --- 01747 // color 0 01748 o30_re += -mferm*(-i10_re+i30_re); //ok 01749 o30_im += -mferm*(-i10_im+i30_im); //ok 01750 // color 1 01751 o31_re += -mferm*(-i11_re+i31_re); //ok 01752 o31_im += -mferm*(-i11_im+i31_im); //ok 01753 // color 2 01754 o32_re += -mferm*(-i12_re+i32_re); //ok 01755 o32_im += -mferm*(-i12_im+i32_im); //ok 01756 // 01757 } // End (x,s)=(x,Ls-1) 01758 } 01759 // ----- end dwf s-direction ---- 01760 01761 } // end s-direction block 01762 01763 01764 // Perform the DSLASH_XPAY operations. 01765 // Undefine all the macros. TODO Make sure that this 01766 // is working right for the diagonal terms of DWF. 01767 //#include "dslash_dagger_core_post.h" 01768 01769 01770 #ifdef DSLASH_XPAY 01771 READ_ACCUM(ACCUMTEX, sp_stride) 01772 #ifdef SPINOR_DOUBLE 01773 o00_re = a*o00_re + accum0.x; 01774 o00_im = a*o00_im + accum0.y; 01775 o01_re = a*o01_re + accum1.x; 01776 o01_im = a*o01_im + accum1.y; 01777 o02_re = a*o02_re + accum2.x; 01778 o02_im = a*o02_im + accum2.y; 01779 o10_re = a*o10_re + accum3.x; 01780 o10_im = a*o10_im + accum3.y; 01781 o11_re = a*o11_re + accum4.x; 01782 o11_im = a*o11_im + accum4.y; 01783 o12_re = a*o12_re + accum5.x; 01784 o12_im = a*o12_im + accum5.y; 01785 o20_re = a*o20_re + accum6.x; 01786 o20_im = a*o20_im + accum6.y; 01787 o21_re = a*o21_re + accum7.x; 01788 o21_im = a*o21_im + accum7.y; 01789 o22_re = a*o22_re + accum8.x; 01790 o22_im = a*o22_im + accum8.y; 01791 o30_re = a*o30_re + accum9.x; 01792 o30_im = a*o30_im + accum9.y; 01793 o31_re = a*o31_re + accum10.x; 01794 o31_im = a*o31_im + accum10.y; 01795 o32_re = a*o32_re + accum11.x; 01796 o32_im = a*o32_im + accum11.y; 01797 #else 01798 o00_re = a*o00_re + accum0.x; 01799 o00_im = a*o00_im + accum0.y; 01800 o01_re = a*o01_re + accum0.z; 01801 o01_im = a*o01_im + accum0.w; 01802 o02_re = a*o02_re + accum1.x; 01803 o02_im = a*o02_im + accum1.y; 01804 o10_re = a*o10_re + accum1.z; 01805 o10_im = a*o10_im + accum1.w; 01806 o11_re = a*o11_re + accum2.x; 01807 o11_im = a*o11_im + accum2.y; 01808 o12_re = a*o12_re + accum2.z; 01809 o12_im = a*o12_im + accum2.w; 01810 o20_re = a*o20_re + accum3.x; 01811 o20_im = a*o20_im + accum3.y; 01812 o21_re = a*o21_re + accum3.z; 01813 o21_im = a*o21_im + accum3.w; 01814 o22_re = a*o22_re + accum4.x; 01815 o22_im = a*o22_im + accum4.y; 01816 o30_re = a*o30_re + accum4.z; 01817 o30_im = a*o30_im + accum4.w; 01818 o31_re = a*o31_re + accum5.x; 01819 o31_im = a*o31_im + accum5.y; 01820 o32_re = a*o32_re + accum5.z; 01821 o32_im = a*o32_im + accum5.w; 01822 #endif // DD_SPREC 01823 #endif // DSLASH_XPAY 01824 01825 01826 // write spinor field back to device memory 01827 WRITE_SPINOR(sp_stride); 01828 01829 // undefine to prevent warning when precision is changed 01830 #undef spinorFloat 01831 #undef A_re 01832 #undef A_im 01833 01834 #undef g00_re 01835 #undef g00_im 01836 #undef g01_re 01837 #undef g01_im 01838 #undef g02_re 01839 #undef g02_im 01840 #undef g10_re 01841 #undef g10_im 01842 #undef g11_re 01843 #undef g11_im 01844 #undef g12_re 01845 #undef g12_im 01846 #undef g20_re 01847 #undef g20_im 01848 #undef g21_re 01849 #undef g21_im 01850 #undef g22_re 01851 #undef g22_im 01852 01853 #undef i00_re 01854 #undef i00_im 01855 #undef i01_re 01856 #undef i01_im 01857 #undef i02_re 01858 #undef i02_im 01859 #undef i10_re 01860 #undef i10_im 01861 #undef i11_re 01862 #undef i11_im 01863 #undef i12_re 01864 #undef i12_im 01865 #undef i20_re 01866 #undef i20_im 01867 #undef i21_re 01868 #undef i21_im 01869 #undef i22_re 01870 #undef i22_im 01871 #undef i30_re 01872 #undef i30_im 01873 #undef i31_re 01874 #undef i31_im 01875 #undef i32_re 01876 #undef i32_im 01877