QUDA v0.4.0
A library for QCD on GPUs
|
00001 //J dslash_dwf_core.h 00002 //J Ver. 09.08.a 00003 // 00004 // 5D PreConditioning (5DPC) code. I.e., the hopping in s direction is 00005 // included in "dslash". 4DPC would require the inverse of the s direction 00006 // hopping to be coded into XpayD, in place of kappa. 00007 // 00008 // 00009 00010 //J Carry out the 4d operations with this include. 00011 // It does not undefine things. That comes 00012 // at the end of this file, through another include. 00013 //#include "dslash_core_ante.h" 00014 00015 //#define SHARED_FLOATS_PER_THREAD 0 // FIXME 00016 #define SHARED_BYTES_DOUBLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(double)) 00017 00018 #define SHARED_BYTES_SINGLE (BLOCK_DIM*SHARED_FLOATS_PER_THREAD*sizeof(float)) 00019 00020 //J I0,...,I11 correspond to the input spinor. 00021 //J The texture fetch operations that get it from device memory are 00022 //J located in io_spinor.h. 00023 // 00024 #ifdef SPINOR_DOUBLE 00025 #define spinorFloat double 00026 //J --- Dirac index 0, Colors 0,1,2. --- 00027 #define i00_re I0.x 00028 #define i00_im I0.y 00029 #define i01_re I1.x 00030 #define i01_im I1.y 00031 #define i02_re I2.x 00032 #define i02_im I2.y 00033 //J --- Dirac index 1, Colors 0,1,2. --- 00034 #define i10_re I3.x 00035 #define i10_im I3.y 00036 #define i11_re I4.x 00037 #define i11_im I4.y 00038 #define i12_re I5.x 00039 #define i12_im I5.y 00040 //J --- Dirac index 2, Colors 0,1,2. --- 00041 #define i20_re I6.x 00042 #define i20_im I6.y 00043 #define i21_re I7.x 00044 #define i21_im I7.y 00045 #define i22_re I8.x 00046 #define i22_im I8.y 00047 //J --- Dirac index 3, Colors 0,1,2. --- 00048 #define i30_re I9.x 00049 #define i30_im I9.y 00050 #define i31_re I10.x 00051 #define i31_im I10.y 00052 #define i32_re I11.x 00053 #define i32_im I11.y 00054 00055 #else 00056 #define spinorFloat float 00057 #define i00_re I0.x 00058 #define i00_im I0.y 00059 #define i01_re I0.z 00060 #define i01_im I0.w 00061 #define i02_re I1.x 00062 #define i02_im I1.y 00063 #define i10_re I1.z 00064 #define i10_im I1.w 00065 #define i11_re I2.x 00066 #define i11_im I2.y 00067 #define i12_re I2.z 00068 #define i12_im I2.w 00069 #define i20_re I3.x 00070 #define i20_im I3.y 00071 #define i21_re I3.z 00072 #define i21_im I3.w 00073 #define i22_re I4.x 00074 #define i22_im I4.y 00075 #define i30_re I4.z 00076 #define i30_im I4.w 00077 #define i31_re I5.x 00078 #define i31_im I5.y 00079 #define i32_re I5.z 00080 #define i32_im I5.w 00081 #endif 00082 00083 // gauge link 00084 #ifdef GAUGE_FLOAT2 00085 #define g00_re G0.x 00086 #define g00_im G0.y 00087 #define g01_re G1.x 00088 #define g01_im G1.y 00089 #define g02_re G2.x 00090 #define g02_im G2.y 00091 #define g10_re G3.x 00092 #define g10_im G3.y 00093 #define g11_re G4.x 00094 #define g11_im G4.y 00095 #define g12_re G5.x 00096 #define g12_im G5.y 00097 #define g20_re G6.x 00098 #define g20_im G6.y 00099 #define g21_re G7.x 00100 #define g21_im G7.y 00101 #define g22_re G8.x 00102 #define g22_im G8.y 00103 // temporaries 00104 #define A_re G9.x 00105 #define A_im G9.y 00106 00107 #else 00108 // Color row 0, color cols 0,1,2. 00109 #define g00_re G0.x 00110 #define g00_im G0.y 00111 #define g01_re G0.z 00112 #define g01_im G0.w 00113 #define g02_re G1.x 00114 #define g02_im G1.y 00115 // Color row 1, color cols 0,1,2. 00116 #define g10_re G1.z 00117 #define g10_im G1.w 00118 #define g11_re G2.x 00119 #define g11_im G2.y 00120 #define g12_re G2.z 00121 #define g12_im G2.w 00122 // Colors row 2, color cols 0,1,2. 00123 // These are reconstructed from rows 0 and 1. 00124 #define g20_re G3.x 00125 #define g20_im G3.y 00126 #define g21_re G3.z 00127 #define g21_im G3.w 00128 #define g22_re G4.x 00129 #define g22_im G4.y 00130 // 00131 // temporaries 00132 #define A_re G4.z 00133 #define A_im G4.w 00134 00135 #endif 00136 00137 // conjugated gauge link 00138 #define gT00_re (+g00_re) 00139 #define gT00_im (-g00_im) 00140 #define gT01_re (+g10_re) 00141 #define gT01_im (-g10_im) 00142 #define gT02_re (+g20_re) 00143 #define gT02_im (-g20_im) 00144 #define gT10_re (+g01_re) 00145 #define gT10_im (-g01_im) 00146 #define gT11_re (+g11_re) 00147 #define gT11_im (-g11_im) 00148 #define gT12_re (+g21_re) 00149 #define gT12_im (-g21_im) 00150 #define gT20_re (+g02_re) 00151 #define gT20_im (-g02_im) 00152 #define gT21_re (+g12_re) 00153 #define gT21_im (-g12_im) 00154 #define gT22_re (+g22_re) 00155 #define gT22_im (-g22_im) 00156 00157 00158 // output spinor 00159 //J According to what I understand from Ron, these sit in device memory 00160 //J and get loaded into registers when they are used, then immediately 00161 //J sent back to device mem. Isn't that very inefficient? Do we leave 00162 //J it to the GPU to avoid IO conflicts when this is done? 00163 volatile spinorFloat o00_re; 00164 volatile spinorFloat o00_im; 00165 volatile spinorFloat o01_re; 00166 volatile spinorFloat o01_im; 00167 volatile spinorFloat o02_re; 00168 volatile spinorFloat o02_im; 00169 volatile spinorFloat o10_re; 00170 volatile spinorFloat o10_im; 00171 volatile spinorFloat o11_re; 00172 volatile spinorFloat o11_im; 00173 volatile spinorFloat o12_re; 00174 volatile spinorFloat o12_im; 00175 volatile spinorFloat o20_re; 00176 volatile spinorFloat o20_im; 00177 volatile spinorFloat o21_re; 00178 volatile spinorFloat o21_im; 00179 volatile spinorFloat o22_re; 00180 volatile spinorFloat o22_im; 00181 volatile spinorFloat o30_re; 00182 volatile spinorFloat o30_im; 00183 volatile spinorFloat o31_re; 00184 volatile spinorFloat o31_im; 00185 volatile spinorFloat o32_re; 00186 volatile spinorFloat o32_im; 00187 00188 00189 //J Go up for these. At this point hoping that no changes 00190 //J will be needed for the gauge/clover code. But need to check. 00191 #include "read_gauge.h" 00192 //J Change to a dwf version which sets N and Vh to dwf values. 00193 #include "io_spinor.h" 00194 00195 //J --- INDEX LOGIC --- 00196 //J BLOCK_DIM is defined in dslash_quda.h. There it is 00197 //J set to 64. Why? GRID_DIM 00198 //J is also defined there. Where is that used? 00199 //J Introduce global thread index sid. Each thread is 00200 //J associated with a pair of sites. They are paired 00201 //J in the x1 direction. They are ordered red/black or 00202 //J black/red, depending on the number of boundary crossings. 00203 int sid = blockIdx.x*blockDim.x + threadIdx.x; 00204 if (sid >= param.threads) return; 00205 //J Boundary crossings has to do with the red/black checkerboard. 00206 //J We keep track of how many of those boundaries we cross in going 00207 //J from the origin to the location associated with THIS thread. 00208 //J Add a dimension relative to original code, so we get an extra term here. 00209 //J Note that X1,..,X4 and X1h are defined in quda.h. 00210 00211 // Will work with 5d checkerboard (for spinors) b/c this is 5DPC code. 00212 // Because of this, some gymnastics are required when accessing 00213 // the gauge links. I.e., they should be thought of as living 00214 // on the xs=0 slice, when determining their parity. 00215 00216 int boundaryCrossings = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h) + sid/(X4*X3*X2*X1h); 00217 // Gauge fields have 4d parity. E.g., if xs is odd, we need to grab 00218 // opposite parity link if we're supposed to use U_\mu(x). 00219 int boundaryCrossings4d = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h); 00220 // We will use the difference between boundaryCrossings and boundaryCrossings4d to determine 00221 // which gaugetex is used in the operations below. 00222 00223 //J Define the linear index X to checkerboard sites on the lattice. 00224 int X = 2*sid + (boundaryCrossings + param.parity) % 2; 00225 //J Coordinates of the checkerboard (sublattice) site. 00226 //J This is for the output spinor. 00227 int xs = X/(X4*X3*X2*X1); 00228 int x4 = (X/(X3*X2*X1)) % X4; 00229 int x3 = (X/(X2*X1)) % X3; 00230 int x2 = (X/X1) % X2; 00231 int x1 = X % X1; 00232 00233 // Initialize output spinor to zero. 00234 o00_re = o00_im = 0; 00235 o01_re = o01_im = 0; 00236 o02_re = o02_im = 0; 00237 o10_re = o10_im = 0; 00238 o11_re = o11_im = 0; 00239 o12_re = o12_im = 0; 00240 o20_re = o20_im = 0; 00241 o21_re = o21_im = 0; 00242 o22_re = o22_im = 0; 00243 o30_re = o30_im = 0; 00244 o31_re = o31_im = 0; 00245 o32_re = o32_im = 0; 00246 00247 //ok 00248 // Hopping operations in x1 direction. 00249 { 00250 // Projector P0- 00251 // 1 0 0 -i 00252 // 0 1 -i 0 00253 // 0 i 1 0 00254 // i 0 0 1 00255 00256 //J Hop forward? Yes, and this is for the input spinor. 00257 int sp_idx = ((x1==X1-1) ? X-(X1-1) : X+1) / 2; 00258 //J Link not hopped. But need to project to 4d lattice. 00259 int ga_idx = (sid % Vh); 00260 00261 // read gauge matrix from device memory 00262 // GAUGE0TEX is defined in dslash_dwf_def.h. 00263 // 00264 //J *** Note: The two options of this if statement are 00265 //J not put into if {} else {} prior to all the computation 00266 //J code that follows because READ_GAUGE_MATRIX declares 00267 //J G0, G1 and they would only have scope inside the {} 00268 //J if we did that. Then they would be invisible to the 00269 //J calculation code that follows. That is why I did this 00270 //J ugly code repetition. *** 00271 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 00272 // gauge field same parity. 00273 READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride); 00274 // read spinor from device memory 00275 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00276 00277 // reconstruct gauge matrix 00278 RECONSTRUCT_GAUGE_MATRIX(0); 00279 00280 // project spinor into half spinors 00281 //J Here we are using P0- on the input spinor. 00282 //J spinor row 0, colors 0,1,2 00283 spinorFloat a0_re = +i00_re+i30_im; 00284 spinorFloat a0_im = +i00_im-i30_re; 00285 spinorFloat a1_re = +i01_re+i31_im; 00286 spinorFloat a1_im = +i01_im-i31_re; 00287 spinorFloat a2_re = +i02_re+i32_im; 00288 spinorFloat a2_im = +i02_im-i32_re; 00289 00290 //J spinor row 1, colors 0,1,2. 00291 spinorFloat b0_re = +i10_re+i20_im; 00292 spinorFloat b0_im = +i10_im-i20_re; 00293 spinorFloat b1_re = +i11_re+i21_im; 00294 spinorFloat b1_im = +i11_im-i21_re; 00295 spinorFloat b2_re = +i12_re+i22_im; 00296 spinorFloat b2_im = +i12_im-i22_re; 00297 00298 //J multiply by links, to get color row 0 00299 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 00300 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 00301 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 00302 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 00303 00304 // multiply row 1 00305 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 00306 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 00307 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 00308 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 00309 00310 // multiply row 2 00311 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 00312 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 00313 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 00314 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 00315 00316 //J Add results to output spinor. 00317 //J color 0 00318 o00_re += A0_re; 00319 o00_im += A0_im; 00320 o10_re += B0_re; 00321 o10_im += B0_im; 00322 //J Q. Why are we assigning spinor row 2 with what 00323 //J looks like spinor row 1 results? A. B/c in P0- row 2 00324 //J differs from row 1 by a factor of i. 00325 o20_re -= B0_im; 00326 o20_im += B0_re; 00327 //J Similarly here. 00328 o30_re -= A0_im; 00329 o30_im += A0_re; 00330 00331 o01_re += A1_re; 00332 o01_im += A1_im; 00333 o11_re += B1_re; 00334 o11_im += B1_im; 00335 o21_re -= B1_im; 00336 o21_im += B1_re; 00337 o31_re -= A1_im; 00338 o31_im += A1_re; 00339 00340 o02_re += A2_re; 00341 o02_im += A2_im; 00342 o12_re += B2_re; 00343 o12_im += B2_im; 00344 o22_re -= B2_im; 00345 o22_im += B2_re; 00346 o32_re -= A2_im; 00347 o32_im += A2_re; 00348 00349 } else { 00350 // gauge field opposite parity. 00351 READ_GAUGE_MATRIX(G, GAUGE1TEX, 0, ga_idx, ga_stride); 00352 // read spinor from device memory 00353 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00354 00355 // reconstruct gauge matrix 00356 RECONSTRUCT_GAUGE_MATRIX(0); 00357 00358 // project spinor into half spinors 00359 //J Here we are using P0- on the input spinor. 00360 //J spinor row 0, colors 0,1,2 00361 spinorFloat a0_re = +i00_re+i30_im; 00362 spinorFloat a0_im = +i00_im-i30_re; 00363 spinorFloat a1_re = +i01_re+i31_im; 00364 spinorFloat a1_im = +i01_im-i31_re; 00365 spinorFloat a2_re = +i02_re+i32_im; 00366 spinorFloat a2_im = +i02_im-i32_re; 00367 00368 //J spinor row 1, colors 0,1,2. 00369 spinorFloat b0_re = +i10_re+i20_im; 00370 spinorFloat b0_im = +i10_im-i20_re; 00371 spinorFloat b1_re = +i11_re+i21_im; 00372 spinorFloat b1_im = +i11_im-i21_re; 00373 spinorFloat b2_re = +i12_re+i22_im; 00374 spinorFloat b2_im = +i12_im-i22_re; 00375 00376 //J multiply by links, to get color row 0 00377 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 00378 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 00379 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 00380 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 00381 00382 // multiply row 1 00383 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 00384 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 00385 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 00386 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 00387 00388 // multiply row 2 00389 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 00390 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 00391 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 00392 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 00393 00394 //J Add results to output spinor. 00395 //J color 0 00396 o00_re += A0_re; 00397 o00_im += A0_im; 00398 o10_re += B0_re; 00399 o10_im += B0_im; 00400 //J Q. Why are we assigning spinor row 2 with what 00401 //J looks like spinor row 1 results? A. B/c in P0- row 2 00402 //J differs from row 1 by a factor of i. 00403 o20_re -= B0_im; 00404 o20_im += B0_re; 00405 //J Similarly here. 00406 o30_re -= A0_im; 00407 o30_im += A0_re; 00408 00409 o01_re += A1_re; 00410 o01_im += A1_im; 00411 o11_re += B1_re; 00412 o11_im += B1_im; 00413 o21_re -= B1_im; 00414 o21_im += B1_re; 00415 o31_re -= A1_im; 00416 o31_im += A1_re; 00417 00418 o02_re += A2_re; 00419 o02_im += A2_im; 00420 o12_re += B2_re; 00421 o12_im += B2_im; 00422 o22_re -= B2_im; 00423 o22_im += B2_re; 00424 o32_re -= A2_im; 00425 o32_im += A2_re; 00426 00427 } 00428 } 00429 00430 { 00431 // Projector P0+ 00432 // 1 0 0 i 00433 // 0 1 i 0 00434 // 0 -i 1 0 00435 // -i 0 0 1 00436 00437 //J (1+gamma1) Udag(x-1hat) psi(x-1hat), 00438 //J but relabeling 1-->0. 00439 //J Thus, hop backwards. sp_idx is used in io_spinor.h to 00440 //J read in the input spinor. 00441 int sp_idx = ((x1==0) ? X+(X1-1) : X-1) / 2; 00442 //J Link also hops. 00443 // ** HERE : need gymnastics because xs=0 for gauge parity. ** 00444 int ga_idx = sp_idx % Vh; 00445 00446 // read gauge matrix from device memory 00447 // NB: Here GAUGE1TEX is used, which is different from in P0- above! 00448 // I.e., parity is handled by having 2 bindings. 00449 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 00450 // gauge field opposite parity. 00451 READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride); 00452 // read spinor from device memory 00453 //J The relevant code from io_spinor.h. 00454 //J Which one to use is determined near Line 105 00455 //J of dslash_dwf_def.h. 00456 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00457 00458 // reconstruct gauge matrix 00459 RECONSTRUCT_GAUGE_MATRIX(1); 00460 00461 // project spinor into half spinors 00462 spinorFloat a0_re = +i00_re-i30_im; 00463 spinorFloat a0_im = +i00_im+i30_re; 00464 spinorFloat a1_re = +i01_re-i31_im; 00465 spinorFloat a1_im = +i01_im+i31_re; 00466 spinorFloat a2_re = +i02_re-i32_im; 00467 spinorFloat a2_im = +i02_im+i32_re; 00468 00469 spinorFloat b0_re = +i10_re-i20_im; 00470 spinorFloat b0_im = +i10_im+i20_re; 00471 spinorFloat b1_re = +i11_re-i21_im; 00472 spinorFloat b1_im = +i11_im+i21_re; 00473 spinorFloat b2_re = +i12_re-i22_im; 00474 spinorFloat b2_im = +i12_im+i22_re; 00475 00476 // multiply row 0 00477 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 00478 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 00479 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 00480 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 00481 00482 // multiply row 1 00483 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 00484 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 00485 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 00486 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 00487 00488 // multiply row 2 00489 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 00490 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 00491 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 00492 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 00493 00494 o00_re += A0_re; 00495 o00_im += A0_im; 00496 o10_re += B0_re; 00497 o10_im += B0_im; 00498 o20_re += B0_im; 00499 o20_im -= B0_re; 00500 o30_re += A0_im; 00501 o30_im -= A0_re; 00502 00503 o01_re += A1_re; 00504 o01_im += A1_im; 00505 o11_re += B1_re; 00506 o11_im += B1_im; 00507 o21_re += B1_im; 00508 o21_im -= B1_re; 00509 o31_re += A1_im; 00510 o31_im -= A1_re; 00511 00512 o02_re += A2_re; 00513 o02_im += A2_im; 00514 o12_re += B2_re; 00515 o12_im += B2_im; 00516 o22_re += B2_im; 00517 o22_im -= B2_re; 00518 o32_re += A2_im; 00519 o32_im -= A2_re; 00520 } else { 00521 // gauge field same parity. 00522 READ_GAUGE_MATRIX(G, GAUGE0TEX, 1, ga_idx, ga_stride); 00523 // read spinor from device memory 00524 //J The relevant code from io_spinor.h. 00525 //J Which one to use is determined near Line 105 00526 //J of dslash_dwf_def.h. 00527 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00528 00529 // reconstruct gauge matrix 00530 RECONSTRUCT_GAUGE_MATRIX(1); 00531 00532 // project spinor into half spinors 00533 spinorFloat a0_re = +i00_re-i30_im; 00534 spinorFloat a0_im = +i00_im+i30_re; 00535 spinorFloat a1_re = +i01_re-i31_im; 00536 spinorFloat a1_im = +i01_im+i31_re; 00537 spinorFloat a2_re = +i02_re-i32_im; 00538 spinorFloat a2_im = +i02_im+i32_re; 00539 00540 spinorFloat b0_re = +i10_re-i20_im; 00541 spinorFloat b0_im = +i10_im+i20_re; 00542 spinorFloat b1_re = +i11_re-i21_im; 00543 spinorFloat b1_im = +i11_im+i21_re; 00544 spinorFloat b2_re = +i12_re-i22_im; 00545 spinorFloat b2_im = +i12_im+i22_re; 00546 00547 // multiply row 0 00548 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 00549 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 00550 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 00551 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 00552 00553 // multiply row 1 00554 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 00555 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 00556 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 00557 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 00558 00559 // multiply row 2 00560 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 00561 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 00562 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 00563 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 00564 00565 o00_re += A0_re; 00566 o00_im += A0_im; 00567 o10_re += B0_re; 00568 o10_im += B0_im; 00569 o20_re += B0_im; 00570 o20_im -= B0_re; 00571 o30_re += A0_im; 00572 o30_im -= A0_re; 00573 00574 o01_re += A1_re; 00575 o01_im += A1_im; 00576 o11_re += B1_re; 00577 o11_im += B1_im; 00578 o21_re += B1_im; 00579 o21_im -= B1_re; 00580 o31_re += A1_im; 00581 o31_im -= A1_re; 00582 00583 o02_re += A2_re; 00584 o02_im += A2_im; 00585 o12_re += B2_re; 00586 o12_im += B2_im; 00587 o22_re += B2_im; 00588 o22_im -= B2_re; 00589 o32_re += A2_im; 00590 o32_im -= A2_re; 00591 } 00592 00593 00594 } 00595 00596 { 00597 // Projector P1- 00598 // 1 0 0 -1 00599 // 0 1 1 0 00600 // 0 1 1 0 00601 // -1 0 0 1 00602 00603 int sp_idx = ((x2==X2-1) ? X-(X2-1)*X1 : X+X1) / 2; 00604 int ga_idx = sid % Vh; 00605 00606 // read gauge matrix from device memory 00607 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 00608 // gauge field same parity. 00609 READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride); 00610 // read spinor from device memory 00611 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00612 00613 // reconstruct gauge matrix 00614 RECONSTRUCT_GAUGE_MATRIX(2); 00615 00616 // project spinor into half spinors 00617 spinorFloat a0_re = +i00_re-i30_re; 00618 spinorFloat a0_im = +i00_im-i30_im; 00619 spinorFloat a1_re = +i01_re-i31_re; 00620 spinorFloat a1_im = +i01_im-i31_im; 00621 spinorFloat a2_re = +i02_re-i32_re; 00622 spinorFloat a2_im = +i02_im-i32_im; 00623 00624 spinorFloat b0_re = +i10_re+i20_re; 00625 spinorFloat b0_im = +i10_im+i20_im; 00626 spinorFloat b1_re = +i11_re+i21_re; 00627 spinorFloat b1_im = +i11_im+i21_im; 00628 spinorFloat b2_re = +i12_re+i22_re; 00629 spinorFloat b2_im = +i12_im+i22_im; 00630 00631 // multiply row 0 00632 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 00633 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 00634 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 00635 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 00636 00637 // multiply row 1 00638 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 00639 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 00640 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 00641 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 00642 00643 // multiply row 2 00644 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 00645 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 00646 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 00647 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 00648 00649 o00_re += A0_re; 00650 o00_im += A0_im; 00651 o10_re += B0_re; 00652 o10_im += B0_im; 00653 o20_re += B0_re; 00654 o20_im += B0_im; 00655 o30_re -= A0_re; 00656 o30_im -= A0_im; 00657 00658 o01_re += A1_re; 00659 o01_im += A1_im; 00660 o11_re += B1_re; 00661 o11_im += B1_im; 00662 o21_re += B1_re; 00663 o21_im += B1_im; 00664 o31_re -= A1_re; 00665 o31_im -= A1_im; 00666 00667 o02_re += A2_re; 00668 o02_im += A2_im; 00669 o12_re += B2_re; 00670 o12_im += B2_im; 00671 o22_re += B2_re; 00672 o22_im += B2_im; 00673 o32_re -= A2_re; 00674 o32_im -= A2_im; 00675 } else { 00676 // gauge field opposite parity. 00677 READ_GAUGE_MATRIX(G, GAUGE1TEX, 2, ga_idx, ga_stride); 00678 // read spinor from device memory 00679 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00680 00681 // reconstruct gauge matrix 00682 RECONSTRUCT_GAUGE_MATRIX(2); 00683 00684 // project spinor into half spinors 00685 spinorFloat a0_re = +i00_re-i30_re; 00686 spinorFloat a0_im = +i00_im-i30_im; 00687 spinorFloat a1_re = +i01_re-i31_re; 00688 spinorFloat a1_im = +i01_im-i31_im; 00689 spinorFloat a2_re = +i02_re-i32_re; 00690 spinorFloat a2_im = +i02_im-i32_im; 00691 00692 spinorFloat b0_re = +i10_re+i20_re; 00693 spinorFloat b0_im = +i10_im+i20_im; 00694 spinorFloat b1_re = +i11_re+i21_re; 00695 spinorFloat b1_im = +i11_im+i21_im; 00696 spinorFloat b2_re = +i12_re+i22_re; 00697 spinorFloat b2_im = +i12_im+i22_im; 00698 00699 // multiply row 0 00700 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 00701 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 00702 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 00703 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 00704 00705 // multiply row 1 00706 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 00707 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 00708 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 00709 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 00710 00711 // multiply row 2 00712 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 00713 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 00714 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 00715 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 00716 00717 o00_re += A0_re; 00718 o00_im += A0_im; 00719 o10_re += B0_re; 00720 o10_im += B0_im; 00721 o20_re += B0_re; 00722 o20_im += B0_im; 00723 o30_re -= A0_re; 00724 o30_im -= A0_im; 00725 00726 o01_re += A1_re; 00727 o01_im += A1_im; 00728 o11_re += B1_re; 00729 o11_im += B1_im; 00730 o21_re += B1_re; 00731 o21_im += B1_im; 00732 o31_re -= A1_re; 00733 o31_im -= A1_im; 00734 00735 o02_re += A2_re; 00736 o02_im += A2_im; 00737 o12_re += B2_re; 00738 o12_im += B2_im; 00739 o22_re += B2_re; 00740 o22_im += B2_im; 00741 o32_re -= A2_re; 00742 o32_im -= A2_im; 00743 } 00744 //READ_GAUGE_MATRIX(GAUGE0TEX, 2); 00745 00746 00747 } 00748 00749 { 00750 // Projector P1+ 00751 // 1 0 0 1 00752 // 0 1 -1 0 00753 // 0 -1 1 0 00754 // 1 0 0 1 00755 00756 int sp_idx = ((x2==0) ? X+(X2-1)*X1 : X-X1) / 2; 00757 int ga_idx = sp_idx % Vh; 00758 00759 // read gauge matrix from device memory 00760 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 00761 // gauge field opposite parity. 00762 READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride); 00763 // read spinor from device memory 00764 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00765 00766 // reconstruct gauge matrix 00767 RECONSTRUCT_GAUGE_MATRIX(3); 00768 00769 // project spinor into half spinors 00770 spinorFloat a0_re = +i00_re+i30_re; 00771 spinorFloat a0_im = +i00_im+i30_im; 00772 spinorFloat a1_re = +i01_re+i31_re; 00773 spinorFloat a1_im = +i01_im+i31_im; 00774 spinorFloat a2_re = +i02_re+i32_re; 00775 spinorFloat a2_im = +i02_im+i32_im; 00776 00777 spinorFloat b0_re = +i10_re-i20_re; 00778 spinorFloat b0_im = +i10_im-i20_im; 00779 spinorFloat b1_re = +i11_re-i21_re; 00780 spinorFloat b1_im = +i11_im-i21_im; 00781 spinorFloat b2_re = +i12_re-i22_re; 00782 spinorFloat b2_im = +i12_im-i22_im; 00783 00784 // multiply row 0 00785 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 00786 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 00787 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 00788 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 00789 00790 // multiply row 1 00791 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 00792 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 00793 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 00794 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 00795 00796 // multiply row 2 00797 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 00798 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 00799 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 00800 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 00801 00802 o00_re += A0_re; 00803 o00_im += A0_im; 00804 o10_re += B0_re; 00805 o10_im += B0_im; 00806 o20_re -= B0_re; 00807 o20_im -= B0_im; 00808 o30_re += A0_re; 00809 o30_im += A0_im; 00810 00811 o01_re += A1_re; 00812 o01_im += A1_im; 00813 o11_re += B1_re; 00814 o11_im += B1_im; 00815 o21_re -= B1_re; 00816 o21_im -= B1_im; 00817 o31_re += A1_re; 00818 o31_im += A1_im; 00819 00820 o02_re += A2_re; 00821 o02_im += A2_im; 00822 o12_re += B2_re; 00823 o12_im += B2_im; 00824 o22_re -= B2_re; 00825 o22_im -= B2_im; 00826 o32_re += A2_re; 00827 o32_im += A2_im; 00828 00829 } else { 00830 // gauge field same parity. 00831 READ_GAUGE_MATRIX(G, GAUGE0TEX, 3, ga_idx, ga_stride); 00832 // read spinor from device memory 00833 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00834 00835 // reconstruct gauge matrix 00836 RECONSTRUCT_GAUGE_MATRIX(3); 00837 00838 // project spinor into half spinors 00839 spinorFloat a0_re = +i00_re+i30_re; 00840 spinorFloat a0_im = +i00_im+i30_im; 00841 spinorFloat a1_re = +i01_re+i31_re; 00842 spinorFloat a1_im = +i01_im+i31_im; 00843 spinorFloat a2_re = +i02_re+i32_re; 00844 spinorFloat a2_im = +i02_im+i32_im; 00845 00846 spinorFloat b0_re = +i10_re-i20_re; 00847 spinorFloat b0_im = +i10_im-i20_im; 00848 spinorFloat b1_re = +i11_re-i21_re; 00849 spinorFloat b1_im = +i11_im-i21_im; 00850 spinorFloat b2_re = +i12_re-i22_re; 00851 spinorFloat b2_im = +i12_im-i22_im; 00852 00853 // multiply row 0 00854 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 00855 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 00856 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 00857 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 00858 00859 // multiply row 1 00860 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 00861 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 00862 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 00863 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 00864 00865 // multiply row 2 00866 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 00867 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 00868 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 00869 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 00870 00871 o00_re += A0_re; 00872 o00_im += A0_im; 00873 o10_re += B0_re; 00874 o10_im += B0_im; 00875 o20_re -= B0_re; 00876 o20_im -= B0_im; 00877 o30_re += A0_re; 00878 o30_im += A0_im; 00879 00880 o01_re += A1_re; 00881 o01_im += A1_im; 00882 o11_re += B1_re; 00883 o11_im += B1_im; 00884 o21_re -= B1_re; 00885 o21_im -= B1_im; 00886 o31_re += A1_re; 00887 o31_im += A1_im; 00888 00889 o02_re += A2_re; 00890 o02_im += A2_im; 00891 o12_re += B2_re; 00892 o12_im += B2_im; 00893 o22_re -= B2_re; 00894 o22_im -= B2_im; 00895 o32_re += A2_re; 00896 o32_im += A2_im; 00897 00898 } 00899 // READ_GAUGE_MATRIX(GAUGE1TEX, 3); 00900 00901 } 00902 00903 { 00904 // Projector P2- 00905 // 1 0 -i 0 00906 // 0 1 0 i 00907 // i 0 1 0 00908 // 0 -i 0 1 00909 00910 int sp_idx = ((x3==X3-1) ? X-(X3-1)*X2*X1 : X+X2*X1) / 2; 00911 int ga_idx = sid % Vh; 00912 00913 // read gauge matrix from device memory 00914 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 00915 // gauge field same parity. 00916 READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride); 00917 // read spinor from device memory 00918 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00919 00920 // reconstruct gauge matrix 00921 RECONSTRUCT_GAUGE_MATRIX(4); 00922 00923 // project spinor into half spinors 00924 spinorFloat a0_re = +i00_re+i20_im; 00925 spinorFloat a0_im = +i00_im-i20_re; 00926 spinorFloat a1_re = +i01_re+i21_im; 00927 spinorFloat a1_im = +i01_im-i21_re; 00928 spinorFloat a2_re = +i02_re+i22_im; 00929 spinorFloat a2_im = +i02_im-i22_re; 00930 00931 spinorFloat b0_re = +i10_re-i30_im; 00932 spinorFloat b0_im = +i10_im+i30_re; 00933 spinorFloat b1_re = +i11_re-i31_im; 00934 spinorFloat b1_im = +i11_im+i31_re; 00935 spinorFloat b2_re = +i12_re-i32_im; 00936 spinorFloat b2_im = +i12_im+i32_re; 00937 00938 // multiply row 0 00939 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 00940 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 00941 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 00942 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 00943 00944 // multiply row 1 00945 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 00946 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 00947 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 00948 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 00949 00950 // multiply row 2 00951 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 00952 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 00953 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 00954 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 00955 00956 o00_re += A0_re; 00957 o00_im += A0_im; 00958 o10_re += B0_re; 00959 o10_im += B0_im; 00960 o20_re -= A0_im; 00961 o20_im += A0_re; 00962 o30_re += B0_im; 00963 o30_im -= B0_re; 00964 00965 o01_re += A1_re; 00966 o01_im += A1_im; 00967 o11_re += B1_re; 00968 o11_im += B1_im; 00969 o21_re -= A1_im; 00970 o21_im += A1_re; 00971 o31_re += B1_im; 00972 o31_im -= B1_re; 00973 00974 o02_re += A2_re; 00975 o02_im += A2_im; 00976 o12_re += B2_re; 00977 o12_im += B2_im; 00978 o22_re -= A2_im; 00979 o22_im += A2_re; 00980 o32_re += B2_im; 00981 o32_im -= B2_re; 00982 } else { 00983 // gauge field opp parity. 00984 READ_GAUGE_MATRIX(G, GAUGE1TEX, 4, ga_idx, ga_stride); 00985 // read spinor from device memory 00986 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 00987 00988 // reconstruct gauge matrix 00989 RECONSTRUCT_GAUGE_MATRIX(4); 00990 00991 // project spinor into half spinors 00992 spinorFloat a0_re = +i00_re+i20_im; 00993 spinorFloat a0_im = +i00_im-i20_re; 00994 spinorFloat a1_re = +i01_re+i21_im; 00995 spinorFloat a1_im = +i01_im-i21_re; 00996 spinorFloat a2_re = +i02_re+i22_im; 00997 spinorFloat a2_im = +i02_im-i22_re; 00998 00999 spinorFloat b0_re = +i10_re-i30_im; 01000 spinorFloat b0_im = +i10_im+i30_re; 01001 spinorFloat b1_re = +i11_re-i31_im; 01002 spinorFloat b1_im = +i11_im+i31_re; 01003 spinorFloat b2_re = +i12_re-i32_im; 01004 spinorFloat b2_im = +i12_im+i32_re; 01005 01006 // multiply row 0 01007 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 01008 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 01009 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 01010 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 01011 01012 // multiply row 1 01013 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 01014 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 01015 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 01016 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 01017 01018 // multiply row 2 01019 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 01020 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 01021 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 01022 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 01023 01024 o00_re += A0_re; 01025 o00_im += A0_im; 01026 o10_re += B0_re; 01027 o10_im += B0_im; 01028 o20_re -= A0_im; 01029 o20_im += A0_re; 01030 o30_re += B0_im; 01031 o30_im -= B0_re; 01032 01033 o01_re += A1_re; 01034 o01_im += A1_im; 01035 o11_re += B1_re; 01036 o11_im += B1_im; 01037 o21_re -= A1_im; 01038 o21_im += A1_re; 01039 o31_re += B1_im; 01040 o31_im -= B1_re; 01041 01042 o02_re += A2_re; 01043 o02_im += A2_im; 01044 o12_re += B2_re; 01045 o12_im += B2_im; 01046 o22_re -= A2_im; 01047 o22_im += A2_re; 01048 o32_re += B2_im; 01049 o32_im -= B2_re; 01050 } 01051 // READ_GAUGE_MATRIX(GAUGE0TEX, 4); 01052 01053 01054 } 01055 01056 { 01057 // Projector P2+ 01058 // 1 0 i 0 01059 // 0 1 0 -i 01060 // -i 0 1 0 01061 // 0 i 0 1 01062 01063 int sp_idx = ((x3==0) ? X+(X3-1)*X2*X1 : X-X2*X1) / 2; 01064 int ga_idx = sp_idx % Vh; 01065 01066 // read gauge matrix from device memory 01067 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 01068 // gauge field opposite parity. 01069 READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride); 01070 // read spinor from device memory 01071 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01072 01073 // reconstruct gauge matrix 01074 RECONSTRUCT_GAUGE_MATRIX(5); 01075 01076 // project spinor into half spinors 01077 spinorFloat a0_re = +i00_re-i20_im; 01078 spinorFloat a0_im = +i00_im+i20_re; 01079 spinorFloat a1_re = +i01_re-i21_im; 01080 spinorFloat a1_im = +i01_im+i21_re; 01081 spinorFloat a2_re = +i02_re-i22_im; 01082 spinorFloat a2_im = +i02_im+i22_re; 01083 01084 spinorFloat b0_re = +i10_re+i30_im; 01085 spinorFloat b0_im = +i10_im-i30_re; 01086 spinorFloat b1_re = +i11_re+i31_im; 01087 spinorFloat b1_im = +i11_im-i31_re; 01088 spinorFloat b2_re = +i12_re+i32_im; 01089 spinorFloat b2_im = +i12_im-i32_re; 01090 01091 // multiply row 0 01092 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 01093 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 01094 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 01095 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 01096 01097 // multiply row 1 01098 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 01099 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 01100 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 01101 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 01102 01103 // multiply row 2 01104 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 01105 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 01106 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 01107 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 01108 01109 o00_re += A0_re; 01110 o00_im += A0_im; 01111 o10_re += B0_re; 01112 o10_im += B0_im; 01113 o20_re += A0_im; 01114 o20_im -= A0_re; 01115 o30_re -= B0_im; 01116 o30_im += B0_re; 01117 01118 o01_re += A1_re; 01119 o01_im += A1_im; 01120 o11_re += B1_re; 01121 o11_im += B1_im; 01122 o21_re += A1_im; 01123 o21_im -= A1_re; 01124 o31_re -= B1_im; 01125 o31_im += B1_re; 01126 01127 o02_re += A2_re; 01128 o02_im += A2_im; 01129 o12_re += B2_re; 01130 o12_im += B2_im; 01131 o22_re += A2_im; 01132 o22_im -= A2_re; 01133 o32_re -= B2_im; 01134 o32_im += B2_re; 01135 01136 } else { 01137 // gauge field same parity. 01138 READ_GAUGE_MATRIX(G, GAUGE0TEX, 5, ga_idx, ga_stride); 01139 // read spinor from device memory 01140 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01141 01142 // reconstruct gauge matrix 01143 RECONSTRUCT_GAUGE_MATRIX(5); 01144 01145 // project spinor into half spinors 01146 spinorFloat a0_re = +i00_re-i20_im; 01147 spinorFloat a0_im = +i00_im+i20_re; 01148 spinorFloat a1_re = +i01_re-i21_im; 01149 spinorFloat a1_im = +i01_im+i21_re; 01150 spinorFloat a2_re = +i02_re-i22_im; 01151 spinorFloat a2_im = +i02_im+i22_re; 01152 01153 spinorFloat b0_re = +i10_re+i30_im; 01154 spinorFloat b0_im = +i10_im-i30_re; 01155 spinorFloat b1_re = +i11_re+i31_im; 01156 spinorFloat b1_im = +i11_im-i31_re; 01157 spinorFloat b2_re = +i12_re+i32_im; 01158 spinorFloat b2_im = +i12_im-i32_re; 01159 01160 // multiply row 0 01161 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 01162 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 01163 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 01164 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 01165 01166 // multiply row 1 01167 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 01168 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 01169 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 01170 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 01171 01172 // multiply row 2 01173 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 01174 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 01175 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 01176 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 01177 01178 o00_re += A0_re; 01179 o00_im += A0_im; 01180 o10_re += B0_re; 01181 o10_im += B0_im; 01182 o20_re += A0_im; 01183 o20_im -= A0_re; 01184 o30_re -= B0_im; 01185 o30_im += B0_re; 01186 01187 o01_re += A1_re; 01188 o01_im += A1_im; 01189 o11_re += B1_re; 01190 o11_im += B1_im; 01191 o21_re += A1_im; 01192 o21_im -= A1_re; 01193 o31_re -= B1_im; 01194 o31_im += B1_re; 01195 01196 o02_re += A2_re; 01197 o02_im += A2_im; 01198 o12_re += B2_re; 01199 o12_im += B2_im; 01200 o22_re += A2_im; 01201 o22_im -= A2_re; 01202 o32_re -= B2_im; 01203 o32_im += B2_re; 01204 01205 } 01206 // READ_GAUGE_MATRIX(GAUGE1TEX, 5); 01207 01208 } 01209 01210 { 01211 // Projector P3- 01212 // 0 0 0 0 01213 // 0 0 0 0 01214 // 0 0 2 0 01215 // 0 0 0 2 01216 01217 int sp_idx = ((x4==X4-1) ? X-(X4-1)*X3*X2*X1 : X+X3*X2*X1) / 2; 01218 int ga_idx = sid % Vh; 01219 01220 if (gauge_fixed && ga_idx < (X4-1)*X1h*X2*X3) { 01221 // read spinor from device memory 01222 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01223 01224 // project spinor into half spinors 01225 spinorFloat a0_re = +2*i20_re; 01226 spinorFloat a0_im = +2*i20_im; 01227 spinorFloat a1_re = +2*i21_re; 01228 spinorFloat a1_im = +2*i21_im; 01229 spinorFloat a2_re = +2*i22_re; 01230 spinorFloat a2_im = +2*i22_im; 01231 01232 spinorFloat b0_re = +2*i30_re; 01233 spinorFloat b0_im = +2*i30_im; 01234 spinorFloat b1_re = +2*i31_re; 01235 spinorFloat b1_im = +2*i31_im; 01236 spinorFloat b2_re = +2*i32_re; 01237 spinorFloat b2_im = +2*i32_im; 01238 01239 // identity gauge matrix 01240 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01241 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01242 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01243 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01244 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01245 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01246 01247 o20_re += A0_re; 01248 o20_im += A0_im; 01249 o30_re += B0_re; 01250 o30_im += B0_im; 01251 01252 o21_re += A1_re; 01253 o21_im += A1_im; 01254 o31_re += B1_re; 01255 o31_im += B1_im; 01256 01257 o22_re += A2_re; 01258 o22_im += A2_im; 01259 o32_re += B2_re; 01260 o32_im += B2_im; 01261 01262 } 01263 else { 01264 // read gauge matrix from device memory 01265 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 01266 // gauge field same parity. 01267 READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride); 01268 // read spinor from device memory 01269 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01270 01271 // reconstruct gauge matrix 01272 RECONSTRUCT_GAUGE_MATRIX(6); 01273 01274 // project spinor into half spinors 01275 spinorFloat a0_re = +2*i20_re; 01276 spinorFloat a0_im = +2*i20_im; 01277 spinorFloat a1_re = +2*i21_re; 01278 spinorFloat a1_im = +2*i21_im; 01279 spinorFloat a2_re = +2*i22_re; 01280 spinorFloat a2_im = +2*i22_im; 01281 01282 spinorFloat b0_re = +2*i30_re; 01283 spinorFloat b0_im = +2*i30_im; 01284 spinorFloat b1_re = +2*i31_re; 01285 spinorFloat b1_im = +2*i31_im; 01286 spinorFloat b2_re = +2*i32_re; 01287 spinorFloat b2_im = +2*i32_im; 01288 01289 // multiply row 0 01290 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 01291 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 01292 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 01293 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 01294 01295 // multiply row 1 01296 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 01297 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 01298 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 01299 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 01300 01301 // multiply row 2 01302 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 01303 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 01304 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 01305 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 01306 01307 o20_re += A0_re; 01308 o20_im += A0_im; 01309 o30_re += B0_re; 01310 o30_im += B0_im; 01311 01312 o21_re += A1_re; 01313 o21_im += A1_im; 01314 o31_re += B1_re; 01315 o31_im += B1_im; 01316 01317 o22_re += A2_re; 01318 o22_im += A2_im; 01319 o32_re += B2_re; 01320 o32_im += B2_im; 01321 01322 } else { 01323 // gauge field opp parity. 01324 READ_GAUGE_MATRIX(G, GAUGE1TEX, 6, ga_idx, ga_stride); 01325 // read spinor from device memory 01326 READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx); 01327 01328 // reconstruct gauge matrix 01329 RECONSTRUCT_GAUGE_MATRIX(6); 01330 01331 // project spinor into half spinors 01332 spinorFloat a0_re = +2*i20_re; 01333 spinorFloat a0_im = +2*i20_im; 01334 spinorFloat a1_re = +2*i21_re; 01335 spinorFloat a1_im = +2*i21_im; 01336 spinorFloat a2_re = +2*i22_re; 01337 spinorFloat a2_im = +2*i22_im; 01338 01339 spinorFloat b0_re = +2*i30_re; 01340 spinorFloat b0_im = +2*i30_im; 01341 spinorFloat b1_re = +2*i31_re; 01342 spinorFloat b1_im = +2*i31_im; 01343 spinorFloat b2_re = +2*i32_re; 01344 spinorFloat b2_im = +2*i32_im; 01345 01346 // multiply row 0 01347 spinorFloat A0_re = + (g00_re * a0_re - g00_im * a0_im) + (g01_re * a1_re - g01_im * a1_im) + (g02_re * a2_re - g02_im * a2_im); 01348 spinorFloat A0_im = + (g00_re * a0_im + g00_im * a0_re) + (g01_re * a1_im + g01_im * a1_re) + (g02_re * a2_im + g02_im * a2_re); 01349 spinorFloat B0_re = + (g00_re * b0_re - g00_im * b0_im) + (g01_re * b1_re - g01_im * b1_im) + (g02_re * b2_re - g02_im * b2_im); 01350 spinorFloat B0_im = + (g00_re * b0_im + g00_im * b0_re) + (g01_re * b1_im + g01_im * b1_re) + (g02_re * b2_im + g02_im * b2_re); 01351 01352 // multiply row 1 01353 spinorFloat A1_re = + (g10_re * a0_re - g10_im * a0_im) + (g11_re * a1_re - g11_im * a1_im) + (g12_re * a2_re - g12_im * a2_im); 01354 spinorFloat A1_im = + (g10_re * a0_im + g10_im * a0_re) + (g11_re * a1_im + g11_im * a1_re) + (g12_re * a2_im + g12_im * a2_re); 01355 spinorFloat B1_re = + (g10_re * b0_re - g10_im * b0_im) + (g11_re * b1_re - g11_im * b1_im) + (g12_re * b2_re - g12_im * b2_im); 01356 spinorFloat B1_im = + (g10_re * b0_im + g10_im * b0_re) + (g11_re * b1_im + g11_im * b1_re) + (g12_re * b2_im + g12_im * b2_re); 01357 01358 // multiply row 2 01359 spinorFloat A2_re = + (g20_re * a0_re - g20_im * a0_im) + (g21_re * a1_re - g21_im * a1_im) + (g22_re * a2_re - g22_im * a2_im); 01360 spinorFloat A2_im = + (g20_re * a0_im + g20_im * a0_re) + (g21_re * a1_im + g21_im * a1_re) + (g22_re * a2_im + g22_im * a2_re); 01361 spinorFloat B2_re = + (g20_re * b0_re - g20_im * b0_im) + (g21_re * b1_re - g21_im * b1_im) + (g22_re * b2_re - g22_im * b2_im); 01362 spinorFloat B2_im = + (g20_re * b0_im + g20_im * b0_re) + (g21_re * b1_im + g21_im * b1_re) + (g22_re * b2_im + g22_im * b2_re); 01363 01364 o20_re += A0_re; 01365 o20_im += A0_im; 01366 o30_re += B0_re; 01367 o30_im += B0_im; 01368 01369 o21_re += A1_re; 01370 o21_im += A1_im; 01371 o31_re += B1_re; 01372 o31_im += B1_im; 01373 01374 o22_re += A2_re; 01375 o22_im += A2_im; 01376 o32_re += B2_re; 01377 o32_im += B2_im; 01378 01379 } 01380 //READ_GAUGE_MATRIX(GAUGE0TEX, 6); 01381 01382 } 01383 } 01384 01385 { 01386 // Projector P3+ 01387 // 2 0 0 0 01388 // 0 2 0 0 01389 // 0 0 0 0 01390 // 0 0 0 0 01391 01392 int sp_idx = ((x4==0) ? X+(X4-1)*X3*X2*X1 : X-X3*X2*X1) / 2; 01393 int ga_idx = sp_idx % Vh; 01394 01395 if (gauge_fixed && ga_idx < (X4-1)*X1h*X2*X3) { 01396 // read spinor from device memory 01397 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01398 01399 // project spinor into half spinors 01400 spinorFloat a0_re = +2*i00_re; 01401 spinorFloat a0_im = +2*i00_im; 01402 spinorFloat a1_re = +2*i01_re; 01403 spinorFloat a1_im = +2*i01_im; 01404 spinorFloat a2_re = +2*i02_re; 01405 spinorFloat a2_im = +2*i02_im; 01406 01407 spinorFloat b0_re = +2*i10_re; 01408 spinorFloat b0_im = +2*i10_im; 01409 spinorFloat b1_re = +2*i11_re; 01410 spinorFloat b1_im = +2*i11_im; 01411 spinorFloat b2_re = +2*i12_re; 01412 spinorFloat b2_im = +2*i12_im; 01413 01414 // identity gauge matrix 01415 spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im; 01416 spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im; 01417 spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im; 01418 spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im; 01419 spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im; 01420 spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im; 01421 01422 o00_re += A0_re; 01423 o00_im += A0_im; 01424 o10_re += B0_re; 01425 o10_im += B0_im; 01426 01427 o01_re += A1_re; 01428 o01_im += A1_im; 01429 o11_re += B1_re; 01430 o11_im += B1_im; 01431 01432 o02_re += A2_re; 01433 o02_im += A2_im; 01434 o12_re += B2_re; 01435 o12_im += B2_im; 01436 01437 } 01438 else { 01439 // read gauge matrix from device memory 01440 if ( !( (boundaryCrossings-boundaryCrossings4d) % 2) ) { 01441 // gauge field opposite parity. 01442 READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride); 01443 // read spinor from device memory 01444 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01445 01446 // reconstruct gauge matrix 01447 RECONSTRUCT_GAUGE_MATRIX(7); 01448 01449 // project spinor into half spinors 01450 spinorFloat a0_re = +2*i00_re; 01451 spinorFloat a0_im = +2*i00_im; 01452 spinorFloat a1_re = +2*i01_re; 01453 spinorFloat a1_im = +2*i01_im; 01454 spinorFloat a2_re = +2*i02_re; 01455 spinorFloat a2_im = +2*i02_im; 01456 01457 spinorFloat b0_re = +2*i10_re; 01458 spinorFloat b0_im = +2*i10_im; 01459 spinorFloat b1_re = +2*i11_re; 01460 spinorFloat b1_im = +2*i11_im; 01461 spinorFloat b2_re = +2*i12_re; 01462 spinorFloat b2_im = +2*i12_im; 01463 01464 // multiply row 0 01465 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 01466 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 01467 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 01468 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 01469 01470 // multiply row 1 01471 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 01472 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 01473 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 01474 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 01475 01476 // multiply row 2 01477 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 01478 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 01479 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 01480 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 01481 01482 o00_re += A0_re; 01483 o00_im += A0_im; 01484 o10_re += B0_re; 01485 o10_im += B0_im; 01486 01487 o01_re += A1_re; 01488 o01_im += A1_im; 01489 o11_re += B1_re; 01490 o11_im += B1_im; 01491 01492 o02_re += A2_re; 01493 o02_im += A2_im; 01494 o12_re += B2_re; 01495 o12_im += B2_im; 01496 } else { 01497 // gauge field same parity. 01498 READ_GAUGE_MATRIX(G, GAUGE0TEX, 7, ga_idx, ga_stride); 01499 // read spinor from device memory 01500 READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx); 01501 01502 // reconstruct gauge matrix 01503 RECONSTRUCT_GAUGE_MATRIX(7); 01504 01505 // project spinor into half spinors 01506 spinorFloat a0_re = +2*i00_re; 01507 spinorFloat a0_im = +2*i00_im; 01508 spinorFloat a1_re = +2*i01_re; 01509 spinorFloat a1_im = +2*i01_im; 01510 spinorFloat a2_re = +2*i02_re; 01511 spinorFloat a2_im = +2*i02_im; 01512 01513 spinorFloat b0_re = +2*i10_re; 01514 spinorFloat b0_im = +2*i10_im; 01515 spinorFloat b1_re = +2*i11_re; 01516 spinorFloat b1_im = +2*i11_im; 01517 spinorFloat b2_re = +2*i12_re; 01518 spinorFloat b2_im = +2*i12_im; 01519 01520 // multiply row 0 01521 spinorFloat A0_re = + (gT00_re * a0_re - gT00_im * a0_im) + (gT01_re * a1_re - gT01_im * a1_im) + (gT02_re * a2_re - gT02_im * a2_im); 01522 spinorFloat A0_im = + (gT00_re * a0_im + gT00_im * a0_re) + (gT01_re * a1_im + gT01_im * a1_re) + (gT02_re * a2_im + gT02_im * a2_re); 01523 spinorFloat B0_re = + (gT00_re * b0_re - gT00_im * b0_im) + (gT01_re * b1_re - gT01_im * b1_im) + (gT02_re * b2_re - gT02_im * b2_im); 01524 spinorFloat B0_im = + (gT00_re * b0_im + gT00_im * b0_re) + (gT01_re * b1_im + gT01_im * b1_re) + (gT02_re * b2_im + gT02_im * b2_re); 01525 01526 // multiply row 1 01527 spinorFloat A1_re = + (gT10_re * a0_re - gT10_im * a0_im) + (gT11_re * a1_re - gT11_im * a1_im) + (gT12_re * a2_re - gT12_im * a2_im); 01528 spinorFloat A1_im = + (gT10_re * a0_im + gT10_im * a0_re) + (gT11_re * a1_im + gT11_im * a1_re) + (gT12_re * a2_im + gT12_im * a2_re); 01529 spinorFloat B1_re = + (gT10_re * b0_re - gT10_im * b0_im) + (gT11_re * b1_re - gT11_im * b1_im) + (gT12_re * b2_re - gT12_im * b2_im); 01530 spinorFloat B1_im = + (gT10_re * b0_im + gT10_im * b0_re) + (gT11_re * b1_im + gT11_im * b1_re) + (gT12_re * b2_im + gT12_im * b2_re); 01531 01532 // multiply row 2 01533 spinorFloat A2_re = + (gT20_re * a0_re - gT20_im * a0_im) + (gT21_re * a1_re - gT21_im * a1_im) + (gT22_re * a2_re - gT22_im * a2_im); 01534 spinorFloat A2_im = + (gT20_re * a0_im + gT20_im * a0_re) + (gT21_re * a1_im + gT21_im * a1_re) + (gT22_re * a2_im + gT22_im * a2_re); 01535 spinorFloat B2_re = + (gT20_re * b0_re - gT20_im * b0_im) + (gT21_re * b1_re - gT21_im * b1_im) + (gT22_re * b2_re - gT22_im * b2_im); 01536 spinorFloat B2_im = + (gT20_re * b0_im + gT20_im * b0_re) + (gT21_re * b1_im + gT21_im * b1_re) + (gT22_re * b2_im + gT22_im * b2_re); 01537 01538 o00_re += A0_re; 01539 o00_im += A0_im; 01540 o10_re += B0_re; 01541 o10_im += B0_im; 01542 01543 o01_re += A1_re; 01544 o01_im += A1_im; 01545 o11_re += B1_re; 01546 o11_im += B1_im; 01547 01548 o02_re += A2_re; 01549 o02_im += A2_im; 01550 o12_re += B2_re; 01551 o12_im += B2_im; 01552 } 01553 // READ_GAUGE_MATRIX(GAUGE1TEX, 7); 01554 01555 01556 } 01557 } 01558 01559 01560 01561 01562 //J ---------------------------------- 01563 //J --- DWF code for 5th dimension --- 01564 //J ---------------------------------- 01565 // 01566 //J Begin scope. 01567 { 01568 //J TODO Insert/check handler for s-direction here. 01569 01570 //J Decided to not change to chiral basis. Then: 01571 // 2 P_- = 2 P_L = 1 -1 01572 // -1 1 01573 //J Begin scope for 2 P_L projection of back-hopped spinor. 01574 { 01575 //J We are left-handed, so hop backwards. If we are at 01576 //J boundary in s-direction, special 01577 //J things will need to be done. xs is defined in dslash_core_ante.h. 01578 //J See near Line 328. N is the 4d volume; cf. quda.h. 01579 //J Cf. hand-written notes 8/6/09 for check of logic. 01580 //J The logic sets xs to the s-coordinate of the output 01581 //J spinor, which is accumulated by this thread. 01582 //J I.e., it uses the thread index to determine xs. 01583 int sp_idx = ((xs==0) ? X+(Ls-1)*2*Vh : X-2*Vh) / 2; 01584 // --- Read spinor from device memory. --- 01585 // 01586 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01587 01588 if (xs != 0) { 01589 //J OK, now the input spinor should be at: 01590 //J 0 < s <= Ls-1 01591 // 01592 //J Project spinor into half spinors, i.e., this is the term 01593 //J " + P_L psi(s-1) " 01594 01595 //J ------------------------------------ 01596 //J --- Dirac index 0, Colors 0,1,2. --- 01597 //J ------------------------------------ 01598 //ok 01599 o00_re += i00_re-i20_re; 01600 o00_im += i00_im-i20_im; 01601 o01_re += i01_re-i21_re; 01602 o01_im += i01_im-i21_im; 01603 o02_re += i02_re-i22_re; 01604 o02_im += i02_im-i22_im; 01605 01606 //J ------------------------------------- 01607 //J --- Dirac index 1, Colors 0,1,2. --- 01608 //J ------------------------------------- 01609 //ok 01610 o10_re += i10_re-i30_re; 01611 o10_im += i10_im-i30_im; 01612 o11_re += i11_re-i31_re; 01613 o11_im += i11_im-i31_im; 01614 o12_re += i12_re-i32_re; 01615 o12_im += i12_im-i32_im; 01616 01617 //J ------------------------------------ 01618 //J --- Dirac index 2, Colors 0,1,2. --- 01619 //J ------------------------------------ 01620 //ok 01621 o20_re += -i00_re+i20_re; 01622 o20_im += -i00_im+i20_im; 01623 o21_re += -i01_re+i21_re; 01624 o21_im += -i01_im+i21_im; 01625 o22_re += -i02_re+i22_re; 01626 o22_im += -i02_im+i22_im; 01627 01628 //J ------------------------------------- 01629 //J --- Dirac index 3, Colors 0,1,2. --- 01630 //J ------------------------------------- 01631 //ok 01632 o30_re += -i10_re+i30_re; 01633 o30_im += -i10_im+i30_im; 01634 o31_re += -i11_re+i31_re; 01635 o31_im += -i11_im+i31_im; 01636 o32_re += -i12_re+i32_re; 01637 o32_im += -i12_im+i32_im; 01638 01639 } // End (x,0) < (x,s) <= (x,Ls-1). 01640 else { 01641 //J LH boundary s=0, backwards hop to Ls-1. 01642 //J Term to add: -mferm*P_L*psi(x,Ls-1) 01643 //J With any luck, sp_idx is linear equiv. to "(x,Ls-1)" 01644 //J Above, we set: 01645 //J sp_idx= (X+(Ls-1)*X4*X3*X2*X1)/2 (*). 01646 //J efs: do some case examples where xs=0 comes out of 01647 //J dslash_ante_core.h procedure, and check that sp_idx is 01648 //J really coming out correct (and in permissable range) 01649 //J in the operation (*). 01650 //J We need mferm to get passed. A modification 01651 //J was made to DD_PARAM2 in the C preprocessing file 01652 //J dslash_dwf_def.h, adding 01653 //J an extra argument to the kernel declarations. 01654 // 01655 //J --- Dirac index 0, Colors 0,1,2. --- 01656 // color 0 (second index) 01657 o00_re += -mferm*(i00_re-i20_re); 01658 o00_im += -mferm*(i00_im-i20_im); 01659 // color 1 01660 o01_re += -mferm*(i01_re-i21_re); 01661 o01_im += -mferm*(i01_im-i21_im); 01662 // color 2 01663 o02_re += -mferm*(i02_re-i22_re); 01664 o02_im += -mferm*(i02_im-i22_im); 01665 01666 //J --- Dirac index 1, Colors 0,1,2. --- 01667 // color 0 01668 o10_re += -mferm*(i10_re-i30_re); 01669 o10_im += -mferm*(i10_im-i30_im); 01670 // color 1 01671 o11_re += -mferm*(i11_re-i31_re); 01672 o11_im += -mferm*(i11_im-i31_im); 01673 // color 2 01674 o12_re += -mferm*(i12_re-i32_re); 01675 o12_im += -mferm*(i12_im-i32_im); 01676 01677 //J --- Dirac index 2, Colors 0,1,2. --- 01678 // color 0 (second index) 01679 o20_re += -mferm*(-i00_re+i20_re); 01680 o20_im += -mferm*(-i00_im+i20_im); 01681 // color 1 01682 o21_re += -mferm*(-i01_re+i21_re); 01683 o21_im += -mferm*(-i01_im+i21_im); 01684 // color 2 01685 o22_re += -mferm*(-i02_re+i22_re); 01686 o22_im += -mferm*(-i02_im+i22_im); 01687 01688 //J --- Dirac index 3, Colors 0,1,2. --- 01689 // color 0 01690 o30_re += -mferm*(-i10_re+i30_re); 01691 o30_im += -mferm*(-i10_im+i30_im); 01692 // color 1 01693 o31_re += -mferm*(-i11_re+i31_re); 01694 o31_im += -mferm*(-i11_im+i31_im); 01695 // color 2 01696 o32_re += -mferm*(-i12_re+i32_re); 01697 o32_im += -mferm*(-i12_im+i32_im); 01698 01699 } // End (x,s)=(x,0) 01700 } 01701 // --- End of left-handed spinor projection. --- 01702 01703 // 2 P_+ = 2 P_R = 1 1 01704 // 1 1 01705 // --- Begin right-handed spinor projection. --- 01706 { 01707 //J For P_R spinor, we hop forwards. Thus: 01708 01709 //J This bit mimics what is done for x4==X4-1 in dslash_core_ante.h. 01710 //J 01711 //J TODO Check logic w/ case examples. 01712 //J Cf. hand-written notes 8/6/09 for check of logic. 01713 int sp_idx = ((xs==(Ls-1)) ? X-(Ls-1)*2*Vh : X+2*Vh) / 2; 01714 01715 //J Read spinor from device memory. 01716 // 01717 READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx); 01718 01719 // HERE 01720 // 01721 if ( xs < (Ls-1) ) { 01722 //J Case of not at RH boundary. Then we just do += P_R psi(s+1). 01723 01724 //J ------------------------------------ 01725 //J --- Dirac index 0, Colors 0,1,2. --- 01726 //J ------------------------------------ 01727 o00_re += i00_re+i20_re; 01728 o00_im += i00_im+i20_im; 01729 o01_re += i01_re+i21_re; 01730 o01_im += i01_im+i21_im; 01731 o02_re += i02_re+i22_re; 01732 o02_im += i02_im+i22_im; 01733 01734 //J ------------------------------------- 01735 //J --- Dirac index 1, Colors 0,1,2. --- 01736 //J ------------------------------------- 01737 o10_re += i10_re+i30_re; 01738 o10_im += i10_im+i30_im; 01739 o11_re += i11_re+i31_re; 01740 o11_im += i11_im+i31_im; 01741 o12_re += i12_re+i32_re; 01742 o12_im += i12_im+i32_im; 01743 01744 //J ------------------------------------ 01745 //J --- Dirac index 2, Colors 0,1,2. --- 01746 //J ------------------------------------ 01747 o20_re += i00_re+i20_re; 01748 o20_im += i00_im+i20_im; 01749 o21_re += i01_re+i21_re; 01750 o21_im += i01_im+i21_im; 01751 o22_re += i02_re+i22_re; 01752 o22_im += i02_im+i22_im; 01753 01754 //J ------------------------------------- 01755 //J --- Dirac index 3, Colors 0,1,2. --- 01756 //J ------------------------------------- 01757 o30_re += i10_re+i30_re; 01758 o30_im += i10_im+i30_im; 01759 o31_re += i11_re+i31_re; 01760 o31_im += i11_im+i31_im; 01761 o32_re += i12_re+i32_re; 01762 o32_im += i12_im+i32_im; 01763 01764 } // End (x,0) <= (x,s) < (x,Ls-1). 01765 else { 01766 //J RH boundary s=Ls-1, forwards hop to s=0. 01767 //J Term to add: -mferm*P_R*psi(x,0) 01768 01769 //J --- Dirac index 0, Colors 0,1,2. --- 01770 // color 0 (second index) 01771 o00_re += -mferm*(i00_re+i20_re); 01772 o00_im += -mferm*(i00_im+i20_im); 01773 // color 1 01774 o01_re += -mferm*(i01_re+i21_re); 01775 o01_im += -mferm*(i01_im+i21_im); 01776 // color 2 01777 o02_re += -mferm*(i02_re+i22_re); 01778 o02_im += -mferm*(i02_im+i22_im); 01779 01780 //J --- Dirac index 1, Colors 0,1,2. --- 01781 // color 0 01782 o10_re += -mferm*(i10_re+i30_re); 01783 o10_im += -mferm*(i10_im+i30_im); 01784 // color 1 01785 o11_re += -mferm*(i11_re+i31_re); 01786 o11_im += -mferm*(i11_im+i31_im); 01787 // color 2 01788 o12_re += -mferm*(i12_re+i32_re); 01789 o12_im += -mferm*(i12_im+i32_im); 01790 01791 //J --- Dirac index 2, Colors 0,1,2. --- 01792 // color 0 (second index) 01793 o20_re += -mferm*(i00_re+i20_re); 01794 o20_im += -mferm*(i00_im+i20_im); 01795 // color 1 01796 o21_re += -mferm*(i01_re+i21_re); 01797 o21_im += -mferm*(i01_im+i21_im); 01798 // color 2 01799 o22_re += -mferm*(i02_re+i22_re); 01800 o22_im += -mferm*(i02_im+i22_im); 01801 01802 //J --- Dirac index 3, Colors 0,1,2. --- 01803 // color 0 01804 o30_re += -mferm*(i10_re+i30_re); 01805 o30_im += -mferm*(i10_im+i30_im); 01806 // color 1 01807 o31_re += -mferm*(i11_re+i31_re); 01808 o31_im += -mferm*(i11_im+i31_im); 01809 // color 2 01810 o32_re += -mferm*(i12_re+i32_re); 01811 o32_im += -mferm*(i12_im+i32_im); 01812 // 01813 } // End (x,s)=(x,Ls-1) 01814 } 01815 // ----- end dwf s-direction ---- 01816 01817 } // end s-direction block 01818 01819 01820 // Perform the DSLASH_XPAY operations. 01821 // Undefine all the macros. TODO Make sure that this 01822 // is working right for the diagonal terms of DWF. 01823 //#include "dslash_core_post.h" 01824 01825 //J The "a" variable is an argument passed within xpay functions. Look at 01826 //J the macro DD_PARAM2 in dslash_dwf_def.h. 01827 //J READ_ACCUM is defined in io_spinor.h. In truth, it points 01828 //J to READ_ACCUM_SINGLE() in the case of single precision, etc. 01829 //J It performs tex1dfetch() calls on ACCUMTEX. ACCUMTEX stands for 01830 //J accumTexSingle, which gets bound in dslash_quda.cu prior to calling 01831 //J the kernel that is defined below. That texture is a cache of the "x" 01832 //J argument to dslashXpayS_dwf_Cuda. 01833 01834 // out_spinor = a*out_spinor + x 01835 // This is useful if we have, say 01836 // -kappa^2 * D_{eo} psi_o + psi_e 01837 // such as occurs in Lines near 666 of dslash_quda.cu. 01838 01839 #ifdef DSLASH_XPAY 01840 READ_ACCUM(ACCUMTEX, sp_stride) 01841 #ifdef SPINOR_DOUBLE 01842 //J This all looks like diagonal terms, 01843 //J out = a*out+in. 01844 //J Q. Why use "accum" rather than I0...I11? Look at io_spinor.h. 01845 // Also note that in dslash_quda.cu, a bunch of binding is performed 01846 // that may be related to what sits in "accum". 01847 o00_re = a*o00_re + accum0.x; 01848 o00_im = a*o00_im + accum0.y; 01849 o01_re = a*o01_re + accum1.x; 01850 o01_im = a*o01_im + accum1.y; 01851 o02_re = a*o02_re + accum2.x; 01852 o02_im = a*o02_im + accum2.y; 01853 o10_re = a*o10_re + accum3.x; 01854 o10_im = a*o10_im + accum3.y; 01855 o11_re = a*o11_re + accum4.x; 01856 o11_im = a*o11_im + accum4.y; 01857 o12_re = a*o12_re + accum5.x; 01858 o12_im = a*o12_im + accum5.y; 01859 o20_re = a*o20_re + accum6.x; 01860 o20_im = a*o20_im + accum6.y; 01861 o21_re = a*o21_re + accum7.x; 01862 o21_im = a*o21_im + accum7.y; 01863 o22_re = a*o22_re + accum8.x; 01864 o22_im = a*o22_im + accum8.y; 01865 o30_re = a*o30_re + accum9.x; 01866 o30_im = a*o30_im + accum9.y; 01867 o31_re = a*o31_re + accum10.x; 01868 o31_im = a*o31_im + accum10.y; 01869 o32_re = a*o32_re + accum11.x; 01870 o32_im = a*o32_im + accum11.y; 01871 #else 01872 o00_re = a*o00_re + accum0.x; 01873 o00_im = a*o00_im + accum0.y; 01874 o01_re = a*o01_re + accum0.z; 01875 o01_im = a*o01_im + accum0.w; 01876 o02_re = a*o02_re + accum1.x; 01877 o02_im = a*o02_im + accum1.y; 01878 o10_re = a*o10_re + accum1.z; 01879 o10_im = a*o10_im + accum1.w; 01880 o11_re = a*o11_re + accum2.x; 01881 o11_im = a*o11_im + accum2.y; 01882 o12_re = a*o12_re + accum2.z; 01883 o12_im = a*o12_im + accum2.w; 01884 o20_re = a*o20_re + accum3.x; 01885 o20_im = a*o20_im + accum3.y; 01886 o21_re = a*o21_re + accum3.z; 01887 o21_im = a*o21_im + accum3.w; 01888 o22_re = a*o22_re + accum4.x; 01889 o22_im = a*o22_im + accum4.y; 01890 o30_re = a*o30_re + accum4.z; 01891 o30_im = a*o30_im + accum4.w; 01892 o31_re = a*o31_re + accum5.x; 01893 o31_im = a*o31_im + accum5.y; 01894 o32_re = a*o32_re + accum5.z; 01895 o32_im = a*o32_im + accum5.w; 01896 #endif // DD_SPREC 01897 #endif // DSLASH_XPAY 01898 01899 01900 // write spinor field back to device memory 01901 WRITE_SPINOR(sp_stride); 01902 01903 // undefine to prevent warning when precision is changed 01904 #undef spinorFloat 01905 #undef A_re 01906 #undef A_im 01907 01908 #undef g00_re 01909 #undef g00_im 01910 #undef g01_re 01911 #undef g01_im 01912 #undef g02_re 01913 #undef g02_im 01914 #undef g10_re 01915 #undef g10_im 01916 #undef g11_re 01917 #undef g11_im 01918 #undef g12_re 01919 #undef g12_im 01920 #undef g20_re 01921 #undef g20_im 01922 #undef g21_re 01923 #undef g21_im 01924 #undef g22_re 01925 #undef g22_im 01926 01927 #undef i00_re 01928 #undef i00_im 01929 #undef i01_re 01930 #undef i01_im 01931 #undef i02_re 01932 #undef i02_im 01933 #undef i10_re 01934 #undef i10_im 01935 #undef i11_re 01936 #undef i11_im 01937 #undef i12_re 01938 #undef i12_im 01939 #undef i20_re 01940 #undef i20_im 01941 #undef i21_re 01942 #undef i21_im 01943 #undef i22_re 01944 #undef i22_im 01945 #undef i30_re 01946 #undef i30_im 01947 #undef i31_re 01948 #undef i31_im 01949 #undef i32_re 01950 #undef i32_im 01951 01952 01953