QUDA v0.3.2
A library for QCD on GPUs

quda/lib/dslash_core/tm_dslash_core.h

Go to the documentation of this file.
00001 // *** CUDA DSLASH ***
00002 
00003 #define SHARED_FLOATS_PER_THREAD 8
00004 
00005 // input spinor
00006 #ifdef SPINOR_DOUBLE
00007 #define spinorFloat double
00008 #define i00_re I0.x
00009 #define i00_im I0.y
00010 #define i01_re I1.x
00011 #define i01_im I1.y
00012 #define i02_re I2.x
00013 #define i02_im I2.y
00014 #define i10_re I3.x
00015 #define i10_im I3.y
00016 #define i11_re I4.x
00017 #define i11_im I4.y
00018 #define i12_re I5.x
00019 #define i12_im I5.y
00020 #define i20_re I6.x
00021 #define i20_im I6.y
00022 #define i21_re I7.x
00023 #define i21_im I7.y
00024 #define i22_re I8.x
00025 #define i22_im I8.y
00026 #define i30_re I9.x
00027 #define i30_im I9.y
00028 #define i31_re I10.x
00029 #define i31_im I10.y
00030 #define i32_re I11.x
00031 #define i32_im I11.y
00032 
00033 #else
00034 #define spinorFloat float
00035 #define i00_re I0.x
00036 #define i00_im I0.y
00037 #define i01_re I0.z
00038 #define i01_im I0.w
00039 #define i02_re I1.x
00040 #define i02_im I1.y
00041 #define i10_re I1.z
00042 #define i10_im I1.w
00043 #define i11_re I2.x
00044 #define i11_im I2.y
00045 #define i12_re I2.z
00046 #define i12_im I2.w
00047 #define i20_re I3.x
00048 #define i20_im I3.y
00049 #define i21_re I3.z
00050 #define i21_im I3.w
00051 #define i22_re I4.x
00052 #define i22_im I4.y
00053 #define i30_re I4.z
00054 #define i30_im I4.w
00055 #define i31_re I5.x
00056 #define i31_im I5.y
00057 #define i32_re I5.z
00058 #define i32_im I5.w
00059 #endif // SPINOR_DOUBLE
00060 
00061 // gauge link
00062 #ifdef GAUGE_FLOAT2
00063 #define g00_re G0.x
00064 #define g00_im G0.y
00065 #define g01_re G1.x
00066 #define g01_im G1.y
00067 #define g02_re G2.x
00068 #define g02_im G2.y
00069 #define g10_re G3.x
00070 #define g10_im G3.y
00071 #define g11_re G4.x
00072 #define g11_im G4.y
00073 #define g12_re G5.x
00074 #define g12_im G5.y
00075 #define g20_re G6.x
00076 #define g20_im G6.y
00077 #define g21_re G7.x
00078 #define g21_im G7.y
00079 #define g22_re G8.x
00080 #define g22_im G8.y
00081 // temporaries
00082 #define A_re G9.x
00083 #define A_im G9.y
00084 
00085 #else
00086 #define g00_re G0.x
00087 #define g00_im G0.y
00088 #define g01_re G0.z
00089 #define g01_im G0.w
00090 #define g02_re G1.x
00091 #define g02_im G1.y
00092 #define g10_re G1.z
00093 #define g10_im G1.w
00094 #define g11_re G2.x
00095 #define g11_im G2.y
00096 #define g12_re G2.z
00097 #define g12_im G2.w
00098 #define g20_re G3.x
00099 #define g20_im G3.y
00100 #define g21_re G3.z
00101 #define g21_im G3.w
00102 #define g22_re G4.x
00103 #define g22_im G4.y
00104 // temporaries
00105 #define A_re G4.z
00106 #define A_im G4.w
00107 
00108 #endif // GAUGE_DOUBLE
00109 
00110 // conjugated gauge link
00111 #define gT00_re (+g00_re)
00112 #define gT00_im (-g00_im)
00113 #define gT01_re (+g10_re)
00114 #define gT01_im (-g10_im)
00115 #define gT02_re (+g20_re)
00116 #define gT02_im (-g20_im)
00117 #define gT10_re (+g01_re)
00118 #define gT10_im (-g01_im)
00119 #define gT11_re (+g11_re)
00120 #define gT11_im (-g11_im)
00121 #define gT12_re (+g21_re)
00122 #define gT12_im (-g21_im)
00123 #define gT20_re (+g02_re)
00124 #define gT20_im (-g02_im)
00125 #define gT21_re (+g12_re)
00126 #define gT21_im (-g12_im)
00127 #define gT22_re (+g22_re)
00128 #define gT22_im (-g22_im)
00129 
00130 // output spinor
00131 #define o00_re s[0*SHARED_STRIDE]
00132 #define o00_im s[1*SHARED_STRIDE]
00133 #define o01_re s[2*SHARED_STRIDE]
00134 #define o01_im s[3*SHARED_STRIDE]
00135 #define o02_re s[4*SHARED_STRIDE]
00136 #define o02_im s[5*SHARED_STRIDE]
00137 #define o10_re s[6*SHARED_STRIDE]
00138 #define o10_im s[7*SHARED_STRIDE]
00139 volatile spinorFloat o11_re;
00140 volatile spinorFloat o11_im;
00141 volatile spinorFloat o12_re;
00142 volatile spinorFloat o12_im;
00143 volatile spinorFloat o20_re;
00144 volatile spinorFloat o20_im;
00145 volatile spinorFloat o21_re;
00146 volatile spinorFloat o21_im;
00147 volatile spinorFloat o22_re;
00148 volatile spinorFloat o22_im;
00149 volatile spinorFloat o30_re;
00150 volatile spinorFloat o30_im;
00151 volatile spinorFloat o31_re;
00152 volatile spinorFloat o31_im;
00153 volatile spinorFloat o32_re;
00154 volatile spinorFloat o32_im;
00155 
00156 
00157 
00158 #include "read_gauge.h"
00159 #include "read_clover.h"
00160 #include "io_spinor.h"
00161 
00162 int sid = blockIdx.x*blockDim.x + threadIdx.x;
00163 int z1 = FAST_INT_DIVIDE(sid, X1h);
00164 int x1h = sid - z1*X1h;
00165 int z2 = FAST_INT_DIVIDE(z1, X2);
00166 int x2 = z1 - z2*X2;
00167 int x4 = FAST_INT_DIVIDE(z2, X3);
00168 int x3 = z2 - x4*X3;
00169 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
00170 int x1 = 2*x1h + x1odd;
00171 int X = 2*sid + x1odd;
00172 
00173 #ifdef SPINOR_DOUBLE
00174 #if (__CUDA_ARCH__ >= 200)
00175 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
00176 #else
00177 #define SHARED_STRIDE  8 // to avoid bank conflicts on G80 and GT200
00178 #endif
00179 extern __shared__ spinorFloat sd_data[];
00180 volatile spinorFloat *s = sd_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
00181                                   + (threadIdx.x % SHARED_STRIDE);
00182 #else
00183 #if (__CUDA_ARCH__ >= 200)
00184 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
00185 #else
00186 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
00187 #endif
00188 extern __shared__ spinorFloat ss_data[];
00189 volatile spinorFloat *s = ss_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
00190                                   + (threadIdx.x % SHARED_STRIDE);
00191 #endif
00192 
00193 o00_re = o00_im = 0;
00194 o01_re = o01_im = 0;
00195 o02_re = o02_im = 0;
00196 o10_re = o10_im = 0;
00197 o11_re = o11_im = 0;
00198 o12_re = o12_im = 0;
00199 o20_re = o20_im = 0;
00200 o21_re = o21_im = 0;
00201 o22_re = o22_im = 0;
00202 o30_re = o30_im = 0;
00203 o31_re = o31_im = 0;
00204 o32_re = o32_im = 0;
00205 
00206 {
00207     // Projector P0-
00208     // 1 0 0 -i 
00209     // 0 1 -i 0 
00210     // 0 i 1 0 
00211     // i 0 0 1 
00212     
00213     int sp_idx = ((x1==X1m1) ? X-X1m1 : X+1) >> 1;
00214     int ga_idx = sid;
00215     
00216     // read gauge matrix from device memory
00217     READ_GAUGE_MATRIX(GAUGE0TEX, 0);
00218     
00219     // read spinor from device memory
00220     READ_SPINOR(SPINORTEX);
00221     
00222     // reconstruct gauge matrix
00223     RECONSTRUCT_GAUGE_MATRIX(0);
00224     
00225     // project spinor into half spinors
00226     spinorFloat a0_re = +i00_re+i30_im;
00227     spinorFloat a0_im = +i00_im-i30_re;
00228     spinorFloat a1_re = +i01_re+i31_im;
00229     spinorFloat a1_im = +i01_im-i31_re;
00230     spinorFloat a2_re = +i02_re+i32_im;
00231     spinorFloat a2_im = +i02_im-i32_re;
00232     
00233     spinorFloat b0_re = +i10_re+i20_im;
00234     spinorFloat b0_im = +i10_im-i20_re;
00235     spinorFloat b1_re = +i11_re+i21_im;
00236     spinorFloat b1_im = +i11_im-i21_re;
00237     spinorFloat b2_re = +i12_re+i22_im;
00238     spinorFloat b2_im = +i12_im-i22_re;
00239     
00240     // multiply row 0
00241     spinorFloat A0_re = 0;
00242     A0_re += g00_re * a0_re;
00243     A0_re -= g00_im * a0_im;
00244     A0_re += g01_re * a1_re;
00245     A0_re -= g01_im * a1_im;
00246     A0_re += g02_re * a2_re;
00247     A0_re -= g02_im * a2_im;
00248     spinorFloat A0_im = 0;
00249     A0_im += g00_re * a0_im;
00250     A0_im += g00_im * a0_re;
00251     A0_im += g01_re * a1_im;
00252     A0_im += g01_im * a1_re;
00253     A0_im += g02_re * a2_im;
00254     A0_im += g02_im * a2_re;
00255     spinorFloat B0_re = 0;
00256     B0_re += g00_re * b0_re;
00257     B0_re -= g00_im * b0_im;
00258     B0_re += g01_re * b1_re;
00259     B0_re -= g01_im * b1_im;
00260     B0_re += g02_re * b2_re;
00261     B0_re -= g02_im * b2_im;
00262     spinorFloat B0_im = 0;
00263     B0_im += g00_re * b0_im;
00264     B0_im += g00_im * b0_re;
00265     B0_im += g01_re * b1_im;
00266     B0_im += g01_im * b1_re;
00267     B0_im += g02_re * b2_im;
00268     B0_im += g02_im * b2_re;
00269     
00270     // multiply row 1
00271     spinorFloat A1_re = 0;
00272     A1_re += g10_re * a0_re;
00273     A1_re -= g10_im * a0_im;
00274     A1_re += g11_re * a1_re;
00275     A1_re -= g11_im * a1_im;
00276     A1_re += g12_re * a2_re;
00277     A1_re -= g12_im * a2_im;
00278     spinorFloat A1_im = 0;
00279     A1_im += g10_re * a0_im;
00280     A1_im += g10_im * a0_re;
00281     A1_im += g11_re * a1_im;
00282     A1_im += g11_im * a1_re;
00283     A1_im += g12_re * a2_im;
00284     A1_im += g12_im * a2_re;
00285     spinorFloat B1_re = 0;
00286     B1_re += g10_re * b0_re;
00287     B1_re -= g10_im * b0_im;
00288     B1_re += g11_re * b1_re;
00289     B1_re -= g11_im * b1_im;
00290     B1_re += g12_re * b2_re;
00291     B1_re -= g12_im * b2_im;
00292     spinorFloat B1_im = 0;
00293     B1_im += g10_re * b0_im;
00294     B1_im += g10_im * b0_re;
00295     B1_im += g11_re * b1_im;
00296     B1_im += g11_im * b1_re;
00297     B1_im += g12_re * b2_im;
00298     B1_im += g12_im * b2_re;
00299     
00300     // multiply row 2
00301     spinorFloat A2_re = 0;
00302     A2_re += g20_re * a0_re;
00303     A2_re -= g20_im * a0_im;
00304     A2_re += g21_re * a1_re;
00305     A2_re -= g21_im * a1_im;
00306     A2_re += g22_re * a2_re;
00307     A2_re -= g22_im * a2_im;
00308     spinorFloat A2_im = 0;
00309     A2_im += g20_re * a0_im;
00310     A2_im += g20_im * a0_re;
00311     A2_im += g21_re * a1_im;
00312     A2_im += g21_im * a1_re;
00313     A2_im += g22_re * a2_im;
00314     A2_im += g22_im * a2_re;
00315     spinorFloat B2_re = 0;
00316     B2_re += g20_re * b0_re;
00317     B2_re -= g20_im * b0_im;
00318     B2_re += g21_re * b1_re;
00319     B2_re -= g21_im * b1_im;
00320     B2_re += g22_re * b2_re;
00321     B2_re -= g22_im * b2_im;
00322     spinorFloat B2_im = 0;
00323     B2_im += g20_re * b0_im;
00324     B2_im += g20_im * b0_re;
00325     B2_im += g21_re * b1_im;
00326     B2_im += g21_im * b1_re;
00327     B2_im += g22_re * b2_im;
00328     B2_im += g22_im * b2_re;
00329     
00330     o00_re += A0_re;
00331     o00_im += A0_im;
00332     o10_re += B0_re;
00333     o10_im += B0_im;
00334     o20_re -= B0_im;
00335     o20_im += B0_re;
00336     o30_re -= A0_im;
00337     o30_im += A0_re;
00338     
00339     o01_re += A1_re;
00340     o01_im += A1_im;
00341     o11_re += B1_re;
00342     o11_im += B1_im;
00343     o21_re -= B1_im;
00344     o21_im += B1_re;
00345     o31_re -= A1_im;
00346     o31_im += A1_re;
00347     
00348     o02_re += A2_re;
00349     o02_im += A2_im;
00350     o12_re += B2_re;
00351     o12_im += B2_im;
00352     o22_re -= B2_im;
00353     o22_im += B2_re;
00354     o32_re -= A2_im;
00355     o32_im += A2_re;
00356     
00357 }
00358 
00359 {
00360     // Projector P0+
00361     // 1 0 0 i 
00362     // 0 1 i 0 
00363     // 0 -i 1 0 
00364     // -i 0 0 1 
00365     
00366     int sp_idx = ((x1==0)    ? X+X1m1 : X-1) >> 1;
00367     int ga_idx = sp_idx;
00368     
00369     // read gauge matrix from device memory
00370     READ_GAUGE_MATRIX(GAUGE1TEX, 1);
00371     
00372     // read spinor from device memory
00373     READ_SPINOR(SPINORTEX);
00374     
00375     // reconstruct gauge matrix
00376     RECONSTRUCT_GAUGE_MATRIX(1);
00377     
00378     // project spinor into half spinors
00379     spinorFloat a0_re = +i00_re-i30_im;
00380     spinorFloat a0_im = +i00_im+i30_re;
00381     spinorFloat a1_re = +i01_re-i31_im;
00382     spinorFloat a1_im = +i01_im+i31_re;
00383     spinorFloat a2_re = +i02_re-i32_im;
00384     spinorFloat a2_im = +i02_im+i32_re;
00385     
00386     spinorFloat b0_re = +i10_re-i20_im;
00387     spinorFloat b0_im = +i10_im+i20_re;
00388     spinorFloat b1_re = +i11_re-i21_im;
00389     spinorFloat b1_im = +i11_im+i21_re;
00390     spinorFloat b2_re = +i12_re-i22_im;
00391     spinorFloat b2_im = +i12_im+i22_re;
00392     
00393     // multiply row 0
00394     spinorFloat A0_re = 0;
00395     A0_re += gT00_re * a0_re;
00396     A0_re -= gT00_im * a0_im;
00397     A0_re += gT01_re * a1_re;
00398     A0_re -= gT01_im * a1_im;
00399     A0_re += gT02_re * a2_re;
00400     A0_re -= gT02_im * a2_im;
00401     spinorFloat A0_im = 0;
00402     A0_im += gT00_re * a0_im;
00403     A0_im += gT00_im * a0_re;
00404     A0_im += gT01_re * a1_im;
00405     A0_im += gT01_im * a1_re;
00406     A0_im += gT02_re * a2_im;
00407     A0_im += gT02_im * a2_re;
00408     spinorFloat B0_re = 0;
00409     B0_re += gT00_re * b0_re;
00410     B0_re -= gT00_im * b0_im;
00411     B0_re += gT01_re * b1_re;
00412     B0_re -= gT01_im * b1_im;
00413     B0_re += gT02_re * b2_re;
00414     B0_re -= gT02_im * b2_im;
00415     spinorFloat B0_im = 0;
00416     B0_im += gT00_re * b0_im;
00417     B0_im += gT00_im * b0_re;
00418     B0_im += gT01_re * b1_im;
00419     B0_im += gT01_im * b1_re;
00420     B0_im += gT02_re * b2_im;
00421     B0_im += gT02_im * b2_re;
00422     
00423     // multiply row 1
00424     spinorFloat A1_re = 0;
00425     A1_re += gT10_re * a0_re;
00426     A1_re -= gT10_im * a0_im;
00427     A1_re += gT11_re * a1_re;
00428     A1_re -= gT11_im * a1_im;
00429     A1_re += gT12_re * a2_re;
00430     A1_re -= gT12_im * a2_im;
00431     spinorFloat A1_im = 0;
00432     A1_im += gT10_re * a0_im;
00433     A1_im += gT10_im * a0_re;
00434     A1_im += gT11_re * a1_im;
00435     A1_im += gT11_im * a1_re;
00436     A1_im += gT12_re * a2_im;
00437     A1_im += gT12_im * a2_re;
00438     spinorFloat B1_re = 0;
00439     B1_re += gT10_re * b0_re;
00440     B1_re -= gT10_im * b0_im;
00441     B1_re += gT11_re * b1_re;
00442     B1_re -= gT11_im * b1_im;
00443     B1_re += gT12_re * b2_re;
00444     B1_re -= gT12_im * b2_im;
00445     spinorFloat B1_im = 0;
00446     B1_im += gT10_re * b0_im;
00447     B1_im += gT10_im * b0_re;
00448     B1_im += gT11_re * b1_im;
00449     B1_im += gT11_im * b1_re;
00450     B1_im += gT12_re * b2_im;
00451     B1_im += gT12_im * b2_re;
00452     
00453     // multiply row 2
00454     spinorFloat A2_re = 0;
00455     A2_re += gT20_re * a0_re;
00456     A2_re -= gT20_im * a0_im;
00457     A2_re += gT21_re * a1_re;
00458     A2_re -= gT21_im * a1_im;
00459     A2_re += gT22_re * a2_re;
00460     A2_re -= gT22_im * a2_im;
00461     spinorFloat A2_im = 0;
00462     A2_im += gT20_re * a0_im;
00463     A2_im += gT20_im * a0_re;
00464     A2_im += gT21_re * a1_im;
00465     A2_im += gT21_im * a1_re;
00466     A2_im += gT22_re * a2_im;
00467     A2_im += gT22_im * a2_re;
00468     spinorFloat B2_re = 0;
00469     B2_re += gT20_re * b0_re;
00470     B2_re -= gT20_im * b0_im;
00471     B2_re += gT21_re * b1_re;
00472     B2_re -= gT21_im * b1_im;
00473     B2_re += gT22_re * b2_re;
00474     B2_re -= gT22_im * b2_im;
00475     spinorFloat B2_im = 0;
00476     B2_im += gT20_re * b0_im;
00477     B2_im += gT20_im * b0_re;
00478     B2_im += gT21_re * b1_im;
00479     B2_im += gT21_im * b1_re;
00480     B2_im += gT22_re * b2_im;
00481     B2_im += gT22_im * b2_re;
00482     
00483     o00_re += A0_re;
00484     o00_im += A0_im;
00485     o10_re += B0_re;
00486     o10_im += B0_im;
00487     o20_re += B0_im;
00488     o20_im -= B0_re;
00489     o30_re += A0_im;
00490     o30_im -= A0_re;
00491     
00492     o01_re += A1_re;
00493     o01_im += A1_im;
00494     o11_re += B1_re;
00495     o11_im += B1_im;
00496     o21_re += B1_im;
00497     o21_im -= B1_re;
00498     o31_re += A1_im;
00499     o31_im -= A1_re;
00500     
00501     o02_re += A2_re;
00502     o02_im += A2_im;
00503     o12_re += B2_re;
00504     o12_im += B2_im;
00505     o22_re += B2_im;
00506     o22_im -= B2_re;
00507     o32_re += A2_im;
00508     o32_im -= A2_re;
00509     
00510 }
00511 
00512 {
00513     // Projector P1-
00514     // 1 0 0 -1 
00515     // 0 1 1 0 
00516     // 0 1 1 0 
00517     // -1 0 0 1 
00518     
00519     int sp_idx = ((x2==X2m1) ? X-X2X1mX1 : X+X1) >> 1;
00520     int ga_idx = sid;
00521     
00522     // read gauge matrix from device memory
00523     READ_GAUGE_MATRIX(GAUGE0TEX, 2);
00524     
00525     // read spinor from device memory
00526     READ_SPINOR(SPINORTEX);
00527     
00528     // reconstruct gauge matrix
00529     RECONSTRUCT_GAUGE_MATRIX(2);
00530     
00531     // project spinor into half spinors
00532     spinorFloat a0_re = +i00_re-i30_re;
00533     spinorFloat a0_im = +i00_im-i30_im;
00534     spinorFloat a1_re = +i01_re-i31_re;
00535     spinorFloat a1_im = +i01_im-i31_im;
00536     spinorFloat a2_re = +i02_re-i32_re;
00537     spinorFloat a2_im = +i02_im-i32_im;
00538     
00539     spinorFloat b0_re = +i10_re+i20_re;
00540     spinorFloat b0_im = +i10_im+i20_im;
00541     spinorFloat b1_re = +i11_re+i21_re;
00542     spinorFloat b1_im = +i11_im+i21_im;
00543     spinorFloat b2_re = +i12_re+i22_re;
00544     spinorFloat b2_im = +i12_im+i22_im;
00545     
00546     // multiply row 0
00547     spinorFloat A0_re = 0;
00548     A0_re += g00_re * a0_re;
00549     A0_re -= g00_im * a0_im;
00550     A0_re += g01_re * a1_re;
00551     A0_re -= g01_im * a1_im;
00552     A0_re += g02_re * a2_re;
00553     A0_re -= g02_im * a2_im;
00554     spinorFloat A0_im = 0;
00555     A0_im += g00_re * a0_im;
00556     A0_im += g00_im * a0_re;
00557     A0_im += g01_re * a1_im;
00558     A0_im += g01_im * a1_re;
00559     A0_im += g02_re * a2_im;
00560     A0_im += g02_im * a2_re;
00561     spinorFloat B0_re = 0;
00562     B0_re += g00_re * b0_re;
00563     B0_re -= g00_im * b0_im;
00564     B0_re += g01_re * b1_re;
00565     B0_re -= g01_im * b1_im;
00566     B0_re += g02_re * b2_re;
00567     B0_re -= g02_im * b2_im;
00568     spinorFloat B0_im = 0;
00569     B0_im += g00_re * b0_im;
00570     B0_im += g00_im * b0_re;
00571     B0_im += g01_re * b1_im;
00572     B0_im += g01_im * b1_re;
00573     B0_im += g02_re * b2_im;
00574     B0_im += g02_im * b2_re;
00575     
00576     // multiply row 1
00577     spinorFloat A1_re = 0;
00578     A1_re += g10_re * a0_re;
00579     A1_re -= g10_im * a0_im;
00580     A1_re += g11_re * a1_re;
00581     A1_re -= g11_im * a1_im;
00582     A1_re += g12_re * a2_re;
00583     A1_re -= g12_im * a2_im;
00584     spinorFloat A1_im = 0;
00585     A1_im += g10_re * a0_im;
00586     A1_im += g10_im * a0_re;
00587     A1_im += g11_re * a1_im;
00588     A1_im += g11_im * a1_re;
00589     A1_im += g12_re * a2_im;
00590     A1_im += g12_im * a2_re;
00591     spinorFloat B1_re = 0;
00592     B1_re += g10_re * b0_re;
00593     B1_re -= g10_im * b0_im;
00594     B1_re += g11_re * b1_re;
00595     B1_re -= g11_im * b1_im;
00596     B1_re += g12_re * b2_re;
00597     B1_re -= g12_im * b2_im;
00598     spinorFloat B1_im = 0;
00599     B1_im += g10_re * b0_im;
00600     B1_im += g10_im * b0_re;
00601     B1_im += g11_re * b1_im;
00602     B1_im += g11_im * b1_re;
00603     B1_im += g12_re * b2_im;
00604     B1_im += g12_im * b2_re;
00605     
00606     // multiply row 2
00607     spinorFloat A2_re = 0;
00608     A2_re += g20_re * a0_re;
00609     A2_re -= g20_im * a0_im;
00610     A2_re += g21_re * a1_re;
00611     A2_re -= g21_im * a1_im;
00612     A2_re += g22_re * a2_re;
00613     A2_re -= g22_im * a2_im;
00614     spinorFloat A2_im = 0;
00615     A2_im += g20_re * a0_im;
00616     A2_im += g20_im * a0_re;
00617     A2_im += g21_re * a1_im;
00618     A2_im += g21_im * a1_re;
00619     A2_im += g22_re * a2_im;
00620     A2_im += g22_im * a2_re;
00621     spinorFloat B2_re = 0;
00622     B2_re += g20_re * b0_re;
00623     B2_re -= g20_im * b0_im;
00624     B2_re += g21_re * b1_re;
00625     B2_re -= g21_im * b1_im;
00626     B2_re += g22_re * b2_re;
00627     B2_re -= g22_im * b2_im;
00628     spinorFloat B2_im = 0;
00629     B2_im += g20_re * b0_im;
00630     B2_im += g20_im * b0_re;
00631     B2_im += g21_re * b1_im;
00632     B2_im += g21_im * b1_re;
00633     B2_im += g22_re * b2_im;
00634     B2_im += g22_im * b2_re;
00635     
00636     o00_re += A0_re;
00637     o00_im += A0_im;
00638     o10_re += B0_re;
00639     o10_im += B0_im;
00640     o20_re += B0_re;
00641     o20_im += B0_im;
00642     o30_re -= A0_re;
00643     o30_im -= A0_im;
00644     
00645     o01_re += A1_re;
00646     o01_im += A1_im;
00647     o11_re += B1_re;
00648     o11_im += B1_im;
00649     o21_re += B1_re;
00650     o21_im += B1_im;
00651     o31_re -= A1_re;
00652     o31_im -= A1_im;
00653     
00654     o02_re += A2_re;
00655     o02_im += A2_im;
00656     o12_re += B2_re;
00657     o12_im += B2_im;
00658     o22_re += B2_re;
00659     o22_im += B2_im;
00660     o32_re -= A2_re;
00661     o32_im -= A2_im;
00662     
00663 }
00664 
00665 {
00666     // Projector P1+
00667     // 1 0 0 1 
00668     // 0 1 -1 0 
00669     // 0 -1 1 0 
00670     // 1 0 0 1 
00671     
00672     int sp_idx = ((x2==0)    ? X+X2X1mX1 : X-X1) >> 1;
00673     int ga_idx = sp_idx;
00674     
00675     // read gauge matrix from device memory
00676     READ_GAUGE_MATRIX(GAUGE1TEX, 3);
00677     
00678     // read spinor from device memory
00679     READ_SPINOR(SPINORTEX);
00680     
00681     // reconstruct gauge matrix
00682     RECONSTRUCT_GAUGE_MATRIX(3);
00683     
00684     // project spinor into half spinors
00685     spinorFloat a0_re = +i00_re+i30_re;
00686     spinorFloat a0_im = +i00_im+i30_im;
00687     spinorFloat a1_re = +i01_re+i31_re;
00688     spinorFloat a1_im = +i01_im+i31_im;
00689     spinorFloat a2_re = +i02_re+i32_re;
00690     spinorFloat a2_im = +i02_im+i32_im;
00691     
00692     spinorFloat b0_re = +i10_re-i20_re;
00693     spinorFloat b0_im = +i10_im-i20_im;
00694     spinorFloat b1_re = +i11_re-i21_re;
00695     spinorFloat b1_im = +i11_im-i21_im;
00696     spinorFloat b2_re = +i12_re-i22_re;
00697     spinorFloat b2_im = +i12_im-i22_im;
00698     
00699     // multiply row 0
00700     spinorFloat A0_re = 0;
00701     A0_re += gT00_re * a0_re;
00702     A0_re -= gT00_im * a0_im;
00703     A0_re += gT01_re * a1_re;
00704     A0_re -= gT01_im * a1_im;
00705     A0_re += gT02_re * a2_re;
00706     A0_re -= gT02_im * a2_im;
00707     spinorFloat A0_im = 0;
00708     A0_im += gT00_re * a0_im;
00709     A0_im += gT00_im * a0_re;
00710     A0_im += gT01_re * a1_im;
00711     A0_im += gT01_im * a1_re;
00712     A0_im += gT02_re * a2_im;
00713     A0_im += gT02_im * a2_re;
00714     spinorFloat B0_re = 0;
00715     B0_re += gT00_re * b0_re;
00716     B0_re -= gT00_im * b0_im;
00717     B0_re += gT01_re * b1_re;
00718     B0_re -= gT01_im * b1_im;
00719     B0_re += gT02_re * b2_re;
00720     B0_re -= gT02_im * b2_im;
00721     spinorFloat B0_im = 0;
00722     B0_im += gT00_re * b0_im;
00723     B0_im += gT00_im * b0_re;
00724     B0_im += gT01_re * b1_im;
00725     B0_im += gT01_im * b1_re;
00726     B0_im += gT02_re * b2_im;
00727     B0_im += gT02_im * b2_re;
00728     
00729     // multiply row 1
00730     spinorFloat A1_re = 0;
00731     A1_re += gT10_re * a0_re;
00732     A1_re -= gT10_im * a0_im;
00733     A1_re += gT11_re * a1_re;
00734     A1_re -= gT11_im * a1_im;
00735     A1_re += gT12_re * a2_re;
00736     A1_re -= gT12_im * a2_im;
00737     spinorFloat A1_im = 0;
00738     A1_im += gT10_re * a0_im;
00739     A1_im += gT10_im * a0_re;
00740     A1_im += gT11_re * a1_im;
00741     A1_im += gT11_im * a1_re;
00742     A1_im += gT12_re * a2_im;
00743     A1_im += gT12_im * a2_re;
00744     spinorFloat B1_re = 0;
00745     B1_re += gT10_re * b0_re;
00746     B1_re -= gT10_im * b0_im;
00747     B1_re += gT11_re * b1_re;
00748     B1_re -= gT11_im * b1_im;
00749     B1_re += gT12_re * b2_re;
00750     B1_re -= gT12_im * b2_im;
00751     spinorFloat B1_im = 0;
00752     B1_im += gT10_re * b0_im;
00753     B1_im += gT10_im * b0_re;
00754     B1_im += gT11_re * b1_im;
00755     B1_im += gT11_im * b1_re;
00756     B1_im += gT12_re * b2_im;
00757     B1_im += gT12_im * b2_re;
00758     
00759     // multiply row 2
00760     spinorFloat A2_re = 0;
00761     A2_re += gT20_re * a0_re;
00762     A2_re -= gT20_im * a0_im;
00763     A2_re += gT21_re * a1_re;
00764     A2_re -= gT21_im * a1_im;
00765     A2_re += gT22_re * a2_re;
00766     A2_re -= gT22_im * a2_im;
00767     spinorFloat A2_im = 0;
00768     A2_im += gT20_re * a0_im;
00769     A2_im += gT20_im * a0_re;
00770     A2_im += gT21_re * a1_im;
00771     A2_im += gT21_im * a1_re;
00772     A2_im += gT22_re * a2_im;
00773     A2_im += gT22_im * a2_re;
00774     spinorFloat B2_re = 0;
00775     B2_re += gT20_re * b0_re;
00776     B2_re -= gT20_im * b0_im;
00777     B2_re += gT21_re * b1_re;
00778     B2_re -= gT21_im * b1_im;
00779     B2_re += gT22_re * b2_re;
00780     B2_re -= gT22_im * b2_im;
00781     spinorFloat B2_im = 0;
00782     B2_im += gT20_re * b0_im;
00783     B2_im += gT20_im * b0_re;
00784     B2_im += gT21_re * b1_im;
00785     B2_im += gT21_im * b1_re;
00786     B2_im += gT22_re * b2_im;
00787     B2_im += gT22_im * b2_re;
00788     
00789     o00_re += A0_re;
00790     o00_im += A0_im;
00791     o10_re += B0_re;
00792     o10_im += B0_im;
00793     o20_re -= B0_re;
00794     o20_im -= B0_im;
00795     o30_re += A0_re;
00796     o30_im += A0_im;
00797     
00798     o01_re += A1_re;
00799     o01_im += A1_im;
00800     o11_re += B1_re;
00801     o11_im += B1_im;
00802     o21_re -= B1_re;
00803     o21_im -= B1_im;
00804     o31_re += A1_re;
00805     o31_im += A1_im;
00806     
00807     o02_re += A2_re;
00808     o02_im += A2_im;
00809     o12_re += B2_re;
00810     o12_im += B2_im;
00811     o22_re -= B2_re;
00812     o22_im -= B2_im;
00813     o32_re += A2_re;
00814     o32_im += A2_im;
00815     
00816 }
00817 
00818 {
00819     // Projector P2-
00820     // 1 0 -i 0 
00821     // 0 1 0 i 
00822     // i 0 1 0 
00823     // 0 -i 0 1 
00824     
00825     int sp_idx = ((x3==X3m1) ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
00826     int ga_idx = sid;
00827     
00828     // read gauge matrix from device memory
00829     READ_GAUGE_MATRIX(GAUGE0TEX, 4);
00830     
00831     // read spinor from device memory
00832     READ_SPINOR(SPINORTEX);
00833     
00834     // reconstruct gauge matrix
00835     RECONSTRUCT_GAUGE_MATRIX(4);
00836     
00837     // project spinor into half spinors
00838     spinorFloat a0_re = +i00_re+i20_im;
00839     spinorFloat a0_im = +i00_im-i20_re;
00840     spinorFloat a1_re = +i01_re+i21_im;
00841     spinorFloat a1_im = +i01_im-i21_re;
00842     spinorFloat a2_re = +i02_re+i22_im;
00843     spinorFloat a2_im = +i02_im-i22_re;
00844     
00845     spinorFloat b0_re = +i10_re-i30_im;
00846     spinorFloat b0_im = +i10_im+i30_re;
00847     spinorFloat b1_re = +i11_re-i31_im;
00848     spinorFloat b1_im = +i11_im+i31_re;
00849     spinorFloat b2_re = +i12_re-i32_im;
00850     spinorFloat b2_im = +i12_im+i32_re;
00851     
00852     // multiply row 0
00853     spinorFloat A0_re = 0;
00854     A0_re += g00_re * a0_re;
00855     A0_re -= g00_im * a0_im;
00856     A0_re += g01_re * a1_re;
00857     A0_re -= g01_im * a1_im;
00858     A0_re += g02_re * a2_re;
00859     A0_re -= g02_im * a2_im;
00860     spinorFloat A0_im = 0;
00861     A0_im += g00_re * a0_im;
00862     A0_im += g00_im * a0_re;
00863     A0_im += g01_re * a1_im;
00864     A0_im += g01_im * a1_re;
00865     A0_im += g02_re * a2_im;
00866     A0_im += g02_im * a2_re;
00867     spinorFloat B0_re = 0;
00868     B0_re += g00_re * b0_re;
00869     B0_re -= g00_im * b0_im;
00870     B0_re += g01_re * b1_re;
00871     B0_re -= g01_im * b1_im;
00872     B0_re += g02_re * b2_re;
00873     B0_re -= g02_im * b2_im;
00874     spinorFloat B0_im = 0;
00875     B0_im += g00_re * b0_im;
00876     B0_im += g00_im * b0_re;
00877     B0_im += g01_re * b1_im;
00878     B0_im += g01_im * b1_re;
00879     B0_im += g02_re * b2_im;
00880     B0_im += g02_im * b2_re;
00881     
00882     // multiply row 1
00883     spinorFloat A1_re = 0;
00884     A1_re += g10_re * a0_re;
00885     A1_re -= g10_im * a0_im;
00886     A1_re += g11_re * a1_re;
00887     A1_re -= g11_im * a1_im;
00888     A1_re += g12_re * a2_re;
00889     A1_re -= g12_im * a2_im;
00890     spinorFloat A1_im = 0;
00891     A1_im += g10_re * a0_im;
00892     A1_im += g10_im * a0_re;
00893     A1_im += g11_re * a1_im;
00894     A1_im += g11_im * a1_re;
00895     A1_im += g12_re * a2_im;
00896     A1_im += g12_im * a2_re;
00897     spinorFloat B1_re = 0;
00898     B1_re += g10_re * b0_re;
00899     B1_re -= g10_im * b0_im;
00900     B1_re += g11_re * b1_re;
00901     B1_re -= g11_im * b1_im;
00902     B1_re += g12_re * b2_re;
00903     B1_re -= g12_im * b2_im;
00904     spinorFloat B1_im = 0;
00905     B1_im += g10_re * b0_im;
00906     B1_im += g10_im * b0_re;
00907     B1_im += g11_re * b1_im;
00908     B1_im += g11_im * b1_re;
00909     B1_im += g12_re * b2_im;
00910     B1_im += g12_im * b2_re;
00911     
00912     // multiply row 2
00913     spinorFloat A2_re = 0;
00914     A2_re += g20_re * a0_re;
00915     A2_re -= g20_im * a0_im;
00916     A2_re += g21_re * a1_re;
00917     A2_re -= g21_im * a1_im;
00918     A2_re += g22_re * a2_re;
00919     A2_re -= g22_im * a2_im;
00920     spinorFloat A2_im = 0;
00921     A2_im += g20_re * a0_im;
00922     A2_im += g20_im * a0_re;
00923     A2_im += g21_re * a1_im;
00924     A2_im += g21_im * a1_re;
00925     A2_im += g22_re * a2_im;
00926     A2_im += g22_im * a2_re;
00927     spinorFloat B2_re = 0;
00928     B2_re += g20_re * b0_re;
00929     B2_re -= g20_im * b0_im;
00930     B2_re += g21_re * b1_re;
00931     B2_re -= g21_im * b1_im;
00932     B2_re += g22_re * b2_re;
00933     B2_re -= g22_im * b2_im;
00934     spinorFloat B2_im = 0;
00935     B2_im += g20_re * b0_im;
00936     B2_im += g20_im * b0_re;
00937     B2_im += g21_re * b1_im;
00938     B2_im += g21_im * b1_re;
00939     B2_im += g22_re * b2_im;
00940     B2_im += g22_im * b2_re;
00941     
00942     o00_re += A0_re;
00943     o00_im += A0_im;
00944     o10_re += B0_re;
00945     o10_im += B0_im;
00946     o20_re -= A0_im;
00947     o20_im += A0_re;
00948     o30_re += B0_im;
00949     o30_im -= B0_re;
00950     
00951     o01_re += A1_re;
00952     o01_im += A1_im;
00953     o11_re += B1_re;
00954     o11_im += B1_im;
00955     o21_re -= A1_im;
00956     o21_im += A1_re;
00957     o31_re += B1_im;
00958     o31_im -= B1_re;
00959     
00960     o02_re += A2_re;
00961     o02_im += A2_im;
00962     o12_re += B2_re;
00963     o12_im += B2_im;
00964     o22_re -= A2_im;
00965     o22_im += A2_re;
00966     o32_re += B2_im;
00967     o32_im -= B2_re;
00968     
00969 }
00970 
00971 {
00972     // Projector P2+
00973     // 1 0 i 0 
00974     // 0 1 0 -i 
00975     // -i 0 1 0 
00976     // 0 i 0 1 
00977     
00978     int sp_idx = ((x3==0)    ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
00979     int ga_idx = sp_idx;
00980     
00981     // read gauge matrix from device memory
00982     READ_GAUGE_MATRIX(GAUGE1TEX, 5);
00983     
00984     // read spinor from device memory
00985     READ_SPINOR(SPINORTEX);
00986     
00987     // reconstruct gauge matrix
00988     RECONSTRUCT_GAUGE_MATRIX(5);
00989     
00990     // project spinor into half spinors
00991     spinorFloat a0_re = +i00_re-i20_im;
00992     spinorFloat a0_im = +i00_im+i20_re;
00993     spinorFloat a1_re = +i01_re-i21_im;
00994     spinorFloat a1_im = +i01_im+i21_re;
00995     spinorFloat a2_re = +i02_re-i22_im;
00996     spinorFloat a2_im = +i02_im+i22_re;
00997     
00998     spinorFloat b0_re = +i10_re+i30_im;
00999     spinorFloat b0_im = +i10_im-i30_re;
01000     spinorFloat b1_re = +i11_re+i31_im;
01001     spinorFloat b1_im = +i11_im-i31_re;
01002     spinorFloat b2_re = +i12_re+i32_im;
01003     spinorFloat b2_im = +i12_im-i32_re;
01004     
01005     // multiply row 0
01006     spinorFloat A0_re = 0;
01007     A0_re += gT00_re * a0_re;
01008     A0_re -= gT00_im * a0_im;
01009     A0_re += gT01_re * a1_re;
01010     A0_re -= gT01_im * a1_im;
01011     A0_re += gT02_re * a2_re;
01012     A0_re -= gT02_im * a2_im;
01013     spinorFloat A0_im = 0;
01014     A0_im += gT00_re * a0_im;
01015     A0_im += gT00_im * a0_re;
01016     A0_im += gT01_re * a1_im;
01017     A0_im += gT01_im * a1_re;
01018     A0_im += gT02_re * a2_im;
01019     A0_im += gT02_im * a2_re;
01020     spinorFloat B0_re = 0;
01021     B0_re += gT00_re * b0_re;
01022     B0_re -= gT00_im * b0_im;
01023     B0_re += gT01_re * b1_re;
01024     B0_re -= gT01_im * b1_im;
01025     B0_re += gT02_re * b2_re;
01026     B0_re -= gT02_im * b2_im;
01027     spinorFloat B0_im = 0;
01028     B0_im += gT00_re * b0_im;
01029     B0_im += gT00_im * b0_re;
01030     B0_im += gT01_re * b1_im;
01031     B0_im += gT01_im * b1_re;
01032     B0_im += gT02_re * b2_im;
01033     B0_im += gT02_im * b2_re;
01034     
01035     // multiply row 1
01036     spinorFloat A1_re = 0;
01037     A1_re += gT10_re * a0_re;
01038     A1_re -= gT10_im * a0_im;
01039     A1_re += gT11_re * a1_re;
01040     A1_re -= gT11_im * a1_im;
01041     A1_re += gT12_re * a2_re;
01042     A1_re -= gT12_im * a2_im;
01043     spinorFloat A1_im = 0;
01044     A1_im += gT10_re * a0_im;
01045     A1_im += gT10_im * a0_re;
01046     A1_im += gT11_re * a1_im;
01047     A1_im += gT11_im * a1_re;
01048     A1_im += gT12_re * a2_im;
01049     A1_im += gT12_im * a2_re;
01050     spinorFloat B1_re = 0;
01051     B1_re += gT10_re * b0_re;
01052     B1_re -= gT10_im * b0_im;
01053     B1_re += gT11_re * b1_re;
01054     B1_re -= gT11_im * b1_im;
01055     B1_re += gT12_re * b2_re;
01056     B1_re -= gT12_im * b2_im;
01057     spinorFloat B1_im = 0;
01058     B1_im += gT10_re * b0_im;
01059     B1_im += gT10_im * b0_re;
01060     B1_im += gT11_re * b1_im;
01061     B1_im += gT11_im * b1_re;
01062     B1_im += gT12_re * b2_im;
01063     B1_im += gT12_im * b2_re;
01064     
01065     // multiply row 2
01066     spinorFloat A2_re = 0;
01067     A2_re += gT20_re * a0_re;
01068     A2_re -= gT20_im * a0_im;
01069     A2_re += gT21_re * a1_re;
01070     A2_re -= gT21_im * a1_im;
01071     A2_re += gT22_re * a2_re;
01072     A2_re -= gT22_im * a2_im;
01073     spinorFloat A2_im = 0;
01074     A2_im += gT20_re * a0_im;
01075     A2_im += gT20_im * a0_re;
01076     A2_im += gT21_re * a1_im;
01077     A2_im += gT21_im * a1_re;
01078     A2_im += gT22_re * a2_im;
01079     A2_im += gT22_im * a2_re;
01080     spinorFloat B2_re = 0;
01081     B2_re += gT20_re * b0_re;
01082     B2_re -= gT20_im * b0_im;
01083     B2_re += gT21_re * b1_re;
01084     B2_re -= gT21_im * b1_im;
01085     B2_re += gT22_re * b2_re;
01086     B2_re -= gT22_im * b2_im;
01087     spinorFloat B2_im = 0;
01088     B2_im += gT20_re * b0_im;
01089     B2_im += gT20_im * b0_re;
01090     B2_im += gT21_re * b1_im;
01091     B2_im += gT21_im * b1_re;
01092     B2_im += gT22_re * b2_im;
01093     B2_im += gT22_im * b2_re;
01094     
01095     o00_re += A0_re;
01096     o00_im += A0_im;
01097     o10_re += B0_re;
01098     o10_im += B0_im;
01099     o20_re += A0_im;
01100     o20_im -= A0_re;
01101     o30_re -= B0_im;
01102     o30_im += B0_re;
01103     
01104     o01_re += A1_re;
01105     o01_im += A1_im;
01106     o11_re += B1_re;
01107     o11_im += B1_im;
01108     o21_re += A1_im;
01109     o21_im -= A1_re;
01110     o31_re -= B1_im;
01111     o31_im += B1_re;
01112     
01113     o02_re += A2_re;
01114     o02_im += A2_im;
01115     o12_re += B2_re;
01116     o12_im += B2_im;
01117     o22_re += A2_im;
01118     o22_im -= A2_re;
01119     o32_re -= B2_im;
01120     o32_im += B2_re;
01121     
01122 }
01123 
01124 {
01125     // Projector P3-
01126     // 0 0 0 0 
01127     // 0 0 0 0 
01128     // 0 0 2 0 
01129     // 0 0 0 2 
01130     
01131     int sp_idx = ((x4==X4m1) ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
01132     int ga_idx = sid;
01133     
01134     if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) {
01135         // read spinor from device memory
01136         READ_SPINOR_DOWN(SPINORTEX);
01137         
01138         // project spinor into half spinors
01139         spinorFloat a0_re = +2*i20_re;
01140         spinorFloat a0_im = +2*i20_im;
01141         spinorFloat a1_re = +2*i21_re;
01142         spinorFloat a1_im = +2*i21_im;
01143         spinorFloat a2_re = +2*i22_re;
01144         spinorFloat a2_im = +2*i22_im;
01145         
01146         spinorFloat b0_re = +2*i30_re;
01147         spinorFloat b0_im = +2*i30_im;
01148         spinorFloat b1_re = +2*i31_re;
01149         spinorFloat b1_im = +2*i31_im;
01150         spinorFloat b2_re = +2*i32_re;
01151         spinorFloat b2_im = +2*i32_im;
01152         
01153         // identity gauge matrix
01154         spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im;
01155         spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im;
01156         spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im;
01157         spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im;
01158         spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im;
01159         spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im;
01160         
01161         o20_re += A0_re;
01162         o20_im += A0_im;
01163         o30_re += B0_re;
01164         o30_im += B0_im;
01165         
01166         o21_re += A1_re;
01167         o21_im += A1_im;
01168         o31_re += B1_re;
01169         o31_im += B1_im;
01170         
01171         o22_re += A2_re;
01172         o22_im += A2_im;
01173         o32_re += B2_re;
01174         o32_im += B2_im;
01175         
01176     }
01177     else {
01178         // read gauge matrix from device memory
01179         READ_GAUGE_MATRIX(GAUGE0TEX, 6);
01180         
01181         // read spinor from device memory
01182         READ_SPINOR_DOWN(SPINORTEX);
01183         
01184         // reconstruct gauge matrix
01185         RECONSTRUCT_GAUGE_MATRIX(6);
01186         
01187         // project spinor into half spinors
01188         spinorFloat a0_re = +2*i20_re;
01189         spinorFloat a0_im = +2*i20_im;
01190         spinorFloat a1_re = +2*i21_re;
01191         spinorFloat a1_im = +2*i21_im;
01192         spinorFloat a2_re = +2*i22_re;
01193         spinorFloat a2_im = +2*i22_im;
01194         
01195         spinorFloat b0_re = +2*i30_re;
01196         spinorFloat b0_im = +2*i30_im;
01197         spinorFloat b1_re = +2*i31_re;
01198         spinorFloat b1_im = +2*i31_im;
01199         spinorFloat b2_re = +2*i32_re;
01200         spinorFloat b2_im = +2*i32_im;
01201         
01202         // multiply row 0
01203         spinorFloat A0_re = 0;
01204         A0_re += g00_re * a0_re;
01205         A0_re -= g00_im * a0_im;
01206         A0_re += g01_re * a1_re;
01207         A0_re -= g01_im * a1_im;
01208         A0_re += g02_re * a2_re;
01209         A0_re -= g02_im * a2_im;
01210         spinorFloat A0_im = 0;
01211         A0_im += g00_re * a0_im;
01212         A0_im += g00_im * a0_re;
01213         A0_im += g01_re * a1_im;
01214         A0_im += g01_im * a1_re;
01215         A0_im += g02_re * a2_im;
01216         A0_im += g02_im * a2_re;
01217         spinorFloat B0_re = 0;
01218         B0_re += g00_re * b0_re;
01219         B0_re -= g00_im * b0_im;
01220         B0_re += g01_re * b1_re;
01221         B0_re -= g01_im * b1_im;
01222         B0_re += g02_re * b2_re;
01223         B0_re -= g02_im * b2_im;
01224         spinorFloat B0_im = 0;
01225         B0_im += g00_re * b0_im;
01226         B0_im += g00_im * b0_re;
01227         B0_im += g01_re * b1_im;
01228         B0_im += g01_im * b1_re;
01229         B0_im += g02_re * b2_im;
01230         B0_im += g02_im * b2_re;
01231         
01232         // multiply row 1
01233         spinorFloat A1_re = 0;
01234         A1_re += g10_re * a0_re;
01235         A1_re -= g10_im * a0_im;
01236         A1_re += g11_re * a1_re;
01237         A1_re -= g11_im * a1_im;
01238         A1_re += g12_re * a2_re;
01239         A1_re -= g12_im * a2_im;
01240         spinorFloat A1_im = 0;
01241         A1_im += g10_re * a0_im;
01242         A1_im += g10_im * a0_re;
01243         A1_im += g11_re * a1_im;
01244         A1_im += g11_im * a1_re;
01245         A1_im += g12_re * a2_im;
01246         A1_im += g12_im * a2_re;
01247         spinorFloat B1_re = 0;
01248         B1_re += g10_re * b0_re;
01249         B1_re -= g10_im * b0_im;
01250         B1_re += g11_re * b1_re;
01251         B1_re -= g11_im * b1_im;
01252         B1_re += g12_re * b2_re;
01253         B1_re -= g12_im * b2_im;
01254         spinorFloat B1_im = 0;
01255         B1_im += g10_re * b0_im;
01256         B1_im += g10_im * b0_re;
01257         B1_im += g11_re * b1_im;
01258         B1_im += g11_im * b1_re;
01259         B1_im += g12_re * b2_im;
01260         B1_im += g12_im * b2_re;
01261         
01262         // multiply row 2
01263         spinorFloat A2_re = 0;
01264         A2_re += g20_re * a0_re;
01265         A2_re -= g20_im * a0_im;
01266         A2_re += g21_re * a1_re;
01267         A2_re -= g21_im * a1_im;
01268         A2_re += g22_re * a2_re;
01269         A2_re -= g22_im * a2_im;
01270         spinorFloat A2_im = 0;
01271         A2_im += g20_re * a0_im;
01272         A2_im += g20_im * a0_re;
01273         A2_im += g21_re * a1_im;
01274         A2_im += g21_im * a1_re;
01275         A2_im += g22_re * a2_im;
01276         A2_im += g22_im * a2_re;
01277         spinorFloat B2_re = 0;
01278         B2_re += g20_re * b0_re;
01279         B2_re -= g20_im * b0_im;
01280         B2_re += g21_re * b1_re;
01281         B2_re -= g21_im * b1_im;
01282         B2_re += g22_re * b2_re;
01283         B2_re -= g22_im * b2_im;
01284         spinorFloat B2_im = 0;
01285         B2_im += g20_re * b0_im;
01286         B2_im += g20_im * b0_re;
01287         B2_im += g21_re * b1_im;
01288         B2_im += g21_im * b1_re;
01289         B2_im += g22_re * b2_im;
01290         B2_im += g22_im * b2_re;
01291         
01292         o20_re += A0_re;
01293         o20_im += A0_im;
01294         o30_re += B0_re;
01295         o30_im += B0_im;
01296         
01297         o21_re += A1_re;
01298         o21_im += A1_im;
01299         o31_re += B1_re;
01300         o31_im += B1_im;
01301         
01302         o22_re += A2_re;
01303         o22_im += A2_im;
01304         o32_re += B2_re;
01305         o32_im += B2_im;
01306         
01307     }
01308 }
01309 
01310 {
01311     // Projector P3+
01312     // 2 0 0 0 
01313     // 0 2 0 0 
01314     // 0 0 0 0 
01315     // 0 0 0 0 
01316     
01317     int sp_idx = ((x4==0)    ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
01318     int ga_idx = sp_idx;
01319     
01320     if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h) {
01321         // read spinor from device memory
01322         READ_SPINOR_UP(SPINORTEX);
01323         
01324         // project spinor into half spinors
01325         spinorFloat a0_re = +2*i00_re;
01326         spinorFloat a0_im = +2*i00_im;
01327         spinorFloat a1_re = +2*i01_re;
01328         spinorFloat a1_im = +2*i01_im;
01329         spinorFloat a2_re = +2*i02_re;
01330         spinorFloat a2_im = +2*i02_im;
01331         
01332         spinorFloat b0_re = +2*i10_re;
01333         spinorFloat b0_im = +2*i10_im;
01334         spinorFloat b1_re = +2*i11_re;
01335         spinorFloat b1_im = +2*i11_im;
01336         spinorFloat b2_re = +2*i12_re;
01337         spinorFloat b2_im = +2*i12_im;
01338         
01339         // identity gauge matrix
01340         spinorFloat A0_re = a0_re; spinorFloat A0_im = a0_im;
01341         spinorFloat B0_re = b0_re; spinorFloat B0_im = b0_im;
01342         spinorFloat A1_re = a1_re; spinorFloat A1_im = a1_im;
01343         spinorFloat B1_re = b1_re; spinorFloat B1_im = b1_im;
01344         spinorFloat A2_re = a2_re; spinorFloat A2_im = a2_im;
01345         spinorFloat B2_re = b2_re; spinorFloat B2_im = b2_im;
01346         
01347         o00_re += A0_re;
01348         o00_im += A0_im;
01349         o10_re += B0_re;
01350         o10_im += B0_im;
01351         
01352         o01_re += A1_re;
01353         o01_im += A1_im;
01354         o11_re += B1_re;
01355         o11_im += B1_im;
01356         
01357         o02_re += A2_re;
01358         o02_im += A2_im;
01359         o12_re += B2_re;
01360         o12_im += B2_im;
01361         
01362     }
01363     else {
01364         // read gauge matrix from device memory
01365         READ_GAUGE_MATRIX(GAUGE1TEX, 7);
01366         
01367         // read spinor from device memory
01368         READ_SPINOR_UP(SPINORTEX);
01369         
01370         // reconstruct gauge matrix
01371         RECONSTRUCT_GAUGE_MATRIX(7);
01372         
01373         // project spinor into half spinors
01374         spinorFloat a0_re = +2*i00_re;
01375         spinorFloat a0_im = +2*i00_im;
01376         spinorFloat a1_re = +2*i01_re;
01377         spinorFloat a1_im = +2*i01_im;
01378         spinorFloat a2_re = +2*i02_re;
01379         spinorFloat a2_im = +2*i02_im;
01380         
01381         spinorFloat b0_re = +2*i10_re;
01382         spinorFloat b0_im = +2*i10_im;
01383         spinorFloat b1_re = +2*i11_re;
01384         spinorFloat b1_im = +2*i11_im;
01385         spinorFloat b2_re = +2*i12_re;
01386         spinorFloat b2_im = +2*i12_im;
01387         
01388         // multiply row 0
01389         spinorFloat A0_re = 0;
01390         A0_re += gT00_re * a0_re;
01391         A0_re -= gT00_im * a0_im;
01392         A0_re += gT01_re * a1_re;
01393         A0_re -= gT01_im * a1_im;
01394         A0_re += gT02_re * a2_re;
01395         A0_re -= gT02_im * a2_im;
01396         spinorFloat A0_im = 0;
01397         A0_im += gT00_re * a0_im;
01398         A0_im += gT00_im * a0_re;
01399         A0_im += gT01_re * a1_im;
01400         A0_im += gT01_im * a1_re;
01401         A0_im += gT02_re * a2_im;
01402         A0_im += gT02_im * a2_re;
01403         spinorFloat B0_re = 0;
01404         B0_re += gT00_re * b0_re;
01405         B0_re -= gT00_im * b0_im;
01406         B0_re += gT01_re * b1_re;
01407         B0_re -= gT01_im * b1_im;
01408         B0_re += gT02_re * b2_re;
01409         B0_re -= gT02_im * b2_im;
01410         spinorFloat B0_im = 0;
01411         B0_im += gT00_re * b0_im;
01412         B0_im += gT00_im * b0_re;
01413         B0_im += gT01_re * b1_im;
01414         B0_im += gT01_im * b1_re;
01415         B0_im += gT02_re * b2_im;
01416         B0_im += gT02_im * b2_re;
01417         
01418         // multiply row 1
01419         spinorFloat A1_re = 0;
01420         A1_re += gT10_re * a0_re;
01421         A1_re -= gT10_im * a0_im;
01422         A1_re += gT11_re * a1_re;
01423         A1_re -= gT11_im * a1_im;
01424         A1_re += gT12_re * a2_re;
01425         A1_re -= gT12_im * a2_im;
01426         spinorFloat A1_im = 0;
01427         A1_im += gT10_re * a0_im;
01428         A1_im += gT10_im * a0_re;
01429         A1_im += gT11_re * a1_im;
01430         A1_im += gT11_im * a1_re;
01431         A1_im += gT12_re * a2_im;
01432         A1_im += gT12_im * a2_re;
01433         spinorFloat B1_re = 0;
01434         B1_re += gT10_re * b0_re;
01435         B1_re -= gT10_im * b0_im;
01436         B1_re += gT11_re * b1_re;
01437         B1_re -= gT11_im * b1_im;
01438         B1_re += gT12_re * b2_re;
01439         B1_re -= gT12_im * b2_im;
01440         spinorFloat B1_im = 0;
01441         B1_im += gT10_re * b0_im;
01442         B1_im += gT10_im * b0_re;
01443         B1_im += gT11_re * b1_im;
01444         B1_im += gT11_im * b1_re;
01445         B1_im += gT12_re * b2_im;
01446         B1_im += gT12_im * b2_re;
01447         
01448         // multiply row 2
01449         spinorFloat A2_re = 0;
01450         A2_re += gT20_re * a0_re;
01451         A2_re -= gT20_im * a0_im;
01452         A2_re += gT21_re * a1_re;
01453         A2_re -= gT21_im * a1_im;
01454         A2_re += gT22_re * a2_re;
01455         A2_re -= gT22_im * a2_im;
01456         spinorFloat A2_im = 0;
01457         A2_im += gT20_re * a0_im;
01458         A2_im += gT20_im * a0_re;
01459         A2_im += gT21_re * a1_im;
01460         A2_im += gT21_im * a1_re;
01461         A2_im += gT22_re * a2_im;
01462         A2_im += gT22_im * a2_re;
01463         spinorFloat B2_re = 0;
01464         B2_re += gT20_re * b0_re;
01465         B2_re -= gT20_im * b0_im;
01466         B2_re += gT21_re * b1_re;
01467         B2_re -= gT21_im * b1_im;
01468         B2_re += gT22_re * b2_re;
01469         B2_re -= gT22_im * b2_im;
01470         spinorFloat B2_im = 0;
01471         B2_im += gT20_re * b0_im;
01472         B2_im += gT20_im * b0_re;
01473         B2_im += gT21_re * b1_im;
01474         B2_im += gT21_im * b1_re;
01475         B2_im += gT22_re * b2_im;
01476         B2_im += gT22_im * b2_re;
01477         
01478         o00_re += A0_re;
01479         o00_im += A0_im;
01480         o10_re += B0_re;
01481         o10_im += B0_im;
01482         
01483         o01_re += A1_re;
01484         o01_im += A1_im;
01485         o11_re += B1_re;
01486         o11_im += B1_im;
01487         
01488         o02_re += A2_re;
01489         o02_im += A2_im;
01490         o12_re += B2_re;
01491         o12_im += B2_im;
01492         
01493     }
01494 }
01495 
01496 {
01497     // apply twisted mass rotation
01498     volatile spinorFloat tmp00_re = +o00_re-o20_im*a;
01499     volatile spinorFloat tmp00_im = +o00_im+o20_re*a;
01500     volatile spinorFloat tmp01_re = +o01_re-o21_im*a;
01501     volatile spinorFloat tmp01_im = +o01_im+o21_re*a;
01502     volatile spinorFloat tmp02_re = +o02_re-o22_im*a;
01503     volatile spinorFloat tmp02_im = +o02_im+o22_re*a;
01504     
01505     volatile spinorFloat tmp10_re = +o10_re-o30_im*a;
01506     volatile spinorFloat tmp10_im = +o10_im+o30_re*a;
01507     volatile spinorFloat tmp11_re = +o11_re-o31_im*a;
01508     volatile spinorFloat tmp11_im = +o11_im+o31_re*a;
01509     volatile spinorFloat tmp12_re = +o12_re-o32_im*a;
01510     volatile spinorFloat tmp12_im = +o12_im+o32_re*a;
01511     
01512     volatile spinorFloat tmp20_re = -o00_im*a+o20_re;
01513     volatile spinorFloat tmp20_im = +o00_re*a+o20_im;
01514     volatile spinorFloat tmp21_re = -o01_im*a+o21_re;
01515     volatile spinorFloat tmp21_im = +o01_re*a+o21_im;
01516     volatile spinorFloat tmp22_re = -o02_im*a+o22_re;
01517     volatile spinorFloat tmp22_im = +o02_re*a+o22_im;
01518     
01519     volatile spinorFloat tmp30_re = -o10_im*a+o30_re;
01520     volatile spinorFloat tmp30_im = +o10_re*a+o30_im;
01521     volatile spinorFloat tmp31_re = -o11_im*a+o31_re;
01522     volatile spinorFloat tmp31_im = +o11_re*a+o31_im;
01523     volatile spinorFloat tmp32_re = -o12_im*a+o32_re;
01524     volatile spinorFloat tmp32_im = +o12_re*a+o32_im;
01525     
01526     
01527     #ifndef DSLASH_XPAY
01528     //scale by b = 1/(1 + a*a) 
01529     o00_re = b*tmp00_re;
01530     o00_im = b*tmp00_im;
01531     o01_re = b*tmp01_re;
01532     o01_im = b*tmp01_im;
01533     o02_re = b*tmp02_re;
01534     o02_im = b*tmp02_im;
01535     o10_re = b*tmp10_re;
01536     o10_im = b*tmp10_im;
01537     o11_re = b*tmp11_re;
01538     o11_im = b*tmp11_im;
01539     o12_re = b*tmp12_re;
01540     o12_im = b*tmp12_im;
01541     o20_re = b*tmp20_re;
01542     o20_im = b*tmp20_im;
01543     o21_re = b*tmp21_re;
01544     o21_im = b*tmp21_im;
01545     o22_re = b*tmp22_re;
01546     o22_im = b*tmp22_im;
01547     o30_re = b*tmp30_re;
01548     o30_im = b*tmp30_im;
01549     o31_re = b*tmp31_re;
01550     o31_im = b*tmp31_im;
01551     o32_re = b*tmp32_re;
01552     o32_im = b*tmp32_im;
01553     #else
01554     o00_re = tmp00_re;
01555     o00_im = tmp00_im;
01556     o01_re = tmp01_re;
01557     o01_im = tmp01_im;
01558     o02_re = tmp02_re;
01559     o02_im = tmp02_im;
01560     o10_re = tmp10_re;
01561     o10_im = tmp10_im;
01562     o11_re = tmp11_re;
01563     o11_im = tmp11_im;
01564     o12_re = tmp12_re;
01565     o12_im = tmp12_im;
01566     o20_re = tmp20_re;
01567     o20_im = tmp20_im;
01568     o21_re = tmp21_re;
01569     o21_im = tmp21_im;
01570     o22_re = tmp22_re;
01571     o22_im = tmp22_im;
01572     o30_re = tmp30_re;
01573     o30_im = tmp30_im;
01574     o31_re = tmp31_re;
01575     o31_im = tmp31_im;
01576     o32_re = tmp32_re;
01577     o32_im = tmp32_im;
01578     #endif // DSLASH_XPAY
01579     
01580 }
01581 
01582 
01583 #ifdef DSLASH_XPAY
01584     READ_ACCUM(ACCUMTEX)
01585 #ifdef SPINOR_DOUBLE
01586     o00_re = b*o00_re + accum0.x;
01587     o00_im = b*o00_im + accum0.y;
01588     o01_re = b*o01_re + accum1.x;
01589     o01_im = b*o01_im + accum1.y;
01590     o02_re = b*o02_re + accum2.x;
01591     o02_im = b*o02_im + accum2.y;
01592     o10_re = b*o10_re + accum3.x;
01593     o10_im = b*o10_im + accum3.y;
01594     o11_re = b*o11_re + accum4.x;
01595     o11_im = b*o11_im + accum4.y;
01596     o12_re = b*o12_re + accum5.x;
01597     o12_im = b*o12_im + accum5.y;
01598     o20_re = b*o20_re + accum6.x;
01599     o20_im = b*o20_im + accum6.y;
01600     o21_re = b*o21_re + accum7.x;
01601     o21_im = b*o21_im + accum7.y;
01602     o22_re = b*o22_re + accum8.x;
01603     o22_im = b*o22_im + accum8.y;
01604     o30_re = b*o30_re + accum9.x;
01605     o30_im = b*o30_im + accum9.y;
01606     o31_re = b*o31_re + accum10.x;
01607     o31_im = b*o31_im + accum10.y;
01608     o32_re = b*o32_re + accum11.x;
01609     o32_im = b*o32_im + accum11.y;
01610 #else
01611     o00_re = b*o00_re + accum0.x;
01612     o00_im = b*o00_im + accum0.y;
01613     o01_re = b*o01_re + accum0.z;
01614     o01_im = b*o01_im + accum0.w;
01615     o02_re = b*o02_re + accum1.x;
01616     o02_im = b*o02_im + accum1.y;
01617     o10_re = b*o10_re + accum1.z;
01618     o10_im = b*o10_im + accum1.w;
01619     o11_re = b*o11_re + accum2.x;
01620     o11_im = b*o11_im + accum2.y;
01621     o12_re = b*o12_re + accum2.z;
01622     o12_im = b*o12_im + accum2.w;
01623     o20_re = b*o20_re + accum3.x;
01624     o20_im = b*o20_im + accum3.y;
01625     o21_re = b*o21_re + accum3.z;
01626     o21_im = b*o21_im + accum3.w;
01627     o22_re = b*o22_re + accum4.x;
01628     o22_im = b*o22_im + accum4.y;
01629     o30_re = b*o30_re + accum4.z;
01630     o30_im = b*o30_im + accum4.w;
01631     o31_re = b*o31_re + accum5.x;
01632     o31_im = b*o31_im + accum5.y;
01633     o32_re = b*o32_re + accum5.z;
01634     o32_im = b*o32_im + accum5.w;
01635 #endif // SPINOR_DOUBLE
01636 #endif // DSLASH_XPAY
01637 
01638 
01639     // write spinor field back to device memory
01640     WRITE_SPINOR();
01641 
01642 // undefine to prevent warning when precision is changed
01643 #undef spinorFloat
01644 #undef SHARED_STRIDE
01645 
01646 #undef A_re
01647 #undef A_im
01648 
01649 #undef g00_re
01650 #undef g00_im
01651 #undef g01_re
01652 #undef g01_im
01653 #undef g02_re
01654 #undef g02_im
01655 #undef g10_re
01656 #undef g10_im
01657 #undef g11_re
01658 #undef g11_im
01659 #undef g12_re
01660 #undef g12_im
01661 #undef g20_re
01662 #undef g20_im
01663 #undef g21_re
01664 #undef g21_im
01665 #undef g22_re
01666 #undef g22_im
01667 
01668 #undef i00_re
01669 #undef i00_im
01670 #undef i01_re
01671 #undef i01_im
01672 #undef i02_re
01673 #undef i02_im
01674 #undef i10_re
01675 #undef i10_im
01676 #undef i11_re
01677 #undef i11_im
01678 #undef i12_re
01679 #undef i12_im
01680 #undef i20_re
01681 #undef i20_im
01682 #undef i21_re
01683 #undef i21_im
01684 #undef i22_re
01685 #undef i22_im
01686 #undef i30_re
01687 #undef i30_im
01688 #undef i31_re
01689 #undef i31_im
01690 #undef i32_re
01691 #undef i32_im
01692 
01693 #undef o00_re
01694 #undef o00_im
01695 #undef o01_re
01696 #undef o01_im
01697 #undef o02_re
01698 #undef o02_im
01699 #undef o10_re
01700 #undef o10_im
01701 
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Defines