QUDA v0.3.2
A library for QCD on GPUs

quda/lib/dslash_core/clover_core.h

Go to the documentation of this file.
00001 // *** CUDA CLOVER ***
00002 
00003 #define SHARED_FLOATS_PER_THREAD 8
00004 
00005 // input spinor
00006 #ifdef SPINOR_DOUBLE
00007 #define spinorFloat double
00008 #define i00_re I0.x
00009 #define i00_im I0.y
00010 #define i01_re I1.x
00011 #define i01_im I1.y
00012 #define i02_re I2.x
00013 #define i02_im I2.y
00014 #define i10_re I3.x
00015 #define i10_im I3.y
00016 #define i11_re I4.x
00017 #define i11_im I4.y
00018 #define i12_re I5.x
00019 #define i12_im I5.y
00020 #define i20_re I6.x
00021 #define i20_im I6.y
00022 #define i21_re I7.x
00023 #define i21_im I7.y
00024 #define i22_re I8.x
00025 #define i22_im I8.y
00026 #define i30_re I9.x
00027 #define i30_im I9.y
00028 #define i31_re I10.x
00029 #define i31_im I10.y
00030 #define i32_re I11.x
00031 #define i32_im I11.y
00032 #else
00033 #define spinorFloat float
00034 #define i00_re I0.x
00035 #define i00_im I0.y
00036 #define i01_re I0.z
00037 #define i01_im I0.w
00038 #define i02_re I1.x
00039 #define i02_im I1.y
00040 #define i10_re I1.z
00041 #define i10_im I1.w
00042 #define i11_re I2.x
00043 #define i11_im I2.y
00044 #define i12_re I2.z
00045 #define i12_im I2.w
00046 #define i20_re I3.x
00047 #define i20_im I3.y
00048 #define i21_re I3.z
00049 #define i21_im I3.w
00050 #define i22_re I4.x
00051 #define i22_im I4.y
00052 #define i30_re I4.z
00053 #define i30_im I4.w
00054 #define i31_re I5.x
00055 #define i31_im I5.y
00056 #define i32_re I5.z
00057 #define i32_im I5.w
00058 #endif // SPINOR_DOUBLE
00059 
00060 // first chiral block of inverted clover term
00061 #ifdef CLOVER_DOUBLE
00062 #define c00_00_re C0.x
00063 #define c01_01_re C0.y
00064 #define c02_02_re C1.x
00065 #define c10_10_re C1.y
00066 #define c11_11_re C2.x
00067 #define c12_12_re C2.y
00068 #define c01_00_re C3.x
00069 #define c01_00_im C3.y
00070 #define c02_00_re C4.x
00071 #define c02_00_im C4.y
00072 #define c10_00_re C5.x
00073 #define c10_00_im C5.y
00074 #define c11_00_re C6.x
00075 #define c11_00_im C6.y
00076 #define c12_00_re C7.x
00077 #define c12_00_im C7.y
00078 #define c02_01_re C8.x
00079 #define c02_01_im C8.y
00080 #define c10_01_re C9.x
00081 #define c10_01_im C9.y
00082 #define c11_01_re C10.x
00083 #define c11_01_im C10.y
00084 #define c12_01_re C11.x
00085 #define c12_01_im C11.y
00086 #define c10_02_re C12.x
00087 #define c10_02_im C12.y
00088 #define c11_02_re C13.x
00089 #define c11_02_im C13.y
00090 #define c12_02_re C14.x
00091 #define c12_02_im C14.y
00092 #define c11_10_re C15.x
00093 #define c11_10_im C15.y
00094 #define c12_10_re C16.x
00095 #define c12_10_im C16.y
00096 #define c12_11_re C17.x
00097 #define c12_11_im C17.y
00098 #else
00099 #define c00_00_re C0.x
00100 #define c01_01_re C0.y
00101 #define c02_02_re C0.z
00102 #define c10_10_re C0.w
00103 #define c11_11_re C1.x
00104 #define c12_12_re C1.y
00105 #define c01_00_re C1.z
00106 #define c01_00_im C1.w
00107 #define c02_00_re C2.x
00108 #define c02_00_im C2.y
00109 #define c10_00_re C2.z
00110 #define c10_00_im C2.w
00111 #define c11_00_re C3.x
00112 #define c11_00_im C3.y
00113 #define c12_00_re C3.z
00114 #define c12_00_im C3.w
00115 #define c02_01_re C4.x
00116 #define c02_01_im C4.y
00117 #define c10_01_re C4.z
00118 #define c10_01_im C4.w
00119 #define c11_01_re C5.x
00120 #define c11_01_im C5.y
00121 #define c12_01_re C5.z
00122 #define c12_01_im C5.w
00123 #define c10_02_re C6.x
00124 #define c10_02_im C6.y
00125 #define c11_02_re C6.z
00126 #define c11_02_im C6.w
00127 #define c12_02_re C7.x
00128 #define c12_02_im C7.y
00129 #define c11_10_re C7.z
00130 #define c11_10_im C7.w
00131 #define c12_10_re C8.x
00132 #define c12_10_im C8.y
00133 #define c12_11_re C8.z
00134 #define c12_11_im C8.w
00135 #endif // CLOVER_DOUBLE
00136 
00137 #define c00_01_re (+c01_00_re)
00138 #define c00_01_im (-c01_00_im)
00139 #define c00_02_re (+c02_00_re)
00140 #define c00_02_im (-c02_00_im)
00141 #define c01_02_re (+c02_01_re)
00142 #define c01_02_im (-c02_01_im)
00143 #define c00_10_re (+c10_00_re)
00144 #define c00_10_im (-c10_00_im)
00145 #define c01_10_re (+c10_01_re)
00146 #define c01_10_im (-c10_01_im)
00147 #define c02_10_re (+c10_02_re)
00148 #define c02_10_im (-c10_02_im)
00149 #define c00_11_re (+c11_00_re)
00150 #define c00_11_im (-c11_00_im)
00151 #define c01_11_re (+c11_01_re)
00152 #define c01_11_im (-c11_01_im)
00153 #define c02_11_re (+c11_02_re)
00154 #define c02_11_im (-c11_02_im)
00155 #define c10_11_re (+c11_10_re)
00156 #define c10_11_im (-c11_10_im)
00157 #define c00_12_re (+c12_00_re)
00158 #define c00_12_im (-c12_00_im)
00159 #define c01_12_re (+c12_01_re)
00160 #define c01_12_im (-c12_01_im)
00161 #define c02_12_re (+c12_02_re)
00162 #define c02_12_im (-c12_02_im)
00163 #define c10_12_re (+c12_10_re)
00164 #define c10_12_im (-c12_10_im)
00165 #define c11_12_re (+c12_11_re)
00166 #define c11_12_im (-c12_11_im)
00167 
00168 // second chiral block of inverted clover term (reuses C0,...,C9)
00169 #define c20_20_re c00_00_re
00170 #define c21_20_re c01_00_re
00171 #define c21_20_im c01_00_im
00172 #define c22_20_re c02_00_re
00173 #define c22_20_im c02_00_im
00174 #define c30_20_re c10_00_re
00175 #define c30_20_im c10_00_im
00176 #define c31_20_re c11_00_re
00177 #define c31_20_im c11_00_im
00178 #define c32_20_re c12_00_re
00179 #define c32_20_im c12_00_im
00180 #define c20_21_re c00_01_re
00181 #define c20_21_im c00_01_im
00182 #define c21_21_re c01_01_re
00183 #define c22_21_re c02_01_re
00184 #define c22_21_im c02_01_im
00185 #define c30_21_re c10_01_re
00186 #define c30_21_im c10_01_im
00187 #define c31_21_re c11_01_re
00188 #define c31_21_im c11_01_im
00189 #define c32_21_re c12_01_re
00190 #define c32_21_im c12_01_im
00191 #define c20_22_re c00_02_re
00192 #define c20_22_im c00_02_im
00193 #define c21_22_re c01_02_re
00194 #define c21_22_im c01_02_im
00195 #define c22_22_re c02_02_re
00196 #define c30_22_re c10_02_re
00197 #define c30_22_im c10_02_im
00198 #define c31_22_re c11_02_re
00199 #define c31_22_im c11_02_im
00200 #define c32_22_re c12_02_re
00201 #define c32_22_im c12_02_im
00202 #define c20_30_re c00_10_re
00203 #define c20_30_im c00_10_im
00204 #define c21_30_re c01_10_re
00205 #define c21_30_im c01_10_im
00206 #define c22_30_re c02_10_re
00207 #define c22_30_im c02_10_im
00208 #define c30_30_re c10_10_re
00209 #define c31_30_re c11_10_re
00210 #define c31_30_im c11_10_im
00211 #define c32_30_re c12_10_re
00212 #define c32_30_im c12_10_im
00213 #define c20_31_re c00_11_re
00214 #define c20_31_im c00_11_im
00215 #define c21_31_re c01_11_re
00216 #define c21_31_im c01_11_im
00217 #define c22_31_re c02_11_re
00218 #define c22_31_im c02_11_im
00219 #define c30_31_re c10_11_re
00220 #define c30_31_im c10_11_im
00221 #define c31_31_re c11_11_re
00222 #define c32_31_re c12_11_re
00223 #define c32_31_im c12_11_im
00224 #define c20_32_re c00_12_re
00225 #define c20_32_im c00_12_im
00226 #define c21_32_re c01_12_re
00227 #define c21_32_im c01_12_im
00228 #define c22_32_re c02_12_re
00229 #define c22_32_im c02_12_im
00230 #define c30_32_re c10_12_re
00231 #define c30_32_im c10_12_im
00232 #define c31_32_re c11_12_re
00233 #define c31_32_im c11_12_im
00234 #define c32_32_re c12_12_re
00235 
00236 // output spinor
00237 #define o00_re s[0*SHARED_STRIDE]
00238 #define o00_im s[1*SHARED_STRIDE]
00239 #define o01_re s[2*SHARED_STRIDE]
00240 #define o01_im s[3*SHARED_STRIDE]
00241 #define o02_re s[4*SHARED_STRIDE]
00242 #define o02_im s[5*SHARED_STRIDE]
00243 #define o10_re s[6*SHARED_STRIDE]
00244 #define o10_im s[7*SHARED_STRIDE]
00245 volatile spinorFloat o11_re;
00246 volatile spinorFloat o11_im;
00247 volatile spinorFloat o12_re;
00248 volatile spinorFloat o12_im;
00249 volatile spinorFloat o20_re;
00250 volatile spinorFloat o20_im;
00251 volatile spinorFloat o21_re;
00252 volatile spinorFloat o21_im;
00253 volatile spinorFloat o22_re;
00254 volatile spinorFloat o22_im;
00255 volatile spinorFloat o30_re;
00256 volatile spinorFloat o30_im;
00257 volatile spinorFloat o31_re;
00258 volatile spinorFloat o31_im;
00259 volatile spinorFloat o32_re;
00260 volatile spinorFloat o32_im;
00261 
00262 #include "read_clover.h"
00263 #include "io_spinor.h"
00264 
00265 #define sp_idx sid // alias needed by READ_SPINOR()
00266 
00267 int sid = blockIdx.x*blockDim.x + threadIdx.x;
00268 
00269 #ifdef SPINOR_DOUBLE
00270 #if (__CUDA_ARCH__ >= 200)
00271 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
00272 #else
00273 #define SHARED_STRIDE  8 // to avoid bank conflicts on G80 and GT200
00274 #endif
00275 extern __shared__ spinorFloat sd_data[];
00276 volatile spinorFloat *s = sd_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
00277                                   + (threadIdx.x % SHARED_STRIDE);
00278 #else
00279 #if (__CUDA_ARCH__ >= 200)
00280 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
00281 #else
00282 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
00283 #endif
00284 extern __shared__ spinorFloat ss_data[];
00285 volatile spinorFloat *s = ss_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
00286                                   + (threadIdx.x % SHARED_STRIDE);
00287 #endif
00288 
00289 // read spinor from device memory                                                                                                           
00290 READ_SPINOR(SPINORTEX);
00291 
00292 // change to chiral basis
00293 {
00294     o00_re = -i10_re - i30_re;
00295     o00_im = -i10_im - i30_im;
00296     o10_re =  i00_re + i20_re;
00297     o10_im =  i00_im + i20_im;
00298     o20_re = -i10_re + i30_re;
00299     o20_im = -i10_im + i30_im;
00300     o30_re =  i00_re - i20_re;
00301     o30_im =  i00_im - i20_im;
00302 
00303     o01_re = -i11_re - i31_re;
00304     o01_im = -i11_im - i31_im;
00305     o11_re =  i01_re + i21_re;
00306     o11_im =  i01_im + i21_im;
00307     o21_re = -i11_re + i31_re;
00308     o21_im = -i11_im + i31_im;
00309     o31_re =  i01_re - i21_re;
00310     o31_im =  i01_im - i21_im;
00311 
00312     o02_re = -i12_re - i32_re;
00313     o02_im = -i12_im - i32_im;
00314     o12_re =  i02_re + i22_re;
00315     o12_im =  i02_im + i22_im;
00316     o22_re = -i12_re + i32_re;
00317     o22_im = -i12_im + i32_im;
00318     o32_re =  i02_re - i22_re;
00319     o32_im =  i02_im - i22_im;
00320 }
00321 
00322 // apply first chiral block
00323 {
00324     READ_CLOVER(CLOVERTEX, 0)
00325     
00326     spinorFloat a00_re = 0; spinorFloat a00_im = 0;
00327     spinorFloat a01_re = 0; spinorFloat a01_im = 0;
00328     spinorFloat a02_re = 0; spinorFloat a02_im = 0;
00329     spinorFloat a10_re = 0; spinorFloat a10_im = 0;
00330     spinorFloat a11_re = 0; spinorFloat a11_im = 0;
00331     spinorFloat a12_re = 0; spinorFloat a12_im = 0;
00332     
00333     a00_re += c00_00_re * o00_re;
00334     a00_im += c00_00_re * o00_im;
00335     a00_re += c00_01_re * o01_re - c00_01_im * o01_im;
00336     a00_im += c00_01_re * o01_im + c00_01_im * o01_re;
00337     a00_re += c00_02_re * o02_re - c00_02_im * o02_im;
00338     a00_im += c00_02_re * o02_im + c00_02_im * o02_re;
00339     a00_re += c00_10_re * o10_re - c00_10_im * o10_im;
00340     a00_im += c00_10_re * o10_im + c00_10_im * o10_re;
00341     a00_re += c00_11_re * o11_re - c00_11_im * o11_im;
00342     a00_im += c00_11_re * o11_im + c00_11_im * o11_re;
00343     a00_re += c00_12_re * o12_re - c00_12_im * o12_im;
00344     a00_im += c00_12_re * o12_im + c00_12_im * o12_re;
00345     
00346     a01_re += c01_00_re * o00_re - c01_00_im * o00_im;
00347     a01_im += c01_00_re * o00_im + c01_00_im * o00_re;
00348     a01_re += c01_01_re * o01_re;
00349     a01_im += c01_01_re * o01_im;
00350     a01_re += c01_02_re * o02_re - c01_02_im * o02_im;
00351     a01_im += c01_02_re * o02_im + c01_02_im * o02_re;
00352     a01_re += c01_10_re * o10_re - c01_10_im * o10_im;
00353     a01_im += c01_10_re * o10_im + c01_10_im * o10_re;
00354     a01_re += c01_11_re * o11_re - c01_11_im * o11_im;
00355     a01_im += c01_11_re * o11_im + c01_11_im * o11_re;
00356     a01_re += c01_12_re * o12_re - c01_12_im * o12_im;
00357     a01_im += c01_12_re * o12_im + c01_12_im * o12_re;
00358     
00359     a02_re += c02_00_re * o00_re - c02_00_im * o00_im;
00360     a02_im += c02_00_re * o00_im + c02_00_im * o00_re;
00361     a02_re += c02_01_re * o01_re - c02_01_im * o01_im;
00362     a02_im += c02_01_re * o01_im + c02_01_im * o01_re;
00363     a02_re += c02_02_re * o02_re;
00364     a02_im += c02_02_re * o02_im;
00365     a02_re += c02_10_re * o10_re - c02_10_im * o10_im;
00366     a02_im += c02_10_re * o10_im + c02_10_im * o10_re;
00367     a02_re += c02_11_re * o11_re - c02_11_im * o11_im;
00368     a02_im += c02_11_re * o11_im + c02_11_im * o11_re;
00369     a02_re += c02_12_re * o12_re - c02_12_im * o12_im;
00370     a02_im += c02_12_re * o12_im + c02_12_im * o12_re;
00371     
00372     a10_re += c10_00_re * o00_re - c10_00_im * o00_im;
00373     a10_im += c10_00_re * o00_im + c10_00_im * o00_re;
00374     a10_re += c10_01_re * o01_re - c10_01_im * o01_im;
00375     a10_im += c10_01_re * o01_im + c10_01_im * o01_re;
00376     a10_re += c10_02_re * o02_re - c10_02_im * o02_im;
00377     a10_im += c10_02_re * o02_im + c10_02_im * o02_re;
00378     a10_re += c10_10_re * o10_re;
00379     a10_im += c10_10_re * o10_im;
00380     a10_re += c10_11_re * o11_re - c10_11_im * o11_im;
00381     a10_im += c10_11_re * o11_im + c10_11_im * o11_re;
00382     a10_re += c10_12_re * o12_re - c10_12_im * o12_im;
00383     a10_im += c10_12_re * o12_im + c10_12_im * o12_re;
00384     
00385     a11_re += c11_00_re * o00_re - c11_00_im * o00_im;
00386     a11_im += c11_00_re * o00_im + c11_00_im * o00_re;
00387     a11_re += c11_01_re * o01_re - c11_01_im * o01_im;
00388     a11_im += c11_01_re * o01_im + c11_01_im * o01_re;
00389     a11_re += c11_02_re * o02_re - c11_02_im * o02_im;
00390     a11_im += c11_02_re * o02_im + c11_02_im * o02_re;
00391     a11_re += c11_10_re * o10_re - c11_10_im * o10_im;
00392     a11_im += c11_10_re * o10_im + c11_10_im * o10_re;
00393     a11_re += c11_11_re * o11_re;
00394     a11_im += c11_11_re * o11_im;
00395     a11_re += c11_12_re * o12_re - c11_12_im * o12_im;
00396     a11_im += c11_12_re * o12_im + c11_12_im * o12_re;
00397     
00398     a12_re += c12_00_re * o00_re - c12_00_im * o00_im;
00399     a12_im += c12_00_re * o00_im + c12_00_im * o00_re;
00400     a12_re += c12_01_re * o01_re - c12_01_im * o01_im;
00401     a12_im += c12_01_re * o01_im + c12_01_im * o01_re;
00402     a12_re += c12_02_re * o02_re - c12_02_im * o02_im;
00403     a12_im += c12_02_re * o02_im + c12_02_im * o02_re;
00404     a12_re += c12_10_re * o10_re - c12_10_im * o10_im;
00405     a12_im += c12_10_re * o10_im + c12_10_im * o10_re;
00406     a12_re += c12_11_re * o11_re - c12_11_im * o11_im;
00407     a12_im += c12_11_re * o11_im + c12_11_im * o11_re;
00408     a12_re += c12_12_re * o12_re;
00409     a12_im += c12_12_re * o12_im;
00410     
00411     o00_re = a00_re;  o00_im = a00_im;
00412     o01_re = a01_re;  o01_im = a01_im;
00413     o02_re = a02_re;  o02_im = a02_im;
00414     o10_re = a10_re;  o10_im = a10_im;
00415     o11_re = a11_re;  o11_im = a11_im;
00416     o12_re = a12_re;  o12_im = a12_im;
00417 }
00418 
00419 // apply second chiral block
00420 {
00421     READ_CLOVER(CLOVERTEX, 1)
00422     
00423     spinorFloat a20_re = 0; spinorFloat a20_im = 0;
00424     spinorFloat a21_re = 0; spinorFloat a21_im = 0;
00425     spinorFloat a22_re = 0; spinorFloat a22_im = 0;
00426     spinorFloat a30_re = 0; spinorFloat a30_im = 0;
00427     spinorFloat a31_re = 0; spinorFloat a31_im = 0;
00428     spinorFloat a32_re = 0; spinorFloat a32_im = 0;
00429     
00430     a20_re += c20_20_re * o20_re;
00431     a20_im += c20_20_re * o20_im;
00432     a20_re += c20_21_re * o21_re - c20_21_im * o21_im;
00433     a20_im += c20_21_re * o21_im + c20_21_im * o21_re;
00434     a20_re += c20_22_re * o22_re - c20_22_im * o22_im;
00435     a20_im += c20_22_re * o22_im + c20_22_im * o22_re;
00436     a20_re += c20_30_re * o30_re - c20_30_im * o30_im;
00437     a20_im += c20_30_re * o30_im + c20_30_im * o30_re;
00438     a20_re += c20_31_re * o31_re - c20_31_im * o31_im;
00439     a20_im += c20_31_re * o31_im + c20_31_im * o31_re;
00440     a20_re += c20_32_re * o32_re - c20_32_im * o32_im;
00441     a20_im += c20_32_re * o32_im + c20_32_im * o32_re;
00442     
00443     a21_re += c21_20_re * o20_re - c21_20_im * o20_im;
00444     a21_im += c21_20_re * o20_im + c21_20_im * o20_re;
00445     a21_re += c21_21_re * o21_re;
00446     a21_im += c21_21_re * o21_im;
00447     a21_re += c21_22_re * o22_re - c21_22_im * o22_im;
00448     a21_im += c21_22_re * o22_im + c21_22_im * o22_re;
00449     a21_re += c21_30_re * o30_re - c21_30_im * o30_im;
00450     a21_im += c21_30_re * o30_im + c21_30_im * o30_re;
00451     a21_re += c21_31_re * o31_re - c21_31_im * o31_im;
00452     a21_im += c21_31_re * o31_im + c21_31_im * o31_re;
00453     a21_re += c21_32_re * o32_re - c21_32_im * o32_im;
00454     a21_im += c21_32_re * o32_im + c21_32_im * o32_re;
00455     
00456     a22_re += c22_20_re * o20_re - c22_20_im * o20_im;
00457     a22_im += c22_20_re * o20_im + c22_20_im * o20_re;
00458     a22_re += c22_21_re * o21_re - c22_21_im * o21_im;
00459     a22_im += c22_21_re * o21_im + c22_21_im * o21_re;
00460     a22_re += c22_22_re * o22_re;
00461     a22_im += c22_22_re * o22_im;
00462     a22_re += c22_30_re * o30_re - c22_30_im * o30_im;
00463     a22_im += c22_30_re * o30_im + c22_30_im * o30_re;
00464     a22_re += c22_31_re * o31_re - c22_31_im * o31_im;
00465     a22_im += c22_31_re * o31_im + c22_31_im * o31_re;
00466     a22_re += c22_32_re * o32_re - c22_32_im * o32_im;
00467     a22_im += c22_32_re * o32_im + c22_32_im * o32_re;
00468     
00469     a30_re += c30_20_re * o20_re - c30_20_im * o20_im;
00470     a30_im += c30_20_re * o20_im + c30_20_im * o20_re;
00471     a30_re += c30_21_re * o21_re - c30_21_im * o21_im;
00472     a30_im += c30_21_re * o21_im + c30_21_im * o21_re;
00473     a30_re += c30_22_re * o22_re - c30_22_im * o22_im;
00474     a30_im += c30_22_re * o22_im + c30_22_im * o22_re;
00475     a30_re += c30_30_re * o30_re;
00476     a30_im += c30_30_re * o30_im;
00477     a30_re += c30_31_re * o31_re - c30_31_im * o31_im;
00478     a30_im += c30_31_re * o31_im + c30_31_im * o31_re;
00479     a30_re += c30_32_re * o32_re - c30_32_im * o32_im;
00480     a30_im += c30_32_re * o32_im + c30_32_im * o32_re;
00481     
00482     a31_re += c31_20_re * o20_re - c31_20_im * o20_im;
00483     a31_im += c31_20_re * o20_im + c31_20_im * o20_re;
00484     a31_re += c31_21_re * o21_re - c31_21_im * o21_im;
00485     a31_im += c31_21_re * o21_im + c31_21_im * o21_re;
00486     a31_re += c31_22_re * o22_re - c31_22_im * o22_im;
00487     a31_im += c31_22_re * o22_im + c31_22_im * o22_re;
00488     a31_re += c31_30_re * o30_re - c31_30_im * o30_im;
00489     a31_im += c31_30_re * o30_im + c31_30_im * o30_re;
00490     a31_re += c31_31_re * o31_re;
00491     a31_im += c31_31_re * o31_im;
00492     a31_re += c31_32_re * o32_re - c31_32_im * o32_im;
00493     a31_im += c31_32_re * o32_im + c31_32_im * o32_re;
00494     
00495     a32_re += c32_20_re * o20_re - c32_20_im * o20_im;
00496     a32_im += c32_20_re * o20_im + c32_20_im * o20_re;
00497     a32_re += c32_21_re * o21_re - c32_21_im * o21_im;
00498     a32_im += c32_21_re * o21_im + c32_21_im * o21_re;
00499     a32_re += c32_22_re * o22_re - c32_22_im * o22_im;
00500     a32_im += c32_22_re * o22_im + c32_22_im * o22_re;
00501     a32_re += c32_30_re * o30_re - c32_30_im * o30_im;
00502     a32_im += c32_30_re * o30_im + c32_30_im * o30_re;
00503     a32_re += c32_31_re * o31_re - c32_31_im * o31_im;
00504     a32_im += c32_31_re * o31_im + c32_31_im * o31_re;
00505     a32_re += c32_32_re * o32_re;
00506     a32_im += c32_32_re * o32_im;
00507     
00508     o20_re = a20_re;  o20_im = a20_im;
00509     o21_re = a21_re;  o21_im = a21_im;
00510     o22_re = a22_re;  o22_im = a22_im;
00511     o30_re = a30_re;  o30_im = a30_im;
00512     o31_re = a31_re;  o31_im = a31_im;
00513     o32_re = a32_re;  o32_im = a32_im;    
00514 }
00515 
00516 // change back from chiral basis
00517 // (note: required factor of 1/2 is included in clover term normalization)
00518 {
00519     spinorFloat a00_re =  o10_re + o30_re;
00520     spinorFloat a00_im =  o10_im + o30_im;
00521     spinorFloat a10_re = -o00_re - o20_re;
00522     spinorFloat a10_im = -o00_im - o20_im;
00523     spinorFloat a20_re =  o10_re - o30_re;
00524     spinorFloat a20_im =  o10_im - o30_im;
00525     spinorFloat a30_re = -o00_re + o20_re;
00526     spinorFloat a30_im = -o00_im + o20_im;
00527     
00528     o00_re = a00_re;  o00_im = a00_im;
00529     o10_re = a10_re;  o10_im = a10_im;
00530     o20_re = a20_re;  o20_im = a20_im;
00531     o30_re = a30_re;  o30_im = a30_im;
00532 }
00533 {
00534     spinorFloat a01_re =  o11_re + o31_re;
00535     spinorFloat a01_im =  o11_im + o31_im;
00536     spinorFloat a11_re = -o01_re - o21_re;
00537     spinorFloat a11_im = -o01_im - o21_im;
00538     spinorFloat a21_re =  o11_re - o31_re;
00539     spinorFloat a21_im =  o11_im - o31_im;
00540     spinorFloat a31_re = -o01_re + o21_re;
00541     spinorFloat a31_im = -o01_im + o21_im;
00542     
00543     o01_re = a01_re;  o01_im = a01_im;
00544     o11_re = a11_re;  o11_im = a11_im;
00545     o21_re = a21_re;  o21_im = a21_im;
00546     o31_re = a31_re;  o31_im = a31_im;
00547 }
00548 {
00549     spinorFloat a02_re =  o12_re + o32_re;
00550     spinorFloat a02_im =  o12_im + o32_im;
00551     spinorFloat a12_re = -o02_re - o22_re;
00552     spinorFloat a12_im = -o02_im - o22_im;
00553     spinorFloat a22_re =  o12_re - o32_re;
00554     spinorFloat a22_im =  o12_im - o32_im;
00555     spinorFloat a32_re = -o02_re + o22_re;
00556     spinorFloat a32_im = -o02_im + o22_im;
00557     
00558     o02_re = a02_re;  o02_im = a02_im;
00559     o12_re = a12_re;  o12_im = a12_im;
00560     o22_re = a22_re;  o22_im = a22_im;
00561     o32_re = a32_re;  o32_im = a32_im;
00562 }
00563 
00564 #ifdef DSLASH_XPAY
00565     READ_ACCUM(ACCUMTEX)
00566 #ifdef SPINOR_DOUBLE
00567     o00_re = a*o00_re + accum0.x;
00568     o00_im = a*o00_im + accum0.y;
00569     o01_re = a*o01_re + accum1.x;
00570     o01_im = a*o01_im + accum1.y;
00571     o02_re = a*o02_re + accum2.x;
00572     o02_im = a*o02_im + accum2.y;
00573     o10_re = a*o10_re + accum3.x;
00574     o10_im = a*o10_im + accum3.y;
00575     o11_re = a*o11_re + accum4.x;
00576     o11_im = a*o11_im + accum4.y;
00577     o12_re = a*o12_re + accum5.x;
00578     o12_im = a*o12_im + accum5.y;
00579     o20_re = a*o20_re + accum6.x;
00580     o20_im = a*o20_im + accum6.y;
00581     o21_re = a*o21_re + accum7.x;
00582     o21_im = a*o21_im + accum7.y;
00583     o22_re = a*o22_re + accum8.x;
00584     o22_im = a*o22_im + accum8.y;
00585     o30_re = a*o30_re + accum9.x;
00586     o30_im = a*o30_im + accum9.y;
00587     o31_re = a*o31_re + accum10.x;
00588     o31_im = a*o31_im + accum10.y;
00589     o32_re = a*o32_re + accum11.x;
00590     o32_im = a*o32_im + accum11.y;
00591 #else
00592     o00_re = a*o00_re + accum0.x;
00593     o00_im = a*o00_im + accum0.y;
00594     o01_re = a*o01_re + accum0.z;
00595     o01_im = a*o01_im + accum0.w;
00596     o02_re = a*o02_re + accum1.x;
00597     o02_im = a*o02_im + accum1.y;
00598     o10_re = a*o10_re + accum1.z;
00599     o10_im = a*o10_im + accum1.w;
00600     o11_re = a*o11_re + accum2.x;
00601     o11_im = a*o11_im + accum2.y;
00602     o12_re = a*o12_re + accum2.z;
00603     o12_im = a*o12_im + accum2.w;
00604     o20_re = a*o20_re + accum3.x;
00605     o20_im = a*o20_im + accum3.y;
00606     o21_re = a*o21_re + accum3.z;
00607     o21_im = a*o21_im + accum3.w;
00608     o22_re = a*o22_re + accum4.x;
00609     o22_im = a*o22_im + accum4.y;
00610     o30_re = a*o30_re + accum4.z;
00611     o30_im = a*o30_im + accum4.w;
00612     o31_re = a*o31_re + accum5.x;
00613     o31_im = a*o31_im + accum5.y;
00614     o32_re = a*o32_re + accum5.z;
00615     o32_im = a*o32_im + accum5.w;
00616 #endif // SPINOR_DOUBLE
00617 #endif // DSLASH_XPAY
00618 
00619     // write spinor field back to device memory
00620     WRITE_SPINOR();
00621 
00622 // undefine to prevent warning when precision is changed
00623 #undef spinorFloat
00624 #undef SHARED_STRIDE
00625 
00626 #undef i00_re
00627 #undef i00_im
00628 #undef i01_re
00629 #undef i01_im
00630 #undef i02_re
00631 #undef i02_im
00632 #undef i10_re
00633 #undef i10_im
00634 #undef i11_re
00635 #undef i11_im
00636 #undef i12_re
00637 #undef i12_im
00638 #undef i20_re
00639 #undef i20_im
00640 #undef i21_re
00641 #undef i21_im
00642 #undef i22_re
00643 #undef i22_im
00644 #undef i30_re
00645 #undef i30_im
00646 #undef i31_re
00647 #undef i31_im
00648 #undef i32_re
00649 #undef i32_im
00650 
00651 #undef c00_00_re
00652 #undef c01_01_re
00653 #undef c02_02_re
00654 #undef c10_10_re
00655 #undef c11_11_re
00656 #undef c12_12_re
00657 #undef c01_00_re
00658 #undef c01_00_im
00659 #undef c02_00_re
00660 #undef c02_00_im
00661 #undef c10_00_re
00662 #undef c10_00_im
00663 #undef c11_00_re
00664 #undef c11_00_im
00665 #undef c12_00_re
00666 #undef c12_00_im
00667 #undef c02_01_re
00668 #undef c02_01_im
00669 #undef c10_01_re
00670 #undef c10_01_im
00671 #undef c11_01_re
00672 #undef c11_01_im
00673 #undef c12_01_re
00674 #undef c12_01_im
00675 #undef c10_02_re
00676 #undef c10_02_im
00677 #undef c11_02_re
00678 #undef c11_02_im
00679 #undef c12_02_re
00680 #undef c12_02_im
00681 #undef c11_10_re
00682 #undef c11_10_im
00683 #undef c12_10_re
00684 #undef c12_10_im
00685 #undef c12_11_re
00686 #undef c12_11_im
00687 
00688 #undef o00_re
00689 #undef o00_im
00690 #undef o01_re
00691 #undef o01_im
00692 #undef o02_re
00693 #undef o02_im
00694 #undef o10_re
00695 #undef o10_im
00696 
00697 #undef sp_idx // alias needed by READ_SPINOR()
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Defines