QUDA v0.4.0
A library for QCD on GPUs
quda/lib/dslash_core/clover_core.h
Go to the documentation of this file.
00001 // *** CUDA CLOVER ***
00002 
00003 #define CLOVER_SHARED_FLOATS_PER_THREAD 0
00004 
00005 
00006 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
00007 #define VOLATILE
00008 #else // Open64 compiler
00009 #define VOLATILE volatile
00010 #endif
00011 // input spinor
00012 #ifdef SPINOR_DOUBLE
00013 #define spinorFloat double
00014 #define i00_re I0.x
00015 #define i00_im I0.y
00016 #define i01_re I1.x
00017 #define i01_im I1.y
00018 #define i02_re I2.x
00019 #define i02_im I2.y
00020 #define i10_re I3.x
00021 #define i10_im I3.y
00022 #define i11_re I4.x
00023 #define i11_im I4.y
00024 #define i12_re I5.x
00025 #define i12_im I5.y
00026 #define i20_re I6.x
00027 #define i20_im I6.y
00028 #define i21_re I7.x
00029 #define i21_im I7.y
00030 #define i22_re I8.x
00031 #define i22_im I8.y
00032 #define i30_re I9.x
00033 #define i30_im I9.y
00034 #define i31_re I10.x
00035 #define i31_im I10.y
00036 #define i32_re I11.x
00037 #define i32_im I11.y
00038 #else
00039 #define spinorFloat float
00040 #define i00_re I0.x
00041 #define i00_im I0.y
00042 #define i01_re I0.z
00043 #define i01_im I0.w
00044 #define i02_re I1.x
00045 #define i02_im I1.y
00046 #define i10_re I1.z
00047 #define i10_im I1.w
00048 #define i11_re I2.x
00049 #define i11_im I2.y
00050 #define i12_re I2.z
00051 #define i12_im I2.w
00052 #define i20_re I3.x
00053 #define i20_im I3.y
00054 #define i21_re I3.z
00055 #define i21_im I3.w
00056 #define i22_re I4.x
00057 #define i22_im I4.y
00058 #define i30_re I4.z
00059 #define i30_im I4.w
00060 #define i31_re I5.x
00061 #define i31_im I5.y
00062 #define i32_re I5.z
00063 #define i32_im I5.w
00064 #endif // SPINOR_DOUBLE
00065 
00066 // first chiral block of inverted clover term
00067 #ifdef CLOVER_DOUBLE
00068 #define c00_00_re C0.x
00069 #define c01_01_re C0.y
00070 #define c02_02_re C1.x
00071 #define c10_10_re C1.y
00072 #define c11_11_re C2.x
00073 #define c12_12_re C2.y
00074 #define c01_00_re C3.x
00075 #define c01_00_im C3.y
00076 #define c02_00_re C4.x
00077 #define c02_00_im C4.y
00078 #define c10_00_re C5.x
00079 #define c10_00_im C5.y
00080 #define c11_00_re C6.x
00081 #define c11_00_im C6.y
00082 #define c12_00_re C7.x
00083 #define c12_00_im C7.y
00084 #define c02_01_re C8.x
00085 #define c02_01_im C8.y
00086 #define c10_01_re C9.x
00087 #define c10_01_im C9.y
00088 #define c11_01_re C10.x
00089 #define c11_01_im C10.y
00090 #define c12_01_re C11.x
00091 #define c12_01_im C11.y
00092 #define c10_02_re C12.x
00093 #define c10_02_im C12.y
00094 #define c11_02_re C13.x
00095 #define c11_02_im C13.y
00096 #define c12_02_re C14.x
00097 #define c12_02_im C14.y
00098 #define c11_10_re C15.x
00099 #define c11_10_im C15.y
00100 #define c12_10_re C16.x
00101 #define c12_10_im C16.y
00102 #define c12_11_re C17.x
00103 #define c12_11_im C17.y
00104 #else
00105 #define c00_00_re C0.x
00106 #define c01_01_re C0.y
00107 #define c02_02_re C0.z
00108 #define c10_10_re C0.w
00109 #define c11_11_re C1.x
00110 #define c12_12_re C1.y
00111 #define c01_00_re C1.z
00112 #define c01_00_im C1.w
00113 #define c02_00_re C2.x
00114 #define c02_00_im C2.y
00115 #define c10_00_re C2.z
00116 #define c10_00_im C2.w
00117 #define c11_00_re C3.x
00118 #define c11_00_im C3.y
00119 #define c12_00_re C3.z
00120 #define c12_00_im C3.w
00121 #define c02_01_re C4.x
00122 #define c02_01_im C4.y
00123 #define c10_01_re C4.z
00124 #define c10_01_im C4.w
00125 #define c11_01_re C5.x
00126 #define c11_01_im C5.y
00127 #define c12_01_re C5.z
00128 #define c12_01_im C5.w
00129 #define c10_02_re C6.x
00130 #define c10_02_im C6.y
00131 #define c11_02_re C6.z
00132 #define c11_02_im C6.w
00133 #define c12_02_re C7.x
00134 #define c12_02_im C7.y
00135 #define c11_10_re C7.z
00136 #define c11_10_im C7.w
00137 #define c12_10_re C8.x
00138 #define c12_10_im C8.y
00139 #define c12_11_re C8.z
00140 #define c12_11_im C8.w
00141 #endif // CLOVER_DOUBLE
00142 
00143 #define c00_01_re (+c01_00_re)
00144 #define c00_01_im (-c01_00_im)
00145 #define c00_02_re (+c02_00_re)
00146 #define c00_02_im (-c02_00_im)
00147 #define c01_02_re (+c02_01_re)
00148 #define c01_02_im (-c02_01_im)
00149 #define c00_10_re (+c10_00_re)
00150 #define c00_10_im (-c10_00_im)
00151 #define c01_10_re (+c10_01_re)
00152 #define c01_10_im (-c10_01_im)
00153 #define c02_10_re (+c10_02_re)
00154 #define c02_10_im (-c10_02_im)
00155 #define c00_11_re (+c11_00_re)
00156 #define c00_11_im (-c11_00_im)
00157 #define c01_11_re (+c11_01_re)
00158 #define c01_11_im (-c11_01_im)
00159 #define c02_11_re (+c11_02_re)
00160 #define c02_11_im (-c11_02_im)
00161 #define c10_11_re (+c11_10_re)
00162 #define c10_11_im (-c11_10_im)
00163 #define c00_12_re (+c12_00_re)
00164 #define c00_12_im (-c12_00_im)
00165 #define c01_12_re (+c12_01_re)
00166 #define c01_12_im (-c12_01_im)
00167 #define c02_12_re (+c12_02_re)
00168 #define c02_12_im (-c12_02_im)
00169 #define c10_12_re (+c12_10_re)
00170 #define c10_12_im (-c12_10_im)
00171 #define c11_12_re (+c12_11_re)
00172 #define c11_12_im (-c12_11_im)
00173 
00174 // second chiral block of inverted clover term (reuses C0,...,C9)
00175 #define c20_20_re c00_00_re
00176 #define c21_20_re c01_00_re
00177 #define c21_20_im c01_00_im
00178 #define c22_20_re c02_00_re
00179 #define c22_20_im c02_00_im
00180 #define c30_20_re c10_00_re
00181 #define c30_20_im c10_00_im
00182 #define c31_20_re c11_00_re
00183 #define c31_20_im c11_00_im
00184 #define c32_20_re c12_00_re
00185 #define c32_20_im c12_00_im
00186 #define c20_21_re c00_01_re
00187 #define c20_21_im c00_01_im
00188 #define c21_21_re c01_01_re
00189 #define c22_21_re c02_01_re
00190 #define c22_21_im c02_01_im
00191 #define c30_21_re c10_01_re
00192 #define c30_21_im c10_01_im
00193 #define c31_21_re c11_01_re
00194 #define c31_21_im c11_01_im
00195 #define c32_21_re c12_01_re
00196 #define c32_21_im c12_01_im
00197 #define c20_22_re c00_02_re
00198 #define c20_22_im c00_02_im
00199 #define c21_22_re c01_02_re
00200 #define c21_22_im c01_02_im
00201 #define c22_22_re c02_02_re
00202 #define c30_22_re c10_02_re
00203 #define c30_22_im c10_02_im
00204 #define c31_22_re c11_02_re
00205 #define c31_22_im c11_02_im
00206 #define c32_22_re c12_02_re
00207 #define c32_22_im c12_02_im
00208 #define c20_30_re c00_10_re
00209 #define c20_30_im c00_10_im
00210 #define c21_30_re c01_10_re
00211 #define c21_30_im c01_10_im
00212 #define c22_30_re c02_10_re
00213 #define c22_30_im c02_10_im
00214 #define c30_30_re c10_10_re
00215 #define c31_30_re c11_10_re
00216 #define c31_30_im c11_10_im
00217 #define c32_30_re c12_10_re
00218 #define c32_30_im c12_10_im
00219 #define c20_31_re c00_11_re
00220 #define c20_31_im c00_11_im
00221 #define c21_31_re c01_11_re
00222 #define c21_31_im c01_11_im
00223 #define c22_31_re c02_11_re
00224 #define c22_31_im c02_11_im
00225 #define c30_31_re c10_11_re
00226 #define c30_31_im c10_11_im
00227 #define c31_31_re c11_11_re
00228 #define c32_31_re c12_11_re
00229 #define c32_31_im c12_11_im
00230 #define c20_32_re c00_12_re
00231 #define c20_32_im c00_12_im
00232 #define c21_32_re c01_12_re
00233 #define c21_32_im c01_12_im
00234 #define c22_32_re c02_12_re
00235 #define c22_32_im c02_12_im
00236 #define c30_32_re c10_12_re
00237 #define c30_32_im c10_12_im
00238 #define c31_32_re c11_12_re
00239 #define c31_32_im c11_12_im
00240 #define c32_32_re c12_12_re
00241 
00242 // output spinor
00243 VOLATILE spinorFloat o00_re;
00244 VOLATILE spinorFloat o00_im;
00245 VOLATILE spinorFloat o01_re;
00246 VOLATILE spinorFloat o01_im;
00247 VOLATILE spinorFloat o02_re;
00248 VOLATILE spinorFloat o02_im;
00249 VOLATILE spinorFloat o10_re;
00250 VOLATILE spinorFloat o10_im;
00251 VOLATILE spinorFloat o11_re;
00252 VOLATILE spinorFloat o11_im;
00253 VOLATILE spinorFloat o12_re;
00254 VOLATILE spinorFloat o12_im;
00255 VOLATILE spinorFloat o20_re;
00256 VOLATILE spinorFloat o20_im;
00257 VOLATILE spinorFloat o21_re;
00258 VOLATILE spinorFloat o21_im;
00259 VOLATILE spinorFloat o22_re;
00260 VOLATILE spinorFloat o22_im;
00261 VOLATILE spinorFloat o30_re;
00262 VOLATILE spinorFloat o30_im;
00263 VOLATILE spinorFloat o31_re;
00264 VOLATILE spinorFloat o31_im;
00265 VOLATILE spinorFloat o32_re;
00266 VOLATILE spinorFloat o32_im;
00267 
00268 #include "read_clover.h"
00269 #include "io_spinor.h"
00270 
00271 int sid = blockIdx.x*blockDim.x + threadIdx.x;
00272 if (sid >= param.threads) return;
00273 
00274 // read spinor from device memory
00275 READ_SPINOR(SPINORTEX, sp_stride, sid, sid);
00276 {
00277   
00278   // change to chiral basis
00279   {
00280     spinorFloat a00_re = -i10_re - i30_re;
00281     spinorFloat a00_im = -i10_im - i30_im;
00282     spinorFloat a10_re =  i00_re + i20_re;
00283     spinorFloat a10_im =  i00_im + i20_im;
00284     spinorFloat a20_re = -i10_re + i30_re;
00285     spinorFloat a20_im = -i10_im + i30_im;
00286     spinorFloat a30_re =  i00_re - i20_re;
00287     spinorFloat a30_im =  i00_im - i20_im;
00288     
00289     o00_re = a00_re;  o00_im = a00_im;
00290     o10_re = a10_re;  o10_im = a10_im;
00291     o20_re = a20_re;  o20_im = a20_im;
00292     o30_re = a30_re;  o30_im = a30_im;
00293   }
00294   
00295   {
00296     spinorFloat a01_re = -i11_re - i31_re;
00297     spinorFloat a01_im = -i11_im - i31_im;
00298     spinorFloat a11_re =  i01_re + i21_re;
00299     spinorFloat a11_im =  i01_im + i21_im;
00300     spinorFloat a21_re = -i11_re + i31_re;
00301     spinorFloat a21_im = -i11_im + i31_im;
00302     spinorFloat a31_re =  i01_re - i21_re;
00303     spinorFloat a31_im =  i01_im - i21_im;
00304     
00305     o01_re = a01_re;  o01_im = a01_im;
00306     o11_re = a11_re;  o11_im = a11_im;
00307     o21_re = a21_re;  o21_im = a21_im;
00308     o31_re = a31_re;  o31_im = a31_im;
00309   }
00310   
00311   {
00312     spinorFloat a02_re = -i12_re - i32_re;
00313     spinorFloat a02_im = -i12_im - i32_im;
00314     spinorFloat a12_re =  i02_re + i22_re;
00315     spinorFloat a12_im =  i02_im + i22_im;
00316     spinorFloat a22_re = -i12_re + i32_re;
00317     spinorFloat a22_im = -i12_im + i32_im;
00318     spinorFloat a32_re =  i02_re - i22_re;
00319     spinorFloat a32_im =  i02_im - i22_im;
00320     
00321     o02_re = a02_re;  o02_im = a02_im;
00322     o12_re = a12_re;  o12_im = a12_im;
00323     o22_re = a22_re;  o22_im = a22_im;
00324     o32_re = a32_re;  o32_im = a32_im;
00325   }
00326   
00327   // apply first chiral block
00328   {
00329     READ_CLOVER(CLOVERTEX, 0)
00330     
00331     spinorFloat a00_re = 0; spinorFloat a00_im = 0;
00332     spinorFloat a01_re = 0; spinorFloat a01_im = 0;
00333     spinorFloat a02_re = 0; spinorFloat a02_im = 0;
00334     spinorFloat a10_re = 0; spinorFloat a10_im = 0;
00335     spinorFloat a11_re = 0; spinorFloat a11_im = 0;
00336     spinorFloat a12_re = 0; spinorFloat a12_im = 0;
00337     
00338     a00_re += c00_00_re * o00_re;
00339     a00_im += c00_00_re * o00_im;
00340     a00_re += c00_01_re * o01_re;
00341     a00_re -= c00_01_im * o01_im;
00342     a00_im += c00_01_re * o01_im;
00343     a00_im += c00_01_im * o01_re;
00344     a00_re += c00_02_re * o02_re;
00345     a00_re -= c00_02_im * o02_im;
00346     a00_im += c00_02_re * o02_im;
00347     a00_im += c00_02_im * o02_re;
00348     a00_re += c00_10_re * o10_re;
00349     a00_re -= c00_10_im * o10_im;
00350     a00_im += c00_10_re * o10_im;
00351     a00_im += c00_10_im * o10_re;
00352     a00_re += c00_11_re * o11_re;
00353     a00_re -= c00_11_im * o11_im;
00354     a00_im += c00_11_re * o11_im;
00355     a00_im += c00_11_im * o11_re;
00356     a00_re += c00_12_re * o12_re;
00357     a00_re -= c00_12_im * o12_im;
00358     a00_im += c00_12_re * o12_im;
00359     a00_im += c00_12_im * o12_re;
00360     
00361     a01_re += c01_00_re * o00_re;
00362     a01_re -= c01_00_im * o00_im;
00363     a01_im += c01_00_re * o00_im;
00364     a01_im += c01_00_im * o00_re;
00365     a01_re += c01_01_re * o01_re;
00366     a01_im += c01_01_re * o01_im;
00367     a01_re += c01_02_re * o02_re;
00368     a01_re -= c01_02_im * o02_im;
00369     a01_im += c01_02_re * o02_im;
00370     a01_im += c01_02_im * o02_re;
00371     a01_re += c01_10_re * o10_re;
00372     a01_re -= c01_10_im * o10_im;
00373     a01_im += c01_10_re * o10_im;
00374     a01_im += c01_10_im * o10_re;
00375     a01_re += c01_11_re * o11_re;
00376     a01_re -= c01_11_im * o11_im;
00377     a01_im += c01_11_re * o11_im;
00378     a01_im += c01_11_im * o11_re;
00379     a01_re += c01_12_re * o12_re;
00380     a01_re -= c01_12_im * o12_im;
00381     a01_im += c01_12_re * o12_im;
00382     a01_im += c01_12_im * o12_re;
00383     
00384     a02_re += c02_00_re * o00_re;
00385     a02_re -= c02_00_im * o00_im;
00386     a02_im += c02_00_re * o00_im;
00387     a02_im += c02_00_im * o00_re;
00388     a02_re += c02_01_re * o01_re;
00389     a02_re -= c02_01_im * o01_im;
00390     a02_im += c02_01_re * o01_im;
00391     a02_im += c02_01_im * o01_re;
00392     a02_re += c02_02_re * o02_re;
00393     a02_im += c02_02_re * o02_im;
00394     a02_re += c02_10_re * o10_re;
00395     a02_re -= c02_10_im * o10_im;
00396     a02_im += c02_10_re * o10_im;
00397     a02_im += c02_10_im * o10_re;
00398     a02_re += c02_11_re * o11_re;
00399     a02_re -= c02_11_im * o11_im;
00400     a02_im += c02_11_re * o11_im;
00401     a02_im += c02_11_im * o11_re;
00402     a02_re += c02_12_re * o12_re;
00403     a02_re -= c02_12_im * o12_im;
00404     a02_im += c02_12_re * o12_im;
00405     a02_im += c02_12_im * o12_re;
00406     
00407     a10_re += c10_00_re * o00_re;
00408     a10_re -= c10_00_im * o00_im;
00409     a10_im += c10_00_re * o00_im;
00410     a10_im += c10_00_im * o00_re;
00411     a10_re += c10_01_re * o01_re;
00412     a10_re -= c10_01_im * o01_im;
00413     a10_im += c10_01_re * o01_im;
00414     a10_im += c10_01_im * o01_re;
00415     a10_re += c10_02_re * o02_re;
00416     a10_re -= c10_02_im * o02_im;
00417     a10_im += c10_02_re * o02_im;
00418     a10_im += c10_02_im * o02_re;
00419     a10_re += c10_10_re * o10_re;
00420     a10_im += c10_10_re * o10_im;
00421     a10_re += c10_11_re * o11_re;
00422     a10_re -= c10_11_im * o11_im;
00423     a10_im += c10_11_re * o11_im;
00424     a10_im += c10_11_im * o11_re;
00425     a10_re += c10_12_re * o12_re;
00426     a10_re -= c10_12_im * o12_im;
00427     a10_im += c10_12_re * o12_im;
00428     a10_im += c10_12_im * o12_re;
00429     
00430     a11_re += c11_00_re * o00_re;
00431     a11_re -= c11_00_im * o00_im;
00432     a11_im += c11_00_re * o00_im;
00433     a11_im += c11_00_im * o00_re;
00434     a11_re += c11_01_re * o01_re;
00435     a11_re -= c11_01_im * o01_im;
00436     a11_im += c11_01_re * o01_im;
00437     a11_im += c11_01_im * o01_re;
00438     a11_re += c11_02_re * o02_re;
00439     a11_re -= c11_02_im * o02_im;
00440     a11_im += c11_02_re * o02_im;
00441     a11_im += c11_02_im * o02_re;
00442     a11_re += c11_10_re * o10_re;
00443     a11_re -= c11_10_im * o10_im;
00444     a11_im += c11_10_re * o10_im;
00445     a11_im += c11_10_im * o10_re;
00446     a11_re += c11_11_re * o11_re;
00447     a11_im += c11_11_re * o11_im;
00448     a11_re += c11_12_re * o12_re;
00449     a11_re -= c11_12_im * o12_im;
00450     a11_im += c11_12_re * o12_im;
00451     a11_im += c11_12_im * o12_re;
00452     
00453     a12_re += c12_00_re * o00_re;
00454     a12_re -= c12_00_im * o00_im;
00455     a12_im += c12_00_re * o00_im;
00456     a12_im += c12_00_im * o00_re;
00457     a12_re += c12_01_re * o01_re;
00458     a12_re -= c12_01_im * o01_im;
00459     a12_im += c12_01_re * o01_im;
00460     a12_im += c12_01_im * o01_re;
00461     a12_re += c12_02_re * o02_re;
00462     a12_re -= c12_02_im * o02_im;
00463     a12_im += c12_02_re * o02_im;
00464     a12_im += c12_02_im * o02_re;
00465     a12_re += c12_10_re * o10_re;
00466     a12_re -= c12_10_im * o10_im;
00467     a12_im += c12_10_re * o10_im;
00468     a12_im += c12_10_im * o10_re;
00469     a12_re += c12_11_re * o11_re;
00470     a12_re -= c12_11_im * o11_im;
00471     a12_im += c12_11_re * o11_im;
00472     a12_im += c12_11_im * o11_re;
00473     a12_re += c12_12_re * o12_re;
00474     a12_im += c12_12_re * o12_im;
00475     
00476     o00_re = a00_re;  o00_im = a00_im;
00477     o01_re = a01_re;  o01_im = a01_im;
00478     o02_re = a02_re;  o02_im = a02_im;
00479     o10_re = a10_re;  o10_im = a10_im;
00480     o11_re = a11_re;  o11_im = a11_im;
00481     o12_re = a12_re;  o12_im = a12_im;
00482     
00483   }
00484   
00485   // apply second chiral block
00486   {
00487     READ_CLOVER(CLOVERTEX, 1)
00488     
00489     spinorFloat a20_re = 0; spinorFloat a20_im = 0;
00490     spinorFloat a21_re = 0; spinorFloat a21_im = 0;
00491     spinorFloat a22_re = 0; spinorFloat a22_im = 0;
00492     spinorFloat a30_re = 0; spinorFloat a30_im = 0;
00493     spinorFloat a31_re = 0; spinorFloat a31_im = 0;
00494     spinorFloat a32_re = 0; spinorFloat a32_im = 0;
00495     
00496     a20_re += c20_20_re * o20_re;
00497     a20_im += c20_20_re * o20_im;
00498     a20_re += c20_21_re * o21_re;
00499     a20_re -= c20_21_im * o21_im;
00500     a20_im += c20_21_re * o21_im;
00501     a20_im += c20_21_im * o21_re;
00502     a20_re += c20_22_re * o22_re;
00503     a20_re -= c20_22_im * o22_im;
00504     a20_im += c20_22_re * o22_im;
00505     a20_im += c20_22_im * o22_re;
00506     a20_re += c20_30_re * o30_re;
00507     a20_re -= c20_30_im * o30_im;
00508     a20_im += c20_30_re * o30_im;
00509     a20_im += c20_30_im * o30_re;
00510     a20_re += c20_31_re * o31_re;
00511     a20_re -= c20_31_im * o31_im;
00512     a20_im += c20_31_re * o31_im;
00513     a20_im += c20_31_im * o31_re;
00514     a20_re += c20_32_re * o32_re;
00515     a20_re -= c20_32_im * o32_im;
00516     a20_im += c20_32_re * o32_im;
00517     a20_im += c20_32_im * o32_re;
00518     
00519     a21_re += c21_20_re * o20_re;
00520     a21_re -= c21_20_im * o20_im;
00521     a21_im += c21_20_re * o20_im;
00522     a21_im += c21_20_im * o20_re;
00523     a21_re += c21_21_re * o21_re;
00524     a21_im += c21_21_re * o21_im;
00525     a21_re += c21_22_re * o22_re;
00526     a21_re -= c21_22_im * o22_im;
00527     a21_im += c21_22_re * o22_im;
00528     a21_im += c21_22_im * o22_re;
00529     a21_re += c21_30_re * o30_re;
00530     a21_re -= c21_30_im * o30_im;
00531     a21_im += c21_30_re * o30_im;
00532     a21_im += c21_30_im * o30_re;
00533     a21_re += c21_31_re * o31_re;
00534     a21_re -= c21_31_im * o31_im;
00535     a21_im += c21_31_re * o31_im;
00536     a21_im += c21_31_im * o31_re;
00537     a21_re += c21_32_re * o32_re;
00538     a21_re -= c21_32_im * o32_im;
00539     a21_im += c21_32_re * o32_im;
00540     a21_im += c21_32_im * o32_re;
00541     
00542     a22_re += c22_20_re * o20_re;
00543     a22_re -= c22_20_im * o20_im;
00544     a22_im += c22_20_re * o20_im;
00545     a22_im += c22_20_im * o20_re;
00546     a22_re += c22_21_re * o21_re;
00547     a22_re -= c22_21_im * o21_im;
00548     a22_im += c22_21_re * o21_im;
00549     a22_im += c22_21_im * o21_re;
00550     a22_re += c22_22_re * o22_re;
00551     a22_im += c22_22_re * o22_im;
00552     a22_re += c22_30_re * o30_re;
00553     a22_re -= c22_30_im * o30_im;
00554     a22_im += c22_30_re * o30_im;
00555     a22_im += c22_30_im * o30_re;
00556     a22_re += c22_31_re * o31_re;
00557     a22_re -= c22_31_im * o31_im;
00558     a22_im += c22_31_re * o31_im;
00559     a22_im += c22_31_im * o31_re;
00560     a22_re += c22_32_re * o32_re;
00561     a22_re -= c22_32_im * o32_im;
00562     a22_im += c22_32_re * o32_im;
00563     a22_im += c22_32_im * o32_re;
00564     
00565     a30_re += c30_20_re * o20_re;
00566     a30_re -= c30_20_im * o20_im;
00567     a30_im += c30_20_re * o20_im;
00568     a30_im += c30_20_im * o20_re;
00569     a30_re += c30_21_re * o21_re;
00570     a30_re -= c30_21_im * o21_im;
00571     a30_im += c30_21_re * o21_im;
00572     a30_im += c30_21_im * o21_re;
00573     a30_re += c30_22_re * o22_re;
00574     a30_re -= c30_22_im * o22_im;
00575     a30_im += c30_22_re * o22_im;
00576     a30_im += c30_22_im * o22_re;
00577     a30_re += c30_30_re * o30_re;
00578     a30_im += c30_30_re * o30_im;
00579     a30_re += c30_31_re * o31_re;
00580     a30_re -= c30_31_im * o31_im;
00581     a30_im += c30_31_re * o31_im;
00582     a30_im += c30_31_im * o31_re;
00583     a30_re += c30_32_re * o32_re;
00584     a30_re -= c30_32_im * o32_im;
00585     a30_im += c30_32_re * o32_im;
00586     a30_im += c30_32_im * o32_re;
00587     
00588     a31_re += c31_20_re * o20_re;
00589     a31_re -= c31_20_im * o20_im;
00590     a31_im += c31_20_re * o20_im;
00591     a31_im += c31_20_im * o20_re;
00592     a31_re += c31_21_re * o21_re;
00593     a31_re -= c31_21_im * o21_im;
00594     a31_im += c31_21_re * o21_im;
00595     a31_im += c31_21_im * o21_re;
00596     a31_re += c31_22_re * o22_re;
00597     a31_re -= c31_22_im * o22_im;
00598     a31_im += c31_22_re * o22_im;
00599     a31_im += c31_22_im * o22_re;
00600     a31_re += c31_30_re * o30_re;
00601     a31_re -= c31_30_im * o30_im;
00602     a31_im += c31_30_re * o30_im;
00603     a31_im += c31_30_im * o30_re;
00604     a31_re += c31_31_re * o31_re;
00605     a31_im += c31_31_re * o31_im;
00606     a31_re += c31_32_re * o32_re;
00607     a31_re -= c31_32_im * o32_im;
00608     a31_im += c31_32_re * o32_im;
00609     a31_im += c31_32_im * o32_re;
00610     
00611     a32_re += c32_20_re * o20_re;
00612     a32_re -= c32_20_im * o20_im;
00613     a32_im += c32_20_re * o20_im;
00614     a32_im += c32_20_im * o20_re;
00615     a32_re += c32_21_re * o21_re;
00616     a32_re -= c32_21_im * o21_im;
00617     a32_im += c32_21_re * o21_im;
00618     a32_im += c32_21_im * o21_re;
00619     a32_re += c32_22_re * o22_re;
00620     a32_re -= c32_22_im * o22_im;
00621     a32_im += c32_22_re * o22_im;
00622     a32_im += c32_22_im * o22_re;
00623     a32_re += c32_30_re * o30_re;
00624     a32_re -= c32_30_im * o30_im;
00625     a32_im += c32_30_re * o30_im;
00626     a32_im += c32_30_im * o30_re;
00627     a32_re += c32_31_re * o31_re;
00628     a32_re -= c32_31_im * o31_im;
00629     a32_im += c32_31_re * o31_im;
00630     a32_im += c32_31_im * o31_re;
00631     a32_re += c32_32_re * o32_re;
00632     a32_im += c32_32_re * o32_im;
00633     
00634     o20_re = a20_re;  o20_im = a20_im;
00635     o21_re = a21_re;  o21_im = a21_im;
00636     o22_re = a22_re;  o22_im = a22_im;
00637     o30_re = a30_re;  o30_im = a30_im;
00638     o31_re = a31_re;  o31_im = a31_im;
00639     o32_re = a32_re;  o32_im = a32_im;
00640     
00641   }
00642   
00643   // change back from chiral basis
00644   // (note: required factor of 1/2 is included in clover term normalization)
00645   {
00646     spinorFloat a00_re =  o10_re + o30_re;
00647     spinorFloat a00_im =  o10_im + o30_im;
00648     spinorFloat a10_re = -o00_re - o20_re;
00649     spinorFloat a10_im = -o00_im - o20_im;
00650     spinorFloat a20_re =  o10_re - o30_re;
00651     spinorFloat a20_im =  o10_im - o30_im;
00652     spinorFloat a30_re = -o00_re + o20_re;
00653     spinorFloat a30_im = -o00_im + o20_im;
00654     
00655     o00_re = a00_re;  o00_im = a00_im;
00656     o10_re = a10_re;  o10_im = a10_im;
00657     o20_re = a20_re;  o20_im = a20_im;
00658     o30_re = a30_re;  o30_im = a30_im;
00659   }
00660   
00661   {
00662     spinorFloat a01_re =  o11_re + o31_re;
00663     spinorFloat a01_im =  o11_im + o31_im;
00664     spinorFloat a11_re = -o01_re - o21_re;
00665     spinorFloat a11_im = -o01_im - o21_im;
00666     spinorFloat a21_re =  o11_re - o31_re;
00667     spinorFloat a21_im =  o11_im - o31_im;
00668     spinorFloat a31_re = -o01_re + o21_re;
00669     spinorFloat a31_im = -o01_im + o21_im;
00670     
00671     o01_re = a01_re;  o01_im = a01_im;
00672     o11_re = a11_re;  o11_im = a11_im;
00673     o21_re = a21_re;  o21_im = a21_im;
00674     o31_re = a31_re;  o31_im = a31_im;
00675   }
00676   
00677   {
00678     spinorFloat a02_re =  o12_re + o32_re;
00679     spinorFloat a02_im =  o12_im + o32_im;
00680     spinorFloat a12_re = -o02_re - o22_re;
00681     spinorFloat a12_im = -o02_im - o22_im;
00682     spinorFloat a22_re =  o12_re - o32_re;
00683     spinorFloat a22_im =  o12_im - o32_im;
00684     spinorFloat a32_re = -o02_re + o22_re;
00685     spinorFloat a32_im = -o02_im + o22_im;
00686     
00687     o02_re = a02_re;  o02_im = a02_im;
00688     o12_re = a12_re;  o12_im = a12_im;
00689     o22_re = a22_re;  o22_im = a22_im;
00690     o32_re = a32_re;  o32_im = a32_im;
00691   }
00692   
00693 #ifdef DSLASH_XPAY
00694   
00695   READ_ACCUM(ACCUMTEX, sp_stride)
00696   
00697 #ifdef SPINOR_DOUBLE
00698   o00_re = a*o00_re + accum0.x;
00699   o00_im = a*o00_im + accum0.y;
00700   o01_re = a*o01_re + accum1.x;
00701   o01_im = a*o01_im + accum1.y;
00702   o02_re = a*o02_re + accum2.x;
00703   o02_im = a*o02_im + accum2.y;
00704   o10_re = a*o10_re + accum3.x;
00705   o10_im = a*o10_im + accum3.y;
00706   o11_re = a*o11_re + accum4.x;
00707   o11_im = a*o11_im + accum4.y;
00708   o12_re = a*o12_re + accum5.x;
00709   o12_im = a*o12_im + accum5.y;
00710   o20_re = a*o20_re + accum6.x;
00711   o20_im = a*o20_im + accum6.y;
00712   o21_re = a*o21_re + accum7.x;
00713   o21_im = a*o21_im + accum7.y;
00714   o22_re = a*o22_re + accum8.x;
00715   o22_im = a*o22_im + accum8.y;
00716   o30_re = a*o30_re + accum9.x;
00717   o30_im = a*o30_im + accum9.y;
00718   o31_re = a*o31_re + accum10.x;
00719   o31_im = a*o31_im + accum10.y;
00720   o32_re = a*o32_re + accum11.x;
00721   o32_im = a*o32_im + accum11.y;
00722 #else
00723   o00_re = a*o00_re + accum0.x;
00724   o00_im = a*o00_im + accum0.y;
00725   o01_re = a*o01_re + accum0.z;
00726   o01_im = a*o01_im + accum0.w;
00727   o02_re = a*o02_re + accum1.x;
00728   o02_im = a*o02_im + accum1.y;
00729   o10_re = a*o10_re + accum1.z;
00730   o10_im = a*o10_im + accum1.w;
00731   o11_re = a*o11_re + accum2.x;
00732   o11_im = a*o11_im + accum2.y;
00733   o12_re = a*o12_re + accum2.z;
00734   o12_im = a*o12_im + accum2.w;
00735   o20_re = a*o20_re + accum3.x;
00736   o20_im = a*o20_im + accum3.y;
00737   o21_re = a*o21_re + accum3.z;
00738   o21_im = a*o21_im + accum3.w;
00739   o22_re = a*o22_re + accum4.x;
00740   o22_im = a*o22_im + accum4.y;
00741   o30_re = a*o30_re + accum4.z;
00742   o30_im = a*o30_im + accum4.w;
00743   o31_re = a*o31_re + accum5.x;
00744   o31_im = a*o31_im + accum5.y;
00745   o32_re = a*o32_re + accum5.z;
00746   o32_im = a*o32_im + accum5.w;
00747 #endif // SPINOR_DOUBLE
00748   
00749 #endif // DSLASH_XPAY
00750 }
00751 
00752 // write spinor field back to device memory
00753 WRITE_SPINOR(sp_stride);
00754 
00755 // undefine to prevent warning when precision is changed
00756 #undef spinorFloat
00757 #undef i00_re
00758 #undef i00_im
00759 #undef i01_re
00760 #undef i01_im
00761 #undef i02_re
00762 #undef i02_im
00763 #undef i10_re
00764 #undef i10_im
00765 #undef i11_re
00766 #undef i11_im
00767 #undef i12_re
00768 #undef i12_im
00769 #undef i20_re
00770 #undef i20_im
00771 #undef i21_re
00772 #undef i21_im
00773 #undef i22_re
00774 #undef i22_im
00775 #undef i30_re
00776 #undef i30_im
00777 #undef i31_re
00778 #undef i31_im
00779 #undef i32_re
00780 #undef i32_im
00781 
00782 #undef c00_00_re
00783 #undef c01_01_re
00784 #undef c02_02_re
00785 #undef c10_10_re
00786 #undef c11_11_re
00787 #undef c12_12_re
00788 #undef c01_00_re
00789 #undef c01_00_im
00790 #undef c02_00_re
00791 #undef c02_00_im
00792 #undef c10_00_re
00793 #undef c10_00_im
00794 #undef c11_00_re
00795 #undef c11_00_im
00796 #undef c12_00_re
00797 #undef c12_00_im
00798 #undef c02_01_re
00799 #undef c02_01_im
00800 #undef c10_01_re
00801 #undef c10_01_im
00802 #undef c11_01_re
00803 #undef c11_01_im
00804 #undef c12_01_re
00805 #undef c12_01_im
00806 #undef c10_02_re
00807 #undef c10_02_im
00808 #undef c11_02_re
00809 #undef c11_02_im
00810 #undef c12_02_re
00811 #undef c12_02_im
00812 #undef c11_10_re
00813 #undef c11_10_im
00814 #undef c12_10_re
00815 #undef c12_10_im
00816 #undef c12_11_re
00817 #undef c12_11_im
00818 
00819 
00820 #undef VOLATILE
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines