|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 // *** CUDA CLOVER *** 00002 00003 #define SHARED_FLOATS_PER_THREAD 8 00004 00005 // input spinor 00006 #ifdef SPINOR_DOUBLE 00007 #define spinorFloat double 00008 #define i00_re I0.x 00009 #define i00_im I0.y 00010 #define i01_re I1.x 00011 #define i01_im I1.y 00012 #define i02_re I2.x 00013 #define i02_im I2.y 00014 #define i10_re I3.x 00015 #define i10_im I3.y 00016 #define i11_re I4.x 00017 #define i11_im I4.y 00018 #define i12_re I5.x 00019 #define i12_im I5.y 00020 #define i20_re I6.x 00021 #define i20_im I6.y 00022 #define i21_re I7.x 00023 #define i21_im I7.y 00024 #define i22_re I8.x 00025 #define i22_im I8.y 00026 #define i30_re I9.x 00027 #define i30_im I9.y 00028 #define i31_re I10.x 00029 #define i31_im I10.y 00030 #define i32_re I11.x 00031 #define i32_im I11.y 00032 #else 00033 #define spinorFloat float 00034 #define i00_re I0.x 00035 #define i00_im I0.y 00036 #define i01_re I0.z 00037 #define i01_im I0.w 00038 #define i02_re I1.x 00039 #define i02_im I1.y 00040 #define i10_re I1.z 00041 #define i10_im I1.w 00042 #define i11_re I2.x 00043 #define i11_im I2.y 00044 #define i12_re I2.z 00045 #define i12_im I2.w 00046 #define i20_re I3.x 00047 #define i20_im I3.y 00048 #define i21_re I3.z 00049 #define i21_im I3.w 00050 #define i22_re I4.x 00051 #define i22_im I4.y 00052 #define i30_re I4.z 00053 #define i30_im I4.w 00054 #define i31_re I5.x 00055 #define i31_im I5.y 00056 #define i32_re I5.z 00057 #define i32_im I5.w 00058 #endif // SPINOR_DOUBLE 00059 00060 // first chiral block of inverted clover term 00061 #ifdef CLOVER_DOUBLE 00062 #define c00_00_re C0.x 00063 #define c01_01_re C0.y 00064 #define c02_02_re C1.x 00065 #define c10_10_re C1.y 00066 #define c11_11_re C2.x 00067 #define c12_12_re C2.y 00068 #define c01_00_re C3.x 00069 #define c01_00_im C3.y 00070 #define c02_00_re C4.x 00071 #define c02_00_im C4.y 00072 #define c10_00_re C5.x 00073 #define c10_00_im C5.y 00074 #define c11_00_re C6.x 00075 #define c11_00_im C6.y 00076 #define c12_00_re C7.x 00077 #define c12_00_im C7.y 00078 #define c02_01_re C8.x 00079 #define c02_01_im C8.y 00080 #define c10_01_re C9.x 00081 #define c10_01_im C9.y 00082 #define c11_01_re C10.x 00083 #define c11_01_im C10.y 00084 #define c12_01_re C11.x 00085 #define c12_01_im C11.y 00086 #define c10_02_re C12.x 00087 #define c10_02_im C12.y 00088 #define c11_02_re C13.x 00089 #define c11_02_im C13.y 00090 #define c12_02_re C14.x 00091 #define c12_02_im C14.y 00092 #define c11_10_re C15.x 00093 #define c11_10_im C15.y 00094 #define c12_10_re C16.x 00095 #define c12_10_im C16.y 00096 #define c12_11_re C17.x 00097 #define c12_11_im C17.y 00098 #else 00099 #define c00_00_re C0.x 00100 #define c01_01_re C0.y 00101 #define c02_02_re C0.z 00102 #define c10_10_re C0.w 00103 #define c11_11_re C1.x 00104 #define c12_12_re C1.y 00105 #define c01_00_re C1.z 00106 #define c01_00_im C1.w 00107 #define c02_00_re C2.x 00108 #define c02_00_im C2.y 00109 #define c10_00_re C2.z 00110 #define c10_00_im C2.w 00111 #define c11_00_re C3.x 00112 #define c11_00_im C3.y 00113 #define c12_00_re C3.z 00114 #define c12_00_im C3.w 00115 #define c02_01_re C4.x 00116 #define c02_01_im C4.y 00117 #define c10_01_re C4.z 00118 #define c10_01_im C4.w 00119 #define c11_01_re C5.x 00120 #define c11_01_im C5.y 00121 #define c12_01_re C5.z 00122 #define c12_01_im C5.w 00123 #define c10_02_re C6.x 00124 #define c10_02_im C6.y 00125 #define c11_02_re C6.z 00126 #define c11_02_im C6.w 00127 #define c12_02_re C7.x 00128 #define c12_02_im C7.y 00129 #define c11_10_re C7.z 00130 #define c11_10_im C7.w 00131 #define c12_10_re C8.x 00132 #define c12_10_im C8.y 00133 #define c12_11_re C8.z 00134 #define c12_11_im C8.w 00135 #endif // CLOVER_DOUBLE 00136 00137 #define c00_01_re (+c01_00_re) 00138 #define c00_01_im (-c01_00_im) 00139 #define c00_02_re (+c02_00_re) 00140 #define c00_02_im (-c02_00_im) 00141 #define c01_02_re (+c02_01_re) 00142 #define c01_02_im (-c02_01_im) 00143 #define c00_10_re (+c10_00_re) 00144 #define c00_10_im (-c10_00_im) 00145 #define c01_10_re (+c10_01_re) 00146 #define c01_10_im (-c10_01_im) 00147 #define c02_10_re (+c10_02_re) 00148 #define c02_10_im (-c10_02_im) 00149 #define c00_11_re (+c11_00_re) 00150 #define c00_11_im (-c11_00_im) 00151 #define c01_11_re (+c11_01_re) 00152 #define c01_11_im (-c11_01_im) 00153 #define c02_11_re (+c11_02_re) 00154 #define c02_11_im (-c11_02_im) 00155 #define c10_11_re (+c11_10_re) 00156 #define c10_11_im (-c11_10_im) 00157 #define c00_12_re (+c12_00_re) 00158 #define c00_12_im (-c12_00_im) 00159 #define c01_12_re (+c12_01_re) 00160 #define c01_12_im (-c12_01_im) 00161 #define c02_12_re (+c12_02_re) 00162 #define c02_12_im (-c12_02_im) 00163 #define c10_12_re (+c12_10_re) 00164 #define c10_12_im (-c12_10_im) 00165 #define c11_12_re (+c12_11_re) 00166 #define c11_12_im (-c12_11_im) 00167 00168 // second chiral block of inverted clover term (reuses C0,...,C9) 00169 #define c20_20_re c00_00_re 00170 #define c21_20_re c01_00_re 00171 #define c21_20_im c01_00_im 00172 #define c22_20_re c02_00_re 00173 #define c22_20_im c02_00_im 00174 #define c30_20_re c10_00_re 00175 #define c30_20_im c10_00_im 00176 #define c31_20_re c11_00_re 00177 #define c31_20_im c11_00_im 00178 #define c32_20_re c12_00_re 00179 #define c32_20_im c12_00_im 00180 #define c20_21_re c00_01_re 00181 #define c20_21_im c00_01_im 00182 #define c21_21_re c01_01_re 00183 #define c22_21_re c02_01_re 00184 #define c22_21_im c02_01_im 00185 #define c30_21_re c10_01_re 00186 #define c30_21_im c10_01_im 00187 #define c31_21_re c11_01_re 00188 #define c31_21_im c11_01_im 00189 #define c32_21_re c12_01_re 00190 #define c32_21_im c12_01_im 00191 #define c20_22_re c00_02_re 00192 #define c20_22_im c00_02_im 00193 #define c21_22_re c01_02_re 00194 #define c21_22_im c01_02_im 00195 #define c22_22_re c02_02_re 00196 #define c30_22_re c10_02_re 00197 #define c30_22_im c10_02_im 00198 #define c31_22_re c11_02_re 00199 #define c31_22_im c11_02_im 00200 #define c32_22_re c12_02_re 00201 #define c32_22_im c12_02_im 00202 #define c20_30_re c00_10_re 00203 #define c20_30_im c00_10_im 00204 #define c21_30_re c01_10_re 00205 #define c21_30_im c01_10_im 00206 #define c22_30_re c02_10_re 00207 #define c22_30_im c02_10_im 00208 #define c30_30_re c10_10_re 00209 #define c31_30_re c11_10_re 00210 #define c31_30_im c11_10_im 00211 #define c32_30_re c12_10_re 00212 #define c32_30_im c12_10_im 00213 #define c20_31_re c00_11_re 00214 #define c20_31_im c00_11_im 00215 #define c21_31_re c01_11_re 00216 #define c21_31_im c01_11_im 00217 #define c22_31_re c02_11_re 00218 #define c22_31_im c02_11_im 00219 #define c30_31_re c10_11_re 00220 #define c30_31_im c10_11_im 00221 #define c31_31_re c11_11_re 00222 #define c32_31_re c12_11_re 00223 #define c32_31_im c12_11_im 00224 #define c20_32_re c00_12_re 00225 #define c20_32_im c00_12_im 00226 #define c21_32_re c01_12_re 00227 #define c21_32_im c01_12_im 00228 #define c22_32_re c02_12_re 00229 #define c22_32_im c02_12_im 00230 #define c30_32_re c10_12_re 00231 #define c30_32_im c10_12_im 00232 #define c31_32_re c11_12_re 00233 #define c31_32_im c11_12_im 00234 #define c32_32_re c12_12_re 00235 00236 // output spinor 00237 #define o00_re s[0*SHARED_STRIDE] 00238 #define o00_im s[1*SHARED_STRIDE] 00239 #define o01_re s[2*SHARED_STRIDE] 00240 #define o01_im s[3*SHARED_STRIDE] 00241 #define o02_re s[4*SHARED_STRIDE] 00242 #define o02_im s[5*SHARED_STRIDE] 00243 #define o10_re s[6*SHARED_STRIDE] 00244 #define o10_im s[7*SHARED_STRIDE] 00245 volatile spinorFloat o11_re; 00246 volatile spinorFloat o11_im; 00247 volatile spinorFloat o12_re; 00248 volatile spinorFloat o12_im; 00249 volatile spinorFloat o20_re; 00250 volatile spinorFloat o20_im; 00251 volatile spinorFloat o21_re; 00252 volatile spinorFloat o21_im; 00253 volatile spinorFloat o22_re; 00254 volatile spinorFloat o22_im; 00255 volatile spinorFloat o30_re; 00256 volatile spinorFloat o30_im; 00257 volatile spinorFloat o31_re; 00258 volatile spinorFloat o31_im; 00259 volatile spinorFloat o32_re; 00260 volatile spinorFloat o32_im; 00261 00262 #include "read_clover.h" 00263 #include "io_spinor.h" 00264 00265 #define sp_idx sid // alias needed by READ_SPINOR() 00266 00267 int sid = blockIdx.x*blockDim.x + threadIdx.x; 00268 00269 #ifdef SPINOR_DOUBLE 00270 #if (__CUDA_ARCH__ >= 200) 00271 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 00272 #else 00273 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200 00274 #endif 00275 extern __shared__ spinorFloat sd_data[]; 00276 volatile spinorFloat *s = sd_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE) 00277 + (threadIdx.x % SHARED_STRIDE); 00278 #else 00279 #if (__CUDA_ARCH__ >= 200) 00280 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 00281 #else 00282 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200 00283 #endif 00284 extern __shared__ spinorFloat ss_data[]; 00285 volatile spinorFloat *s = ss_data + SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE) 00286 + (threadIdx.x % SHARED_STRIDE); 00287 #endif 00288 00289 // read spinor from device memory 00290 READ_SPINOR(SPINORTEX); 00291 00292 // change to chiral basis 00293 { 00294 o00_re = -i10_re - i30_re; 00295 o00_im = -i10_im - i30_im; 00296 o10_re = i00_re + i20_re; 00297 o10_im = i00_im + i20_im; 00298 o20_re = -i10_re + i30_re; 00299 o20_im = -i10_im + i30_im; 00300 o30_re = i00_re - i20_re; 00301 o30_im = i00_im - i20_im; 00302 00303 o01_re = -i11_re - i31_re; 00304 o01_im = -i11_im - i31_im; 00305 o11_re = i01_re + i21_re; 00306 o11_im = i01_im + i21_im; 00307 o21_re = -i11_re + i31_re; 00308 o21_im = -i11_im + i31_im; 00309 o31_re = i01_re - i21_re; 00310 o31_im = i01_im - i21_im; 00311 00312 o02_re = -i12_re - i32_re; 00313 o02_im = -i12_im - i32_im; 00314 o12_re = i02_re + i22_re; 00315 o12_im = i02_im + i22_im; 00316 o22_re = -i12_re + i32_re; 00317 o22_im = -i12_im + i32_im; 00318 o32_re = i02_re - i22_re; 00319 o32_im = i02_im - i22_im; 00320 } 00321 00322 // apply first chiral block 00323 { 00324 READ_CLOVER(CLOVERTEX, 0) 00325 00326 spinorFloat a00_re = 0; spinorFloat a00_im = 0; 00327 spinorFloat a01_re = 0; spinorFloat a01_im = 0; 00328 spinorFloat a02_re = 0; spinorFloat a02_im = 0; 00329 spinorFloat a10_re = 0; spinorFloat a10_im = 0; 00330 spinorFloat a11_re = 0; spinorFloat a11_im = 0; 00331 spinorFloat a12_re = 0; spinorFloat a12_im = 0; 00332 00333 a00_re += c00_00_re * o00_re; 00334 a00_im += c00_00_re * o00_im; 00335 a00_re += c00_01_re * o01_re - c00_01_im * o01_im; 00336 a00_im += c00_01_re * o01_im + c00_01_im * o01_re; 00337 a00_re += c00_02_re * o02_re - c00_02_im * o02_im; 00338 a00_im += c00_02_re * o02_im + c00_02_im * o02_re; 00339 a00_re += c00_10_re * o10_re - c00_10_im * o10_im; 00340 a00_im += c00_10_re * o10_im + c00_10_im * o10_re; 00341 a00_re += c00_11_re * o11_re - c00_11_im * o11_im; 00342 a00_im += c00_11_re * o11_im + c00_11_im * o11_re; 00343 a00_re += c00_12_re * o12_re - c00_12_im * o12_im; 00344 a00_im += c00_12_re * o12_im + c00_12_im * o12_re; 00345 00346 a01_re += c01_00_re * o00_re - c01_00_im * o00_im; 00347 a01_im += c01_00_re * o00_im + c01_00_im * o00_re; 00348 a01_re += c01_01_re * o01_re; 00349 a01_im += c01_01_re * o01_im; 00350 a01_re += c01_02_re * o02_re - c01_02_im * o02_im; 00351 a01_im += c01_02_re * o02_im + c01_02_im * o02_re; 00352 a01_re += c01_10_re * o10_re - c01_10_im * o10_im; 00353 a01_im += c01_10_re * o10_im + c01_10_im * o10_re; 00354 a01_re += c01_11_re * o11_re - c01_11_im * o11_im; 00355 a01_im += c01_11_re * o11_im + c01_11_im * o11_re; 00356 a01_re += c01_12_re * o12_re - c01_12_im * o12_im; 00357 a01_im += c01_12_re * o12_im + c01_12_im * o12_re; 00358 00359 a02_re += c02_00_re * o00_re - c02_00_im * o00_im; 00360 a02_im += c02_00_re * o00_im + c02_00_im * o00_re; 00361 a02_re += c02_01_re * o01_re - c02_01_im * o01_im; 00362 a02_im += c02_01_re * o01_im + c02_01_im * o01_re; 00363 a02_re += c02_02_re * o02_re; 00364 a02_im += c02_02_re * o02_im; 00365 a02_re += c02_10_re * o10_re - c02_10_im * o10_im; 00366 a02_im += c02_10_re * o10_im + c02_10_im * o10_re; 00367 a02_re += c02_11_re * o11_re - c02_11_im * o11_im; 00368 a02_im += c02_11_re * o11_im + c02_11_im * o11_re; 00369 a02_re += c02_12_re * o12_re - c02_12_im * o12_im; 00370 a02_im += c02_12_re * o12_im + c02_12_im * o12_re; 00371 00372 a10_re += c10_00_re * o00_re - c10_00_im * o00_im; 00373 a10_im += c10_00_re * o00_im + c10_00_im * o00_re; 00374 a10_re += c10_01_re * o01_re - c10_01_im * o01_im; 00375 a10_im += c10_01_re * o01_im + c10_01_im * o01_re; 00376 a10_re += c10_02_re * o02_re - c10_02_im * o02_im; 00377 a10_im += c10_02_re * o02_im + c10_02_im * o02_re; 00378 a10_re += c10_10_re * o10_re; 00379 a10_im += c10_10_re * o10_im; 00380 a10_re += c10_11_re * o11_re - c10_11_im * o11_im; 00381 a10_im += c10_11_re * o11_im + c10_11_im * o11_re; 00382 a10_re += c10_12_re * o12_re - c10_12_im * o12_im; 00383 a10_im += c10_12_re * o12_im + c10_12_im * o12_re; 00384 00385 a11_re += c11_00_re * o00_re - c11_00_im * o00_im; 00386 a11_im += c11_00_re * o00_im + c11_00_im * o00_re; 00387 a11_re += c11_01_re * o01_re - c11_01_im * o01_im; 00388 a11_im += c11_01_re * o01_im + c11_01_im * o01_re; 00389 a11_re += c11_02_re * o02_re - c11_02_im * o02_im; 00390 a11_im += c11_02_re * o02_im + c11_02_im * o02_re; 00391 a11_re += c11_10_re * o10_re - c11_10_im * o10_im; 00392 a11_im += c11_10_re * o10_im + c11_10_im * o10_re; 00393 a11_re += c11_11_re * o11_re; 00394 a11_im += c11_11_re * o11_im; 00395 a11_re += c11_12_re * o12_re - c11_12_im * o12_im; 00396 a11_im += c11_12_re * o12_im + c11_12_im * o12_re; 00397 00398 a12_re += c12_00_re * o00_re - c12_00_im * o00_im; 00399 a12_im += c12_00_re * o00_im + c12_00_im * o00_re; 00400 a12_re += c12_01_re * o01_re - c12_01_im * o01_im; 00401 a12_im += c12_01_re * o01_im + c12_01_im * o01_re; 00402 a12_re += c12_02_re * o02_re - c12_02_im * o02_im; 00403 a12_im += c12_02_re * o02_im + c12_02_im * o02_re; 00404 a12_re += c12_10_re * o10_re - c12_10_im * o10_im; 00405 a12_im += c12_10_re * o10_im + c12_10_im * o10_re; 00406 a12_re += c12_11_re * o11_re - c12_11_im * o11_im; 00407 a12_im += c12_11_re * o11_im + c12_11_im * o11_re; 00408 a12_re += c12_12_re * o12_re; 00409 a12_im += c12_12_re * o12_im; 00410 00411 o00_re = a00_re; o00_im = a00_im; 00412 o01_re = a01_re; o01_im = a01_im; 00413 o02_re = a02_re; o02_im = a02_im; 00414 o10_re = a10_re; o10_im = a10_im; 00415 o11_re = a11_re; o11_im = a11_im; 00416 o12_re = a12_re; o12_im = a12_im; 00417 } 00418 00419 // apply second chiral block 00420 { 00421 READ_CLOVER(CLOVERTEX, 1) 00422 00423 spinorFloat a20_re = 0; spinorFloat a20_im = 0; 00424 spinorFloat a21_re = 0; spinorFloat a21_im = 0; 00425 spinorFloat a22_re = 0; spinorFloat a22_im = 0; 00426 spinorFloat a30_re = 0; spinorFloat a30_im = 0; 00427 spinorFloat a31_re = 0; spinorFloat a31_im = 0; 00428 spinorFloat a32_re = 0; spinorFloat a32_im = 0; 00429 00430 a20_re += c20_20_re * o20_re; 00431 a20_im += c20_20_re * o20_im; 00432 a20_re += c20_21_re * o21_re - c20_21_im * o21_im; 00433 a20_im += c20_21_re * o21_im + c20_21_im * o21_re; 00434 a20_re += c20_22_re * o22_re - c20_22_im * o22_im; 00435 a20_im += c20_22_re * o22_im + c20_22_im * o22_re; 00436 a20_re += c20_30_re * o30_re - c20_30_im * o30_im; 00437 a20_im += c20_30_re * o30_im + c20_30_im * o30_re; 00438 a20_re += c20_31_re * o31_re - c20_31_im * o31_im; 00439 a20_im += c20_31_re * o31_im + c20_31_im * o31_re; 00440 a20_re += c20_32_re * o32_re - c20_32_im * o32_im; 00441 a20_im += c20_32_re * o32_im + c20_32_im * o32_re; 00442 00443 a21_re += c21_20_re * o20_re - c21_20_im * o20_im; 00444 a21_im += c21_20_re * o20_im + c21_20_im * o20_re; 00445 a21_re += c21_21_re * o21_re; 00446 a21_im += c21_21_re * o21_im; 00447 a21_re += c21_22_re * o22_re - c21_22_im * o22_im; 00448 a21_im += c21_22_re * o22_im + c21_22_im * o22_re; 00449 a21_re += c21_30_re * o30_re - c21_30_im * o30_im; 00450 a21_im += c21_30_re * o30_im + c21_30_im * o30_re; 00451 a21_re += c21_31_re * o31_re - c21_31_im * o31_im; 00452 a21_im += c21_31_re * o31_im + c21_31_im * o31_re; 00453 a21_re += c21_32_re * o32_re - c21_32_im * o32_im; 00454 a21_im += c21_32_re * o32_im + c21_32_im * o32_re; 00455 00456 a22_re += c22_20_re * o20_re - c22_20_im * o20_im; 00457 a22_im += c22_20_re * o20_im + c22_20_im * o20_re; 00458 a22_re += c22_21_re * o21_re - c22_21_im * o21_im; 00459 a22_im += c22_21_re * o21_im + c22_21_im * o21_re; 00460 a22_re += c22_22_re * o22_re; 00461 a22_im += c22_22_re * o22_im; 00462 a22_re += c22_30_re * o30_re - c22_30_im * o30_im; 00463 a22_im += c22_30_re * o30_im + c22_30_im * o30_re; 00464 a22_re += c22_31_re * o31_re - c22_31_im * o31_im; 00465 a22_im += c22_31_re * o31_im + c22_31_im * o31_re; 00466 a22_re += c22_32_re * o32_re - c22_32_im * o32_im; 00467 a22_im += c22_32_re * o32_im + c22_32_im * o32_re; 00468 00469 a30_re += c30_20_re * o20_re - c30_20_im * o20_im; 00470 a30_im += c30_20_re * o20_im + c30_20_im * o20_re; 00471 a30_re += c30_21_re * o21_re - c30_21_im * o21_im; 00472 a30_im += c30_21_re * o21_im + c30_21_im * o21_re; 00473 a30_re += c30_22_re * o22_re - c30_22_im * o22_im; 00474 a30_im += c30_22_re * o22_im + c30_22_im * o22_re; 00475 a30_re += c30_30_re * o30_re; 00476 a30_im += c30_30_re * o30_im; 00477 a30_re += c30_31_re * o31_re - c30_31_im * o31_im; 00478 a30_im += c30_31_re * o31_im + c30_31_im * o31_re; 00479 a30_re += c30_32_re * o32_re - c30_32_im * o32_im; 00480 a30_im += c30_32_re * o32_im + c30_32_im * o32_re; 00481 00482 a31_re += c31_20_re * o20_re - c31_20_im * o20_im; 00483 a31_im += c31_20_re * o20_im + c31_20_im * o20_re; 00484 a31_re += c31_21_re * o21_re - c31_21_im * o21_im; 00485 a31_im += c31_21_re * o21_im + c31_21_im * o21_re; 00486 a31_re += c31_22_re * o22_re - c31_22_im * o22_im; 00487 a31_im += c31_22_re * o22_im + c31_22_im * o22_re; 00488 a31_re += c31_30_re * o30_re - c31_30_im * o30_im; 00489 a31_im += c31_30_re * o30_im + c31_30_im * o30_re; 00490 a31_re += c31_31_re * o31_re; 00491 a31_im += c31_31_re * o31_im; 00492 a31_re += c31_32_re * o32_re - c31_32_im * o32_im; 00493 a31_im += c31_32_re * o32_im + c31_32_im * o32_re; 00494 00495 a32_re += c32_20_re * o20_re - c32_20_im * o20_im; 00496 a32_im += c32_20_re * o20_im + c32_20_im * o20_re; 00497 a32_re += c32_21_re * o21_re - c32_21_im * o21_im; 00498 a32_im += c32_21_re * o21_im + c32_21_im * o21_re; 00499 a32_re += c32_22_re * o22_re - c32_22_im * o22_im; 00500 a32_im += c32_22_re * o22_im + c32_22_im * o22_re; 00501 a32_re += c32_30_re * o30_re - c32_30_im * o30_im; 00502 a32_im += c32_30_re * o30_im + c32_30_im * o30_re; 00503 a32_re += c32_31_re * o31_re - c32_31_im * o31_im; 00504 a32_im += c32_31_re * o31_im + c32_31_im * o31_re; 00505 a32_re += c32_32_re * o32_re; 00506 a32_im += c32_32_re * o32_im; 00507 00508 o20_re = a20_re; o20_im = a20_im; 00509 o21_re = a21_re; o21_im = a21_im; 00510 o22_re = a22_re; o22_im = a22_im; 00511 o30_re = a30_re; o30_im = a30_im; 00512 o31_re = a31_re; o31_im = a31_im; 00513 o32_re = a32_re; o32_im = a32_im; 00514 } 00515 00516 // change back from chiral basis 00517 // (note: required factor of 1/2 is included in clover term normalization) 00518 { 00519 spinorFloat a00_re = o10_re + o30_re; 00520 spinorFloat a00_im = o10_im + o30_im; 00521 spinorFloat a10_re = -o00_re - o20_re; 00522 spinorFloat a10_im = -o00_im - o20_im; 00523 spinorFloat a20_re = o10_re - o30_re; 00524 spinorFloat a20_im = o10_im - o30_im; 00525 spinorFloat a30_re = -o00_re + o20_re; 00526 spinorFloat a30_im = -o00_im + o20_im; 00527 00528 o00_re = a00_re; o00_im = a00_im; 00529 o10_re = a10_re; o10_im = a10_im; 00530 o20_re = a20_re; o20_im = a20_im; 00531 o30_re = a30_re; o30_im = a30_im; 00532 } 00533 { 00534 spinorFloat a01_re = o11_re + o31_re; 00535 spinorFloat a01_im = o11_im + o31_im; 00536 spinorFloat a11_re = -o01_re - o21_re; 00537 spinorFloat a11_im = -o01_im - o21_im; 00538 spinorFloat a21_re = o11_re - o31_re; 00539 spinorFloat a21_im = o11_im - o31_im; 00540 spinorFloat a31_re = -o01_re + o21_re; 00541 spinorFloat a31_im = -o01_im + o21_im; 00542 00543 o01_re = a01_re; o01_im = a01_im; 00544 o11_re = a11_re; o11_im = a11_im; 00545 o21_re = a21_re; o21_im = a21_im; 00546 o31_re = a31_re; o31_im = a31_im; 00547 } 00548 { 00549 spinorFloat a02_re = o12_re + o32_re; 00550 spinorFloat a02_im = o12_im + o32_im; 00551 spinorFloat a12_re = -o02_re - o22_re; 00552 spinorFloat a12_im = -o02_im - o22_im; 00553 spinorFloat a22_re = o12_re - o32_re; 00554 spinorFloat a22_im = o12_im - o32_im; 00555 spinorFloat a32_re = -o02_re + o22_re; 00556 spinorFloat a32_im = -o02_im + o22_im; 00557 00558 o02_re = a02_re; o02_im = a02_im; 00559 o12_re = a12_re; o12_im = a12_im; 00560 o22_re = a22_re; o22_im = a22_im; 00561 o32_re = a32_re; o32_im = a32_im; 00562 } 00563 00564 #ifdef DSLASH_XPAY 00565 READ_ACCUM(ACCUMTEX) 00566 #ifdef SPINOR_DOUBLE 00567 o00_re = a*o00_re + accum0.x; 00568 o00_im = a*o00_im + accum0.y; 00569 o01_re = a*o01_re + accum1.x; 00570 o01_im = a*o01_im + accum1.y; 00571 o02_re = a*o02_re + accum2.x; 00572 o02_im = a*o02_im + accum2.y; 00573 o10_re = a*o10_re + accum3.x; 00574 o10_im = a*o10_im + accum3.y; 00575 o11_re = a*o11_re + accum4.x; 00576 o11_im = a*o11_im + accum4.y; 00577 o12_re = a*o12_re + accum5.x; 00578 o12_im = a*o12_im + accum5.y; 00579 o20_re = a*o20_re + accum6.x; 00580 o20_im = a*o20_im + accum6.y; 00581 o21_re = a*o21_re + accum7.x; 00582 o21_im = a*o21_im + accum7.y; 00583 o22_re = a*o22_re + accum8.x; 00584 o22_im = a*o22_im + accum8.y; 00585 o30_re = a*o30_re + accum9.x; 00586 o30_im = a*o30_im + accum9.y; 00587 o31_re = a*o31_re + accum10.x; 00588 o31_im = a*o31_im + accum10.y; 00589 o32_re = a*o32_re + accum11.x; 00590 o32_im = a*o32_im + accum11.y; 00591 #else 00592 o00_re = a*o00_re + accum0.x; 00593 o00_im = a*o00_im + accum0.y; 00594 o01_re = a*o01_re + accum0.z; 00595 o01_im = a*o01_im + accum0.w; 00596 o02_re = a*o02_re + accum1.x; 00597 o02_im = a*o02_im + accum1.y; 00598 o10_re = a*o10_re + accum1.z; 00599 o10_im = a*o10_im + accum1.w; 00600 o11_re = a*o11_re + accum2.x; 00601 o11_im = a*o11_im + accum2.y; 00602 o12_re = a*o12_re + accum2.z; 00603 o12_im = a*o12_im + accum2.w; 00604 o20_re = a*o20_re + accum3.x; 00605 o20_im = a*o20_im + accum3.y; 00606 o21_re = a*o21_re + accum3.z; 00607 o21_im = a*o21_im + accum3.w; 00608 o22_re = a*o22_re + accum4.x; 00609 o22_im = a*o22_im + accum4.y; 00610 o30_re = a*o30_re + accum4.z; 00611 o30_im = a*o30_im + accum4.w; 00612 o31_re = a*o31_re + accum5.x; 00613 o31_im = a*o31_im + accum5.y; 00614 o32_re = a*o32_re + accum5.z; 00615 o32_im = a*o32_im + accum5.w; 00616 #endif // SPINOR_DOUBLE 00617 #endif // DSLASH_XPAY 00618 00619 // write spinor field back to device memory 00620 WRITE_SPINOR(); 00621 00622 // undefine to prevent warning when precision is changed 00623 #undef spinorFloat 00624 #undef SHARED_STRIDE 00625 00626 #undef i00_re 00627 #undef i00_im 00628 #undef i01_re 00629 #undef i01_im 00630 #undef i02_re 00631 #undef i02_im 00632 #undef i10_re 00633 #undef i10_im 00634 #undef i11_re 00635 #undef i11_im 00636 #undef i12_re 00637 #undef i12_im 00638 #undef i20_re 00639 #undef i20_im 00640 #undef i21_re 00641 #undef i21_im 00642 #undef i22_re 00643 #undef i22_im 00644 #undef i30_re 00645 #undef i30_im 00646 #undef i31_re 00647 #undef i31_im 00648 #undef i32_re 00649 #undef i32_im 00650 00651 #undef c00_00_re 00652 #undef c01_01_re 00653 #undef c02_02_re 00654 #undef c10_10_re 00655 #undef c11_11_re 00656 #undef c12_12_re 00657 #undef c01_00_re 00658 #undef c01_00_im 00659 #undef c02_00_re 00660 #undef c02_00_im 00661 #undef c10_00_re 00662 #undef c10_00_im 00663 #undef c11_00_re 00664 #undef c11_00_im 00665 #undef c12_00_re 00666 #undef c12_00_im 00667 #undef c02_01_re 00668 #undef c02_01_im 00669 #undef c10_01_re 00670 #undef c10_01_im 00671 #undef c11_01_re 00672 #undef c11_01_im 00673 #undef c12_01_re 00674 #undef c12_01_im 00675 #undef c10_02_re 00676 #undef c10_02_im 00677 #undef c11_02_re 00678 #undef c11_02_im 00679 #undef c12_02_re 00680 #undef c12_02_im 00681 #undef c11_10_re 00682 #undef c11_10_im 00683 #undef c12_10_re 00684 #undef c12_10_im 00685 #undef c12_11_re 00686 #undef c12_11_im 00687 00688 #undef o00_re 00689 #undef o00_im 00690 #undef o01_re 00691 #undef o01_im 00692 #undef o02_re 00693 #undef o02_im 00694 #undef o10_re 00695 #undef o10_im 00696 00697 #undef sp_idx // alias needed by READ_SPINOR()
1.7.3