QUDA v0.4.0
A library for QCD on GPUs
|
00001 // *** CUDA CLOVER *** 00002 00003 #define CLOVER_SHARED_FLOATS_PER_THREAD 0 00004 00005 00006 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 00007 #define VOLATILE 00008 #else // Open64 compiler 00009 #define VOLATILE volatile 00010 #endif 00011 // input spinor 00012 #ifdef SPINOR_DOUBLE 00013 #define spinorFloat double 00014 #define i00_re I0.x 00015 #define i00_im I0.y 00016 #define i01_re I1.x 00017 #define i01_im I1.y 00018 #define i02_re I2.x 00019 #define i02_im I2.y 00020 #define i10_re I3.x 00021 #define i10_im I3.y 00022 #define i11_re I4.x 00023 #define i11_im I4.y 00024 #define i12_re I5.x 00025 #define i12_im I5.y 00026 #define i20_re I6.x 00027 #define i20_im I6.y 00028 #define i21_re I7.x 00029 #define i21_im I7.y 00030 #define i22_re I8.x 00031 #define i22_im I8.y 00032 #define i30_re I9.x 00033 #define i30_im I9.y 00034 #define i31_re I10.x 00035 #define i31_im I10.y 00036 #define i32_re I11.x 00037 #define i32_im I11.y 00038 #else 00039 #define spinorFloat float 00040 #define i00_re I0.x 00041 #define i00_im I0.y 00042 #define i01_re I0.z 00043 #define i01_im I0.w 00044 #define i02_re I1.x 00045 #define i02_im I1.y 00046 #define i10_re I1.z 00047 #define i10_im I1.w 00048 #define i11_re I2.x 00049 #define i11_im I2.y 00050 #define i12_re I2.z 00051 #define i12_im I2.w 00052 #define i20_re I3.x 00053 #define i20_im I3.y 00054 #define i21_re I3.z 00055 #define i21_im I3.w 00056 #define i22_re I4.x 00057 #define i22_im I4.y 00058 #define i30_re I4.z 00059 #define i30_im I4.w 00060 #define i31_re I5.x 00061 #define i31_im I5.y 00062 #define i32_re I5.z 00063 #define i32_im I5.w 00064 #endif // SPINOR_DOUBLE 00065 00066 // first chiral block of inverted clover term 00067 #ifdef CLOVER_DOUBLE 00068 #define c00_00_re C0.x 00069 #define c01_01_re C0.y 00070 #define c02_02_re C1.x 00071 #define c10_10_re C1.y 00072 #define c11_11_re C2.x 00073 #define c12_12_re C2.y 00074 #define c01_00_re C3.x 00075 #define c01_00_im C3.y 00076 #define c02_00_re C4.x 00077 #define c02_00_im C4.y 00078 #define c10_00_re C5.x 00079 #define c10_00_im C5.y 00080 #define c11_00_re C6.x 00081 #define c11_00_im C6.y 00082 #define c12_00_re C7.x 00083 #define c12_00_im C7.y 00084 #define c02_01_re C8.x 00085 #define c02_01_im C8.y 00086 #define c10_01_re C9.x 00087 #define c10_01_im C9.y 00088 #define c11_01_re C10.x 00089 #define c11_01_im C10.y 00090 #define c12_01_re C11.x 00091 #define c12_01_im C11.y 00092 #define c10_02_re C12.x 00093 #define c10_02_im C12.y 00094 #define c11_02_re C13.x 00095 #define c11_02_im C13.y 00096 #define c12_02_re C14.x 00097 #define c12_02_im C14.y 00098 #define c11_10_re C15.x 00099 #define c11_10_im C15.y 00100 #define c12_10_re C16.x 00101 #define c12_10_im C16.y 00102 #define c12_11_re C17.x 00103 #define c12_11_im C17.y 00104 #else 00105 #define c00_00_re C0.x 00106 #define c01_01_re C0.y 00107 #define c02_02_re C0.z 00108 #define c10_10_re C0.w 00109 #define c11_11_re C1.x 00110 #define c12_12_re C1.y 00111 #define c01_00_re C1.z 00112 #define c01_00_im C1.w 00113 #define c02_00_re C2.x 00114 #define c02_00_im C2.y 00115 #define c10_00_re C2.z 00116 #define c10_00_im C2.w 00117 #define c11_00_re C3.x 00118 #define c11_00_im C3.y 00119 #define c12_00_re C3.z 00120 #define c12_00_im C3.w 00121 #define c02_01_re C4.x 00122 #define c02_01_im C4.y 00123 #define c10_01_re C4.z 00124 #define c10_01_im C4.w 00125 #define c11_01_re C5.x 00126 #define c11_01_im C5.y 00127 #define c12_01_re C5.z 00128 #define c12_01_im C5.w 00129 #define c10_02_re C6.x 00130 #define c10_02_im C6.y 00131 #define c11_02_re C6.z 00132 #define c11_02_im C6.w 00133 #define c12_02_re C7.x 00134 #define c12_02_im C7.y 00135 #define c11_10_re C7.z 00136 #define c11_10_im C7.w 00137 #define c12_10_re C8.x 00138 #define c12_10_im C8.y 00139 #define c12_11_re C8.z 00140 #define c12_11_im C8.w 00141 #endif // CLOVER_DOUBLE 00142 00143 #define c00_01_re (+c01_00_re) 00144 #define c00_01_im (-c01_00_im) 00145 #define c00_02_re (+c02_00_re) 00146 #define c00_02_im (-c02_00_im) 00147 #define c01_02_re (+c02_01_re) 00148 #define c01_02_im (-c02_01_im) 00149 #define c00_10_re (+c10_00_re) 00150 #define c00_10_im (-c10_00_im) 00151 #define c01_10_re (+c10_01_re) 00152 #define c01_10_im (-c10_01_im) 00153 #define c02_10_re (+c10_02_re) 00154 #define c02_10_im (-c10_02_im) 00155 #define c00_11_re (+c11_00_re) 00156 #define c00_11_im (-c11_00_im) 00157 #define c01_11_re (+c11_01_re) 00158 #define c01_11_im (-c11_01_im) 00159 #define c02_11_re (+c11_02_re) 00160 #define c02_11_im (-c11_02_im) 00161 #define c10_11_re (+c11_10_re) 00162 #define c10_11_im (-c11_10_im) 00163 #define c00_12_re (+c12_00_re) 00164 #define c00_12_im (-c12_00_im) 00165 #define c01_12_re (+c12_01_re) 00166 #define c01_12_im (-c12_01_im) 00167 #define c02_12_re (+c12_02_re) 00168 #define c02_12_im (-c12_02_im) 00169 #define c10_12_re (+c12_10_re) 00170 #define c10_12_im (-c12_10_im) 00171 #define c11_12_re (+c12_11_re) 00172 #define c11_12_im (-c12_11_im) 00173 00174 // second chiral block of inverted clover term (reuses C0,...,C9) 00175 #define c20_20_re c00_00_re 00176 #define c21_20_re c01_00_re 00177 #define c21_20_im c01_00_im 00178 #define c22_20_re c02_00_re 00179 #define c22_20_im c02_00_im 00180 #define c30_20_re c10_00_re 00181 #define c30_20_im c10_00_im 00182 #define c31_20_re c11_00_re 00183 #define c31_20_im c11_00_im 00184 #define c32_20_re c12_00_re 00185 #define c32_20_im c12_00_im 00186 #define c20_21_re c00_01_re 00187 #define c20_21_im c00_01_im 00188 #define c21_21_re c01_01_re 00189 #define c22_21_re c02_01_re 00190 #define c22_21_im c02_01_im 00191 #define c30_21_re c10_01_re 00192 #define c30_21_im c10_01_im 00193 #define c31_21_re c11_01_re 00194 #define c31_21_im c11_01_im 00195 #define c32_21_re c12_01_re 00196 #define c32_21_im c12_01_im 00197 #define c20_22_re c00_02_re 00198 #define c20_22_im c00_02_im 00199 #define c21_22_re c01_02_re 00200 #define c21_22_im c01_02_im 00201 #define c22_22_re c02_02_re 00202 #define c30_22_re c10_02_re 00203 #define c30_22_im c10_02_im 00204 #define c31_22_re c11_02_re 00205 #define c31_22_im c11_02_im 00206 #define c32_22_re c12_02_re 00207 #define c32_22_im c12_02_im 00208 #define c20_30_re c00_10_re 00209 #define c20_30_im c00_10_im 00210 #define c21_30_re c01_10_re 00211 #define c21_30_im c01_10_im 00212 #define c22_30_re c02_10_re 00213 #define c22_30_im c02_10_im 00214 #define c30_30_re c10_10_re 00215 #define c31_30_re c11_10_re 00216 #define c31_30_im c11_10_im 00217 #define c32_30_re c12_10_re 00218 #define c32_30_im c12_10_im 00219 #define c20_31_re c00_11_re 00220 #define c20_31_im c00_11_im 00221 #define c21_31_re c01_11_re 00222 #define c21_31_im c01_11_im 00223 #define c22_31_re c02_11_re 00224 #define c22_31_im c02_11_im 00225 #define c30_31_re c10_11_re 00226 #define c30_31_im c10_11_im 00227 #define c31_31_re c11_11_re 00228 #define c32_31_re c12_11_re 00229 #define c32_31_im c12_11_im 00230 #define c20_32_re c00_12_re 00231 #define c20_32_im c00_12_im 00232 #define c21_32_re c01_12_re 00233 #define c21_32_im c01_12_im 00234 #define c22_32_re c02_12_re 00235 #define c22_32_im c02_12_im 00236 #define c30_32_re c10_12_re 00237 #define c30_32_im c10_12_im 00238 #define c31_32_re c11_12_re 00239 #define c31_32_im c11_12_im 00240 #define c32_32_re c12_12_re 00241 00242 // output spinor 00243 VOLATILE spinorFloat o00_re; 00244 VOLATILE spinorFloat o00_im; 00245 VOLATILE spinorFloat o01_re; 00246 VOLATILE spinorFloat o01_im; 00247 VOLATILE spinorFloat o02_re; 00248 VOLATILE spinorFloat o02_im; 00249 VOLATILE spinorFloat o10_re; 00250 VOLATILE spinorFloat o10_im; 00251 VOLATILE spinorFloat o11_re; 00252 VOLATILE spinorFloat o11_im; 00253 VOLATILE spinorFloat o12_re; 00254 VOLATILE spinorFloat o12_im; 00255 VOLATILE spinorFloat o20_re; 00256 VOLATILE spinorFloat o20_im; 00257 VOLATILE spinorFloat o21_re; 00258 VOLATILE spinorFloat o21_im; 00259 VOLATILE spinorFloat o22_re; 00260 VOLATILE spinorFloat o22_im; 00261 VOLATILE spinorFloat o30_re; 00262 VOLATILE spinorFloat o30_im; 00263 VOLATILE spinorFloat o31_re; 00264 VOLATILE spinorFloat o31_im; 00265 VOLATILE spinorFloat o32_re; 00266 VOLATILE spinorFloat o32_im; 00267 00268 #include "read_clover.h" 00269 #include "io_spinor.h" 00270 00271 int sid = blockIdx.x*blockDim.x + threadIdx.x; 00272 if (sid >= param.threads) return; 00273 00274 // read spinor from device memory 00275 READ_SPINOR(SPINORTEX, sp_stride, sid, sid); 00276 { 00277 00278 // change to chiral basis 00279 { 00280 spinorFloat a00_re = -i10_re - i30_re; 00281 spinorFloat a00_im = -i10_im - i30_im; 00282 spinorFloat a10_re = i00_re + i20_re; 00283 spinorFloat a10_im = i00_im + i20_im; 00284 spinorFloat a20_re = -i10_re + i30_re; 00285 spinorFloat a20_im = -i10_im + i30_im; 00286 spinorFloat a30_re = i00_re - i20_re; 00287 spinorFloat a30_im = i00_im - i20_im; 00288 00289 o00_re = a00_re; o00_im = a00_im; 00290 o10_re = a10_re; o10_im = a10_im; 00291 o20_re = a20_re; o20_im = a20_im; 00292 o30_re = a30_re; o30_im = a30_im; 00293 } 00294 00295 { 00296 spinorFloat a01_re = -i11_re - i31_re; 00297 spinorFloat a01_im = -i11_im - i31_im; 00298 spinorFloat a11_re = i01_re + i21_re; 00299 spinorFloat a11_im = i01_im + i21_im; 00300 spinorFloat a21_re = -i11_re + i31_re; 00301 spinorFloat a21_im = -i11_im + i31_im; 00302 spinorFloat a31_re = i01_re - i21_re; 00303 spinorFloat a31_im = i01_im - i21_im; 00304 00305 o01_re = a01_re; o01_im = a01_im; 00306 o11_re = a11_re; o11_im = a11_im; 00307 o21_re = a21_re; o21_im = a21_im; 00308 o31_re = a31_re; o31_im = a31_im; 00309 } 00310 00311 { 00312 spinorFloat a02_re = -i12_re - i32_re; 00313 spinorFloat a02_im = -i12_im - i32_im; 00314 spinorFloat a12_re = i02_re + i22_re; 00315 spinorFloat a12_im = i02_im + i22_im; 00316 spinorFloat a22_re = -i12_re + i32_re; 00317 spinorFloat a22_im = -i12_im + i32_im; 00318 spinorFloat a32_re = i02_re - i22_re; 00319 spinorFloat a32_im = i02_im - i22_im; 00320 00321 o02_re = a02_re; o02_im = a02_im; 00322 o12_re = a12_re; o12_im = a12_im; 00323 o22_re = a22_re; o22_im = a22_im; 00324 o32_re = a32_re; o32_im = a32_im; 00325 } 00326 00327 // apply first chiral block 00328 { 00329 READ_CLOVER(CLOVERTEX, 0) 00330 00331 spinorFloat a00_re = 0; spinorFloat a00_im = 0; 00332 spinorFloat a01_re = 0; spinorFloat a01_im = 0; 00333 spinorFloat a02_re = 0; spinorFloat a02_im = 0; 00334 spinorFloat a10_re = 0; spinorFloat a10_im = 0; 00335 spinorFloat a11_re = 0; spinorFloat a11_im = 0; 00336 spinorFloat a12_re = 0; spinorFloat a12_im = 0; 00337 00338 a00_re += c00_00_re * o00_re; 00339 a00_im += c00_00_re * o00_im; 00340 a00_re += c00_01_re * o01_re; 00341 a00_re -= c00_01_im * o01_im; 00342 a00_im += c00_01_re * o01_im; 00343 a00_im += c00_01_im * o01_re; 00344 a00_re += c00_02_re * o02_re; 00345 a00_re -= c00_02_im * o02_im; 00346 a00_im += c00_02_re * o02_im; 00347 a00_im += c00_02_im * o02_re; 00348 a00_re += c00_10_re * o10_re; 00349 a00_re -= c00_10_im * o10_im; 00350 a00_im += c00_10_re * o10_im; 00351 a00_im += c00_10_im * o10_re; 00352 a00_re += c00_11_re * o11_re; 00353 a00_re -= c00_11_im * o11_im; 00354 a00_im += c00_11_re * o11_im; 00355 a00_im += c00_11_im * o11_re; 00356 a00_re += c00_12_re * o12_re; 00357 a00_re -= c00_12_im * o12_im; 00358 a00_im += c00_12_re * o12_im; 00359 a00_im += c00_12_im * o12_re; 00360 00361 a01_re += c01_00_re * o00_re; 00362 a01_re -= c01_00_im * o00_im; 00363 a01_im += c01_00_re * o00_im; 00364 a01_im += c01_00_im * o00_re; 00365 a01_re += c01_01_re * o01_re; 00366 a01_im += c01_01_re * o01_im; 00367 a01_re += c01_02_re * o02_re; 00368 a01_re -= c01_02_im * o02_im; 00369 a01_im += c01_02_re * o02_im; 00370 a01_im += c01_02_im * o02_re; 00371 a01_re += c01_10_re * o10_re; 00372 a01_re -= c01_10_im * o10_im; 00373 a01_im += c01_10_re * o10_im; 00374 a01_im += c01_10_im * o10_re; 00375 a01_re += c01_11_re * o11_re; 00376 a01_re -= c01_11_im * o11_im; 00377 a01_im += c01_11_re * o11_im; 00378 a01_im += c01_11_im * o11_re; 00379 a01_re += c01_12_re * o12_re; 00380 a01_re -= c01_12_im * o12_im; 00381 a01_im += c01_12_re * o12_im; 00382 a01_im += c01_12_im * o12_re; 00383 00384 a02_re += c02_00_re * o00_re; 00385 a02_re -= c02_00_im * o00_im; 00386 a02_im += c02_00_re * o00_im; 00387 a02_im += c02_00_im * o00_re; 00388 a02_re += c02_01_re * o01_re; 00389 a02_re -= c02_01_im * o01_im; 00390 a02_im += c02_01_re * o01_im; 00391 a02_im += c02_01_im * o01_re; 00392 a02_re += c02_02_re * o02_re; 00393 a02_im += c02_02_re * o02_im; 00394 a02_re += c02_10_re * o10_re; 00395 a02_re -= c02_10_im * o10_im; 00396 a02_im += c02_10_re * o10_im; 00397 a02_im += c02_10_im * o10_re; 00398 a02_re += c02_11_re * o11_re; 00399 a02_re -= c02_11_im * o11_im; 00400 a02_im += c02_11_re * o11_im; 00401 a02_im += c02_11_im * o11_re; 00402 a02_re += c02_12_re * o12_re; 00403 a02_re -= c02_12_im * o12_im; 00404 a02_im += c02_12_re * o12_im; 00405 a02_im += c02_12_im * o12_re; 00406 00407 a10_re += c10_00_re * o00_re; 00408 a10_re -= c10_00_im * o00_im; 00409 a10_im += c10_00_re * o00_im; 00410 a10_im += c10_00_im * o00_re; 00411 a10_re += c10_01_re * o01_re; 00412 a10_re -= c10_01_im * o01_im; 00413 a10_im += c10_01_re * o01_im; 00414 a10_im += c10_01_im * o01_re; 00415 a10_re += c10_02_re * o02_re; 00416 a10_re -= c10_02_im * o02_im; 00417 a10_im += c10_02_re * o02_im; 00418 a10_im += c10_02_im * o02_re; 00419 a10_re += c10_10_re * o10_re; 00420 a10_im += c10_10_re * o10_im; 00421 a10_re += c10_11_re * o11_re; 00422 a10_re -= c10_11_im * o11_im; 00423 a10_im += c10_11_re * o11_im; 00424 a10_im += c10_11_im * o11_re; 00425 a10_re += c10_12_re * o12_re; 00426 a10_re -= c10_12_im * o12_im; 00427 a10_im += c10_12_re * o12_im; 00428 a10_im += c10_12_im * o12_re; 00429 00430 a11_re += c11_00_re * o00_re; 00431 a11_re -= c11_00_im * o00_im; 00432 a11_im += c11_00_re * o00_im; 00433 a11_im += c11_00_im * o00_re; 00434 a11_re += c11_01_re * o01_re; 00435 a11_re -= c11_01_im * o01_im; 00436 a11_im += c11_01_re * o01_im; 00437 a11_im += c11_01_im * o01_re; 00438 a11_re += c11_02_re * o02_re; 00439 a11_re -= c11_02_im * o02_im; 00440 a11_im += c11_02_re * o02_im; 00441 a11_im += c11_02_im * o02_re; 00442 a11_re += c11_10_re * o10_re; 00443 a11_re -= c11_10_im * o10_im; 00444 a11_im += c11_10_re * o10_im; 00445 a11_im += c11_10_im * o10_re; 00446 a11_re += c11_11_re * o11_re; 00447 a11_im += c11_11_re * o11_im; 00448 a11_re += c11_12_re * o12_re; 00449 a11_re -= c11_12_im * o12_im; 00450 a11_im += c11_12_re * o12_im; 00451 a11_im += c11_12_im * o12_re; 00452 00453 a12_re += c12_00_re * o00_re; 00454 a12_re -= c12_00_im * o00_im; 00455 a12_im += c12_00_re * o00_im; 00456 a12_im += c12_00_im * o00_re; 00457 a12_re += c12_01_re * o01_re; 00458 a12_re -= c12_01_im * o01_im; 00459 a12_im += c12_01_re * o01_im; 00460 a12_im += c12_01_im * o01_re; 00461 a12_re += c12_02_re * o02_re; 00462 a12_re -= c12_02_im * o02_im; 00463 a12_im += c12_02_re * o02_im; 00464 a12_im += c12_02_im * o02_re; 00465 a12_re += c12_10_re * o10_re; 00466 a12_re -= c12_10_im * o10_im; 00467 a12_im += c12_10_re * o10_im; 00468 a12_im += c12_10_im * o10_re; 00469 a12_re += c12_11_re * o11_re; 00470 a12_re -= c12_11_im * o11_im; 00471 a12_im += c12_11_re * o11_im; 00472 a12_im += c12_11_im * o11_re; 00473 a12_re += c12_12_re * o12_re; 00474 a12_im += c12_12_re * o12_im; 00475 00476 o00_re = a00_re; o00_im = a00_im; 00477 o01_re = a01_re; o01_im = a01_im; 00478 o02_re = a02_re; o02_im = a02_im; 00479 o10_re = a10_re; o10_im = a10_im; 00480 o11_re = a11_re; o11_im = a11_im; 00481 o12_re = a12_re; o12_im = a12_im; 00482 00483 } 00484 00485 // apply second chiral block 00486 { 00487 READ_CLOVER(CLOVERTEX, 1) 00488 00489 spinorFloat a20_re = 0; spinorFloat a20_im = 0; 00490 spinorFloat a21_re = 0; spinorFloat a21_im = 0; 00491 spinorFloat a22_re = 0; spinorFloat a22_im = 0; 00492 spinorFloat a30_re = 0; spinorFloat a30_im = 0; 00493 spinorFloat a31_re = 0; spinorFloat a31_im = 0; 00494 spinorFloat a32_re = 0; spinorFloat a32_im = 0; 00495 00496 a20_re += c20_20_re * o20_re; 00497 a20_im += c20_20_re * o20_im; 00498 a20_re += c20_21_re * o21_re; 00499 a20_re -= c20_21_im * o21_im; 00500 a20_im += c20_21_re * o21_im; 00501 a20_im += c20_21_im * o21_re; 00502 a20_re += c20_22_re * o22_re; 00503 a20_re -= c20_22_im * o22_im; 00504 a20_im += c20_22_re * o22_im; 00505 a20_im += c20_22_im * o22_re; 00506 a20_re += c20_30_re * o30_re; 00507 a20_re -= c20_30_im * o30_im; 00508 a20_im += c20_30_re * o30_im; 00509 a20_im += c20_30_im * o30_re; 00510 a20_re += c20_31_re * o31_re; 00511 a20_re -= c20_31_im * o31_im; 00512 a20_im += c20_31_re * o31_im; 00513 a20_im += c20_31_im * o31_re; 00514 a20_re += c20_32_re * o32_re; 00515 a20_re -= c20_32_im * o32_im; 00516 a20_im += c20_32_re * o32_im; 00517 a20_im += c20_32_im * o32_re; 00518 00519 a21_re += c21_20_re * o20_re; 00520 a21_re -= c21_20_im * o20_im; 00521 a21_im += c21_20_re * o20_im; 00522 a21_im += c21_20_im * o20_re; 00523 a21_re += c21_21_re * o21_re; 00524 a21_im += c21_21_re * o21_im; 00525 a21_re += c21_22_re * o22_re; 00526 a21_re -= c21_22_im * o22_im; 00527 a21_im += c21_22_re * o22_im; 00528 a21_im += c21_22_im * o22_re; 00529 a21_re += c21_30_re * o30_re; 00530 a21_re -= c21_30_im * o30_im; 00531 a21_im += c21_30_re * o30_im; 00532 a21_im += c21_30_im * o30_re; 00533 a21_re += c21_31_re * o31_re; 00534 a21_re -= c21_31_im * o31_im; 00535 a21_im += c21_31_re * o31_im; 00536 a21_im += c21_31_im * o31_re; 00537 a21_re += c21_32_re * o32_re; 00538 a21_re -= c21_32_im * o32_im; 00539 a21_im += c21_32_re * o32_im; 00540 a21_im += c21_32_im * o32_re; 00541 00542 a22_re += c22_20_re * o20_re; 00543 a22_re -= c22_20_im * o20_im; 00544 a22_im += c22_20_re * o20_im; 00545 a22_im += c22_20_im * o20_re; 00546 a22_re += c22_21_re * o21_re; 00547 a22_re -= c22_21_im * o21_im; 00548 a22_im += c22_21_re * o21_im; 00549 a22_im += c22_21_im * o21_re; 00550 a22_re += c22_22_re * o22_re; 00551 a22_im += c22_22_re * o22_im; 00552 a22_re += c22_30_re * o30_re; 00553 a22_re -= c22_30_im * o30_im; 00554 a22_im += c22_30_re * o30_im; 00555 a22_im += c22_30_im * o30_re; 00556 a22_re += c22_31_re * o31_re; 00557 a22_re -= c22_31_im * o31_im; 00558 a22_im += c22_31_re * o31_im; 00559 a22_im += c22_31_im * o31_re; 00560 a22_re += c22_32_re * o32_re; 00561 a22_re -= c22_32_im * o32_im; 00562 a22_im += c22_32_re * o32_im; 00563 a22_im += c22_32_im * o32_re; 00564 00565 a30_re += c30_20_re * o20_re; 00566 a30_re -= c30_20_im * o20_im; 00567 a30_im += c30_20_re * o20_im; 00568 a30_im += c30_20_im * o20_re; 00569 a30_re += c30_21_re * o21_re; 00570 a30_re -= c30_21_im * o21_im; 00571 a30_im += c30_21_re * o21_im; 00572 a30_im += c30_21_im * o21_re; 00573 a30_re += c30_22_re * o22_re; 00574 a30_re -= c30_22_im * o22_im; 00575 a30_im += c30_22_re * o22_im; 00576 a30_im += c30_22_im * o22_re; 00577 a30_re += c30_30_re * o30_re; 00578 a30_im += c30_30_re * o30_im; 00579 a30_re += c30_31_re * o31_re; 00580 a30_re -= c30_31_im * o31_im; 00581 a30_im += c30_31_re * o31_im; 00582 a30_im += c30_31_im * o31_re; 00583 a30_re += c30_32_re * o32_re; 00584 a30_re -= c30_32_im * o32_im; 00585 a30_im += c30_32_re * o32_im; 00586 a30_im += c30_32_im * o32_re; 00587 00588 a31_re += c31_20_re * o20_re; 00589 a31_re -= c31_20_im * o20_im; 00590 a31_im += c31_20_re * o20_im; 00591 a31_im += c31_20_im * o20_re; 00592 a31_re += c31_21_re * o21_re; 00593 a31_re -= c31_21_im * o21_im; 00594 a31_im += c31_21_re * o21_im; 00595 a31_im += c31_21_im * o21_re; 00596 a31_re += c31_22_re * o22_re; 00597 a31_re -= c31_22_im * o22_im; 00598 a31_im += c31_22_re * o22_im; 00599 a31_im += c31_22_im * o22_re; 00600 a31_re += c31_30_re * o30_re; 00601 a31_re -= c31_30_im * o30_im; 00602 a31_im += c31_30_re * o30_im; 00603 a31_im += c31_30_im * o30_re; 00604 a31_re += c31_31_re * o31_re; 00605 a31_im += c31_31_re * o31_im; 00606 a31_re += c31_32_re * o32_re; 00607 a31_re -= c31_32_im * o32_im; 00608 a31_im += c31_32_re * o32_im; 00609 a31_im += c31_32_im * o32_re; 00610 00611 a32_re += c32_20_re * o20_re; 00612 a32_re -= c32_20_im * o20_im; 00613 a32_im += c32_20_re * o20_im; 00614 a32_im += c32_20_im * o20_re; 00615 a32_re += c32_21_re * o21_re; 00616 a32_re -= c32_21_im * o21_im; 00617 a32_im += c32_21_re * o21_im; 00618 a32_im += c32_21_im * o21_re; 00619 a32_re += c32_22_re * o22_re; 00620 a32_re -= c32_22_im * o22_im; 00621 a32_im += c32_22_re * o22_im; 00622 a32_im += c32_22_im * o22_re; 00623 a32_re += c32_30_re * o30_re; 00624 a32_re -= c32_30_im * o30_im; 00625 a32_im += c32_30_re * o30_im; 00626 a32_im += c32_30_im * o30_re; 00627 a32_re += c32_31_re * o31_re; 00628 a32_re -= c32_31_im * o31_im; 00629 a32_im += c32_31_re * o31_im; 00630 a32_im += c32_31_im * o31_re; 00631 a32_re += c32_32_re * o32_re; 00632 a32_im += c32_32_re * o32_im; 00633 00634 o20_re = a20_re; o20_im = a20_im; 00635 o21_re = a21_re; o21_im = a21_im; 00636 o22_re = a22_re; o22_im = a22_im; 00637 o30_re = a30_re; o30_im = a30_im; 00638 o31_re = a31_re; o31_im = a31_im; 00639 o32_re = a32_re; o32_im = a32_im; 00640 00641 } 00642 00643 // change back from chiral basis 00644 // (note: required factor of 1/2 is included in clover term normalization) 00645 { 00646 spinorFloat a00_re = o10_re + o30_re; 00647 spinorFloat a00_im = o10_im + o30_im; 00648 spinorFloat a10_re = -o00_re - o20_re; 00649 spinorFloat a10_im = -o00_im - o20_im; 00650 spinorFloat a20_re = o10_re - o30_re; 00651 spinorFloat a20_im = o10_im - o30_im; 00652 spinorFloat a30_re = -o00_re + o20_re; 00653 spinorFloat a30_im = -o00_im + o20_im; 00654 00655 o00_re = a00_re; o00_im = a00_im; 00656 o10_re = a10_re; o10_im = a10_im; 00657 o20_re = a20_re; o20_im = a20_im; 00658 o30_re = a30_re; o30_im = a30_im; 00659 } 00660 00661 { 00662 spinorFloat a01_re = o11_re + o31_re; 00663 spinorFloat a01_im = o11_im + o31_im; 00664 spinorFloat a11_re = -o01_re - o21_re; 00665 spinorFloat a11_im = -o01_im - o21_im; 00666 spinorFloat a21_re = o11_re - o31_re; 00667 spinorFloat a21_im = o11_im - o31_im; 00668 spinorFloat a31_re = -o01_re + o21_re; 00669 spinorFloat a31_im = -o01_im + o21_im; 00670 00671 o01_re = a01_re; o01_im = a01_im; 00672 o11_re = a11_re; o11_im = a11_im; 00673 o21_re = a21_re; o21_im = a21_im; 00674 o31_re = a31_re; o31_im = a31_im; 00675 } 00676 00677 { 00678 spinorFloat a02_re = o12_re + o32_re; 00679 spinorFloat a02_im = o12_im + o32_im; 00680 spinorFloat a12_re = -o02_re - o22_re; 00681 spinorFloat a12_im = -o02_im - o22_im; 00682 spinorFloat a22_re = o12_re - o32_re; 00683 spinorFloat a22_im = o12_im - o32_im; 00684 spinorFloat a32_re = -o02_re + o22_re; 00685 spinorFloat a32_im = -o02_im + o22_im; 00686 00687 o02_re = a02_re; o02_im = a02_im; 00688 o12_re = a12_re; o12_im = a12_im; 00689 o22_re = a22_re; o22_im = a22_im; 00690 o32_re = a32_re; o32_im = a32_im; 00691 } 00692 00693 #ifdef DSLASH_XPAY 00694 00695 READ_ACCUM(ACCUMTEX, sp_stride) 00696 00697 #ifdef SPINOR_DOUBLE 00698 o00_re = a*o00_re + accum0.x; 00699 o00_im = a*o00_im + accum0.y; 00700 o01_re = a*o01_re + accum1.x; 00701 o01_im = a*o01_im + accum1.y; 00702 o02_re = a*o02_re + accum2.x; 00703 o02_im = a*o02_im + accum2.y; 00704 o10_re = a*o10_re + accum3.x; 00705 o10_im = a*o10_im + accum3.y; 00706 o11_re = a*o11_re + accum4.x; 00707 o11_im = a*o11_im + accum4.y; 00708 o12_re = a*o12_re + accum5.x; 00709 o12_im = a*o12_im + accum5.y; 00710 o20_re = a*o20_re + accum6.x; 00711 o20_im = a*o20_im + accum6.y; 00712 o21_re = a*o21_re + accum7.x; 00713 o21_im = a*o21_im + accum7.y; 00714 o22_re = a*o22_re + accum8.x; 00715 o22_im = a*o22_im + accum8.y; 00716 o30_re = a*o30_re + accum9.x; 00717 o30_im = a*o30_im + accum9.y; 00718 o31_re = a*o31_re + accum10.x; 00719 o31_im = a*o31_im + accum10.y; 00720 o32_re = a*o32_re + accum11.x; 00721 o32_im = a*o32_im + accum11.y; 00722 #else 00723 o00_re = a*o00_re + accum0.x; 00724 o00_im = a*o00_im + accum0.y; 00725 o01_re = a*o01_re + accum0.z; 00726 o01_im = a*o01_im + accum0.w; 00727 o02_re = a*o02_re + accum1.x; 00728 o02_im = a*o02_im + accum1.y; 00729 o10_re = a*o10_re + accum1.z; 00730 o10_im = a*o10_im + accum1.w; 00731 o11_re = a*o11_re + accum2.x; 00732 o11_im = a*o11_im + accum2.y; 00733 o12_re = a*o12_re + accum2.z; 00734 o12_im = a*o12_im + accum2.w; 00735 o20_re = a*o20_re + accum3.x; 00736 o20_im = a*o20_im + accum3.y; 00737 o21_re = a*o21_re + accum3.z; 00738 o21_im = a*o21_im + accum3.w; 00739 o22_re = a*o22_re + accum4.x; 00740 o22_im = a*o22_im + accum4.y; 00741 o30_re = a*o30_re + accum4.z; 00742 o30_im = a*o30_im + accum4.w; 00743 o31_re = a*o31_re + accum5.x; 00744 o31_im = a*o31_im + accum5.y; 00745 o32_re = a*o32_re + accum5.z; 00746 o32_im = a*o32_im + accum5.w; 00747 #endif // SPINOR_DOUBLE 00748 00749 #endif // DSLASH_XPAY 00750 } 00751 00752 // write spinor field back to device memory 00753 WRITE_SPINOR(sp_stride); 00754 00755 // undefine to prevent warning when precision is changed 00756 #undef spinorFloat 00757 #undef i00_re 00758 #undef i00_im 00759 #undef i01_re 00760 #undef i01_im 00761 #undef i02_re 00762 #undef i02_im 00763 #undef i10_re 00764 #undef i10_im 00765 #undef i11_re 00766 #undef i11_im 00767 #undef i12_re 00768 #undef i12_im 00769 #undef i20_re 00770 #undef i20_im 00771 #undef i21_re 00772 #undef i21_im 00773 #undef i22_re 00774 #undef i22_im 00775 #undef i30_re 00776 #undef i30_im 00777 #undef i31_re 00778 #undef i31_im 00779 #undef i32_re 00780 #undef i32_im 00781 00782 #undef c00_00_re 00783 #undef c01_01_re 00784 #undef c02_02_re 00785 #undef c10_10_re 00786 #undef c11_11_re 00787 #undef c12_12_re 00788 #undef c01_00_re 00789 #undef c01_00_im 00790 #undef c02_00_re 00791 #undef c02_00_im 00792 #undef c10_00_re 00793 #undef c10_00_im 00794 #undef c11_00_re 00795 #undef c11_00_im 00796 #undef c12_00_re 00797 #undef c12_00_im 00798 #undef c02_01_re 00799 #undef c02_01_im 00800 #undef c10_01_re 00801 #undef c10_01_im 00802 #undef c11_01_re 00803 #undef c11_01_im 00804 #undef c12_01_re 00805 #undef c12_01_im 00806 #undef c10_02_re 00807 #undef c10_02_im 00808 #undef c11_02_re 00809 #undef c11_02_im 00810 #undef c12_02_re 00811 #undef c12_02_im 00812 #undef c11_10_re 00813 #undef c11_10_im 00814 #undef c12_10_re 00815 #undef c12_10_im 00816 #undef c12_11_re 00817 #undef c12_11_im 00818 00819 00820 #undef VOLATILE