QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tmc_gamma_core.h
Go to the documentation of this file.
1 #ifndef _TMC_GAMMA_CORE_H
2 #define _TMC_GAMMA_CORE_H
3 
4 //action of the operator b*(1 + i*a*gamma5)
5 //used also macros from io_spinor.h
6 /*
7 __device__ float4 operator*(const float &x, const float4 &y)
8 {
9  float4 res;
10 
11  res.x = x * y.x;
12  res.y = x * y.y;
13  res.z = x * y.z;
14  res.w = x * y.w;
15 
16  return res;
17 }
18 
19 __device__ double2 operator*(const double &x, const double2 &y)
20 {
21  double2 res;
22 
23  res.x = x * y.x;
24  res.y = x * y.y;
25 
26  return res;
27 }
28 */
29 
30 #ifdef USE_TEXTURE_OBJECTS
31 #define SPINORTEX param.inTex
32 #else
33 #define SPINORTEX spinorTexDouble
34 #endif
35 
36 
37 // first chiral block of clover term
38 //Double precision
39 #define cd00_00_re C0.x
40 #define cd01_01_re C0.y
41 #define cd02_02_re C1.x
42 #define cd10_10_re C1.y
43 #define cd11_11_re C2.x
44 #define cd12_12_re C2.y
45 #define cd01_00_re C3.x
46 #define cd01_00_im C3.y
47 #define cd02_00_re C4.x
48 #define cd02_00_im C4.y
49 #define cd10_00_re C5.x
50 #define cd10_00_im C5.y
51 #define cd11_00_re C6.x
52 #define cd11_00_im C6.y
53 #define cd12_00_re C7.x
54 #define cd12_00_im C7.y
55 #define cd02_01_re C8.x
56 #define cd02_01_im C8.y
57 #define cd10_01_re C9.x
58 #define cd10_01_im C9.y
59 #define cd11_01_re C10.x
60 #define cd11_01_im C10.y
61 #define cd12_01_re C11.x
62 #define cd12_01_im C11.y
63 #define cd10_02_re C12.x
64 #define cd10_02_im C12.y
65 #define cd11_02_re C13.x
66 #define cd11_02_im C13.y
67 #define cd12_02_re C14.x
68 #define cd12_02_im C14.y
69 #define cd11_10_re C15.x
70 #define cd11_10_im C15.y
71 #define cd12_10_re C16.x
72 #define cd12_10_im C16.y
73 #define cd12_11_re C17.x
74 #define cd12_11_im C17.y
75 
76 #define cd00_01_re (+cd01_00_re)
77 #define cd00_01_im (-cd01_00_im)
78 #define cd00_02_re (+cd02_00_re)
79 #define cd00_02_im (-cd02_00_im)
80 #define cd01_02_re (+cd02_01_re)
81 #define cd01_02_im (-cd02_01_im)
82 #define cd00_10_re (+cd10_00_re)
83 #define cd00_10_im (-cd10_00_im)
84 #define cd01_10_re (+cd10_01_re)
85 #define cd01_10_im (-cd10_01_im)
86 #define cd02_10_re (+cd10_02_re)
87 #define cd02_10_im (-cd10_02_im)
88 #define cd00_11_re (+cd11_00_re)
89 #define cd00_11_im (-cd11_00_im)
90 #define cd01_11_re (+cd11_01_re)
91 #define cd01_11_im (-cd11_01_im)
92 #define cd02_11_re (+cd11_02_re)
93 #define cd02_11_im (-cd11_02_im)
94 #define cd10_11_re (+cd11_10_re)
95 #define cd10_11_im (-cd11_10_im)
96 #define cd00_12_re (+cd12_00_re)
97 #define cd00_12_im (-cd12_00_im)
98 #define cd01_12_re (+cd12_01_re)
99 #define cd01_12_im (-cd12_01_im)
100 #define cd02_12_re (+cd12_02_re)
101 #define cd02_12_im (-cd12_02_im)
102 #define cd10_12_re (+cd12_10_re)
103 #define cd10_12_im (-cd12_10_im)
104 #define cd11_12_re (+cd12_11_re)
105 #define cd11_12_im (-cd12_11_im)
106 
107 // second chiral block of clover term (reuses C0,...,C9)
108 #define cd20_20_re cd00_00_re
109 #define cd21_20_re cd01_00_re
110 #define cd21_20_im cd01_00_im
111 #define cd22_20_re cd02_00_re
112 #define cd22_20_im cd02_00_im
113 #define cd30_20_re cd10_00_re
114 #define cd30_20_im cd10_00_im
115 #define cd31_20_re cd11_00_re
116 #define cd31_20_im cd11_00_im
117 #define cd32_20_re cd12_00_re
118 #define cd32_20_im cd12_00_im
119 #define cd20_21_re cd00_01_re
120 #define cd20_21_im cd00_01_im
121 #define cd21_21_re cd01_01_re
122 #define cd22_21_re cd02_01_re
123 #define cd22_21_im cd02_01_im
124 #define cd30_21_re cd10_01_re
125 #define cd30_21_im cd10_01_im
126 #define cd31_21_re cd11_01_re
127 #define cd31_21_im cd11_01_im
128 #define cd32_21_re cd12_01_re
129 #define cd32_21_im cd12_01_im
130 #define cd20_22_re cd00_02_re
131 #define cd20_22_im cd00_02_im
132 #define cd21_22_re cd01_02_re
133 #define cd21_22_im cd01_02_im
134 #define cd22_22_re cd02_02_re
135 #define cd30_22_re cd10_02_re
136 #define cd30_22_im cd10_02_im
137 #define cd31_22_re cd11_02_re
138 #define cd31_22_im cd11_02_im
139 #define cd32_22_re cd12_02_re
140 #define cd32_22_im cd12_02_im
141 #define cd20_30_re cd00_10_re
142 #define cd20_30_im cd00_10_im
143 #define cd21_30_re cd01_10_re
144 #define cd21_30_im cd01_10_im
145 #define cd22_30_re cd02_10_re
146 #define cd22_30_im cd02_10_im
147 #define cd30_30_re cd10_10_re
148 #define cd31_30_re cd11_10_re
149 #define cd31_30_im cd11_10_im
150 #define cd32_30_re cd12_10_re
151 #define cd32_30_im cd12_10_im
152 #define cd20_31_re cd00_11_re
153 #define cd20_31_im cd00_11_im
154 #define cd21_31_re cd01_11_re
155 #define cd21_31_im cd01_11_im
156 #define cd22_31_re cd02_11_re
157 #define cd22_31_im cd02_11_im
158 #define cd30_31_re cd10_11_re
159 #define cd30_31_im cd10_11_im
160 #define cd31_31_re cd11_11_re
161 #define cd32_31_re cd12_11_re
162 #define cd32_31_im cd12_11_im
163 #define cd20_32_re cd00_12_re
164 #define cd20_32_im cd00_12_im
165 #define cd21_32_re cd01_12_re
166 #define cd21_32_im cd01_12_im
167 #define cd22_32_re cd02_12_re
168 #define cd22_32_im cd02_12_im
169 #define cd30_32_re cd10_12_re
170 #define cd30_32_im cd10_12_im
171 #define cd31_32_re cd11_12_re
172 #define cd31_32_im cd11_12_im
173 #define cd32_32_re cd12_12_re
174 
175 //Single-half precision
176 #define c00_00_re C0.x
177 #define c01_01_re C0.y
178 #define c02_02_re C0.z
179 #define c10_10_re C0.w
180 #define c11_11_re C1.x
181 #define c12_12_re C1.y
182 #define c01_00_re C1.z
183 #define c01_00_im C1.w
184 #define c02_00_re C2.x
185 #define c02_00_im C2.y
186 #define c10_00_re C2.z
187 #define c10_00_im C2.w
188 #define c11_00_re C3.x
189 #define c11_00_im C3.y
190 #define c12_00_re C3.z
191 #define c12_00_im C3.w
192 #define c02_01_re C4.x
193 #define c02_01_im C4.y
194 #define c10_01_re C4.z
195 #define c10_01_im C4.w
196 #define c11_01_re C5.x
197 #define c11_01_im C5.y
198 #define c12_01_re C5.z
199 #define c12_01_im C5.w
200 #define c10_02_re C6.x
201 #define c10_02_im C6.y
202 #define c11_02_re C6.z
203 #define c11_02_im C6.w
204 #define c12_02_re C7.x
205 #define c12_02_im C7.y
206 #define c11_10_re C7.z
207 #define c11_10_im C7.w
208 #define c12_10_re C8.x
209 #define c12_10_im C8.y
210 #define c12_11_re C8.z
211 #define c12_11_im C8.w
212 
213 #define c00_01_re (+c01_00_re)
214 #define c00_01_im (-c01_00_im)
215 #define c00_02_re (+c02_00_re)
216 #define c00_02_im (-c02_00_im)
217 #define c01_02_re (+c02_01_re)
218 #define c01_02_im (-c02_01_im)
219 #define c00_10_re (+c10_00_re)
220 #define c00_10_im (-c10_00_im)
221 #define c01_10_re (+c10_01_re)
222 #define c01_10_im (-c10_01_im)
223 #define c02_10_re (+c10_02_re)
224 #define c02_10_im (-c10_02_im)
225 #define c00_11_re (+c11_00_re)
226 #define c00_11_im (-c11_00_im)
227 #define c01_11_re (+c11_01_re)
228 #define c01_11_im (-c11_01_im)
229 #define c02_11_re (+c11_02_re)
230 #define c02_11_im (-c11_02_im)
231 #define c10_11_re (+c11_10_re)
232 #define c10_11_im (-c11_10_im)
233 #define c00_12_re (+c12_00_re)
234 #define c00_12_im (-c12_00_im)
235 #define c01_12_re (+c12_01_re)
236 #define c01_12_im (-c12_01_im)
237 #define c02_12_re (+c12_02_re)
238 #define c02_12_im (-c12_02_im)
239 #define c10_12_re (+c12_10_re)
240 #define c10_12_im (-c12_10_im)
241 #define c11_12_re (+c12_11_re)
242 #define c11_12_im (-c12_11_im)
243 
244 // second chiral block of clover term (reuses C0,...,C9)
245 #define c20_20_re c00_00_re
246 #define c21_20_re c01_00_re
247 #define c21_20_im c01_00_im
248 #define c22_20_re c02_00_re
249 #define c22_20_im c02_00_im
250 #define c30_20_re c10_00_re
251 #define c30_20_im c10_00_im
252 #define c31_20_re c11_00_re
253 #define c31_20_im c11_00_im
254 #define c32_20_re c12_00_re
255 #define c32_20_im c12_00_im
256 #define c20_21_re c00_01_re
257 #define c20_21_im c00_01_im
258 #define c21_21_re c01_01_re
259 #define c22_21_re c02_01_re
260 #define c22_21_im c02_01_im
261 #define c30_21_re c10_01_re
262 #define c30_21_im c10_01_im
263 #define c31_21_re c11_01_re
264 #define c31_21_im c11_01_im
265 #define c32_21_re c12_01_re
266 #define c32_21_im c12_01_im
267 #define c20_22_re c00_02_re
268 #define c20_22_im c00_02_im
269 #define c21_22_re c01_02_re
270 #define c21_22_im c01_02_im
271 #define c22_22_re c02_02_re
272 #define c30_22_re c10_02_re
273 #define c30_22_im c10_02_im
274 #define c31_22_re c11_02_re
275 #define c31_22_im c11_02_im
276 #define c32_22_re c12_02_re
277 #define c32_22_im c12_02_im
278 #define c20_30_re c00_10_re
279 #define c20_30_im c00_10_im
280 #define c21_30_re c01_10_re
281 #define c21_30_im c01_10_im
282 #define c22_30_re c02_10_re
283 #define c22_30_im c02_10_im
284 #define c30_30_re c10_10_re
285 #define c31_30_re c11_10_re
286 #define c31_30_im c11_10_im
287 #define c32_30_re c12_10_re
288 #define c32_30_im c12_10_im
289 #define c20_31_re c00_11_re
290 #define c20_31_im c00_11_im
291 #define c21_31_re c01_11_re
292 #define c21_31_im c01_11_im
293 #define c22_31_re c02_11_re
294 #define c22_31_im c02_11_im
295 #define c30_31_re c10_11_re
296 #define c30_31_im c10_11_im
297 #define c31_31_re c11_11_re
298 #define c32_31_re c12_11_re
299 #define c32_31_im c12_11_im
300 #define c20_32_re c00_12_re
301 #define c20_32_im c00_12_im
302 #define c21_32_re c01_12_re
303 #define c21_32_im c01_12_im
304 #define c22_32_re c02_12_re
305 #define c22_32_im c02_12_im
306 #define c30_32_re c10_12_re
307 #define c30_32_im c10_12_im
308 #define c31_32_re c11_12_re
309 #define c31_32_im c11_12_im
310 #define c32_32_re c12_12_re
311 
312 // first chiral block of inverted clover term (reuses C0,...,C9)
313 //Double-precision
314 #define cdinv00_00_re C0.x
315 #define cdinv01_01_re C0.y
316 #define cdinv02_02_re C1.x
317 #define cdinv10_10_re C1.y
318 #define cdinv11_11_re C2.x
319 #define cdinv12_12_re C2.y
320 #define cdinv01_00_re C3.x
321 #define cdinv01_00_im C3.y
322 #define cdinv02_00_re C4.x
323 #define cdinv02_00_im C4.y
324 #define cdinv10_00_re C5.x
325 #define cdinv10_00_im C5.y
326 #define cdinv11_00_re C6.x
327 #define cdinv11_00_im C6.y
328 #define cdinv12_00_re C7.x
329 #define cdinv12_00_im C7.y
330 #define cdinv02_01_re C8.x
331 #define cdinv02_01_im C8.y
332 #define cdinv10_01_re C9.x
333 #define cdinv10_01_im C9.y
334 #define cdinv11_01_re C10.x
335 #define cdinv11_01_im C10.y
336 #define cdinv12_01_re C11.x
337 #define cdinv12_01_im C11.y
338 #define cdinv10_02_re C12.x
339 #define cdinv10_02_im C12.y
340 #define cdinv11_02_re C13.x
341 #define cdinv11_02_im C13.y
342 #define cdinv12_02_re C14.x
343 #define cdinv12_02_im C14.y
344 #define cdinv11_10_re C15.x
345 #define cdinv11_10_im C15.y
346 #define cdinv12_10_re C16.x
347 #define cdinv12_10_im C16.y
348 #define cdinv12_11_re C17.x
349 #define cdinv12_11_im C17.y
350 
351 #define cdinv00_01_re (+cdinv01_00_re)
352 #define cdinv00_01_im (-cdinv01_00_im)
353 #define cdinv00_02_re (+cdinv02_00_re)
354 #define cdinv00_02_im (-cdinv02_00_im)
355 #define cdinv01_02_re (+cdinv02_01_re)
356 #define cdinv01_02_im (-cdinv02_01_im)
357 #define cdinv00_10_re (+cdinv10_00_re)
358 #define cdinv00_10_im (-cdinv10_00_im)
359 #define cdinv01_10_re (+cdinv10_01_re)
360 #define cdinv01_10_im (-cdinv10_01_im)
361 #define cdinv02_10_re (+cdinv10_02_re)
362 #define cdinv02_10_im (-cdinv10_02_im)
363 #define cdinv00_11_re (+cdinv11_00_re)
364 #define cdinv00_11_im (-cdinv11_00_im)
365 #define cdinv01_11_re (+cdinv11_01_re)
366 #define cdinv01_11_im (-cdinv11_01_im)
367 #define cdinv02_11_re (+cdinv11_02_re)
368 #define cdinv02_11_im (-cdinv11_02_im)
369 #define cdinv10_11_re (+cdinv11_10_re)
370 #define cdinv10_11_im (-cdinv11_10_im)
371 #define cdinv00_12_re (+cdinv12_00_re)
372 #define cdinv00_12_im (-cdinv12_00_im)
373 #define cdinv01_12_re (+cdinv12_01_re)
374 #define cdinv01_12_im (-cdinv12_01_im)
375 #define cdinv02_12_re (+cdinv12_02_re)
376 #define cdinv02_12_im (-cdinv12_02_im)
377 #define cdinv10_12_re (+cdinv12_10_re)
378 #define cdinv10_12_im (-cdinv12_10_im)
379 #define cdinv11_12_re (+cdinv12_11_re)
380 #define cdinv11_12_im (-cdinv12_11_im)
381 
382 // second chiral block of inverted clover term (reuses C0,...,C9)
383 #define cdinv20_20_re cdinv00_00_re
384 #define cdinv21_20_re cdinv01_00_re
385 #define cdinv21_20_im cdinv01_00_im
386 #define cdinv22_20_re cdinv02_00_re
387 #define cdinv22_20_im cdinv02_00_im
388 #define cdinv30_20_re cdinv10_00_re
389 #define cdinv30_20_im cdinv10_00_im
390 #define cdinv31_20_re cdinv11_00_re
391 #define cdinv31_20_im cdinv11_00_im
392 #define cdinv32_20_re cdinv12_00_re
393 #define cdinv32_20_im cdinv12_00_im
394 #define cdinv20_21_re cdinv00_01_re
395 #define cdinv20_21_im cdinv00_01_im
396 #define cdinv21_21_re cdinv01_01_re
397 #define cdinv22_21_re cdinv02_01_re
398 #define cdinv22_21_im cdinv02_01_im
399 #define cdinv30_21_re cdinv10_01_re
400 #define cdinv30_21_im cdinv10_01_im
401 #define cdinv31_21_re cdinv11_01_re
402 #define cdinv31_21_im cdinv11_01_im
403 #define cdinv32_21_re cdinv12_01_re
404 #define cdinv32_21_im cdinv12_01_im
405 #define cdinv20_22_re cdinv00_02_re
406 #define cdinv20_22_im cdinv00_02_im
407 #define cdinv21_22_re cdinv01_02_re
408 #define cdinv21_22_im cdinv01_02_im
409 #define cdinv22_22_re cdinv02_02_re
410 #define cdinv30_22_re cdinv10_02_re
411 #define cdinv30_22_im cdinv10_02_im
412 #define cdinv31_22_re cdinv11_02_re
413 #define cdinv31_22_im cdinv11_02_im
414 #define cdinv32_22_re cdinv12_02_re
415 #define cdinv32_22_im cdinv12_02_im
416 #define cdinv20_30_re cdinv00_10_re
417 #define cdinv20_30_im cdinv00_10_im
418 #define cdinv21_30_re cdinv01_10_re
419 #define cdinv21_30_im cdinv01_10_im
420 #define cdinv22_30_re cdinv02_10_re
421 #define cdinv22_30_im cdinv02_10_im
422 #define cdinv30_30_re cdinv10_10_re
423 #define cdinv31_30_re cdinv11_10_re
424 #define cdinv31_30_im cdinv11_10_im
425 #define cdinv32_30_re cdinv12_10_re
426 #define cdinv32_30_im cdinv12_10_im
427 #define cdinv20_31_re cdinv00_11_re
428 #define cdinv20_31_im cdinv00_11_im
429 #define cdinv21_31_re cdinv01_11_re
430 #define cdinv21_31_im cdinv01_11_im
431 #define cdinv22_31_re cdinv02_11_re
432 #define cdinv22_31_im cdinv02_11_im
433 #define cdinv30_31_re cdinv10_11_re
434 #define cdinv30_31_im cdinv10_11_im
435 #define cdinv31_31_re cdinv11_11_re
436 #define cdinv32_31_re cdinv12_11_re
437 #define cdinv32_31_im cdinv12_11_im
438 #define cdinv20_32_re cdinv00_12_re
439 #define cdinv20_32_im cdinv00_12_im
440 #define cdinv21_32_re cdinv01_12_re
441 #define cdinv21_32_im cdinv01_12_im
442 #define cdinv22_32_re cdinv02_12_re
443 #define cdinv22_32_im cdinv02_12_im
444 #define cdinv30_32_re cdinv10_12_re
445 #define cdinv30_32_im cdinv10_12_im
446 #define cdinv31_32_re cdinv11_12_re
447 #define cdinv31_32_im cdinv11_12_im
448 #define cdinv32_32_re cdinv12_12_re
449 
450 //Single-half precision
451 #define cinv00_00_re C0.x
452 #define cinv01_01_re C0.y
453 #define cinv02_02_re C0.z
454 #define cinv10_10_re C0.w
455 #define cinv11_11_re C1.x
456 #define cinv12_12_re C1.y
457 #define cinv01_00_re C1.z
458 #define cinv01_00_im C1.w
459 #define cinv02_00_re C2.x
460 #define cinv02_00_im C2.y
461 #define cinv10_00_re C2.z
462 #define cinv10_00_im C2.w
463 #define cinv11_00_re C3.x
464 #define cinv11_00_im C3.y
465 #define cinv12_00_re C3.z
466 #define cinv12_00_im C3.w
467 #define cinv02_01_re C4.x
468 #define cinv02_01_im C4.y
469 #define cinv10_01_re C4.z
470 #define cinv10_01_im C4.w
471 #define cinv11_01_re C5.x
472 #define cinv11_01_im C5.y
473 #define cinv12_01_re C5.z
474 #define cinv12_01_im C5.w
475 #define cinv10_02_re C6.x
476 #define cinv10_02_im C6.y
477 #define cinv11_02_re C6.z
478 #define cinv11_02_im C6.w
479 #define cinv12_02_re C7.x
480 #define cinv12_02_im C7.y
481 #define cinv11_10_re C7.z
482 #define cinv11_10_im C7.w
483 #define cinv12_10_re C8.x
484 #define cinv12_10_im C8.y
485 #define cinv12_11_re C8.z
486 #define cinv12_11_im C8.w
487 
488 #define cinv00_01_re (+cinv01_00_re)
489 #define cinv00_01_im (-cinv01_00_im)
490 #define cinv00_02_re (+cinv02_00_re)
491 #define cinv00_02_im (-cinv02_00_im)
492 #define cinv01_02_re (+cinv02_01_re)
493 #define cinv01_02_im (-cinv02_01_im)
494 #define cinv00_10_re (+cinv10_00_re)
495 #define cinv00_10_im (-cinv10_00_im)
496 #define cinv01_10_re (+cinv10_01_re)
497 #define cinv01_10_im (-cinv10_01_im)
498 #define cinv02_10_re (+cinv10_02_re)
499 #define cinv02_10_im (-cinv10_02_im)
500 #define cinv00_11_re (+cinv11_00_re)
501 #define cinv00_11_im (-cinv11_00_im)
502 #define cinv01_11_re (+cinv11_01_re)
503 #define cinv01_11_im (-cinv11_01_im)
504 #define cinv02_11_re (+cinv11_02_re)
505 #define cinv02_11_im (-cinv11_02_im)
506 #define cinv10_11_re (+cinv11_10_re)
507 #define cinv10_11_im (-cinv11_10_im)
508 #define cinv00_12_re (+cinv12_00_re)
509 #define cinv00_12_im (-cinv12_00_im)
510 #define cinv01_12_re (+cinv12_01_re)
511 #define cinv01_12_im (-cinv12_01_im)
512 #define cinv02_12_re (+cinv12_02_re)
513 #define cinv02_12_im (-cinv12_02_im)
514 #define cinv10_12_re (+cinv12_10_re)
515 #define cinv10_12_im (-cinv12_10_im)
516 #define cinv11_12_re (+cinv12_11_re)
517 #define cinv11_12_im (-cinv12_11_im)
518 
519 // second chiral block of inverted clover term (reuses C0,...,C9)
520 #define cinv20_20_re cinv00_00_re
521 #define cinv21_20_re cinv01_00_re
522 #define cinv21_20_im cinv01_00_im
523 #define cinv22_20_re cinv02_00_re
524 #define cinv22_20_im cinv02_00_im
525 #define cinv30_20_re cinv10_00_re
526 #define cinv30_20_im cinv10_00_im
527 #define cinv31_20_re cinv11_00_re
528 #define cinv31_20_im cinv11_00_im
529 #define cinv32_20_re cinv12_00_re
530 #define cinv32_20_im cinv12_00_im
531 #define cinv20_21_re cinv00_01_re
532 #define cinv20_21_im cinv00_01_im
533 #define cinv21_21_re cinv01_01_re
534 #define cinv22_21_re cinv02_01_re
535 #define cinv22_21_im cinv02_01_im
536 #define cinv30_21_re cinv10_01_re
537 #define cinv30_21_im cinv10_01_im
538 #define cinv31_21_re cinv11_01_re
539 #define cinv31_21_im cinv11_01_im
540 #define cinv32_21_re cinv12_01_re
541 #define cinv32_21_im cinv12_01_im
542 #define cinv20_22_re cinv00_02_re
543 #define cinv20_22_im cinv00_02_im
544 #define cinv21_22_re cinv01_02_re
545 #define cinv21_22_im cinv01_02_im
546 #define cinv22_22_re cinv02_02_re
547 #define cinv30_22_re cinv10_02_re
548 #define cinv30_22_im cinv10_02_im
549 #define cinv31_22_re cinv11_02_re
550 #define cinv31_22_im cinv11_02_im
551 #define cinv32_22_re cinv12_02_re
552 #define cinv32_22_im cinv12_02_im
553 #define cinv20_30_re cinv00_10_re
554 #define cinv20_30_im cinv00_10_im
555 #define cinv21_30_re cinv01_10_re
556 #define cinv21_30_im cinv01_10_im
557 #define cinv22_30_re cinv02_10_re
558 #define cinv22_30_im cinv02_10_im
559 #define cinv30_30_re cinv10_10_re
560 #define cinv31_30_re cinv11_10_re
561 #define cinv31_30_im cinv11_10_im
562 #define cinv32_30_re cinv12_10_re
563 #define cinv32_30_im cinv12_10_im
564 #define cinv20_31_re cinv00_11_re
565 #define cinv20_31_im cinv00_11_im
566 #define cinv21_31_re cinv01_11_re
567 #define cinv21_31_im cinv01_11_im
568 #define cinv22_31_re cinv02_11_re
569 #define cinv22_31_im cinv02_11_im
570 #define cinv30_31_re cinv10_11_re
571 #define cinv30_31_im cinv10_11_im
572 #define cinv31_31_re cinv11_11_re
573 #define cinv32_31_re cinv12_11_re
574 #define cinv32_31_im cinv12_11_im
575 #define cinv20_32_re cinv00_12_re
576 #define cinv20_32_im cinv00_12_im
577 #define cinv21_32_re cinv01_12_re
578 #define cinv21_32_im cinv01_12_im
579 #define cinv22_32_re cinv02_12_re
580 #define cinv22_32_im cinv02_12_im
581 #define cinv30_32_re cinv10_12_re
582 #define cinv30_32_im cinv10_12_im
583 #define cinv31_32_re cinv11_12_re
584 #define cinv31_32_im cinv11_12_im
585 #define cinv32_32_re cinv12_12_re
586 
587 #if (__COMPUTE_CAPABILITY__ >= 130)
588 
589 #define S00_re I0.x
590 #define S00_im I0.y
591 #define S01_re I1.x
592 #define S01_im I1.y
593 #define S02_re I2.x
594 #define S02_im I2.y
595 #define S10_re I3.x
596 #define S10_im I3.y
597 #define S11_re I4.x
598 #define S11_im I4.y
599 #define S12_re I5.x
600 #define S12_im I5.y
601 #define S20_re I6.x
602 #define S20_im I6.y
603 #define S21_re I7.x
604 #define S21_im I7.y
605 #define S22_re I8.x
606 #define S22_im I8.y
607 #define S30_re I9.x
608 #define S30_im I9.y
609 #define S31_re I10.x
610 #define S31_im I10.y
611 #define S32_re I11.x
612 #define S32_im I11.y
613 #define spinorFloat double
614 
615 #if (defined DIRECT_ACCESS_CLOVER) || (defined FERMI_NO_DBLE_TEX)
616  #define TMCLOVERTEX clover
617  #define TM_INV_CLOVERTEX cloverInv
618  #define READ_CLOVER READ_CLOVER_DOUBLE_STR
619  #define ASSN_CLOVER ASSN_CLOVER_DOUBLE_STR
620 #else
621  #ifdef USE_TEXTURE_OBJECTS
622  #define TMCLOVERTEX (param.cloverTex)
623  #define TM_INV_CLOVERTEX (param.cloverInvTex)
624  #else
625  #define TMCLOVERTEX cloverTexDouble
626  #define TM_INV_CLOVERTEX cloverInvTexDouble
627  #endif
628  #define READ_CLOVER READ_CLOVER_DOUBLE_TEX
629  #define ASSN_CLOVER ASSN_CLOVER_DOUBLE_TEX
630 #endif
631 
632 #define CLOVER_DOUBLE
633 
634 __global__ void twistCloverGamma5Kernel(double2 *spinor, float *null, double a, const double2 *in, const float *null2, DslashParam param,
635  const double2 *clover, const float *cNorm, const double2 *cloverInv, const float *cNrm2)
636 {
637 #ifdef GPU_TWISTED_CLOVER_DIRAC
638 
639  int sid = blockIdx.x*blockDim.x + threadIdx.x;
640  if (sid >= param.threads) return;
641 
642 #ifndef FERMI_NO_DBLE_TEX
643  double2 I0 = fetch_double2(SPINORTEX, sid + 0 * param.sp_stride);
644  double2 I1 = fetch_double2(SPINORTEX, sid + 1 * param.sp_stride);
645  double2 I2 = fetch_double2(SPINORTEX, sid + 2 * param.sp_stride);
646  double2 I3 = fetch_double2(SPINORTEX, sid + 3 * param.sp_stride);
647  double2 I4 = fetch_double2(SPINORTEX, sid + 4 * param.sp_stride);
648  double2 I5 = fetch_double2(SPINORTEX, sid + 5 * param.sp_stride);
649  double2 I6 = fetch_double2(SPINORTEX, sid + 6 * param.sp_stride);
650  double2 I7 = fetch_double2(SPINORTEX, sid + 7 * param.sp_stride);
651  double2 I8 = fetch_double2(SPINORTEX, sid + 8 * param.sp_stride);
652  double2 I9 = fetch_double2(SPINORTEX, sid + 9 * param.sp_stride);
653  double2 I10 = fetch_double2(SPINORTEX, sid + 10 * param.sp_stride);
654  double2 I11 = fetch_double2(SPINORTEX, sid + 11 * param.sp_stride);
655 #else
656  double2 I0 = in[sid + 0 * param.sp_stride];
657  double2 I1 = in[sid + 1 * param.sp_stride];
658  double2 I2 = in[sid + 2 * param.sp_stride];
659  double2 I3 = in[sid + 3 * param.sp_stride];
660  double2 I4 = in[sid + 4 * param.sp_stride];
661  double2 I5 = in[sid + 5 * param.sp_stride];
662  double2 I6 = in[sid + 6 * param.sp_stride];
663  double2 I7 = in[sid + 7 * param.sp_stride];
664  double2 I8 = in[sid + 8 * param.sp_stride];
665  double2 I9 = in[sid + 9 * param.sp_stride];
666  double2 I10 = in[sid + 10 * param.sp_stride];
667  double2 I11 = in[sid + 11 * param.sp_stride];
668 #endif
669 
670  double2 C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17;
671 
672  //apply (Clover + i*a*gamma_5) to the input spinor
673  APPLY_CLOVER_TWIST(cd, a, S);
674 
675  spinor[sid + 0 * param.sp_stride] = I0;
676  spinor[sid + 1 * param.sp_stride] = I1;
677  spinor[sid + 2 * param.sp_stride] = I2;
678  spinor[sid + 3 * param.sp_stride] = I3;
679  spinor[sid + 4 * param.sp_stride] = I4;
680  spinor[sid + 5 * param.sp_stride] = I5;
681  spinor[sid + 6 * param.sp_stride] = I6;
682  spinor[sid + 7 * param.sp_stride] = I7;
683  spinor[sid + 8 * param.sp_stride] = I8;
684  spinor[sid + 9 * param.sp_stride] = I9;
685  spinor[sid + 10 * param.sp_stride] = I10;
686  spinor[sid + 11 * param.sp_stride] = I11;
687 
688 #endif
689 }
690 
691 __global__ void twistCloverGamma5InvKernel(double2 *spinor, float *null, double a, const double2 *in, const float *null2, DslashParam param,
692  const double2 *clover, const float *cNorm, const double2 *cloverInv, const float *cNrm2)
693 {
694 #ifdef GPU_TWISTED_CLOVER_DIRAC
695 
696  int sid = blockIdx.x*blockDim.x + threadIdx.x;
697  if (sid >= param.threads) return;
698 
699 #ifndef FERMI_NO_DBLE_TEX
700  double2 I0 = fetch_double2(SPINORTEX, sid + 0 * param.sp_stride);
701  double2 I1 = fetch_double2(SPINORTEX, sid + 1 * param.sp_stride);
702  double2 I2 = fetch_double2(SPINORTEX, sid + 2 * param.sp_stride);
703  double2 I3 = fetch_double2(SPINORTEX, sid + 3 * param.sp_stride);
704  double2 I4 = fetch_double2(SPINORTEX, sid + 4 * param.sp_stride);
705  double2 I5 = fetch_double2(SPINORTEX, sid + 5 * param.sp_stride);
706  double2 I6 = fetch_double2(SPINORTEX, sid + 6 * param.sp_stride);
707  double2 I7 = fetch_double2(SPINORTEX, sid + 7 * param.sp_stride);
708  double2 I8 = fetch_double2(SPINORTEX, sid + 8 * param.sp_stride);
709  double2 I9 = fetch_double2(SPINORTEX, sid + 9 * param.sp_stride);
710  double2 I10 = fetch_double2(SPINORTEX, sid + 10 * param.sp_stride);
711  double2 I11 = fetch_double2(SPINORTEX, sid + 11 * param.sp_stride);
712 #else
713  double2 I0 = in[sid + 0 * param.sp_stride];
714  double2 I1 = in[sid + 1 * param.sp_stride];
715  double2 I2 = in[sid + 2 * param.sp_stride];
716  double2 I3 = in[sid + 3 * param.sp_stride];
717  double2 I4 = in[sid + 4 * param.sp_stride];
718  double2 I5 = in[sid + 5 * param.sp_stride];
719  double2 I6 = in[sid + 6 * param.sp_stride];
720  double2 I7 = in[sid + 7 * param.sp_stride];
721  double2 I8 = in[sid + 8 * param.sp_stride];
722  double2 I9 = in[sid + 9 * param.sp_stride];
723  double2 I10 = in[sid + 10 * param.sp_stride];
724  double2 I11 = in[sid + 11 * param.sp_stride];
725 #endif
726 
727  double2 C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17;
728 
729  //apply (Clover + i*a*gamma_5)/(Clover^2 + a^2) to the input spinor
730  APPLY_CLOVER_TWIST_INV(cd, cdinv, a, S);
731 
732  spinor[sid + 0 * param.sp_stride] = I0;
733  spinor[sid + 1 * param.sp_stride] = I1;
734  spinor[sid + 2 * param.sp_stride] = I2;
735  spinor[sid + 3 * param.sp_stride] = I3;
736  spinor[sid + 4 * param.sp_stride] = I4;
737  spinor[sid + 5 * param.sp_stride] = I5;
738  spinor[sid + 6 * param.sp_stride] = I6;
739  spinor[sid + 7 * param.sp_stride] = I7;
740  spinor[sid + 8 * param.sp_stride] = I8;
741  spinor[sid + 9 * param.sp_stride] = I9;
742  spinor[sid + 10 * param.sp_stride] = I10;
743  spinor[sid + 11 * param.sp_stride] = I11;
744 
745 #endif
746 }
747 
748 #undef TMCLOVERTEX
749 #undef TM_INV_CLOVERTEX
750 #undef READ_CLOVER
751 #undef ASSN_CLOVER
752 #undef CLOVER_DOUBLE
753 
754 #undef S00_re
755 #undef S00_im
756 #undef S01_re
757 #undef S01_im
758 #undef S02_re
759 #undef S02_im
760 #undef S10_re
761 #undef S10_im
762 #undef S11_re
763 #undef S11_im
764 #undef S12_re
765 #undef S12_im
766 #undef S20_re
767 #undef S20_im
768 #undef S21_re
769 #undef S21_im
770 #undef S22_re
771 #undef S22_im
772 #undef S30_re
773 #undef S30_im
774 #undef S31_re
775 #undef S31_im
776 #undef S32_re
777 #undef S32_im
778 #undef spinorFloat
779 
780 #endif // (__COMPUTE_CAPABILITY__ >= 130)
781 
782 #undef SPINORTEX
783 #ifdef USE_TEXTURE_OBJECTS
784 #define SPINORTEX param.inTex
785 #else
786 #define SPINORTEX spinorTexSingle
787 #endif
788 
789 #define S00_re I0.x
790 #define S00_im I0.y
791 #define S01_re I0.z
792 #define S01_im I0.w
793 #define S02_re I1.x
794 #define S02_im I1.y
795 #define S10_re I1.z
796 #define S10_im I1.w
797 #define S11_re I2.x
798 #define S11_im I2.y
799 #define S12_re I2.z
800 #define S12_im I2.w
801 #define S20_re I3.x
802 #define S20_im I3.y
803 #define S21_re I3.z
804 #define S21_im I3.w
805 #define S22_re I4.x
806 #define S22_im I4.y
807 #define S30_re I4.z
808 #define S30_im I4.w
809 #define S31_re I5.x
810 #define S31_im I5.y
811 #define S32_re I5.z
812 #define S32_im I5.w
813 
814 #define spinorFloat float
815 
816 #ifdef DIRECT_ACCESS_CLOVER
817  #define TMCLOVERTEX clover
818  #define TM_INV_CLOVERTEX cloverInv
819  #define READ_CLOVER READ_CLOVER_SINGLE
820  #define ASSN_CLOVER ASSN_CLOVER_SINGLE
821 #else
822  #ifdef USE_TEXTURE_OBJECTS
823  #define TMCLOVERTEX (param.cloverTex)
824  #define TM_INV_CLOVERTEX (param.cloverInvTex)
825  #else
826  #define TMCLOVERTEX cloverTexSingle
827  #define TM_INV_CLOVERTEX cloverInvTexSingle
828  #endif
829  #define READ_CLOVER READ_CLOVER_SINGLE_TEX
830  #define ASSN_CLOVER ASSN_CLOVER_SINGLE_TEX
831 #endif
832 
833 __global__ void twistCloverGamma5Kernel(float4 *spinor, float *null, float a, const float4 *in, const float *null2, DslashParam param,
834  const float4 *clover, const float *cNorm, const float4 *cloverInv, const float *cNrm2)
835 {
836 #ifdef GPU_TWISTED_CLOVER_DIRAC
837  int sid = blockIdx.x*blockDim.x + threadIdx.x;
838  if (sid >= param.threads) return;
839 
840  float4 I0 = TEX1DFETCH(float4, SPINORTEX, sid + 0 * param.sp_stride);
841  float4 I1 = TEX1DFETCH(float4, SPINORTEX, sid + 1 * param.sp_stride);
842  float4 I2 = TEX1DFETCH(float4, SPINORTEX, sid + 2 * param.sp_stride);
843  float4 I3 = TEX1DFETCH(float4, SPINORTEX, sid + 3 * param.sp_stride);
844  float4 I4 = TEX1DFETCH(float4, SPINORTEX, sid + 4 * param.sp_stride);
845  float4 I5 = TEX1DFETCH(float4, SPINORTEX, sid + 5 * param.sp_stride);
846 
847  float4 C0, C1, C2, C3, C4, C5, C6, C7, C8;
848 
849  //apply (Clover + i*a*gamma_5) to the input spinor
850  APPLY_CLOVER_TWIST(c, a, S);
851 
852  spinor[sid + 0 * param.sp_stride] = I0;
853  spinor[sid + 1 * param.sp_stride] = I1;
854  spinor[sid + 2 * param.sp_stride] = I2;
855  spinor[sid + 3 * param.sp_stride] = I3;
856  spinor[sid + 4 * param.sp_stride] = I4;
857  spinor[sid + 5 * param.sp_stride] = I5;
858 
859 #endif
860 }
861 
862 __global__ void twistCloverGamma5InvKernel(float4 *spinor, float *null, float a, const float4 *in, const float *null2, DslashParam param,
863  const float4 *clover, const float *cNorm, const float4 *cloverInv, const float *cNrm2)
864 {
865 #ifdef GPU_TWISTED_CLOVER_DIRAC
866  int sid = blockIdx.x*blockDim.x + threadIdx.x;
867  if (sid >= param.threads) return;
868 
869  float4 I0 = TEX1DFETCH(float4, SPINORTEX, sid + 0 * param.sp_stride);
870  float4 I1 = TEX1DFETCH(float4, SPINORTEX, sid + 1 * param.sp_stride);
871  float4 I2 = TEX1DFETCH(float4, SPINORTEX, sid + 2 * param.sp_stride);
872  float4 I3 = TEX1DFETCH(float4, SPINORTEX, sid + 3 * param.sp_stride);
873  float4 I4 = TEX1DFETCH(float4, SPINORTEX, sid + 4 * param.sp_stride);
874  float4 I5 = TEX1DFETCH(float4, SPINORTEX, sid + 5 * param.sp_stride);
875 
876  float4 C0, C1, C2, C3, C4, C5, C6, C7, C8;
877 
878  //apply (Clover + i*a*gamma_5)/(Clover^2 + a^2) to the input spinor
879  APPLY_CLOVER_TWIST_INV(c, cinv, a, S);
880 
881  spinor[sid + 0 * param.sp_stride] = I0;
882  spinor[sid + 1 * param.sp_stride] = I1;
883  spinor[sid + 2 * param.sp_stride] = I2;
884  spinor[sid + 3 * param.sp_stride] = I3;
885  spinor[sid + 4 * param.sp_stride] = I4;
886  spinor[sid + 5 * param.sp_stride] = I5;
887 
888 #endif
889 }
890 
891 #undef TMCLOVERTEX
892 #undef TM_INV_CLOVERTEX
893 #undef READ_CLOVER
894 #undef ASSN_CLOVER
895 
896 
897 #undef SPINORTEX
898 #ifdef USE_TEXTURE_OBJECTS
899 #define SPINORTEX param.inTex
900 #define SPINORTEXNORM param.inTexNorm
901 #else
902 #define SPINORTEX spinorTexHalf
903 #define SPINORTEXNORM spinorTexHalfNorm
904 #endif
905 
906 #ifdef DIRECT_ACCESS_CLOVER
907  #define CLOVERTEX clover
908  #define READ_CLOVER READ_CLOVER_HALF
909  #define ASSN_CLOVER ASSN_CLOVER_HALF
910 #else
911  #ifdef USE_TEXTURE_OBJECTS
912  #define TMCLOVERTEX (param.cloverTex)
913  #define TMCLOVERTEXNORM (param.cloverNormTex)
914  #define TM_INV_CLOVERTEX (param.cloverInvTex)
915  #define TM_INV_CLOVERTEXNORM (param.cloverInvNormTex)
916  #else
917  #define TMCLOVERTEX cloverTexHalf
918  #define TMCLOVERTEXNORM cloverTexNorm
919  #define TM_INV_CLOVERTEX cloverInvTexHalf
920  #define TM_INV_CLOVERTEXNORM cloverInvTexNorm
921  #endif
922  #define READ_CLOVER READ_CLOVER_HALF_TEX
923  #define ASSN_CLOVER ASSN_CLOVER_HALF_TEX
924 #endif
925 
926 __global__ void twistCloverGamma5Kernel(short4* spinor, float *spinorNorm, float a, const short4 *in, const float *inNorm, DslashParam param,
927  const short4 *clover, const float *cNorm, const short4 *cloverInv, const float *cNrm2)
928 {
929 #ifdef GPU_TWISTED_CLOVER_DIRAC
930  int sid = blockIdx.x*blockDim.x + threadIdx.x;
931  if (sid >= param.threads) return;
932 
933  float4 I0 = TEX1DFETCH(float4, SPINORTEX, sid + 0 * param.sp_stride);
934  float4 I1 = TEX1DFETCH(float4, SPINORTEX, sid + 1 * param.sp_stride);
935  float4 I2 = TEX1DFETCH(float4, SPINORTEX, sid + 2 * param.sp_stride);
936  float4 I3 = TEX1DFETCH(float4, SPINORTEX, sid + 3 * param.sp_stride);
937  float4 I4 = TEX1DFETCH(float4, SPINORTEX, sid + 4 * param.sp_stride);
938  float4 I5 = TEX1DFETCH(float4, SPINORTEX, sid + 5 * param.sp_stride);
939 
940  float KC = TEX1DFETCH(float, SPINORTEXNORM, sid);
941 
942  I0 = KC * I0;
943  I1 = KC * I1;
944  I2 = KC * I2;
945  I3 = KC * I3;
946  I4 = KC * I4;
947  I5 = KC * I5;
948 
949  float4 C0, C1, C2, C3, C4, C5, C6, C7, C8;
950  float K;
951 
952  //apply (Clover + i*a*gamma_5) to the input spinor
953  APPLY_CLOVER_TWIST(c, a, S);
954 
955  float k0 = fmaxf(fabsf(I0.x), fabsf(I0.y));
956  float k1 = fmaxf(fabsf(I0.z), fabsf(I0.w));
957  float k2 = fmaxf(fabsf(I1.x), fabsf(I1.y));
958  float k3 = fmaxf(fabsf(I1.z), fabsf(I1.w));
959  float k4 = fmaxf(fabsf(I2.x), fabsf(I2.y));
960  float k5 = fmaxf(fabsf(I2.z), fabsf(I2.w));
961  float k6 = fmaxf(fabsf(I3.x), fabsf(I3.y));
962  float k7 = fmaxf(fabsf(I3.z), fabsf(I3.w));
963  float k8 = fmaxf(fabsf(I4.x), fabsf(I4.y));
964  float k9 = fmaxf(fabsf(I4.z), fabsf(I4.w));
965  float k10 = fmaxf(fabsf(I5.x), fabsf(I5.y));
966  float k11 = fmaxf(fabsf(I5.z), fabsf(I5.w));
967  k0 = fmaxf(k0, k1);
968  k1 = fmaxf(k2, k3);
969  k2 = fmaxf(k4, k5);
970  k3 = fmaxf(k6, k7);
971  k4 = fmaxf(k8, k9);
972  k5 = fmaxf(k10, k11);
973  k0 = fmaxf(k0, k1);
974  k1 = fmaxf(k2, k3);
975  k2 = fmaxf(k4, k5);
976  k0 = fmaxf(k0, k1);
977  k0 = fmaxf(k0, k2);
978  spinorNorm[sid] = k0;
979  float scale = __fdividef(MAX_SHORT, k0);
980 
981  I0 = scale * I0;
982  I1 = scale * I1;
983  I2 = scale * I2;
984  I3 = scale * I3;
985  I4 = scale * I4;
986  I5 = scale * I5;
987 
988  spinor[sid+0*(param.sp_stride)] = make_short4((short)I0.x, (short)I0.y, (short)I0.z, (short)I0.w);
989  spinor[sid+1*(param.sp_stride)] = make_short4((short)I1.x, (short)I1.y, (short)I1.z, (short)I1.w);
990  spinor[sid+2*(param.sp_stride)] = make_short4((short)I2.x, (short)I2.y, (short)I2.z, (short)I2.w);
991  spinor[sid+3*(param.sp_stride)] = make_short4((short)I3.x, (short)I3.y, (short)I3.z, (short)I3.w);
992  spinor[sid+4*(param.sp_stride)] = make_short4((short)I4.x, (short)I4.y, (short)I4.z, (short)I4.w);
993  spinor[sid+5*(param.sp_stride)] = make_short4((short)I5.x, (short)I5.y, (short)I5.z, (short)I5.w);
994 
995 #endif
996 }
997 
998 __global__ void twistCloverGamma5InvKernel(short4* spinor, float *spinorNorm, float a, const short4 *in, const float *inNorm, DslashParam param,
999  const short4 *clover, const float *cNorm, const short4 *cloverInv, const float *cNrm2)
1000 {
1001 #ifdef GPU_TWISTED_CLOVER_DIRAC
1002  int sid = blockIdx.x*blockDim.x + threadIdx.x;
1003  if (sid >= param.threads) return;
1004 
1005  float4 I0 = TEX1DFETCH(float4, SPINORTEX, sid + 0 * param.sp_stride);
1006  float4 I1 = TEX1DFETCH(float4, SPINORTEX, sid + 1 * param.sp_stride);
1007  float4 I2 = TEX1DFETCH(float4, SPINORTEX, sid + 2 * param.sp_stride);
1008  float4 I3 = TEX1DFETCH(float4, SPINORTEX, sid + 3 * param.sp_stride);
1009  float4 I4 = TEX1DFETCH(float4, SPINORTEX, sid + 4 * param.sp_stride);
1010  float4 I5 = TEX1DFETCH(float4, SPINORTEX, sid + 5 * param.sp_stride);
1011 
1012  float KC = TEX1DFETCH(float, SPINORTEXNORM, sid);
1013 
1014  I0 = KC * I0;
1015  I1 = KC * I1;
1016  I2 = KC * I2;
1017  I3 = KC * I3;
1018  I4 = KC * I4;
1019  I5 = KC * I5;
1020 
1021  float4 C0, C1, C2, C3, C4, C5, C6, C7, C8;
1022  float K;
1023 
1024  //apply (Clover + i*a*gamma_5)/(Clover^2 + a^2) to the input spinor
1025  APPLY_CLOVER_TWIST_INV(c, cinv, a, S);
1026 
1027  float k0 = fmaxf(fabsf(I0.x), fabsf(I0.y));
1028  float k1 = fmaxf(fabsf(I0.z), fabsf(I0.w));
1029  float k2 = fmaxf(fabsf(I1.x), fabsf(I1.y));
1030  float k3 = fmaxf(fabsf(I1.z), fabsf(I1.w));
1031  float k4 = fmaxf(fabsf(I2.x), fabsf(I2.y));
1032  float k5 = fmaxf(fabsf(I2.z), fabsf(I2.w));
1033  float k6 = fmaxf(fabsf(I3.x), fabsf(I3.y));
1034  float k7 = fmaxf(fabsf(I3.z), fabsf(I3.w));
1035  float k8 = fmaxf(fabsf(I4.x), fabsf(I4.y));
1036  float k9 = fmaxf(fabsf(I4.z), fabsf(I4.w));
1037  float k10 = fmaxf(fabsf(I5.x), fabsf(I5.y));
1038  float k11 = fmaxf(fabsf(I5.z), fabsf(I5.w));
1039  k0 = fmaxf(k0, k1);
1040  k1 = fmaxf(k2, k3);
1041  k2 = fmaxf(k4, k5);
1042  k3 = fmaxf(k6, k7);
1043  k4 = fmaxf(k8, k9);
1044  k5 = fmaxf(k10, k11);
1045  k0 = fmaxf(k0, k1);
1046  k1 = fmaxf(k2, k3);
1047  k2 = fmaxf(k4, k5);
1048  k0 = fmaxf(k0, k1);
1049  k0 = fmaxf(k0, k2);
1050  spinorNorm[sid] = k0;
1051  float scale = __fdividef(MAX_SHORT, k0);
1052 
1053  I0 = scale * I0;
1054  I1 = scale * I1;
1055  I2 = scale * I2;
1056  I3 = scale * I3;
1057  I4 = scale * I4;
1058  I5 = scale * I5;
1059 
1060  spinor[sid+0*(param.sp_stride)] = make_short4((short)I0.x, (short)I0.y, (short)I0.z, (short)I0.w);
1061  spinor[sid+1*(param.sp_stride)] = make_short4((short)I1.x, (short)I1.y, (short)I1.z, (short)I1.w);
1062  spinor[sid+2*(param.sp_stride)] = make_short4((short)I2.x, (short)I2.y, (short)I2.z, (short)I2.w);
1063  spinor[sid+3*(param.sp_stride)] = make_short4((short)I3.x, (short)I3.y, (short)I3.z, (short)I3.w);
1064  spinor[sid+4*(param.sp_stride)] = make_short4((short)I4.x, (short)I4.y, (short)I4.z, (short)I4.w);
1065  spinor[sid+5*(param.sp_stride)] = make_short4((short)I5.x, (short)I5.y, (short)I5.z, (short)I5.w);
1066 
1067 #endif
1068 }
1069 
1070 #undef CLOVERTEX
1071 #undef READ_CLOVER
1072 #undef TMCLOVERTEX
1073 #undef TMCLOVERTEXNORM
1074 #undef TM_INV_CLOVERTEX
1075 #undef TM_INV_CLOVERTEXNORM
1076 
1077 
1078 #undef SPINORTEX
1079 #undef SPINORTEXNORM
1080 
1081 #undef S00_re
1082 #undef S00_im
1083 #undef S01_re
1084 #undef S01_im
1085 #undef S02_re
1086 #undef S02_im
1087 #undef S10_re
1088 #undef S10_im
1089 #undef S11_re
1090 #undef S11_im
1091 #undef S12_re
1092 #undef S12_im
1093 #undef S20_re
1094 #undef S20_im
1095 #undef S21_re
1096 #undef S21_im
1097 #undef S22_re
1098 #undef S22_im
1099 #undef S30_re
1100 #undef S30_im
1101 #undef S31_re
1102 #undef S31_im
1103 #undef S32_re
1104 #undef S32_im
1105 #undef spinorFloat
1106 
1107 #endif //_TM_GAMMA_CORE_H
1108 
1109 
#define APPLY_CLOVER_TWIST(c, a, reg)
Definition: tmc_core.h:1
__global__ void twistCloverGamma5InvKernel(float4 *spinor, float *null, float a, const float4 *in, const float *null2, DslashParam param, const float4 *clover, const float *cNorm, const float4 *cloverInv, const float *cNrm2)
#define SPINORTEXNORM
cpuColorSpinorField * spinor
Definition: dslash_test.cpp:40
QudaGaugeParam param
Definition: pack_test.cpp:17
__global__ void twistCloverGamma5Kernel(float4 *spinor, float *null, float a, const float4 *in, const float *null2, DslashParam param, const float4 *clover, const float *cNorm, const float4 *cloverInv, const float *cNrm2)
#define APPLY_CLOVER_TWIST_INV(c, cinv, a, reg)
Definition: tmc_core.h:432
#define SPINORTEX
cpuColorSpinorField * in
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
Definition: texture.h:90
#define MAX_SHORT
Definition: quda_internal.h:30
#define TEX1DFETCH(type, tex, idx)