QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_core.h
Go to the documentation of this file.
1 #ifndef _TM_CORE_H
2 #define _TM_CORE_H
3 
4 //action of the operator b*(1 + i*a*gamma5)
5 //used also macros from io_spinor.h
6 
7 __device__ float4 operator*(const float &x, const float4 &y)
8 {
9  float4 res;
10 
11  res.x = x * y.x;
12  res.y = x * y.y;
13  res.z = x * y.z;
14  res.w = x * y.w;
15 
16  return res;
17 }
18 
19 __device__ double2 operator*(const double &x, const double2 &y)
20 {
21  double2 res;
22 
23  res.x = x * y.x;
24  res.y = x * y.y;
25 
26  return res;
27 }
28 
29 
30 #define tmp0_re tmp0.x
31 #define tmp0_im tmp0.y
32 #define tmp1_re tmp1.x
33 #define tmp1_im tmp1.y
34 #define tmp2_re tmp2.x
35 #define tmp2_im tmp2.y
36 #define tmp3_re tmp3.x
37 #define tmp3_im tmp3.y
38 
39 #ifdef USE_TEXTURE_OBJECTS
40 #define SPINORTEX param.inTex
41 #else
42 #define SPINORTEX spinorTexDouble
43 #endif
44 
45 #if (__COMPUTE_CAPABILITY__ >= 130)
46 __global__ void twistGamma5Kernel(double2 *spinor, float *null, double a, double b,
47  const double2 *in, const float *null2, DslashParam param)
48 {
49 #ifdef GPU_TWISTED_MASS_DIRAC
50 
51  int sid = blockIdx.x*blockDim.x + threadIdx.x;
52  if (sid >= param.threads) return;
53 
54 #ifndef FERMI_NO_DBLE_TEX
55  double2 I0 = fetch_double2(SPINORTEX, sid + 0 * param.sp_stride);
56  double2 I1 = fetch_double2(SPINORTEX, sid + 1 * param.sp_stride);
57  double2 I2 = fetch_double2(SPINORTEX, sid + 2 * param.sp_stride);
58  double2 I3 = fetch_double2(SPINORTEX, sid + 3 * param.sp_stride);
59  double2 I4 = fetch_double2(SPINORTEX, sid + 4 * param.sp_stride);
60  double2 I5 = fetch_double2(SPINORTEX, sid + 5 * param.sp_stride);
61  double2 I6 = fetch_double2(SPINORTEX, sid + 6 * param.sp_stride);
62  double2 I7 = fetch_double2(SPINORTEX, sid + 7 * param.sp_stride);
63  double2 I8 = fetch_double2(SPINORTEX, sid + 8 * param.sp_stride);
64  double2 I9 = fetch_double2(SPINORTEX, sid + 9 * param.sp_stride);
65  double2 I10 = fetch_double2(SPINORTEX, sid + 10 * param.sp_stride);
66  double2 I11 = fetch_double2(SPINORTEX, sid + 11 * param.sp_stride);
67 #else
68  double2 I0 = in[sid + 0 * param.sp_stride];
69  double2 I1 = in[sid + 1 * param.sp_stride];
70  double2 I2 = in[sid + 2 * param.sp_stride];
71  double2 I3 = in[sid + 3 * param.sp_stride];
72  double2 I4 = in[sid + 4 * param.sp_stride];
73  double2 I5 = in[sid + 5 * param.sp_stride];
74  double2 I6 = in[sid + 6 * param.sp_stride];
75  double2 I7 = in[sid + 7 * param.sp_stride];
76  double2 I8 = in[sid + 8 * param.sp_stride];
77  double2 I9 = in[sid + 9 * param.sp_stride];
78  double2 I10 = in[sid + 10 * param.sp_stride];
79  double2 I11 = in[sid + 11 * param.sp_stride];
80 #endif
81 
82  volatile double2 tmp0, tmp1, tmp2, tmp3;
83 
84  //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor)
85 
86  //get the 1st color component:
87 
88  tmp0_re = I0.x - a * I6.y;
89  tmp0_im = I0.y + a * I6.x;
90 
91  tmp2_re = I6.x - a * I0.y;
92  tmp2_im = I6.y + a * I0.x;
93 
94  tmp1_re = I3.x - a * I9.y;
95  tmp1_im = I3.y + a * I9.x;
96 
97  tmp3_re = I9.x - a * I3.y;
98  tmp3_im = I9.y + a * I3.x;
99 
100  I0.x = b * tmp0_re;
101  I0.y = b * tmp0_im;
102  I3.x = b * tmp1_re;
103  I3.y = b * tmp1_im;
104  I6.x = b * tmp2_re;
105  I6.y = b * tmp2_im;
106  I9.x = b * tmp3_re;
107  I9.y = b * tmp3_im;
108 
109  //get the 2nd color component:
110 
111  tmp0_re = I1.x - a * I7.y;
112  tmp0_im = I1.y + a * I7.x;
113 
114  tmp2_re = I7.x - a * I1.y;
115  tmp2_im = I7.y + a * I1.x;
116 
117  tmp1_re = I4.x - a * I10.y;
118  tmp1_im = I4.y + a * I10.x;
119 
120  tmp3_re = I10.x - a * I4.y;
121  tmp3_im = I10.y + a * I4.x;
122 
123  I1.x = b * tmp0_re;
124  I1.y = b * tmp0_im;
125  I4.x = b * tmp1_re;
126  I4.y = b * tmp1_im;
127  I7.x = b * tmp2_re;
128  I7.y = b * tmp2_im;
129  I10.x = b * tmp3_re;
130  I10.y = b * tmp3_im;
131 
132  //get the 3d color component:
133 
134  tmp0_re = I2.x - a* I8.y;
135  tmp0_im = I2.y + a* I8.x;
136 
137  tmp2_re = I8.x - a* I2.y;
138  tmp2_im = I8.y + a* I2.x;
139 
140  tmp1_re = I5.x - a* I11.y;
141  tmp1_im = I5.y + a* I11.x;
142 
143  tmp3_re = I11.x - a* I5.y;
144  tmp3_im = I11.y + a* I5.x;
145 
146  I2.x = b * tmp0_re;
147  I2.y = b * tmp0_im;
148  I5.x = b * tmp1_re;
149  I5.y = b * tmp1_im;
150  I8.x = b * tmp2_re;
151  I8.y = b * tmp2_im;
152  I11.x = b * tmp3_re;
153  I11.y = b * tmp3_im;
154 
155  spinor[sid + 0 * param.sp_stride] = I0;
156  spinor[sid + 1 * param.sp_stride] = I1;
157  spinor[sid + 2 * param.sp_stride] = I2;
158  spinor[sid + 3 * param.sp_stride] = I3;
159  spinor[sid + 4 * param.sp_stride] = I4;
160  spinor[sid + 5 * param.sp_stride] = I5;
161  spinor[sid + 6 * param.sp_stride] = I6;
162  spinor[sid + 7 * param.sp_stride] = I7;
163  spinor[sid + 8 * param.sp_stride] = I8;
164  spinor[sid + 9 * param.sp_stride] = I9;
165  spinor[sid + 10 * param.sp_stride] = I10;
166  spinor[sid + 11 * param.sp_stride] = I11;
167 #endif
168 }
169 
170 
171 __global__ void twistGamma5Kernel(double2 *spinor, float *null, const double a, const double b, const double c, const double2 *in, const float *null2, DslashParam param)
172 {
173 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
174  int sid = blockIdx.x * blockDim.x + threadIdx.x;
175  if (sid >= param.threads) return;
176 
177  //temporal regs:
178  double2 accum1_0, accum1_1;
179  double2 accum2_0, accum2_1;
180  double2 tmp0, tmp1;
181 
182  int flv1_idx = sid;
183  int flv2_idx = sid + param.fl_stride; //or simply +flavor_length (volume incl. pad)
184 
185  //apply (1 - i*a*gamma_5 * tau_3 + b * tau_1)
186 
187  //get the first color component for each flavor:
188 
189 #ifndef FERMI_NO_DBLE_TEX
190  tmp0 = fetch_double2(SPINORTEX, flv1_idx + 0 * param.sp_stride);
191  tmp1 = fetch_double2(SPINORTEX, flv1_idx + 6 * param.sp_stride);
192 #else
193  tmp0 = in[flv1_idx + 0 * param.sp_stride];
194  tmp1 = in[flv1_idx + 6 * param.sp_stride];
195 #endif
196 
197  accum1_0.x = tmp0.x + a * tmp1.y;
198  accum1_0.y = tmp0.y - a * tmp1.x;
199 
200  accum2_0.x = b * tmp0.x;
201  accum2_0.y = b * tmp0.y;
202 
203  accum1_1.x = tmp1.x + a * tmp0.y;
204  accum1_1.y = tmp1.y - a * tmp0.x;
205 
206  accum2_1.x = b * tmp1.x;
207  accum2_1.y = b * tmp1.y;
208 
209 
210 #ifndef FERMI_NO_DBLE_TEX
211  tmp0 = fetch_double2(SPINORTEX, flv2_idx + 0 * param.sp_stride);
212  tmp1 = fetch_double2(SPINORTEX, flv2_idx + 6 * param.sp_stride);
213 #else
214  tmp0 = in[flv2_idx + 0 * param.sp_stride];
215  tmp1 = in[flv2_idx + 6 * param.sp_stride];
216 #endif
217 
218  accum2_0.x += tmp0.x - a * tmp1.y;
219  accum2_0.y += tmp0.y + a * tmp1.x;
220 
221  accum1_0.x += b * tmp0.x;
222  accum1_0.y += b * tmp0.y;
223 
224  accum2_1.x += tmp1.x - a * tmp0.y;
225  accum2_1.y += tmp1.y + a * tmp0.x;
226 
227  accum1_1.x += b * tmp1.x;
228  accum1_1.y += b * tmp1.y;
229 
230  //store results back to memory:
231 
232  spinor[flv1_idx + 0 * param.sp_stride] = c * accum1_0;
233  spinor[flv1_idx + 6 * param.sp_stride] = c * accum1_1;
234  spinor[flv2_idx + 0 * param.sp_stride] = c * accum2_0;
235  spinor[flv2_idx + 6 * param.sp_stride] = c * accum2_1;
236 
237 #ifndef FERMI_NO_DBLE_TEX
238  tmp0 = fetch_double2(SPINORTEX, flv1_idx + 3 * param.sp_stride);
239  tmp1 = fetch_double2(SPINORTEX, flv1_idx + 9 * param.sp_stride);
240 #else
241  tmp0 = in[flv1_idx + 3 * param.sp_stride];
242  tmp1 = in[flv1_idx + 9 * param.sp_stride];
243 #endif
244 
245  accum1_0.x = tmp0.x + a * tmp1.y;
246  accum1_0.y = tmp0.y - a * tmp1.x;
247 
248  accum2_0.x = b * tmp0.x;
249  accum2_0.y = b * tmp0.y;
250 
251  accum1_1.x = tmp1.x + a * tmp0.y;
252  accum1_1.y = tmp1.y - a * tmp0.x;
253 
254  accum2_1.x = b * tmp1.x;
255  accum2_1.y = b * tmp1.y;
256 
257 #ifndef FERMI_NO_DBLE_TEX
258  tmp0 = fetch_double2(SPINORTEX, flv2_idx + 3 * param.sp_stride);
259  tmp1 = fetch_double2(SPINORTEX, flv2_idx + 9 * param.sp_stride);
260 #else
261  tmp0 = in[flv2_idx + 3 * param.sp_stride];
262  tmp1 = in[flv2_idx + 9 * param.sp_stride];
263 #endif
264 
265  accum2_0.x += tmp0.x - a * tmp1.y;
266  accum2_0.y += tmp0.y + a * tmp1.x;
267 
268  accum1_0.x += b * tmp0.x;
269  accum1_0.y += b * tmp0.y;
270 
271  accum2_1.x += tmp1.x - a * tmp0.y;
272  accum2_1.y += tmp1.y + a * tmp0.x;
273 
274  accum1_1.x += b * tmp1.x;
275  accum1_1.y += b * tmp1.y;
276 
277  //store results back to memory:
278 
279  spinor[flv1_idx + 3 * param.sp_stride] = c * accum1_0;
280  spinor[flv1_idx + 9 * param.sp_stride] = c * accum1_1;
281  spinor[flv2_idx + 3 * param.sp_stride] = c * accum2_0;
282  spinor[flv2_idx + 9 * param.sp_stride] = c * accum2_1;
283  //get the second color component for each flavor:
284 
285 #ifndef FERMI_NO_DBLE_TEX
286  tmp0 = fetch_double2(SPINORTEX, flv1_idx + 1 * param.sp_stride);
287  tmp1 = fetch_double2(SPINORTEX, flv1_idx + 7 * param.sp_stride);
288 #else
289  tmp0 = in[flv1_idx + 1 * param.sp_stride];
290  tmp1 = in[flv1_idx + 7 * param.sp_stride];
291 #endif
292 
293  accum1_0.x = tmp0.x + a * tmp1.y;
294  accum1_0.y = tmp0.y - a * tmp1.x;
295 
296  accum2_0.x = b * tmp0.x;
297  accum2_0.y = b * tmp0.y;
298 
299  accum1_1.x = tmp1.x + a * tmp0.y;
300  accum1_1.y = tmp1.y - a * tmp0.x;
301 
302  accum2_1.x = b * tmp1.x;
303  accum2_1.y = b * tmp1.y;
304 
305 #ifndef FERMI_NO_DBLE_TEX
306  tmp0 = fetch_double2(SPINORTEX, flv2_idx + 1 * param.sp_stride);
307  tmp1 = fetch_double2(SPINORTEX, flv2_idx + 7 * param.sp_stride);
308 #else
309  tmp0 = in[flv2_idx + 1 * param.sp_stride];
310  tmp1 = in[flv2_idx + 7 * param.sp_stride];
311 #endif
312 
313  accum2_0.x += tmp0.x - a * tmp1.y;
314  accum2_0.y += tmp0.y + a * tmp1.x;
315 
316  accum1_0.x += b * tmp0.x;
317  accum1_0.y += b * tmp0.y;
318 
319  accum2_1.x += tmp1.x - a * tmp0.y;
320  accum2_1.y += tmp1.y + a * tmp0.x;
321 
322  accum1_1.x += b * tmp1.x;
323  accum1_1.y += b * tmp1.y;
324 
325  //store results back to memory:
326 
327  spinor[flv1_idx + 1 * param.sp_stride] = c * accum1_0;
328  spinor[flv1_idx + 7 * param.sp_stride] = c * accum1_1;
329  spinor[flv2_idx + 1 * param.sp_stride] = c * accum2_0;
330  spinor[flv2_idx + 7 * param.sp_stride] = c * accum2_1;
331 
332 #ifndef FERMI_NO_DBLE_TEX
333  tmp0 = fetch_double2(SPINORTEX, flv1_idx + 4 * param.sp_stride);
334  tmp1 = fetch_double2(SPINORTEX, flv1_idx + 10 * param.sp_stride);
335 #else
336  tmp0 = in[flv1_idx + 4 * param.sp_stride];
337  tmp1 = in[flv1_idx + 10 * param.sp_stride];
338 #endif
339 
340  accum1_0.x = tmp0.x + a * tmp1.y;
341  accum1_0.y = tmp0.y - a * tmp1.x;
342 
343  accum2_0.x = b * tmp0.x;
344  accum2_0.y = b * tmp0.y;
345 
346  accum1_1.x = tmp1.x + a * tmp0.y;
347  accum1_1.y = tmp1.y - a * tmp0.x;
348 
349  accum2_1.x = b * tmp1.x;
350  accum2_1.y = b * tmp1.y;
351 
352 #ifndef FERMI_NO_DBLE_TEX
353  tmp0 = fetch_double2(SPINORTEX, flv2_idx + 4 * param.sp_stride);
354  tmp1 = fetch_double2(SPINORTEX, flv2_idx + 10 * param.sp_stride);
355 #else
356  tmp0 = in[flv2_idx + 4 * param.sp_stride];
357  tmp1 = in[flv2_idx + 10 * param.sp_stride];
358 #endif
359 
360  accum2_0.x += tmp0.x - a * tmp1.y;
361  accum2_0.y += tmp0.y + a * tmp1.x;
362 
363  accum1_0.x += b * tmp0.x;
364  accum1_0.y += b * tmp0.y;
365 
366  accum2_1.x += tmp1.x - a * tmp0.y;
367  accum2_1.y += tmp1.y + a * tmp0.x;
368 
369  accum1_1.x += b * tmp1.x;
370  accum1_1.y += b * tmp1.y;
371 
372  //store results back to memory:
373 
374  spinor[flv1_idx + 4 * param.sp_stride] = c * accum1_0;
375  spinor[flv1_idx + 10 * param.sp_stride] = c * accum1_1;
376  spinor[flv2_idx + 4 * param.sp_stride] = c * accum2_0;
377  spinor[flv2_idx + 10 * param.sp_stride] = c * accum2_1;
378  //get the second color component for each flavor:
379 
380 #ifndef FERMI_NO_DBLE_TEX
381  tmp0 = fetch_double2(SPINORTEX, flv1_idx + 2 * param.sp_stride);
382  tmp1 = fetch_double2(SPINORTEX, flv1_idx + 8 * param.sp_stride);
383 #else
384  tmp0 = in[flv1_idx + 2 * param.sp_stride];
385  tmp1 = in[flv1_idx + 8 * param.sp_stride];
386 #endif
387 
388  accum1_0.x = tmp0.x + a * tmp1.y;
389  accum1_0.y = tmp0.y - a * tmp1.x;
390 
391  accum2_0.x = b * tmp0.x;
392  accum2_0.y = b * tmp0.y;
393 
394  accum1_1.x = tmp1.x + a * tmp0.y;
395  accum1_1.y = tmp1.y - a * tmp0.x;
396 
397  accum2_1.x = b * tmp1.x;
398  accum2_1.y = b * tmp1.y;
399 
400 #ifndef FERMI_NO_DBLE_TEX
401  tmp0 = fetch_double2(SPINORTEX, flv2_idx + 2 * param.sp_stride);
402  tmp1 = fetch_double2(SPINORTEX, flv2_idx + 8 * param.sp_stride);
403 #else
404  tmp0 = in[flv2_idx + 2 * param.sp_stride];
405  tmp1 = in[flv2_idx + 8 * param.sp_stride];
406 #endif
407 
408  accum2_0.x += tmp0.x - a * tmp1.y;
409  accum2_0.y += tmp0.y + a * tmp1.x;
410 
411  accum1_0.x += b * tmp0.x;
412  accum1_0.y += b * tmp0.y;
413 
414  accum2_1.x += tmp1.x - a * tmp0.y;
415  accum2_1.y += tmp1.y + a * tmp0.x;
416 
417  accum1_1.x += b * tmp1.x;
418  accum1_1.y += b * tmp1.y;
419 
420  //store results back to memory:
421 
422  spinor[flv1_idx + 2 * param.sp_stride] = c * accum1_0;
423  spinor[flv1_idx + 8 * param.sp_stride] = c * accum1_1;
424  spinor[flv2_idx + 2 * param.sp_stride] = c * accum2_0;
425  spinor[flv2_idx + 8 * param.sp_stride] = c * accum2_1;
426 
427 #ifndef FERMI_NO_DBLE_TEX
428  tmp0 = fetch_double2(SPINORTEX, flv1_idx + 5 * param.sp_stride);
429  tmp1 = fetch_double2(SPINORTEX, flv1_idx + 11 * param.sp_stride);
430 #else
431  tmp0 = in[flv1_idx + 5 * param.sp_stride];
432  tmp1 = in[flv1_idx + 11 * param.sp_stride];
433 #endif
434 
435  accum1_0.x = tmp0.x + a * tmp1.y;
436  accum1_0.y = tmp0.y - a * tmp1.x;
437 
438  accum2_0.x = b * tmp0.x;
439  accum2_0.y = b * tmp0.y;
440 
441  accum1_1.x = tmp1.x + a * tmp0.y;
442  accum1_1.y = tmp1.y - a * tmp0.x;
443 
444  accum2_1.x = b * tmp1.x;
445  accum2_1.y = b * tmp1.y;
446 
447 
448 #ifndef FERMI_NO_DBLE_TEX
449  tmp0 = fetch_double2(SPINORTEX, flv2_idx + 5 * param.sp_stride);
450  tmp1 = fetch_double2(SPINORTEX, flv2_idx + 11 * param.sp_stride);
451 #else
452  tmp0 = in[flv2_idx + 5 * param.sp_stride];
453  tmp1 = in[flv2_idx + 11 * param.sp_stride];
454 #endif
455 
456  accum2_0.x += tmp0.x - a * tmp1.y;
457  accum2_0.y += tmp0.y + a * tmp1.x;
458 
459  accum1_0.x += b * tmp0.x;
460  accum1_0.y += b * tmp0.y;
461 
462  accum2_1.x += tmp1.x - a * tmp0.y;
463  accum2_1.y += tmp1.y + a * tmp0.x;
464 
465  accum1_1.x += b * tmp1.x;
466  accum1_1.y += b * tmp1.y;
467 
468  //store results back to memory:
469 
470  spinor[flv1_idx + 5 * param.sp_stride] = c * accum1_0;
471  spinor[flv1_idx + 11 * param.sp_stride] = c * accum1_1;
472  spinor[flv2_idx + 5 * param.sp_stride] = c * accum2_0;
473  spinor[flv2_idx + 11 * param.sp_stride] = c * accum2_1;
474 
475 #endif
476 }
477 
478 #endif // (__COMPUTE_CAPABILITY__ >= 130)
479 
480 #undef tmp0_re
481 #undef tmp0_im
482 #undef tmp1_re
483 #undef tmp1_im
484 #undef tmp2_re
485 #undef tmp2_im
486 #undef tmp3_re
487 #undef tmp3_im
488 
489 #define tmp0_re tmp0.x
490 #define tmp0_im tmp0.y
491 #define tmp1_re tmp0.z
492 #define tmp1_im tmp0.w
493 #define tmp2_re tmp1.x
494 #define tmp2_im tmp1.y
495 #define tmp3_re tmp1.z
496 #define tmp3_im tmp1.w
497 
498 #undef SPINORTEX
499 #ifdef USE_TEXTURE_OBJECTS
500 #define SPINORTEX param.inTex
501 #else
502 #define SPINORTEX spinorTexSingle
503 #endif
504 
505 
506 __global__ void twistGamma5Kernel(float4 *spinor, float *null, float a, float b,
507  const float4 *in, const float *null2, DslashParam param)
508 {
509 #ifdef GPU_TWISTED_MASS_DIRAC
510  int sid = blockIdx.x*blockDim.x + threadIdx.x;
511  if (sid >= param.threads) return;
512 
513  float4 I0 = TEX1DFETCH(float4, SPINORTEX, sid + 0 * param.sp_stride);
514  float4 I1 = TEX1DFETCH(float4, SPINORTEX, sid + 1 * param.sp_stride);
515  float4 I2 = TEX1DFETCH(float4, SPINORTEX, sid + 2 * param.sp_stride);
516  float4 I3 = TEX1DFETCH(float4, SPINORTEX, sid + 3 * param.sp_stride);
517  float4 I4 = TEX1DFETCH(float4, SPINORTEX, sid + 4 * param.sp_stride);
518  float4 I5 = TEX1DFETCH(float4, SPINORTEX, sid + 5 * param.sp_stride);
519 
520  volatile float4 tmp0, tmp1;
521 
522  //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor)
523 
524  //get the 1st color component:(I0.xy, I1.zw, I3.xy, I4.zw)
525 
526  tmp0_re = I0.x - a * I3.y;
527  tmp0_im = I0.y + a * I3.x;
528 
529  tmp1_re = I1.z - a * I4.w;
530  tmp1_im = I1.w + a * I4.z;
531 
532  tmp2_re = I3.x - a * I0.y;
533  tmp2_im = I3.y + a * I0.x;
534 
535  tmp3_re = I4.z - a * I1.w;
536  tmp3_im = I4.w + a * I1.z;
537 
538  I0.x = b * tmp0_re;
539  I0.y = b * tmp0_im;
540  I1.z = b * tmp1_re;
541  I1.w = b * tmp1_im;
542  I3.x = b * tmp2_re;
543  I3.y = b * tmp2_im;
544  I4.z = b * tmp3_re;
545  I4.w = b * tmp3_im;
546 
547  //get the 2nd color component:(I0.zw, I2.xy, I3.zw, I5.xy)
548 
549  tmp0_re = I0.z - a * I3.w;
550  tmp0_im = I0.w + a * I3.z;
551 
552  tmp1_re = I2.x - a * I5.y;
553  tmp1_im = I2.y + a * I5.x;
554 
555  tmp2_re = I3.z - a * I0.w;
556  tmp2_im = I3.w + a * I0.z;
557 
558  tmp3_re = I5.x - a * I2.y;
559  tmp3_im = I5.y + a * I2.x;
560 
561  I0.z = b * tmp0_re;
562  I0.w = b * tmp0_im;
563  I2.x = b * tmp1_re;
564  I2.y = b * tmp1_im;
565  I3.z = b * tmp2_re;
566  I3.w = b * tmp2_im;
567  I5.x = b * tmp3_re;
568  I5.y = b * tmp3_im;
569 
570  //get the 3d color component:(I1.xy, I2.zw, I4.xy, I5.zw)
571 
572  tmp0_re = I1.x - a * I4.y;
573  tmp0_im = I1.y + a * I4.x;
574 
575  tmp1_re = I2.z - a * I5.w;
576  tmp1_im = I2.w + a * I5.z;
577 
578  tmp2_re = I4.x - a * I1.y;
579  tmp2_im = I4.y + a * I1.x;
580 
581  tmp3_re = I5.z - a * I2.w;
582  tmp3_im = I5.w + a * I2.z;
583 
584  I1.x = b * tmp0_re;
585  I1.y = b * tmp0_im;
586  I2.z = b * tmp1_re;
587  I2.w = b * tmp1_im;
588  I4.x = b * tmp2_re;
589  I4.y = b * tmp2_im;
590  I5.z = b * tmp3_re;
591  I5.w = b * tmp3_im;
592 
593  spinor[sid + 0 * param.sp_stride] = I0;
594  spinor[sid + 1 * param.sp_stride] = I1;
595  spinor[sid + 2 * param.sp_stride] = I2;
596  spinor[sid + 3 * param.sp_stride] = I3;
597  spinor[sid + 4 * param.sp_stride] = I4;
598  spinor[sid + 5 * param.sp_stride] = I5;
599 #endif
600 }
601 
602 __global__ void twistGamma5Kernel(float4 *spinor, float *null, float a, float b, float c, const float4 *in, const float *null2, DslashParam param)
603 {
604 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
605  int sid = blockIdx.x * blockDim.x + threadIdx.x;
606  if (sid >= param.threads) return;
607 
608  float4 accum1_0, accum1_1;
609  float4 accum2_0, accum2_1;
610  float4 tmp0, tmp1;
611 
612  int flv1_idx = sid;
613  int flv2_idx = sid + param.fl_stride;
614 
615  //apply (1 - i*a*gamma_5 * tau_3 + b * tau_1)
616 
617  //get the first color component for each flavor:
618 
619  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 0 * param.sp_stride);
620  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 3 * param.sp_stride);
621 
622  accum1_0.x = tmp0.x + a * tmp1.y;
623  accum1_0.y = tmp0.y - a * tmp1.x;
624  accum1_0.z = tmp0.z + a * tmp1.w;
625  accum1_0.w = tmp0.w - a * tmp1.z;
626 
627  accum2_0.x = b * tmp0.x;
628  accum2_0.y = b * tmp0.y;
629  accum2_0.z = b * tmp0.z;
630  accum2_0.w = b * tmp0.w;
631 
632  accum1_1.x = tmp1.x + a * tmp0.y;
633  accum1_1.y = tmp1.y - a * tmp0.x;
634  accum1_1.z = tmp1.z + a * tmp0.w;
635  accum1_1.w = tmp1.w - a * tmp0.z;
636 
637  accum2_1.x = b * tmp1.x;
638  accum2_1.y = b * tmp1.y;
639  accum2_1.z = b * tmp1.z;
640  accum2_1.w = b * tmp1.w;
641 
642  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 0 * param.sp_stride);
643  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 3 * param.sp_stride);
644 
645  accum2_0.x += tmp0.x - a * tmp1.y;
646  accum2_0.y += tmp0.y + a * tmp1.x;
647  accum2_0.z += tmp0.z - a * tmp1.w;
648  accum2_0.w += tmp0.w + a * tmp1.z;
649 
650  accum1_0.x += b * tmp0.x;
651  accum1_0.y += b * tmp0.y;
652  accum1_0.z += b * tmp0.z;
653  accum1_0.w += b * tmp0.w;
654 
655  accum2_1.x += tmp1.x - a * tmp0.y;
656  accum2_1.y += tmp1.y + a * tmp0.x;
657  accum2_1.z += tmp1.z - a * tmp0.w;
658  accum2_1.w += tmp1.w + a * tmp0.z;
659 
660  accum1_1.x += b * tmp1.x;
661  accum1_1.y += b * tmp1.y;
662  accum1_1.z += b * tmp1.z;
663  accum1_1.w += b * tmp1.w;
664 
665  spinor[flv1_idx + 0 * param.sp_stride] = c * accum1_0;
666  spinor[flv1_idx + 3 * param.sp_stride] = c * accum1_1;
667  spinor[flv2_idx + 0 * param.sp_stride] = c * accum2_0;
668  spinor[flv2_idx + 3 * param.sp_stride] = c * accum2_1;
669 
670  //get the second color component for each flavor:
671 
672  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 1 * param.sp_stride);
673  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 4 * param.sp_stride);
674 
675  accum1_0.x = tmp0.x + a * tmp1.y;
676  accum1_0.y = tmp0.y - a * tmp1.x;
677  accum1_0.z = tmp0.z + a * tmp1.w;
678  accum1_0.w = tmp0.w - a * tmp1.z;
679 
680  accum2_0.x = b * tmp0.x;
681  accum2_0.y = b * tmp0.y;
682  accum2_0.z = b * tmp0.z;
683  accum2_0.w = b * tmp0.w;
684 
685  accum1_1.x = tmp1.x + a * tmp0.y;
686  accum1_1.y = tmp1.y - a * tmp0.x;
687  accum1_1.z = tmp1.z + a * tmp0.w;
688  accum1_1.w = tmp1.w - a * tmp0.z;
689 
690  accum2_1.x = b * tmp1.x;
691  accum2_1.y = b * tmp1.y;
692  accum2_1.z = b * tmp1.z;
693  accum2_1.w = b * tmp1.w;
694 
695  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 1 * param.sp_stride);
696  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 4 * param.sp_stride);
697 
698  accum2_0.x += tmp0.x - a * tmp1.y;
699  accum2_0.y += tmp0.y + a * tmp1.x;
700  accum2_0.z += tmp0.z - a * tmp1.w;
701  accum2_0.w += tmp0.w + a * tmp1.z;
702 
703  accum1_0.x += b * tmp0.x;
704  accum1_0.y += b * tmp0.y;
705  accum1_0.z += b * tmp0.z;
706  accum1_0.w += b * tmp0.w;
707 
708  accum2_1.x += tmp1.x - a * tmp0.y;
709  accum2_1.y += tmp1.y + a * tmp0.x;
710  accum2_1.z += tmp1.z - a * tmp0.w;
711  accum2_1.w += tmp1.w + a * tmp0.z;
712 
713  accum1_1.x += b * tmp1.x;
714  accum1_1.y += b * tmp1.y;
715  accum1_1.z += b * tmp1.z;
716  accum1_1.w += b * tmp1.w;
717 
718  spinor[flv1_idx + 1 * param.sp_stride] = c * accum1_0;
719  spinor[flv1_idx + 4 * param.sp_stride] = c * accum1_1;
720  spinor[flv2_idx + 1 * param.sp_stride] = c * accum2_0;
721  spinor[flv2_idx + 4 * param.sp_stride] = c * accum2_1;
722 
723  //get the third color component for each flavor:
724 
725  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 2 * param.sp_stride);
726  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 5 * param.sp_stride);
727 
728  accum1_0.x = tmp0.x + a * tmp1.y;
729  accum1_0.y = tmp0.y - a * tmp1.x;
730  accum1_0.z = tmp0.z + a * tmp1.w;
731  accum1_0.w = tmp0.w - a * tmp1.z;
732 
733  accum2_0.x = b * tmp0.x;
734  accum2_0.y = b * tmp0.y;
735  accum2_0.z = b * tmp0.z;
736  accum2_0.w = b * tmp0.w;
737 
738  accum1_1.x = tmp1.x + a * tmp0.y;
739  accum1_1.y = tmp1.y - a * tmp0.x;
740  accum1_1.z = tmp1.z + a * tmp0.w;
741  accum1_1.w = tmp1.w - a * tmp0.z;
742 
743  accum2_1.x = b * tmp1.x;
744  accum2_1.y = b * tmp1.y;
745  accum2_1.z = b * tmp1.z;
746  accum2_1.w = b * tmp1.w;
747 
748  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 2 * param.sp_stride);
749  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 5 * param.sp_stride);
750 
751  accum2_0.x += tmp0.x - a * tmp1.y;
752  accum2_0.y += tmp0.y + a * tmp1.x;
753  accum2_0.z += tmp0.z - a * tmp1.w;
754  accum2_0.w += tmp0.w + a * tmp1.z;
755 
756  accum1_0.x += b * tmp0.x;
757  accum1_0.y += b * tmp0.y;
758  accum1_0.z += b * tmp0.z;
759  accum1_0.w += b * tmp0.w;
760 
761  accum2_1.x += tmp1.x - a * tmp0.y;
762  accum2_1.y += tmp1.y + a * tmp0.x;
763  accum2_1.z += tmp1.z - a * tmp0.w;
764  accum2_1.w += tmp1.w + a * tmp0.z;
765 
766  accum1_1.x += b * tmp1.x;
767  accum1_1.y += b * tmp1.y;
768  accum1_1.z += b * tmp1.z;
769  accum1_1.w += b * tmp1.w;
770 
771  spinor[flv1_idx + 2 * param.sp_stride] = c * accum1_0;
772  spinor[flv1_idx + 5 * param.sp_stride] = c * accum1_1;
773  spinor[flv2_idx + 2 * param.sp_stride] = c * accum2_0;
774  spinor[flv2_idx + 5 * param.sp_stride] = c * accum2_1;
775 
776 #endif
777 }
778 
779 #undef SPINORTEX
780 #ifdef USE_TEXTURE_OBJECTS
781 #define SPINORTEX param.inTex
782 #define SPINORTEXNORM param.inTexNorm
783 #else
784 #define SPINORTEX spinorTexHalf
785 #define SPINORTEXNORM spinorTexHalfNorm
786 #endif
787 
788 
789 __global__ void twistGamma5Kernel(short4* spinor, float *spinorNorm, float a, float b,
790  const short4 *in, const float *inNorm, DslashParam param)
791 {
792 #ifdef GPU_TWISTED_MASS_DIRAC
793  int sid = blockIdx.x*blockDim.x + threadIdx.x;
794  if (sid >= param.threads) return;
795 
796  float4 I0 = TEX1DFETCH(float4, SPINORTEX, sid + 0 * param.sp_stride);
797  float4 I1 = TEX1DFETCH(float4, SPINORTEX, sid + 1 * param.sp_stride);
798  float4 I2 = TEX1DFETCH(float4, SPINORTEX, sid + 2 * param.sp_stride);
799  float4 I3 = TEX1DFETCH(float4, SPINORTEX, sid + 3 * param.sp_stride);
800  float4 I4 = TEX1DFETCH(float4, SPINORTEX, sid + 4 * param.sp_stride);
801  float4 I5 = TEX1DFETCH(float4, SPINORTEX, sid + 5 * param.sp_stride);
802 
803  float C = TEX1DFETCH(float, SPINORTEXNORM, sid);
804 
805  I0 = C * I0;
806  I1 = C * I1;
807  I2 = C * I2;
808  I3 = C * I3;
809  I4 = C * I4;
810  I5 = C * I5;
811 
812  volatile float4 tmp0, tmp1;
813 
814  //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor)
815 
816  //get the 1st color component:(I0.xy, I1.zw, I3.xy, I4.zw)
817 
818  tmp0_re = I0.x - a * I3.y;
819  tmp0_im = I0.y + a * I3.x;
820 
821  tmp1_re = I1.z - a * I4.w;
822  tmp1_im = I1.w + a * I4.z;
823 
824  tmp2_re = I3.x - a * I0.y;
825  tmp2_im = I3.y + a * I0.x;
826 
827  tmp3_re = I4.z - a * I1.w;
828  tmp3_im = I4.w + a * I1.z;
829 
830  I0.x = b * tmp0_re;
831  I0.y = b * tmp0_im;
832  I1.z = b * tmp1_re;
833  I1.w = b * tmp1_im;
834  I3.x = b * tmp2_re;
835  I3.y = b * tmp2_im;
836  I4.z = b * tmp3_re;
837  I4.w = b * tmp3_im;
838 
839  //get the 2nd color component:(I0.zw, I2.xy, I3.zw, I5.xy)
840 
841  tmp0_re = I0.z - a * I3.w;
842  tmp0_im = I0.w + a * I3.z;
843 
844  tmp1_re = I2.x - a * I5.y;
845  tmp1_im = I2.y + a * I5.x;
846 
847  tmp2_re = I3.z - a * I0.w;
848  tmp2_im = I3.w + a * I0.z;
849 
850  tmp3_re = I5.x - a * I2.y;
851  tmp3_im = I5.y + a * I2.x;
852 
853  I0.z = b * tmp0_re;
854  I0.w = b * tmp0_im;
855  I2.x = b * tmp1_re;
856  I2.y = b * tmp1_im;
857  I3.z = b * tmp2_re;
858  I3.w = b * tmp2_im;
859  I5.x = b * tmp3_re;
860  I5.y = b * tmp3_im;
861 
862  //get the 3d color component:(I1.xy, I2.zw, I4.xy, I5.zw)
863 
864  tmp0_re = I1.x - a * I4.y;
865  tmp0_im = I1.y + a * I4.x;
866 
867  tmp1_re = I2.z - a * I5.w;
868  tmp1_im = I2.w + a * I5.z;
869 
870  tmp2_re = I4.x - a * I1.y;
871  tmp2_im = I4.y + a * I1.x;
872 
873  tmp3_re = I5.z - a * I2.w;
874  tmp3_im = I5.w + a * I2.z;
875 
876  I1.x = b * tmp0_re;
877  I1.y = b * tmp0_im;
878  I2.z = b * tmp1_re;
879  I2.w = b * tmp1_im;
880  I4.x = b * tmp2_re;
881  I4.y = b * tmp2_im;
882  I5.z = b * tmp3_re;
883  I5.w = b * tmp3_im;
884 
885 
886  float c0 = fmaxf(fabsf(I0.x), fabsf(I0.y));
887  float c1 = fmaxf(fabsf(I0.z), fabsf(I0.w));
888  float c2 = fmaxf(fabsf(I1.x), fabsf(I1.y));
889  float c3 = fmaxf(fabsf(I1.z), fabsf(I1.w));
890  float c4 = fmaxf(fabsf(I2.x), fabsf(I2.y));
891  float c5 = fmaxf(fabsf(I2.z), fabsf(I2.w));
892  float c6 = fmaxf(fabsf(I3.x), fabsf(I3.y));
893  float c7 = fmaxf(fabsf(I3.z), fabsf(I3.w));
894  float c8 = fmaxf(fabsf(I4.x), fabsf(I4.y));
895  float c9 = fmaxf(fabsf(I4.z), fabsf(I4.w));
896  float c10 = fmaxf(fabsf(I5.x), fabsf(I5.y));
897  float c11 = fmaxf(fabsf(I5.z), fabsf(I5.w));
898  c0 = fmaxf(c0, c1);
899  c1 = fmaxf(c2, c3);
900  c2 = fmaxf(c4, c5);
901  c3 = fmaxf(c6, c7);
902  c4 = fmaxf(c8, c9);
903  c5 = fmaxf(c10, c11);
904  c0 = fmaxf(c0, c1);
905  c1 = fmaxf(c2, c3);
906  c2 = fmaxf(c4, c5);
907  c0 = fmaxf(c0, c1);
908  c0 = fmaxf(c0, c2);
909  spinorNorm[sid] = c0;
910  float scale = __fdividef(MAX_SHORT, c0);
911 
912  I0 = scale * I0;
913  I1 = scale * I1;
914  I2 = scale * I2;
915  I3 = scale * I3;
916  I4 = scale * I4;
917  I5 = scale * I5;
918 
919  spinor[sid+0*(param.sp_stride)] = make_short4((short)I0.x, (short)I0.y, (short)I0.z, (short)I0.w);
920  spinor[sid+1*(param.sp_stride)] = make_short4((short)I1.x, (short)I1.y, (short)I1.z, (short)I1.w);
921  spinor[sid+2*(param.sp_stride)] = make_short4((short)I2.x, (short)I2.y, (short)I2.z, (short)I2.w);
922  spinor[sid+3*(param.sp_stride)] = make_short4((short)I3.x, (short)I3.y, (short)I3.z, (short)I3.w);
923  spinor[sid+4*(param.sp_stride)] = make_short4((short)I4.x, (short)I4.y, (short)I4.z, (short)I4.w);
924  spinor[sid+5*(param.sp_stride)] = make_short4((short)I5.x, (short)I5.y, (short)I5.z, (short)I5.w);
925 
926 #endif
927 }
928 
929 
930 __global__ void twistGamma5Kernel(short4* spinor, float *spinorNorm, float a, float b, float c, const short4 *in, const float *inNorm, DslashParam param)
931 {
932 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
933  int sid = blockIdx.x * blockDim.x + threadIdx.x;
934  if (sid >= param.threads) return;
935 
936  int flv1_idx = sid;
937  int flv2_idx = sid + param.fl_stride;
938 
939  float C1 = TEX1DFETCH(float, SPINORTEXNORM, flv1_idx);
940  float C2 = TEX1DFETCH(float, SPINORTEXNORM, flv2_idx);
941 
942  float4 accum1_0, accum1_1, accum1_2, accum1_3, accum1_4, accum1_5;
943  float4 accum2_0, accum2_1, accum2_2, accum2_3, accum2_4, accum2_5;
944 
945  float4 tmp0, tmp1;
946 
947  C1 *= c, C2 *= c;
948 
949  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 0 * param.sp_stride);
950  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 3 * param.sp_stride);
951 
952  tmp0 = C1 * tmp0;
953  tmp1 = C1 * tmp1;
954 
955  accum1_0.x = tmp0.x + a * tmp1.y;
956  accum1_0.y = tmp0.y - a * tmp1.x;
957  accum1_0.z = tmp0.z + a * tmp1.w;
958  accum1_0.w = tmp0.w - a * tmp1.z;
959 
960  accum2_0.x = b * tmp0.x;
961  accum2_0.y = b * tmp0.y;
962  accum2_0.z = b * tmp0.z;
963  accum2_0.w = b * tmp0.w;
964 
965  accum1_3.x = tmp1.x + a * tmp0.y;
966  accum1_3.y = tmp1.y - a * tmp0.x;
967  accum1_3.z = tmp1.z + a * tmp0.w;
968  accum1_3.w = tmp1.w - a * tmp0.z;
969 
970  accum2_3.x = b * tmp1.x;
971  accum2_3.y = b * tmp1.y;
972  accum2_3.z = b * tmp1.z;
973  accum2_3.w = b * tmp1.w;
974 
975  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 0 * param.sp_stride);
976  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 3 * param.sp_stride);
977 
978  tmp0 = C2 * tmp0;
979  tmp1 = C2 * tmp1;
980 
981  accum2_0.x += tmp0.x - a * tmp1.y;
982  accum2_0.y += tmp0.y + a * tmp1.x;
983  accum2_0.z += tmp0.z - a * tmp1.w;
984  accum2_0.w += tmp0.w + a * tmp1.z;
985 
986  accum1_0.x += b * tmp0.x;
987  accum1_0.y += b * tmp0.y;
988  accum1_0.z += b * tmp0.z;
989  accum1_0.w += b * tmp0.w;
990 
991  accum2_3.x += tmp1.x - a * tmp0.y;
992  accum2_3.y += tmp1.y + a * tmp0.x;
993  accum2_3.z += tmp1.z - a * tmp0.w;
994  accum2_3.w += tmp1.w + a * tmp0.z;
995 
996  accum1_3.x += b * tmp1.x;
997  accum1_3.y += b * tmp1.y;
998  accum1_3.z += b * tmp1.z;
999  accum1_3.w += b * tmp1.w;
1000 
1001  float c1_0 = fmaxf(fabsf(accum1_0.x), fabsf(accum1_0.y));
1002  float c1_1 = fmaxf(fabsf(accum1_0.z), fabsf(accum1_0.w));
1003  float c1_6 = fmaxf(fabsf(accum1_3.x), fabsf(accum1_3.y));
1004  float c1_7 = fmaxf(fabsf(accum1_3.z), fabsf(accum1_3.w));
1005 
1006  float c2_0 = fmaxf(fabsf(accum2_0.x), fabsf(accum2_0.y));
1007  float c2_1 = fmaxf(fabsf(accum2_0.z), fabsf(accum2_0.w));
1008  float c2_6 = fmaxf(fabsf(accum2_3.x), fabsf(accum2_3.y));
1009  float c2_7 = fmaxf(fabsf(accum2_3.z), fabsf(accum2_3.w));
1010 
1011  //get the second color component for each flavor:
1012 
1013  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 1 * param.sp_stride);
1014  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 4 * param.sp_stride);
1015 
1016  tmp0 = C1 * tmp0;
1017  tmp1 = C1 * tmp1;
1018 
1019  accum1_1.x = tmp0.x + a * tmp1.y;
1020  accum1_1.y = tmp0.y - a * tmp1.x;
1021  accum1_1.z = tmp0.z + a * tmp1.w;
1022  accum1_1.w = tmp0.w - a * tmp1.z;
1023 
1024  accum2_1.x = b * tmp0.x;
1025  accum2_1.y = b * tmp0.y;
1026  accum2_1.z = b * tmp0.z;
1027  accum2_1.w = b * tmp0.w;
1028 
1029  accum1_4.x = tmp1.x + a * tmp0.y;
1030  accum1_4.y = tmp1.y - a * tmp0.x;
1031  accum1_4.z = tmp1.z + a * tmp0.w;
1032  accum1_4.w = tmp1.w - a * tmp0.z;
1033 
1034  accum2_4.x = b * tmp1.x;
1035  accum2_4.y = b * tmp1.y;
1036  accum2_4.z = b * tmp1.z;
1037  accum2_4.w = b * tmp1.w;
1038 
1039  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 1 * param.sp_stride);
1040  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 4 * param.sp_stride);
1041 
1042  tmp0 = C2 * tmp0;
1043  tmp1 = C2 * tmp1;
1044 
1045  accum2_1.x += tmp0.x - a * tmp1.y;
1046  accum2_1.y += tmp0.y + a * tmp1.x;
1047  accum2_1.z += tmp0.z - a * tmp1.w;
1048  accum2_1.w += tmp0.w + a * tmp1.z;
1049 
1050  accum1_1.x += b * tmp0.x;
1051  accum1_1.y += b * tmp0.y;
1052  accum1_1.z += b * tmp0.z;
1053  accum1_1.w += b * tmp0.w;
1054 
1055  accum2_4.x += tmp1.x - a * tmp0.y;
1056  accum2_4.y += tmp1.y + a * tmp0.x;
1057  accum2_4.z += tmp1.z - a * tmp0.w;
1058  accum2_4.w += tmp1.w + a * tmp0.z;
1059 
1060  accum1_4.x += b * tmp1.x;
1061  accum1_4.y += b * tmp1.y;
1062  accum1_4.z += b * tmp1.z;
1063  accum1_4.w += b * tmp1.w;
1064 
1065  float c1_2 = fmaxf(fabsf(accum1_1.x), fabsf(accum1_1.y));
1066  float c1_3 = fmaxf(fabsf(accum1_1.z), fabsf(accum1_1.w));
1067  float c1_8 = fmaxf(fabsf(accum1_4.x), fabsf(accum1_4.y));
1068  float c1_9 = fmaxf(fabsf(accum1_4.z), fabsf(accum1_4.w));
1069 
1070  float c2_2 = fmaxf(fabsf(accum2_1.x), fabsf(accum2_1.y));
1071  float c2_3 = fmaxf(fabsf(accum2_1.z), fabsf(accum2_1.w));
1072  float c2_8 = fmaxf(fabsf(accum2_4.x), fabsf(accum2_4.y));
1073  float c2_9 = fmaxf(fabsf(accum2_4.z), fabsf(accum2_4.w));
1074 
1075  //get the third color component for each flavor:
1076 
1077  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 2 * param.sp_stride);
1078  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv1_idx + 5 * param.sp_stride);
1079 
1080  tmp0 = C1 * tmp0;
1081  tmp1 = C1 * tmp1;
1082 
1083  accum1_2.x = tmp0.x + a * tmp1.y;
1084  accum1_2.y = tmp0.y - a * tmp1.x;
1085  accum1_2.z = tmp0.z + a * tmp1.w;
1086  accum1_2.w = tmp0.w - a * tmp1.z;
1087 
1088  accum2_2.x = b * tmp0.x;
1089  accum2_2.y = b * tmp0.y;
1090  accum2_2.z = b * tmp0.z;
1091  accum2_2.w = b * tmp0.w;
1092 
1093  accum1_5.x = tmp1.x + a * tmp0.y;
1094  accum1_5.y = tmp1.y - a * tmp0.x;
1095  accum1_5.z = tmp1.z + a * tmp0.w;
1096  accum1_5.w = tmp1.w - a * tmp0.z;
1097 
1098  accum2_5.x = b * tmp1.x;
1099  accum2_5.y = b * tmp1.y;
1100  accum2_5.z = b * tmp1.z;
1101  accum2_5.w = b * tmp1.w;
1102 
1103  tmp0 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 2 * param.sp_stride);
1104  tmp1 = TEX1DFETCH(float4, SPINORTEX, flv2_idx + 5 * param.sp_stride);
1105 
1106  tmp0 = C2 * tmp0;
1107  tmp1 = C2 * tmp1;
1108 
1109  accum2_2.x += tmp0.x - a * tmp1.y;
1110  accum2_2.y += tmp0.y + a * tmp1.x;
1111  accum2_2.z += tmp0.z - a * tmp1.w;
1112  accum2_2.w += tmp0.w + a * tmp1.z;
1113 
1114  accum1_2.x += b * tmp0.x;
1115  accum1_2.y += b * tmp0.y;
1116  accum1_2.z += b * tmp0.z;
1117  accum1_2.w += b * tmp0.w;
1118 
1119  accum2_5.x += tmp1.x - a * tmp0.y;
1120  accum2_5.y += tmp1.y + a * tmp0.x;
1121  accum2_5.z += tmp1.z - a * tmp0.w;
1122  accum2_5.w += tmp1.w + a * tmp0.z;
1123 
1124  accum1_5.x += b * tmp1.x;
1125  accum1_5.y += b * tmp1.y;
1126  accum1_5.z += b * tmp1.z;
1127  accum1_5.w += b * tmp1.w;
1128 
1129  float c1_4 = fmaxf(fabsf(accum1_2.x), fabsf(accum1_2.y));
1130  float c1_5 = fmaxf(fabsf(accum1_2.z), fabsf(accum1_2.w));
1131  float c1_10 = fmaxf(fabsf(accum1_5.x), fabsf(accum1_5.y));
1132  float c1_11 = fmaxf(fabsf(accum1_5.z), fabsf(accum1_5.w));
1133 
1134  float c2_4 = fmaxf(fabsf(accum2_2.x), fabsf(accum2_2.y));
1135  float c2_5 = fmaxf(fabsf(accum2_2.z), fabsf(accum2_2.w));
1136  float c2_10 = fmaxf(fabsf(accum2_5.x), fabsf(accum2_5.y));
1137  float c2_11 = fmaxf(fabsf(accum2_5.z), fabsf(accum2_5.w));
1138 
1139 
1140  c1_0 = fmaxf(c1_0, c1_1);
1141  c1_1 = fmaxf(c1_2, c1_3);
1142  c1_2 = fmaxf(c1_4, c1_5);
1143  c1_3 = fmaxf(c1_6, c1_7);
1144  c1_4 = fmaxf(c1_8, c1_9);
1145  c1_5 = fmaxf(c1_10, c1_11);
1146  c1_0 = fmaxf(c1_0, c1_1);
1147  c1_1 = fmaxf(c1_2, c1_3);
1148  c1_2 = fmaxf(c1_4, c1_5);
1149  c1_0 = fmaxf(c1_0, c1_1);
1150  c1_0 = fmaxf(c1_0, c1_2);
1151  spinorNorm[flv1_idx] = c1_0;
1152  float scale = __fdividef(MAX_SHORT, c1_0);
1153 
1154  accum1_0 = scale * accum1_0;
1155  accum1_1 = scale * accum1_1;
1156  accum1_2 = scale * accum1_2;
1157  accum1_3 = scale * accum1_3;
1158  accum1_4 = scale * accum1_4;
1159  accum1_5 = scale * accum1_5;
1160 
1161  c2_0 = fmaxf(c2_0, c2_1);
1162  c2_1 = fmaxf(c2_2, c2_3);
1163  c2_2 = fmaxf(c2_4, c2_5);
1164  c2_3 = fmaxf(c2_6, c2_7);
1165  c2_4 = fmaxf(c2_8, c2_9);
1166  c2_5 = fmaxf(c2_10, c2_11);
1167  c2_0 = fmaxf(c2_0, c2_1);
1168  c2_1 = fmaxf(c2_2, c2_3);
1169  c2_2 = fmaxf(c2_4, c2_5);
1170  c2_0 = fmaxf(c2_0, c2_1);
1171  c2_0 = fmaxf(c2_0, c2_2);
1172  spinorNorm[flv2_idx] = c2_0;
1173  scale = __fdividef(MAX_SHORT, c2_0);
1174 
1175  accum2_0 = scale * accum2_0;
1176  accum2_1 = scale * accum2_1;
1177  accum2_2 = scale * accum2_2;
1178  accum2_3 = scale * accum2_3;
1179  accum2_4 = scale * accum2_4;
1180  accum2_5 = scale * accum2_5;
1181 
1182 
1183  spinor[flv1_idx+0*(param.sp_stride)] = make_short4((short)accum1_0.x, (short)accum1_0.y, (short)accum1_0.z, (short)accum1_0.w);
1184  spinor[flv1_idx+1*(param.sp_stride)] = make_short4((short)accum1_1.x, (short)accum1_1.y, (short)accum1_1.z, (short)accum1_1.w);
1185  spinor[flv1_idx+2*(param.sp_stride)] = make_short4((short)accum1_2.x, (short)accum1_2.y, (short)accum1_2.z, (short)accum1_2.w);
1186  spinor[flv1_idx+3*(param.sp_stride)] = make_short4((short)accum1_3.x, (short)accum1_3.y, (short)accum1_3.z, (short)accum1_3.w);
1187  spinor[flv1_idx+4*(param.sp_stride)] = make_short4((short)accum1_4.x, (short)accum1_4.y, (short)accum1_4.z, (short)accum1_4.w);
1188  spinor[flv1_idx+5*(param.sp_stride)] = make_short4((short)accum1_5.x, (short)accum1_5.y, (short)accum1_5.z, (short)accum1_5.w);
1189 
1190  spinor[flv2_idx+0*(param.sp_stride)] = make_short4((short)accum2_0.x, (short)accum2_0.y, (short)accum2_0.z, (short)accum2_0.w);
1191  spinor[flv2_idx+1*(param.sp_stride)] = make_short4((short)accum2_1.x, (short)accum2_1.y, (short)accum2_1.z, (short)accum2_1.w);
1192  spinor[flv2_idx+2*(param.sp_stride)] = make_short4((short)accum2_2.x, (short)accum2_2.y, (short)accum2_2.z, (short)accum2_2.w);
1193  spinor[flv2_idx+3*(param.sp_stride)] = make_short4((short)accum2_3.x, (short)accum2_3.y, (short)accum2_3.z, (short)accum2_3.w);
1194  spinor[flv2_idx+4*(param.sp_stride)] = make_short4((short)accum2_4.x, (short)accum2_4.y, (short)accum2_4.z, (short)accum2_4.w);
1195  spinor[flv2_idx+5*(param.sp_stride)] = make_short4((short)accum2_5.x, (short)accum2_5.y, (short)accum2_5.z, (short)accum2_5.w);
1196 
1197 #endif
1198 }
1199 
1200 #undef SPINORTEX
1201 #undef SPINORTEXNORM
1202 
1203 
1204 #undef tmp0_re
1205 #undef tmp0_im
1206 #undef tmp1_re
1207 #undef tmp1_im
1208 #undef tmp2_re
1209 #undef tmp2_im
1210 #undef tmp3_re
1211 #undef tmp3_im
1212 
1213 #endif //_TM_CUDA_H
1214 
1215 
#define tmp2_im
Definition: tm_core.h:494
#define tmp3_im
Definition: tm_core.h:496
#define tmp1_re
Definition: tm_core.h:491
int y[4]
cudaColorSpinorField * tmp1
Definition: dslash_test.cpp:41
#define SPINORTEXNORM
Definition: tm_core.h:785
cpuColorSpinorField * spinor
Definition: dslash_test.cpp:40
__global__ void twistGamma5Kernel(float4 *spinor, float *null, float a, float b, const float4 *in, const float *null2, DslashParam param)
Definition: tm_core.h:506
#define tmp1_im
Definition: tm_core.h:492
QudaGaugeParam param
Definition: pack_test.cpp:17
cudaColorSpinorField * tmp2
Definition: dslash_test.cpp:41
#define SPINORTEX
Definition: tm_core.h:784
cpuColorSpinorField * in
#define tmp0_re
Definition: tm_core.h:489
#define tmp0_im
Definition: tm_core.h:490
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
Definition: texture.h:90
int x[4]
#define tmp2_re
Definition: tm_core.h:493
__device__ float4 operator*(const float &x, const float4 &y)
Definition: tm_core.h:7
#define MAX_SHORT
Definition: quda_internal.h:30
#define TEX1DFETCH(type, tex, idx)
#define tmp3_re
Definition: tm_core.h:495