QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
gamma5.h
Go to the documentation of this file.
1 #ifndef _TWIST_QUDA_G5
2 #define _TWIST_QUDA_G5
3 
4 //action of the operator b*(1 + i*a*gamma5)
5 //used also macros from io_spinor.h
6 
7 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
8 #define VOLATILE
9 #else // Open64 compiler
10 #define VOLATILE volatile
11 #endif
12 // input spinor
13 
14 #define tmp0_re tmp0.x
15 #define tmp0_im tmp0.y
16 #define tmp1_re tmp1.x
17 #define tmp1_im tmp1.y
18 #define tmp2_re tmp2.x
19 #define tmp2_im tmp2.y
20 #define tmp3_re tmp3.x
21 #define tmp3_im tmp3.y
22 
23 #if (__COMPUTE_CAPABILITY__ >= 130)
24 #ifdef DIRECT_ACCESS_WILSON_SPINOR
25  #define READ_SPINOR READ_SPINOR_DOUBLE
26  #define SPINORTEX in
27 #else
28  #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
29 
30  #ifdef USE_TEXTURE_OBJECTS
31  #define SPINORTEX param.inTex
32  #else
33  #define SPINORTEX spinorTexDouble
34  #endif // USE_TEXTURE_OBJECTS
35 #endif
36 
37 #define SPINOR_HOP 12
38 #define WRITE_SPINOR WRITE_SPINOR_DOUBLE2_STR
39 
40 #define o00_re I0.x
41 #define o00_im I0.y
42 #define o01_re I1.x
43 #define o01_im I1.y
44 #define o02_re I2.x
45 #define o02_im I2.y
46 #define o10_re I3.x
47 #define o10_im I3.y
48 #define o11_re I4.x
49 #define o11_im I4.y
50 #define o12_re I5.x
51 #define o12_im I5.y
52 #define o20_re I6.x
53 #define o20_im I6.y
54 #define o21_re I7.x
55 #define o21_im I7.y
56 #define o22_re I8.x
57 #define o22_im I8.y
58 #define o30_re I9.x
59 #define o30_im I9.y
60 #define o31_re I10.x
61 #define o31_im I10.y
62 #define o32_re I11.x
63 #define o32_im I11.y
64 
65 __global__ void gamma5Kernel(double2 *out, float *outNorm, double2 *in, float *inNorm, DslashParam param, int myStride)
66 {
67  int sid = blockIdx.x*blockDim.x + threadIdx.x;
68  if (sid >= param.threads) return;
69 /*
70  // output spinor
71  VOLATILE double2 I0;
72  VOLATILE double2 I1;
73  VOLATILE double2 I2;
74  VOLATILE double2 I3;
75  VOLATILE double2 I4;
76  VOLATILE double2 I5;
77  VOLATILE double2 I6;
78  VOLATILE double2 I7;
79  VOLATILE double2 I8;
80  VOLATILE double2 I9;
81  VOLATILE double2 I10;
82  VOLATILE double2 I11;
83 */
84  READ_SPINOR (SPINORTEX, myStride, sid, sid);
85 /*
86 #if defined(FERMI_NO_DBLE_TEX) || defined (USE_TEXTURE_OBJECTS)
87  double2 I0 = spinor[sid + 0 * sp_stride];
88  double2 I1 = spinor[sid + 1 * sp_stride];
89  double2 I2 = spinor[sid + 2 * sp_stride];
90  double2 I3 = spinor[sid + 3 * sp_stride];
91  double2 I4 = spinor[sid + 4 * sp_stride];
92  double2 I5 = spinor[sid + 5 * sp_stride];
93  double2 I6 = spinor[sid + 6 * sp_stride];
94  double2 I7 = spinor[sid + 7 * sp_stride];
95  double2 I8 = spinor[sid + 8 * sp_stride];
96  double2 I9 = spinor[sid + 9 * sp_stride];
97  double2 I10 = spinor[sid + 10 * sp_stride];
98  double2 I11 = spinor[sid + 11 * sp_stride];
99 #else
100  double2 I0 = fetch_double2(spinorTexDouble, sid + 0 * sp_stride);
101  double2 I1 = fetch_double2(spinorTexDouble, sid + 1 * sp_stride);
102  double2 I2 = fetch_double2(spinorTexDouble, sid + 2 * sp_stride);
103  double2 I3 = fetch_double2(spinorTexDouble, sid + 3 * sp_stride);
104  double2 I4 = fetch_double2(spinorTexDouble, sid + 4 * sp_stride);
105  double2 I5 = fetch_double2(spinorTexDouble, sid + 5 * sp_stride);
106  double2 I6 = fetch_double2(spinorTexDouble, sid + 6 * sp_stride);
107  double2 I7 = fetch_double2(spinorTexDouble, sid + 7 * sp_stride);
108  double2 I8 = fetch_double2(spinorTexDouble, sid + 8 * sp_stride);
109  double2 I9 = fetch_double2(spinorTexDouble, sid + 9 * sp_stride);
110  double2 I10 = fetch_double2(spinorTexDouble, sid + 10 * sp_stride);
111  double2 I11 = fetch_double2(spinorTexDouble, sid + 11 * sp_stride);
112 #endif
113 */
114 
115  volatile double2 tmp0, tmp1, tmp2, tmp3;
116 
117  //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor)
118 
119  //get the 1st color component:
120 
121  tmp0_re = o20_re;
122  tmp0_im = o20_im;
123 
124  tmp2_re = o00_re;
125  tmp2_im = o00_im;
126 
127  tmp1_re = o30_re;
128  tmp1_im = o30_im;
129 
130  tmp3_re = o10_re;
131  tmp3_im = o10_im;
132 
133  o00_re = tmp0_re;
134  o00_im = tmp0_im;
135  o10_re = tmp1_re;
136  o10_im = tmp1_im;
137  o20_re = tmp2_re;
138  o20_im = tmp2_im;
139  o30_re = tmp3_re;
140  o30_im = tmp3_im;
141 
142  //get the 2nd color component:
143 
144  tmp0_re = o21_re;
145  tmp0_im = o21_im;
146 
147  tmp2_re = o01_re;
148  tmp2_im = o01_im;
149 
150  tmp1_re = o31_re;
151  tmp1_im = o31_im;
152 
153  tmp3_re = o11_re;
154  tmp3_im = o11_im;
155 
156  o01_re = tmp0_re;
157  o01_im = tmp0_im;
158  o11_re = tmp1_re;
159  o11_im = tmp1_im;
160  o21_re = tmp2_re;
161  o21_im = tmp2_im;
162  o31_re = tmp3_re;
163  o31_im = tmp3_im;
164 
165  //get the 3d color component:
166 
167  tmp0_re = o22_re;
168  tmp0_im = o22_im;
169 
170  tmp2_re = o02_re;
171  tmp2_im = o02_im;
172 
173  tmp1_re = o32_re;
174  tmp1_im = o32_im;
175 
176  tmp3_re = o12_re;
177  tmp3_im = o12_im;
178 
179  o02_re = tmp0_re;
180  o02_im = tmp0_im;
181  o12_re = tmp1_re;
182  o12_im = tmp1_im;
183  o22_re = tmp2_re;
184  o22_im = tmp2_im;
185  o32_re = tmp3_re;
186  o32_im = tmp3_im;
187 /*
188  spinor[sid + 0 * myStride] = I0;
189  spinor[sid + 1 * myStride] = I1;
190  spinor[sid + 2 * myStride] = I2;
191  spinor[sid + 3 * myStride] = I3;
192  spinor[sid + 4 * myStride] = I4;
193  spinor[sid + 5 * myStride] = I5;
194  spinor[sid + 6 * myStride] = I6;
195  spinor[sid + 7 * myStride] = I7;
196  spinor[sid + 8 * myStride] = I8;
197  spinor[sid + 9 * myStride] = I9;
198  spinor[sid + 10 * myStride] = I10;
199  spinor[sid + 11 * myStride] = I11;
200 */
201 
202  WRITE_SPINOR(myStride);
203 
204  return;
205 }
206 #endif // (__CUDA_ARCH__ >= 130)
207 
208 #undef tmp0_re
209 #undef tmp0_im
210 #undef tmp1_re
211 #undef tmp1_im
212 #undef tmp2_re
213 #undef tmp2_im
214 #undef tmp3_re
215 #undef tmp3_im
216 
217 #undef SPINOR_HOP
218 #undef READ_SPINOR
219 #undef SPINORTEX
220 #undef WRITE_SPINOR
221 
222 #undef o00_re
223 #undef o00_im
224 #undef o01_re
225 #undef o01_im
226 #undef o02_re
227 #undef o02_im
228 #undef o10_re
229 #undef o10_im
230 #undef o11_re
231 #undef o11_im
232 #undef o12_re
233 #undef o12_im
234 #undef o20_re
235 #undef o20_im
236 #undef o21_re
237 #undef o21_im
238 #undef o22_re
239 #undef o22_im
240 #undef o30_re
241 #undef o30_im
242 #undef o31_re
243 #undef o31_im
244 #undef o32_re
245 #undef o32_im
246 
247 #define tmp0_re tmp0.x
248 #define tmp0_im tmp0.y
249 #define tmp1_re tmp0.z
250 #define tmp1_im tmp0.w
251 #define tmp2_re tmp1.x
252 #define tmp2_im tmp1.y
253 #define tmp3_re tmp1.z
254 #define tmp3_im tmp1.w
255 
256 #ifdef DIRECT_ACCESS_WILSON_SPINOR
257  #define READ_SPINOR READ_SPINOR_SINGLE
258  #define SPINORTEX in
259 #else
260  #define READ_SPINOR READ_SPINOR_SINGLE_TEX
261 
262  #ifdef USE_TEXTURE_OBJECTS
263  #define SPINORTEX param.inTex
264  #else
265  #define SPINORTEX spinorTexSingle
266  #endif // USE_TEXTURE_OBJECTS
267 #endif
268 
269 #define SPINOR_HOP 6
270 #define WRITE_SPINOR WRITE_SPINOR_FLOAT4_STR
271 
272 #define o00_re I0.x
273 #define o00_im I0.y
274 #define o01_re I0.z
275 #define o01_im I0.w
276 #define o02_re I1.x
277 #define o02_im I1.y
278 #define o10_re I1.z
279 #define o10_im I1.w
280 #define o11_re I2.x
281 #define o11_im I2.y
282 #define o12_re I2.z
283 #define o12_im I2.w
284 #define o20_re I3.x
285 #define o20_im I3.y
286 #define o21_re I3.z
287 #define o21_im I3.w
288 #define o22_re I4.x
289 #define o22_im I4.y
290 #define o30_re I4.z
291 #define o30_im I4.w
292 #define o31_re I5.x
293 #define o31_im I5.y
294 #define o32_re I5.z
295 #define o32_im I5.w
296 
297 __global__ void gamma5Kernel(float4 *out, float *outNorm, float4 *in, float *inNorm, DslashParam param, int myStride)
298 {
299  int sid = blockIdx.x*blockDim.x + threadIdx.x;
300  if (sid >= param.threads) return;
301 /*
302  // output spinor
303  VOLATILE float4 I0;
304  VOLATILE float4 I1;
305  VOLATILE float4 I2;
306  VOLATILE float4 I3;
307  VOLATILE float4 I4;
308  VOLATILE float4 I5;
309 /*
310 #if defined(FERMI_NO_DBLE_TEX) || defined (USE_TEXTURE_OBJECTS)
311  float4 I0 = spinor[sid + 0 * sp_stride];
312  float4 I1 = spinor[sid + 1 * sp_stride];
313  float4 I2 = spinor[sid + 2 * sp_stride];
314  float4 I3 = spinor[sid + 3 * sp_stride];
315  float4 I4 = spinor[sid + 4 * sp_stride];
316  float4 I5 = spinor[sid + 5 * sp_stride];
317 #else
318  float4 I0 = tex1Dfetch(spinorTexSingle, sid + 0 * myStride);
319  float4 I1 = tex1Dfetch(spinorTexSingle, sid + 1 * myStride);
320  float4 I2 = tex1Dfetch(spinorTexSingle, sid + 2 * myStride);
321  float4 I3 = tex1Dfetch(spinorTexSingle, sid + 3 * myStride);
322  float4 I4 = tex1Dfetch(spinorTexSingle, sid + 4 * myStride);
323  float4 I5 = tex1Dfetch(spinorTexSingle, sid + 5 * myStride);
324 #endif
325 */
326  READ_SPINOR (SPINORTEX, myStride, sid, sid);
327 
328  volatile float4 tmp0, tmp1;
329 
330  //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor)
331 
332  //get the 1st color component:(o00_rey, o10_rew, o20_rey, o30_rew)
333 
334  tmp0_re = o20_re;
335  tmp0_im = o20_im;
336 
337  tmp1_re = o30_re;
338  tmp1_im = o30_im;
339 
340  tmp2_re = o00_re;
341  tmp2_im = o00_im;
342 
343  tmp3_re = o10_re;
344  tmp3_im = o10_im;
345 
346  o00_re = tmp0_re;
347  o00_im = tmp0_im;
348  o10_re = tmp1_re;
349  o10_im = tmp1_im;
350  o20_re = tmp2_re;
351  o20_im = tmp2_im;
352  o30_re = tmp3_re;
353  o30_im = tmp3_im;
354 
355  //get the 2nd color component:(o01_rew, o11_rey, o21_rew, o31_rey)
356 
357  tmp0_re = o21_re;
358  tmp0_im = o21_im;
359 
360  tmp1_re = o31_re;
361  tmp1_im = o31_im;
362 
363  tmp2_re = o01_re;
364  tmp2_im = o01_im;
365 
366  tmp3_re = o11_re;
367  tmp3_im = o11_im;
368 
369  o01_re = tmp0_re;
370  o01_im = tmp0_im;
371  o11_re = tmp1_re;
372  o11_im = tmp1_im;
373  o21_re = tmp2_re;
374  o21_im = tmp2_im;
375  o31_re = tmp3_re;
376  o31_im = tmp3_im;
377 
378  //get the 3d color component:(o02_rey, o12_rew, o22_rey, o32_rew)
379 
380  tmp0_re = o22_re;
381  tmp0_im = o22_im;
382 
383  tmp1_re = o32_re;
384  tmp1_im = o32_im;
385 
386  tmp2_re = o02_re;
387  tmp2_im = o02_im;
388 
389  tmp3_re = o12_re;
390  tmp3_im = o12_im;
391 
392  o02_re = tmp0_re;
393  o02_im = tmp0_im;
394  o12_re = tmp1_re;
395  o12_im = tmp1_im;
396  o22_re = tmp2_re;
397  o22_im = tmp2_im;
398  o32_re = tmp3_re;
399  o32_im = tmp3_im;
400  /*
401  spinor[sid + 0 * myStride] = I0;
402  spinor[sid + 1 * myStride] = I1;
403  spinor[sid + 2 * myStride] = I2;
404  spinor[sid + 3 * myStride] = I3;
405  spinor[sid + 4 * myStride] = I4;
406  spinor[sid + 5 * myStride] = I5;
407 */
408 
409  WRITE_SPINOR(myStride);
410 
411  return;
412 }
413 
414 /*
415 __global__ void gamma5Kernel(short4* spinor, float *spinorNorm, DslashParam param, int myStride)
416 {
417  int sid = blockIdx.x*blockDim.x + threadIdx.x;
418  if (sid >= param.threads) return;
419 
420 #if defined(FERMI_NO_DBLE_TEX) || defined (USE_TEXTURE_OBJECTS)
421  float4 I0 = spinor[sid + 0 * sp_stride];
422  float4 I1 = spinor[sid + 1 * sp_stride];
423  float4 I2 = spinor[sid + 2 * sp_stride];
424  float4 I3 = spinor[sid + 3 * sp_stride];
425  float4 I4 = spinor[sid + 4 * sp_stride];
426  float4 I5 = spinor[sid + 5 * sp_stride];
427 #else
428  float4 I0 = tex1Dfetch(spinorTexHalf, sid + 0 * myStride);
429  float4 I1 = tex1Dfetch(spinorTexHalf, sid + 1 * myStride);
430  float4 I2 = tex1Dfetch(spinorTexHalf, sid + 2 * myStride);
431  float4 I3 = tex1Dfetch(spinorTexHalf, sid + 3 * myStride);
432  float4 I4 = tex1Dfetch(spinorTexHalf, sid + 4 * myStride);
433  float4 I5 = tex1Dfetch(spinorTexHalf, sid + 5 * myStride);
434 
435  float C = tex1Dfetch(spinorTexHalfNorm, sid);
436 #endif
437 
438  I0 = C * I0;
439  I1 = C * I1;
440  I2 = C * I2;
441  I3 = C * I3;
442  I4 = C * I4;
443  I5 = C * I5;
444 
445  volatile float4 tmp0, tmp1;
446 
447  //apply (1 + i*a*gamma_5) to the input spinor and then add to (b * output spinor)
448 
449  //get the 1st color component:(o00_rey, o10_rew, o20_rey, o30_rew)
450 
451  tmp0_re = o20_re;
452  tmp0_im = o20_im;
453 
454  tmp1_re = o30_re;
455  tmp1_im = o30_im;
456 
457  tmp2_re = o00_re;
458  tmp2_im = o00_im;
459 
460  tmp3_re = o10_re;
461  tmp3_im = o10_im;
462 
463  o00_re = tmp0_re;
464  o00_im = tmp0_im;
465  o10_re = tmp1_re;
466  o10_im = tmp1_im;
467  o20_re = tmp2_re;
468  o20_im = tmp2_im;
469  o30_re = tmp3_re;
470  o30_im = tmp3_im;
471 
472  //get the 2nd color component:(o01_rew, o11_rey, o21_rew, o31_rey)
473 
474  tmp0_re = o21_re;
475  tmp0_im = o21_im;
476 
477  tmp1_re = o31_re;
478  tmp1_im = o31_im;
479 
480  tmp2_re = o01_re;
481  tmp2_im = o01_im;
482 
483  tmp3_re = o11_re;
484  tmp3_im = o11_im;
485 
486  o01_re = tmp0_re;
487  o01_im = tmp0_im;
488  o11_re = tmp1_re;
489  o11_im = tmp1_im;
490  o21_re = tmp2_re;
491  o21_im = tmp2_im;
492  o31_re = tmp3_re;
493  o31_im = tmp3_im;
494 
495  //get the 3d color component:(o02_rey, o12_rew, o22_rey, o32_rew)
496 
497  tmp0_re = o22_re;
498  tmp0_im = o22_im;
499 
500  tmp1_re = o32_re;
501  tmp1_im = o32_im;
502 
503  tmp2_re = o02_re;
504  tmp2_im = o02_im;
505 
506  tmp3_re = o12_re;
507  tmp3_im = o12_im;
508 
509  o02_re = tmp0_re;
510  o02_im = tmp0_im;
511  o12_re = tmp1_re;
512  o12_im = tmp1_im;
513  o22_re = tmp2_re;
514  o22_im = tmp2_im;
515  o32_re = tmp3_re;
516  o32_im = tmp3_im;
517 
518 
519  float c0 = fmaxf(fabsf(o00_re), fabsf(o00_im));
520  float c1 = fmaxf(fabsf(o01_re), fabsf(o01_im));
521  float c2 = fmaxf(fabsf(o02_re), fabsf(o02_im));
522  float c3 = fmaxf(fabsf(o10_re), fabsf(o10_im));
523  float c4 = fmaxf(fabsf(o11_re), fabsf(o11_im));
524  float c5 = fmaxf(fabsf(o12_re), fabsf(o12_im));
525  float c6 = fmaxf(fabsf(o20_re), fabsf(o20_im));
526  float c7 = fmaxf(fabsf(o21_re), fabsf(o21_im));
527  float c8 = fmaxf(fabsf(o22_re), fabsf(o22_im));
528  float c9 = fmaxf(fabsf(o30_re), fabsf(o30_im));
529  float c10 = fmaxf(fabsf(o31_re), fabsf(o31_im));
530  float c11 = fmaxf(fabsf(o32_re), fabsf(o32_im));
531  c0 = fmaxf(c0, c1);
532  c1 = fmaxf(c2, c3);
533  c2 = fmaxf(c4, c5);
534  c3 = fmaxf(c6, c7);
535  c4 = fmaxf(c8, c9);
536  c5 = fmaxf(c10, c11);
537  c0 = fmaxf(c0, c1);
538  c1 = fmaxf(c2, c3);
539  c2 = fmaxf(c4, c5);
540  c0 = fmaxf(c0, c1);
541  c0 = fmaxf(c0, c2);
542  spinorNorm[sid] = c0;
543  float scale = __fdividef(MAX_SHORT, c0);
544 
545  I0 = scale * I0;
546  I1 = scale * I1;
547  I2 = scale * I2;
548  I3 = scale * I3;
549  I4 = scale * I4;
550  I5 = scale * I5;
551 
552  spinor[sid+0*(myStride)] = make_short4((short)o00_re, (short)o00_im, (short)o01_re, (short)o01_im);
553  spinor[sid+1*(myStride)] = make_short4((short)o02_re, (short)o02_im, (short)o10_re, (short)o10_im);
554  spinor[sid+2*(myStride)] = make_short4((short)o11_re, (short)o11_im, (short)o12_re, (short)o12_im);
555  spinor[sid+3*(myStride)] = make_short4((short)o20_re, (short)o20_im, (short)o21_re, (short)o21_im);
556  spinor[sid+4*(myStride)] = make_short4((short)o22_re, (short)o22_im, (short)o30_re, (short)o30_im);
557  spinor[sid+5*(myStride)] = make_short4((short)o31_re, (short)o31_im, (short)o32_re, (short)o32_im);
558 
559  return;
560 }
561 */
562 #undef tmp0_re
563 #undef tmp0_im
564 #undef tmp1_re
565 #undef tmp1_im
566 #undef tmp2_re
567 #undef tmp2_im
568 #undef tmp3_re
569 #undef tmp3_im
570 
571 #endif //_TWIST_QUDA_G5
572 
#define tmp1_im
Definition: gamma5.h:250
#define o20_im
Definition: gamma5.h:285
#define tmp1_re
Definition: gamma5.h:249
#define o02_re
Definition: gamma5.h:276
#define o32_im
Definition: gamma5.h:295
#define o22_im
Definition: gamma5.h:289
#define o11_re
Definition: gamma5.h:280
#define o01_im
Definition: gamma5.h:275
#define o21_re
Definition: gamma5.h:286
cudaColorSpinorField * tmp1
Definition: dslash_test.cpp:41
#define o20_re
Definition: gamma5.h:284
#define tmp0_im
Definition: gamma5.h:248
#define tmp3_re
Definition: gamma5.h:253
#define o10_re
Definition: gamma5.h:278
#define WRITE_SPINOR
Definition: gamma5.h:270
#define o31_im
Definition: gamma5.h:293
#define tmp3_im
Definition: gamma5.h:254
QudaGaugeParam param
Definition: pack_test.cpp:17
#define o12_re
Definition: gamma5.h:282
cudaColorSpinorField * tmp2
Definition: dslash_test.cpp:41
#define o30_re
Definition: gamma5.h:290
#define o12_im
Definition: gamma5.h:283
#define tmp2_im
Definition: gamma5.h:252
cpuColorSpinorField * in
#define o30_im
Definition: gamma5.h:291
#define o10_im
Definition: gamma5.h:279
#define SPINORTEX
Definition: gamma5.h:265
#define o22_re
Definition: gamma5.h:288
#define o32_re
Definition: gamma5.h:294
#define o21_im
Definition: gamma5.h:287
#define o00_im
Definition: gamma5.h:273
#define o31_re
Definition: gamma5.h:292
__global__ void gamma5Kernel(float4 *out, float *outNorm, float4 *in, float *inNorm, DslashParam param, int myStride)
Definition: gamma5.h:297
cpuColorSpinorField * out
#define READ_SPINOR
Definition: gamma5.h:260
#define tmp0_re
Definition: gamma5.h:247
#define o01_re
Definition: gamma5.h:274
#define o11_im
Definition: gamma5.h:281
#define tmp2_re
Definition: gamma5.h:251
#define o02_im
Definition: gamma5.h:277
#define o00_re
Definition: gamma5.h:272