QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dw_dslash5_dagger_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 // NB! Don't trust any MULTI_GPU code
6 
7 #if (CUDA_VERSION >= 4010)
8 #define VOLATILE
9 #else
10 #define VOLATILE volatile
11 #endif
12 // input spinor
13 #ifdef SPINOR_DOUBLE
14 #define spinorFloat double
15 #define i00_re I0.x
16 #define i00_im I0.y
17 #define i01_re I1.x
18 #define i01_im I1.y
19 #define i02_re I2.x
20 #define i02_im I2.y
21 #define i10_re I3.x
22 #define i10_im I3.y
23 #define i11_re I4.x
24 #define i11_im I4.y
25 #define i12_re I5.x
26 #define i12_im I5.y
27 #define i20_re I6.x
28 #define i20_im I6.y
29 #define i21_re I7.x
30 #define i21_im I7.y
31 #define i22_re I8.x
32 #define i22_im I8.y
33 #define i30_re I9.x
34 #define i30_im I9.y
35 #define i31_re I10.x
36 #define i31_im I10.y
37 #define i32_re I11.x
38 #define i32_im I11.y
39 #define m5 m5_d
40 #define mdwf_b5 mdwf_b5_d
41 #define mdwf_c5 mdwf_c5_d
42 #else
43 #define spinorFloat float
44 #define i00_re I0.x
45 #define i00_im I0.y
46 #define i01_re I0.z
47 #define i01_im I0.w
48 #define i02_re I1.x
49 #define i02_im I1.y
50 #define i10_re I1.z
51 #define i10_im I1.w
52 #define i11_re I2.x
53 #define i11_im I2.y
54 #define i12_re I2.z
55 #define i12_im I2.w
56 #define i20_re I3.x
57 #define i20_im I3.y
58 #define i21_re I3.z
59 #define i21_im I3.w
60 #define i22_re I4.x
61 #define i22_im I4.y
62 #define i30_re I4.z
63 #define i30_im I4.w
64 #define i31_re I5.x
65 #define i31_im I5.y
66 #define i32_re I5.z
67 #define i32_im I5.w
68 #define m5 m5_f
69 #define mdwf_b5 mdwf_b5_f
70 #define mdwf_c5 mdwf_c5_f
71 #endif // SPINOR_DOUBLE
72 
73 // output spinor
98 
99 #ifdef SPINOR_DOUBLE
100 #if (__COMPUTE_CAPABILITY__ >= 200)
101 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
102 #else
103 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
104 #endif
105 #else
106 #if (__COMPUTE_CAPABILITY__ >= 200)
107 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
108 #else
109 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
110 #endif
111 #endif
112 #include "io_spinor.h"
113 
114 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
115 if (sid >= param.threads*param.Ls) return;
116 
118 
119 int X, xs;
120 
121 // Inline by hand for the moment and assume even dimensions
122 //coordsFromIndex(X, x1, x2, x3, x4, sid, param.parity);
123 
124 boundaryCrossing = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h);
125 
126 X = 2*sid + (boundaryCrossing + param.parity) % 2;
127 xs = X/(X1*X2*X3*X4);
128 
129  o00_re = 0; o00_im = 0;
130  o01_re = 0; o01_im = 0;
131  o02_re = 0; o02_im = 0;
132  o10_re = 0; o10_im = 0;
133  o11_re = 0; o11_im = 0;
134  o12_re = 0; o12_im = 0;
135  o20_re = 0; o20_im = 0;
136  o21_re = 0; o21_im = 0;
137  o22_re = 0; o22_im = 0;
138  o30_re = 0; o30_im = 0;
139  o31_re = 0; o31_im = 0;
140  o32_re = 0; o32_im = 0;
141 
142 
143 // 5th dimension -- NB: not partitionable!
144 {
145 // 2 P_L = 2 P_- = ( ( +1, -1 ), ( -1, +1 ) )
146  {
147  int sp_idx = ( xs == 0 ? X+(param.Ls-1)*2*Vh : X-2*Vh ) / 2;
148 
149 // read spinor from device memory
150  READ_SPINOR( SPINORTEX, param.sp_stride, sp_idx, sp_idx );
151 
152  if ( xs != 0 )
153  {
157 
161 
165 
169  }
170  else
171  {
172  o00_re += -mferm*(+i00_re-i20_re); o00_im += -mferm*(+i00_im-i20_im);
173  o01_re += -mferm*(+i01_re-i21_re); o01_im += -mferm*(+i01_im-i21_im);
174  o02_re += -mferm*(+i02_re-i22_re); o02_im += -mferm*(+i02_im-i22_im);
175 
176  o10_re += -mferm*(+i10_re-i30_re); o10_im += -mferm*(+i10_im-i30_im);
177  o11_re += -mferm*(+i11_re-i31_re); o11_im += -mferm*(+i11_im-i31_im);
178  o12_re += -mferm*(+i12_re-i32_re); o12_im += -mferm*(+i12_im-i32_im);
179 
180  o20_re += -mferm*(-i00_re+i20_re); o20_im += -mferm*(-i00_im+i20_im);
181  o21_re += -mferm*(-i01_re+i21_re); o21_im += -mferm*(-i01_im+i21_im);
182  o22_re += -mferm*(-i02_re+i22_re); o22_im += -mferm*(-i02_im+i22_im);
183 
184  o30_re += -mferm*(-i10_re+i30_re); o30_im += -mferm*(-i10_im+i30_im);
185  o31_re += -mferm*(-i11_re+i31_re); o31_im += -mferm*(-i11_im+i31_im);
186  o32_re += -mferm*(-i12_re+i32_re); o32_im += -mferm*(-i12_im+i32_im);
187  } // end if ( xs != 0 )
188  } // end P_L
189 
190  // 2 P_R = 2 P_+ = ( ( +1, +1 ), ( +1, +1 ) )
191  {
192  int sp_idx = ( xs == param.Ls-1 ? X-(param.Ls-1)*2*Vh : X+2*Vh ) / 2;
193 
194 // read spinor from device memory
195  READ_SPINOR( SPINORTEX, param.sp_stride, sp_idx, sp_idx );
196 
197  if ( xs != param.Ls-1 )
198  {
202 
206 
210 
214  }
215  else
216  {
217  o00_re += -mferm*(+i00_re+i20_re); o00_im += -mferm*(+i00_im+i20_im);
218  o01_re += -mferm*(+i01_re+i21_re); o01_im += -mferm*(+i01_im+i21_im);
219  o02_re += -mferm*(+i02_re+i22_re); o02_im += -mferm*(+i02_im+i22_im);
220 
221  o10_re += -mferm*(+i10_re+i30_re); o10_im += -mferm*(+i10_im+i30_im);
222  o11_re += -mferm*(+i11_re+i31_re); o11_im += -mferm*(+i11_im+i31_im);
223  o12_re += -mferm*(+i12_re+i32_re); o12_im += -mferm*(+i12_im+i32_im);
224 
225  o20_re += -mferm*(+i00_re+i20_re); o20_im += -mferm*(+i00_im+i20_im);
226  o21_re += -mferm*(+i01_re+i21_re); o21_im += -mferm*(+i01_im+i21_im);
227  o22_re += -mferm*(+i02_re+i22_re); o22_im += -mferm*(+i02_im+i22_im);
228 
229  o30_re += -mferm*(+i10_re+i30_re); o30_im += -mferm*(+i10_im+i30_im);
230  o31_re += -mferm*(+i11_re+i31_re); o31_im += -mferm*(+i11_im+i31_im);
231  o32_re += -mferm*(+i12_re+i32_re); o32_im += -mferm*(+i12_im+i32_im);
232  } // end if ( xs != param.Ls-1 )
233  } // end P_R
234 
235  // MDWF Dslash_5 operator is given as follow
236  // Dslash4pre = [c_5(s)(P_+\delta_{s,s`+1} - mP_+\delta_{s,0}\delta_{s`,L_s-1}
237  // + P_-\delta_{s,s`-1}-mP_-\delta_{s,L_s-1}\delta_{s`,0})
238  // + b_5(s)\delta_{s,s`}]\delta_{x,x`}
239  // For Dslash4pre
240  // C_5 \equiv c_5(s)*0.5
241  // B_5 \equiv b_5(s)
242  // For Dslash5
243  // C_5 \equiv 0.5*{c_5(s)(4+M_5)-1}/{b_5(s)(4+M_5)+1}
244  // B_5 \equiv 1.0
245 #ifdef MDWF_mode // Check whether MDWF option is enabled
246 #if (MDWF_mode==1)
247  VOLATILE spinorFloat C_5;
248  VOLATILE spinorFloat B_5;
249  C_5 = (spinorFloat)mdwf_c5[xs]*0.5;
250  B_5 = (spinorFloat)mdwf_b5[xs];
251 
252  READ_SPINOR( SPINORTEX, param.sp_stride, X/2, X/2 );
253  o00_re = C_5*o00_re + B_5*i00_re;
254  o00_im = C_5*o00_im + B_5*i00_im;
255  o01_re = C_5*o01_re + B_5*i01_re;
256  o01_im = C_5*o01_im + B_5*i01_im;
257  o02_re = C_5*o02_re + B_5*i02_re;
258  o02_im = C_5*o02_im + B_5*i02_im;
259  o10_re = C_5*o10_re + B_5*i10_re;
260  o10_im = C_5*o10_im + B_5*i10_im;
261  o11_re = C_5*o11_re + B_5*i11_re;
262  o11_im = C_5*o11_im + B_5*i11_im;
263  o12_re = C_5*o12_re + B_5*i12_re;
264  o12_im = C_5*o12_im + B_5*i12_im;
265  o20_re = C_5*o20_re + B_5*i20_re;
266  o20_im = C_5*o20_im + B_5*i20_im;
267  o21_re = C_5*o21_re + B_5*i21_re;
268  o21_im = C_5*o21_im + B_5*i21_im;
269  o22_re = C_5*o22_re + B_5*i22_re;
270  o22_im = C_5*o22_im + B_5*i22_im;
271  o30_re = C_5*o30_re + B_5*i30_re;
272  o30_im = C_5*o30_im + B_5*i30_im;
273  o31_re = C_5*o31_re + B_5*i31_re;
274  o31_im = C_5*o31_im + B_5*i31_im;
275  o32_re = C_5*o32_re + B_5*i32_re;
276  o32_im = C_5*o32_im + B_5*i32_im;
277 #elif (MDWF_mode==2)
278  VOLATILE spinorFloat C_5;
279  C_5 = (spinorFloat)(0.5*(mdwf_c5[xs]*(m5+4.0) - 1.0)/(mdwf_b5[xs]*(m5+4.0) + 1.0));
280 
281  READ_SPINOR( SPINORTEX, param.sp_stride, X/2, X/2 );
282  o00_re = C_5*o00_re + i00_re;
283  o00_im = C_5*o00_im + i00_im;
284  o01_re = C_5*o01_re + i01_re;
285  o01_im = C_5*o01_im + i01_im;
286  o02_re = C_5*o02_re + i02_re;
287  o02_im = C_5*o02_im + i02_im;
288  o10_re = C_5*o10_re + i10_re;
289  o10_im = C_5*o10_im + i10_im;
290  o11_re = C_5*o11_re + i11_re;
291  o11_im = C_5*o11_im + i11_im;
292  o12_re = C_5*o12_re + i12_re;
293  o12_im = C_5*o12_im + i12_im;
294  o20_re = C_5*o20_re + i20_re;
295  o20_im = C_5*o20_im + i20_im;
296  o21_re = C_5*o21_re + i21_re;
297  o21_im = C_5*o21_im + i21_im;
298  o22_re = C_5*o22_re + i22_re;
299  o22_im = C_5*o22_im + i22_im;
300  o30_re = C_5*o30_re + i30_re;
301  o30_im = C_5*o30_im + i30_im;
302  o31_re = C_5*o31_re + i31_re;
303  o31_im = C_5*o31_im + i31_im;
304  o32_re = C_5*o32_re + i32_re;
305  o32_im = C_5*o32_im + i32_im;
306 #endif // select MDWF mode
307 #endif // check MDWF on/off
308 } // end 5th dimension
309 
310 {
311 
312 #ifdef DSLASH_XPAY
313  READ_ACCUM(ACCUMTEX, param.sp_stride)
315 
316 #ifdef MDWF_mode
317  coeff = (spinorFloat)(0.5/(mdwf_b5[xs]*(m5+4.0) + 1.0));
318  coeff *= -coeff;
319 #else
320  coeff = a;
321 #endif
322 
323 #ifdef YPAX
324 #ifdef SPINOR_DOUBLE
325  o00_re = o00_re + coeff*accum0.x;
326  o00_im = o00_im + coeff*accum0.y;
327  o01_re = o01_re + coeff*accum1.x;
328  o01_im = o01_im + coeff*accum1.y;
329  o02_re = o02_re + coeff*accum2.x;
330  o02_im = o02_im + coeff*accum2.y;
331  o10_re = o10_re + coeff*accum3.x;
332  o10_im = o10_im + coeff*accum3.y;
333  o11_re = o11_re + coeff*accum4.x;
334  o11_im = o11_im + coeff*accum4.y;
335  o12_re = o12_re + coeff*accum5.x;
336  o12_im = o12_im + coeff*accum5.y;
337  o20_re = o20_re + coeff*accum6.x;
338  o20_im = o20_im + coeff*accum6.y;
339  o21_re = o21_re + coeff*accum7.x;
340  o21_im = o21_im + coeff*accum7.y;
341  o22_re = o22_re + coeff*accum8.x;
342  o22_im = o22_im + coeff*accum8.y;
343  o30_re = o30_re + coeff*accum9.x;
344  o30_im = o30_im + coeff*accum9.y;
345  o31_re = o31_re + coeff*accum10.x;
346  o31_im = o31_im + coeff*accum10.y;
347  o32_re = o32_re + coeff*accum11.x;
348  o32_im = o32_im + coeff*accum11.y;
349 #else
350  o00_re = o00_re + coeff*accum0.x;
351  o00_im = o00_im + coeff*accum0.y;
352  o01_re = o01_re + coeff*accum0.z;
353  o01_im = o01_im + coeff*accum0.w;
354  o02_re = o02_re + coeff*accum1.x;
355  o02_im = o02_im + coeff*accum1.y;
356  o10_re = o10_re + coeff*accum1.z;
357  o10_im = o10_im + coeff*accum1.w;
358  o11_re = o11_re + coeff*accum2.x;
359  o11_im = o11_im + coeff*accum2.y;
360  o12_re = o12_re + coeff*accum2.z;
361  o12_im = o12_im + coeff*accum2.w;
362  o20_re = o20_re + coeff*accum3.x;
363  o20_im = o20_im + coeff*accum3.y;
364  o21_re = o21_re + coeff*accum3.z;
365  o21_im = o21_im + coeff*accum3.w;
366  o22_re = o22_re + coeff*accum4.x;
367  o22_im = o22_im + coeff*accum4.y;
368  o30_re = o30_re + coeff*accum4.z;
369  o30_im = o30_im + coeff*accum4.w;
370  o31_re = o31_re + coeff*accum5.x;
371  o31_im = o31_im + coeff*accum5.y;
372  o32_re = o32_re + coeff*accum5.z;
373  o32_im = o32_im + coeff*accum5.w;
374 #endif // SPINOR_DOUBLE
375 #else
376 #ifdef SPINOR_DOUBLE
377  o00_re = coeff*o00_re + accum0.x;
378  o00_im = coeff*o00_im + accum0.y;
379  o01_re = coeff*o01_re + accum1.x;
380  o01_im = coeff*o01_im + accum1.y;
381  o02_re = coeff*o02_re + accum2.x;
382  o02_im = coeff*o02_im + accum2.y;
383  o10_re = coeff*o10_re + accum3.x;
384  o10_im = coeff*o10_im + accum3.y;
385  o11_re = coeff*o11_re + accum4.x;
386  o11_im = coeff*o11_im + accum4.y;
387  o12_re = coeff*o12_re + accum5.x;
388  o12_im = coeff*o12_im + accum5.y;
389  o20_re = coeff*o20_re + accum6.x;
390  o20_im = coeff*o20_im + accum6.y;
391  o21_re = coeff*o21_re + accum7.x;
392  o21_im = coeff*o21_im + accum7.y;
393  o22_re = coeff*o22_re + accum8.x;
394  o22_im = coeff*o22_im + accum8.y;
395  o30_re = coeff*o30_re + accum9.x;
396  o30_im = coeff*o30_im + accum9.y;
397  o31_re = coeff*o31_re + accum10.x;
398  o31_im = coeff*o31_im + accum10.y;
399  o32_re = coeff*o32_re + accum11.x;
400  o32_im = coeff*o32_im + accum11.y;
401 #else
402  o00_re = coeff*o00_re + accum0.x;
403  o00_im = coeff*o00_im + accum0.y;
404  o01_re = coeff*o01_re + accum0.z;
405  o01_im = coeff*o01_im + accum0.w;
406  o02_re = coeff*o02_re + accum1.x;
407  o02_im = coeff*o02_im + accum1.y;
408  o10_re = coeff*o10_re + accum1.z;
409  o10_im = coeff*o10_im + accum1.w;
410  o11_re = coeff*o11_re + accum2.x;
411  o11_im = coeff*o11_im + accum2.y;
412  o12_re = coeff*o12_re + accum2.z;
413  o12_im = coeff*o12_im + accum2.w;
414  o20_re = coeff*o20_re + accum3.x;
415  o20_im = coeff*o20_im + accum3.y;
416  o21_re = coeff*o21_re + accum3.z;
417  o21_im = coeff*o21_im + accum3.w;
418  o22_re = coeff*o22_re + accum4.x;
419  o22_im = coeff*o22_im + accum4.y;
420  o30_re = coeff*o30_re + accum4.z;
421  o30_im = coeff*o30_im + accum4.w;
422  o31_re = coeff*o31_re + accum5.x;
423  o31_im = coeff*o31_im + accum5.y;
424  o32_re = coeff*o32_re + accum5.z;
425  o32_im = coeff*o32_im + accum5.w;
426 #endif // SPINOR_DOUBLE
427 #endif // YPAX
428 #endif // DSLASH_XPAY
429 }
430 
431 // write spinor field back to device memory
432 WRITE_SPINOR(param.sp_stride);
433 
434 // undefine to prevent warning when precision is changed
435 #undef m5
436 #undef mdwf_b5
437 #undef mdwf_c5
438 #undef spinorFloat
439 #undef SHARED_STRIDE
440 
441 #undef i00_re
442 #undef i00_im
443 #undef i01_re
444 #undef i01_im
445 #undef i02_re
446 #undef i02_im
447 #undef i10_re
448 #undef i10_im
449 #undef i11_re
450 #undef i11_im
451 #undef i12_re
452 #undef i12_im
453 #undef i20_re
454 #undef i20_im
455 #undef i21_re
456 #undef i21_im
457 #undef i22_re
458 #undef i22_im
459 #undef i30_re
460 #undef i30_im
461 #undef i31_re
462 #undef i31_im
463 #undef i32_re
464 #undef i32_im
465 
466 
467 
468 #undef VOLATILE
VOLATILE spinorFloat o20_re
__constant__ int Vh
#define i11_im
#define i22_im
__constant__ int X1h
#define i20_re
__constant__ int X2
VOLATILE spinorFloat o11_re
#define i20_im
VOLATILE spinorFloat o11_im
#define i12_re
#define i30_im
VOLATILE spinorFloat o10_re
#define i31_re
int boundaryCrossing
#define i10_re
__constant__ int X1
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o32_im
int sp_idx
VOLATILE spinorFloat o01_re
#define i01_re
VOLATILE spinorFloat o12_im
#define i31_im
VOLATILE spinorFloat o31_re
VOLATILE spinorFloat o22_re
#define mdwf_c5
#define m5
QudaGaugeParam param
Definition: pack_test.cpp:17
#define i32_im
#define i12_im
VOLATILE spinorFloat o20_im
__constant__ double coeff
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o02_re
#define i32_re
#define i01_im
#define SPINORTEX
Definition: clover_def.h:40
#define i02_re
VOLATILE spinorFloat o21_re
#define VOLATILE
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o00_im
#define i02_im
#define i21_im
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o01_im
#define i22_re
VOLATILE spinorFloat o30_re
__constant__ int X3
#define i10_im
VOLATILE spinorFloat o10_im
#define i00_im
#define mdwf_b5
VOLATILE spinorFloat o02_im
#define spinorFloat
#define WRITE_SPINOR
Definition: clover_def.h:48
#define i30_re
#define READ_SPINOR
Definition: clover_def.h:36
VOLATILE spinorFloat o32_re
#define i00_re
#define i11_re
VOLATILE spinorFloat o30_im
#define i21_re
VOLATILE spinorFloat o22_im
__constant__ int X4