QUDA  0.9.0
dw_dslash5_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if (CUDA_VERSION >= 4010)
7 #define VOLATILE
8 #else
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 // workaround for C++11 bug in CUDA 6.5/7.0
15 #if CUDA_VERSION >= 6050 && CUDA_VERSION < 7050
16 #define POW(a, b) pow(a, static_cast<spinorFloat>(b))
17 #else
18 #define POW(a, b) pow(a, b)
19 #endif
20 
21 #define i00_re I0.x
22 #define i00_im I0.y
23 #define i01_re I1.x
24 #define i01_im I1.y
25 #define i02_re I2.x
26 #define i02_im I2.y
27 #define i10_re I3.x
28 #define i10_im I3.y
29 #define i11_re I4.x
30 #define i11_im I4.y
31 #define i12_re I5.x
32 #define i12_im I5.y
33 #define i20_re I6.x
34 #define i20_im I6.y
35 #define i21_re I7.x
36 #define i21_im I7.y
37 #define i22_re I8.x
38 #define i22_im I8.y
39 #define i30_re I9.x
40 #define i30_im I9.y
41 #define i31_re I10.x
42 #define i31_im I10.y
43 #define i32_re I11.x
44 #define i32_im I11.y
45 #define m5 param.m5_d
46 #define mdwf_b5 param.mdwf_b5_d
47 #define mdwf_c5 param.mdwf_c5_d
48 #define mferm param.mferm
49 #define a param.a
50 #define b param.b
51 #else
52 #define spinorFloat float
53 #define POW(a, b) __fast_pow(a, b)
54 #define i00_re I0.x
55 #define i00_im I0.y
56 #define i01_re I0.z
57 #define i01_im I0.w
58 #define i02_re I1.x
59 #define i02_im I1.y
60 #define i10_re I1.z
61 #define i10_im I1.w
62 #define i11_re I2.x
63 #define i11_im I2.y
64 #define i12_re I2.z
65 #define i12_im I2.w
66 #define i20_re I3.x
67 #define i20_im I3.y
68 #define i21_re I3.z
69 #define i21_im I3.w
70 #define i22_re I4.x
71 #define i22_im I4.y
72 #define i30_re I4.z
73 #define i30_im I4.w
74 #define i31_re I5.x
75 #define i31_im I5.y
76 #define i32_re I5.z
77 #define i32_im I5.w
78 #define m5 param.m5_f
79 #define mdwf_b5 param.mdwf_b5_f
80 #define mdwf_c5 param.mdwf_c5_f
81 #define mferm param.mferm_f
82 #define a param.a
83 #define b param.b
84 #endif // SPINOR_DOUBLE
85 
86 // output spinor
111 
112 #ifdef SPINOR_DOUBLE
113 #if (__COMPUTE_CAPABILITY__ >= 200)
114 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
115 #else
116 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
117 #endif
118 #else
119 #if (__COMPUTE_CAPABILITY__ >= 200)
120 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
121 #else
122 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
123 #endif
124 #endif
125 #include "io_spinor.h"
126 
127 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
128 if (sid >= param.threads*param.dc.Ls) return;
129 
130 
132 
133 
134 
135 boundaryCrossing = sid/param.dc.Xh[0] + sid/(param.dc.X[1]*param.dc.Xh[0]) + sid/(param.dc.X[2]*param.dc.X[1]*param.dc.Xh[0]);
136 
137 X = 2*sid + (boundaryCrossing + param.parity) % 2;
138 coord[4] = X/(param.dc.X[0]*param.dc.X[1]*param.dc.X[2]*param.dc.X[3]);
139 
140  o00_re = 0; o00_im = 0;
141  o01_re = 0; o01_im = 0;
142  o02_re = 0; o02_im = 0;
143  o10_re = 0; o10_im = 0;
144  o11_re = 0; o11_im = 0;
145  o12_re = 0; o12_im = 0;
146  o20_re = 0; o20_im = 0;
147  o21_re = 0; o21_im = 0;
148  o22_re = 0; o22_im = 0;
149  o30_re = 0; o30_im = 0;
150  o31_re = 0; o31_im = 0;
151  o32_re = 0; o32_im = 0;
152 
153 
154 // 5th dimension -- NB: not partitionable!
155 {
156 // 2 P_L = 2 P_- = ( ( +1, -1 ), ( -1, +1 ) )
157  {
158  int sp_idx = ( coord[4] == param.dc.Ls-1 ? X-(param.dc.Ls-1)*2*param.dc.volume_4d_cb : X+2*param.dc.volume_4d_cb ) / 2;
159 
160 // read spinor from device memory
161  READ_SPINOR( SPINORTEX, param.sp_stride, sp_idx, sp_idx );
162 
163  if ( coord[4] != param.dc.Ls-1 )
164  {
168 
172 
176 
180  }
181  else
182  {
186 
190 
194 
198  } // end if ( coord[4] != param.dc.Ls-1 )
199  } // end P_L
200 
201  // 2 P_R = 2 P_+ = ( ( +1, +1 ), ( +1, +1 ) )
202  {
203  int sp_idx = ( coord[4] == 0 ? X+(param.dc.Ls-1)*2*param.dc.volume_4d_cb : X-2*param.dc.volume_4d_cb ) / 2;
204 
205 // read spinor from device memory
206  READ_SPINOR( SPINORTEX, param.sp_stride, sp_idx, sp_idx );
207 
208  if ( coord[4] != 0 )
209  {
213 
217 
221 
225  }
226  else
227  {
231 
235 
239 
243  } // end if ( coord[4] != 0 )
244  } // end P_R
245 
246  // MDWF Dslash_5 operator is given as follow
247  // Dslash4pre = [c_5(s)(P_+\delta_{s,s`+1} - mP_+\delta_{s,0}\delta_{s`,L_s-1}
248  // + P_-\delta_{s,s`-1}-mP_-\delta_{s,L_s-1}\delta_{s`,0})
249  // + b_5(s)\delta_{s,s`}]\delta_{x,x`}
250  // For Dslash4pre
251  // C_5 \equiv c_5(s)*0.5
252  // B_5 \equiv b_5(s)
253  // For Dslash5
254  // C_5 \equiv 0.5*{c_5(s)(4+M_5)-1}/{b_5(s)(4+M_5)+1}
255  // B_5 \equiv 1.0
256 #ifdef MDWF_mode // Check whether MDWF option is enabled
257 #if (MDWF_mode==1)
258  VOLATILE spinorFloat C_5;
259  VOLATILE spinorFloat B_5;
260  C_5 = mdwf_c5[ coord[4] ]*static_cast<spinorFloat>(0.5);
261  B_5 = mdwf_b5[ coord[4] ];
262 
263  READ_SPINOR( SPINORTEX, param.sp_stride, X/2, X/2 );
264  o00_re = C_5*o00_re + B_5*i00_re;
265  o00_im = C_5*o00_im + B_5*i00_im;
266  o01_re = C_5*o01_re + B_5*i01_re;
267  o01_im = C_5*o01_im + B_5*i01_im;
268  o02_re = C_5*o02_re + B_5*i02_re;
269  o02_im = C_5*o02_im + B_5*i02_im;
270  o10_re = C_5*o10_re + B_5*i10_re;
271  o10_im = C_5*o10_im + B_5*i10_im;
272  o11_re = C_5*o11_re + B_5*i11_re;
273  o11_im = C_5*o11_im + B_5*i11_im;
274  o12_re = C_5*o12_re + B_5*i12_re;
275  o12_im = C_5*o12_im + B_5*i12_im;
276  o20_re = C_5*o20_re + B_5*i20_re;
277  o20_im = C_5*o20_im + B_5*i20_im;
278  o21_re = C_5*o21_re + B_5*i21_re;
279  o21_im = C_5*o21_im + B_5*i21_im;
280  o22_re = C_5*o22_re + B_5*i22_re;
281  o22_im = C_5*o22_im + B_5*i22_im;
282  o30_re = C_5*o30_re + B_5*i30_re;
283  o30_im = C_5*o30_im + B_5*i30_im;
284  o31_re = C_5*o31_re + B_5*i31_re;
285  o31_im = C_5*o31_im + B_5*i31_im;
286  o32_re = C_5*o32_re + B_5*i32_re;
287  o32_im = C_5*o32_im + B_5*i32_im;
288 #elif (MDWF_mode==2)
289  VOLATILE spinorFloat C_5;
290  C_5 = static_cast<spinorFloat>(0.5)*(mdwf_c5[ coord[4] ]*(m5+static_cast<spinorFloat>(4.0)) - static_cast<spinorFloat>(1.0))/(mdwf_b5[ coord[4] ]*(m5+static_cast<spinorFloat>(4.0)) + static_cast<spinorFloat>(1.0));
291 
292  READ_SPINOR( SPINORTEX, param.sp_stride, X/2, X/2 );
293  o00_re = C_5*o00_re + i00_re;
294  o00_im = C_5*o00_im + i00_im;
295  o01_re = C_5*o01_re + i01_re;
296  o01_im = C_5*o01_im + i01_im;
297  o02_re = C_5*o02_re + i02_re;
298  o02_im = C_5*o02_im + i02_im;
299  o10_re = C_5*o10_re + i10_re;
300  o10_im = C_5*o10_im + i10_im;
301  o11_re = C_5*o11_re + i11_re;
302  o11_im = C_5*o11_im + i11_im;
303  o12_re = C_5*o12_re + i12_re;
304  o12_im = C_5*o12_im + i12_im;
305  o20_re = C_5*o20_re + i20_re;
306  o20_im = C_5*o20_im + i20_im;
307  o21_re = C_5*o21_re + i21_re;
308  o21_im = C_5*o21_im + i21_im;
309  o22_re = C_5*o22_re + i22_re;
310  o22_im = C_5*o22_im + i22_im;
311  o30_re = C_5*o30_re + i30_re;
312  o30_im = C_5*o30_im + i30_im;
313  o31_re = C_5*o31_re + i31_re;
314  o31_im = C_5*o31_im + i31_im;
315  o32_re = C_5*o32_re + i32_re;
316  o32_im = C_5*o32_im + i32_im;
317 #endif // select MDWF mode
318 #endif // check MDWF on/off
319 } // end 5th dimension
320 
321 {
322 
323 #ifdef DSLASH_XPAY
324  READ_ACCUM(ACCUMTEX, param.sp_stride)
326 
327 #ifdef MDWF_mode
328  coeff = static_cast<spinorFloat>(0.5)/(mdwf_b5[coord[4]]*(m5+static_cast<spinorFloat>(4.0)) + static_cast<spinorFloat>(1.0));
329  coeff *= coeff;
330  coeff *= a;
331 #else
332  coeff = a;
333 #endif
334 
335 #ifdef YPAX
336 #ifdef SPINOR_DOUBLE
337  o00_re = o00_re + coeff*accum0.x;
338  o00_im = o00_im + coeff*accum0.y;
339  o01_re = o01_re + coeff*accum1.x;
340  o01_im = o01_im + coeff*accum1.y;
341  o02_re = o02_re + coeff*accum2.x;
342  o02_im = o02_im + coeff*accum2.y;
343  o10_re = o10_re + coeff*accum3.x;
344  o10_im = o10_im + coeff*accum3.y;
345  o11_re = o11_re + coeff*accum4.x;
346  o11_im = o11_im + coeff*accum4.y;
347  o12_re = o12_re + coeff*accum5.x;
348  o12_im = o12_im + coeff*accum5.y;
349  o20_re = o20_re + coeff*accum6.x;
350  o20_im = o20_im + coeff*accum6.y;
351  o21_re = o21_re + coeff*accum7.x;
352  o21_im = o21_im + coeff*accum7.y;
353  o22_re = o22_re + coeff*accum8.x;
354  o22_im = o22_im + coeff*accum8.y;
355  o30_re = o30_re + coeff*accum9.x;
356  o30_im = o30_im + coeff*accum9.y;
357  o31_re = o31_re + coeff*accum10.x;
358  o31_im = o31_im + coeff*accum10.y;
359  o32_re = o32_re + coeff*accum11.x;
360  o32_im = o32_im + coeff*accum11.y;
361 #else
362  o00_re = o00_re + coeff*accum0.x;
363  o00_im = o00_im + coeff*accum0.y;
364  o01_re = o01_re + coeff*accum0.z;
365  o01_im = o01_im + coeff*accum0.w;
366  o02_re = o02_re + coeff*accum1.x;
367  o02_im = o02_im + coeff*accum1.y;
368  o10_re = o10_re + coeff*accum1.z;
369  o10_im = o10_im + coeff*accum1.w;
370  o11_re = o11_re + coeff*accum2.x;
371  o11_im = o11_im + coeff*accum2.y;
372  o12_re = o12_re + coeff*accum2.z;
373  o12_im = o12_im + coeff*accum2.w;
374  o20_re = o20_re + coeff*accum3.x;
375  o20_im = o20_im + coeff*accum3.y;
376  o21_re = o21_re + coeff*accum3.z;
377  o21_im = o21_im + coeff*accum3.w;
378  o22_re = o22_re + coeff*accum4.x;
379  o22_im = o22_im + coeff*accum4.y;
380  o30_re = o30_re + coeff*accum4.z;
381  o30_im = o30_im + coeff*accum4.w;
382  o31_re = o31_re + coeff*accum5.x;
383  o31_im = o31_im + coeff*accum5.y;
384  o32_re = o32_re + coeff*accum5.z;
385  o32_im = o32_im + coeff*accum5.w;
386 #endif // SPINOR_DOUBLE
387 #else
388 #ifdef SPINOR_DOUBLE
389  o00_re = coeff*o00_re + accum0.x;
390  o00_im = coeff*o00_im + accum0.y;
391  o01_re = coeff*o01_re + accum1.x;
392  o01_im = coeff*o01_im + accum1.y;
393  o02_re = coeff*o02_re + accum2.x;
394  o02_im = coeff*o02_im + accum2.y;
395  o10_re = coeff*o10_re + accum3.x;
396  o10_im = coeff*o10_im + accum3.y;
397  o11_re = coeff*o11_re + accum4.x;
398  o11_im = coeff*o11_im + accum4.y;
399  o12_re = coeff*o12_re + accum5.x;
400  o12_im = coeff*o12_im + accum5.y;
401  o20_re = coeff*o20_re + accum6.x;
402  o20_im = coeff*o20_im + accum6.y;
403  o21_re = coeff*o21_re + accum7.x;
404  o21_im = coeff*o21_im + accum7.y;
405  o22_re = coeff*o22_re + accum8.x;
406  o22_im = coeff*o22_im + accum8.y;
407  o30_re = coeff*o30_re + accum9.x;
408  o30_im = coeff*o30_im + accum9.y;
409  o31_re = coeff*o31_re + accum10.x;
410  o31_im = coeff*o31_im + accum10.y;
411  o32_re = coeff*o32_re + accum11.x;
412  o32_im = coeff*o32_im + accum11.y;
413 #else
414  o00_re = coeff*o00_re + accum0.x;
415  o00_im = coeff*o00_im + accum0.y;
416  o01_re = coeff*o01_re + accum0.z;
417  o01_im = coeff*o01_im + accum0.w;
418  o02_re = coeff*o02_re + accum1.x;
419  o02_im = coeff*o02_im + accum1.y;
420  o10_re = coeff*o10_re + accum1.z;
421  o10_im = coeff*o10_im + accum1.w;
422  o11_re = coeff*o11_re + accum2.x;
423  o11_im = coeff*o11_im + accum2.y;
424  o12_re = coeff*o12_re + accum2.z;
425  o12_im = coeff*o12_im + accum2.w;
426  o20_re = coeff*o20_re + accum3.x;
427  o20_im = coeff*o20_im + accum3.y;
428  o21_re = coeff*o21_re + accum3.z;
429  o21_im = coeff*o21_im + accum3.w;
430  o22_re = coeff*o22_re + accum4.x;
431  o22_im = coeff*o22_im + accum4.y;
432  o30_re = coeff*o30_re + accum4.z;
433  o30_im = coeff*o30_im + accum4.w;
434  o31_re = coeff*o31_re + accum5.x;
435  o31_im = coeff*o31_im + accum5.y;
436  o32_re = coeff*o32_re + accum5.z;
437  o32_im = coeff*o32_im + accum5.w;
438 #endif // SPINOR_DOUBLE
439 #endif // YPAX
440 #endif // DSLASH_XPAY
441 }
442 
443 // write spinor field back to device memory
444 WRITE_SPINOR(param.sp_stride);
445 
446 // undefine to prevent warning when precision is changed
447 #undef m5
448 #undef mdwf_b5
449 #undef mdwf_c5
450 #undef mferm
451 #undef a
452 #undef b
453 #undef spinorFloat
454 #undef POW
455 #undef SHARED_STRIDE
456 
457 #undef i00_re
458 #undef i00_im
459 #undef i01_re
460 #undef i01_im
461 #undef i02_re
462 #undef i02_im
463 #undef i10_re
464 #undef i10_im
465 #undef i11_re
466 #undef i11_im
467 #undef i12_re
468 #undef i12_im
469 #undef i20_re
470 #undef i20_im
471 #undef i21_re
472 #undef i21_im
473 #undef i22_re
474 #undef i22_im
475 #undef i30_re
476 #undef i30_im
477 #undef i31_re
478 #undef i31_im
479 #undef i32_re
480 #undef i32_im
481 
482 
483 
484 #undef VOLATILE
VOLATILE spinorFloat o11_im
#define i11_im
dim3 dim3 blockDim
VOLATILE spinorFloat o30_re
VOLATILE spinorFloat o22_re
#define i12_re
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o00_im
VOLATILE spinorFloat o20_im
#define i32_im
#define i02_re
VOLATILE spinorFloat o01_re
#define i31_re
VOLATILE spinorFloat o11_re
#define WRITE_SPINOR
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o31_re
VOLATILE spinorFloat o20_re
VOLATILE spinorFloat o01_im
int sp_idx
#define i22_re
int X
#define mdwf_c5
#define i02_im
VOLATILE spinorFloat o21_re
#define i20_re
VOLATILE spinorFloat o32_re
#define i20_im
#define mferm
#define i30_im
#define i00_im
QudaGaugeParam param
Definition: pack_test.cpp:17
#define i30_re
#define VOLATILE
VOLATILE spinorFloat o30_im
#define spinorFloat
#define i00_re
#define i21_re
VOLATILE spinorFloat o12_re
#define i21_im
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o10_im
#define SPINORTEX
#define READ_SPINOR
#define i11_re
int sid
#define i10_im
#define m5
VOLATILE spinorFloat o31_im
int X[4]
Definition: quda.h:29
int boundaryCrossing
#define i12_im
#define i10_re
VOLATILE spinorFloat o32_im
#define a
int coord[5]
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o02_re
#define i32_re
#define mdwf_b5
#define i01_re
VOLATILE spinorFloat o21_im
#define i31_im
#define i22_im
#define i01_im