QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dw_dslash5inv_dagger_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 // NB! Don't trust any MULTI_GPU code
6 
7 #if (CUDA_VERSION >= 4010)
8 #define VOLATILE
9 #else
10 #define VOLATILE volatile
11 #endif
12 // input spinor
13 #ifdef SPINOR_DOUBLE
14 #define spinorFloat double
15 #define i00_re I0.x
16 #define i00_im I0.y
17 #define i01_re I1.x
18 #define i01_im I1.y
19 #define i02_re I2.x
20 #define i02_im I2.y
21 #define i10_re I3.x
22 #define i10_im I3.y
23 #define i11_re I4.x
24 #define i11_im I4.y
25 #define i12_re I5.x
26 #define i12_im I5.y
27 #define i20_re I6.x
28 #define i20_im I6.y
29 #define i21_re I7.x
30 #define i21_im I7.y
31 #define i22_re I8.x
32 #define i22_im I8.y
33 #define i30_re I9.x
34 #define i30_im I9.y
35 #define i31_re I10.x
36 #define i31_im I10.y
37 #define i32_re I11.x
38 #define i32_im I11.y
39 #define m5 m5_d
40 #define mdwf_b5 mdwf_b5_d
41 #define mdwf_c5 mdwf_c5_d
42 #else
43 #define spinorFloat float
44 #define i00_re I0.x
45 #define i00_im I0.y
46 #define i01_re I0.z
47 #define i01_im I0.w
48 #define i02_re I1.x
49 #define i02_im I1.y
50 #define i10_re I1.z
51 #define i10_im I1.w
52 #define i11_re I2.x
53 #define i11_im I2.y
54 #define i12_re I2.z
55 #define i12_im I2.w
56 #define i20_re I3.x
57 #define i20_im I3.y
58 #define i21_re I3.z
59 #define i21_im I3.w
60 #define i22_re I4.x
61 #define i22_im I4.y
62 #define i30_re I4.z
63 #define i30_im I4.w
64 #define i31_re I5.x
65 #define i31_im I5.y
66 #define i32_re I5.z
67 #define i32_im I5.w
68 #define m5 m5_f
69 #define mdwf_b5 mdwf_b5_f
70 #define mdwf_c5 mdwf_c5_f
71 #endif // SPINOR_DOUBLE
72 
73 // output spinor
98 
99 #ifdef SPINOR_DOUBLE
100 #if (__COMPUTE_CAPABILITY__ >= 200)
101 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
102 #else
103 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
104 #endif
105 #else
106 #if (__COMPUTE_CAPABILITY__ >= 200)
107 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
108 #else
109 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
110 #endif
111 #endif
112 #include "io_spinor.h"
113 
114 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
115 if (sid >= param.threads*param.Ls) return;
116 
118 
119 int X, xs;
120 
121 // Inline by hand for the moment and assume even dimensions
122 //coordsFromIndex(X, x1, x2, x3, x4, sid, param.parity);
123 
124 boundaryCrossing = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h);
125 
126 X = 2*sid + (boundaryCrossing + param.parity) % 2;
127 xs = X/(X1*X2*X3*X4);
128 
129  o00_re = 0; o00_im = 0;
130  o01_re = 0; o01_im = 0;
131  o02_re = 0; o02_im = 0;
132  o10_re = 0; o10_im = 0;
133  o11_re = 0; o11_im = 0;
134  o12_re = 0; o12_im = 0;
135  o20_re = 0; o20_im = 0;
136  o21_re = 0; o21_im = 0;
137  o22_re = 0; o22_im = 0;
138  o30_re = 0; o30_im = 0;
139  o31_re = 0; o31_im = 0;
140  o32_re = 0; o32_im = 0;
141 
143 
144 #ifdef MDWF_mode // Check whether MDWF option is enabled
145  kappa = (spinorFloat)(-(mdwf_c5[xs]*(4.0 + m5) - 1.0)/(mdwf_b5[xs]*(4.0 + m5) + 1.0));
146 #else
147  kappa = 2.0*a;
148 #endif // select MDWF mode
149 
150 // M5_inv operation -- NB: not partitionable!
151 
152 // In this part, we will do the following operation in parallel way.
153 
154 // w = M5inv * v
155 // 'w' means output vector
156 // 'v' means input vector
157 {
158  int base_idx = sid%Vh;
159  int sp_idx;
160 
161 // let's assume the index,
162 // s = output vector index,
163 // s' = input vector index and
164 // 'a'= kappa5
165 
166  spinorFloat inv_d_n = 1.0 / ( 1.0 + pow(kappa,param.Ls)*mferm);
169 
170  for(int s = 0; s < param.Ls; s++)
171  {
172  factorR = ( xs > s ? -inv_d_n*pow(kappa,param.Ls-xs+s)*mferm : inv_d_n*pow(kappa,s-xs))/2.0;
173 
174  sp_idx = base_idx + s*Vh;
175  // read spinor from device memory
176  READ_SPINOR( SPINORTEX, param.sp_stride, sp_idx, sp_idx );
177 
178  o00_re += factorR*(i00_re + i20_re);
179  o00_im += factorR*(i00_im + i20_im);
180  o20_re += factorR*(i00_re + i20_re);
181  o20_im += factorR*(i00_im + i20_im);
182  o01_re += factorR*(i01_re + i21_re);
183  o01_im += factorR*(i01_im + i21_im);
184  o21_re += factorR*(i01_re + i21_re);
185  o21_im += factorR*(i01_im + i21_im);
186  o02_re += factorR*(i02_re + i22_re);
187  o02_im += factorR*(i02_im + i22_im);
188  o22_re += factorR*(i02_re + i22_re);
189  o22_im += factorR*(i02_im + i22_im);
190  o10_re += factorR*(i10_re + i30_re);
191  o10_im += factorR*(i10_im + i30_im);
192  o30_re += factorR*(i10_re + i30_re);
193  o30_im += factorR*(i10_im + i30_im);
194  o11_re += factorR*(i11_re + i31_re);
195  o11_im += factorR*(i11_im + i31_im);
196  o31_re += factorR*(i11_re + i31_re);
197  o31_im += factorR*(i11_im + i31_im);
198  o12_re += factorR*(i12_re + i32_re);
199  o12_im += factorR*(i12_im + i32_im);
200  o32_re += factorR*(i12_re + i32_re);
201  o32_im += factorR*(i12_im + i32_im);
202 
203  factorL = ( xs < s ? -inv_d_n*pow(kappa,param.Ls-s+xs)*mferm : inv_d_n*pow(kappa,xs-s))/2.0;
204 
205  o00_re += factorL*(i00_re - i20_re);
206  o00_im += factorL*(i00_im - i20_im);
207  o01_re += factorL*(i01_re - i21_re);
208  o01_im += factorL*(i01_im - i21_im);
209  o02_re += factorL*(i02_re - i22_re);
210  o02_im += factorL*(i02_im - i22_im);
211  o10_re += factorL*(i10_re - i30_re);
212  o10_im += factorL*(i10_im - i30_im);
213  o11_re += factorL*(i11_re - i31_re);
214  o11_im += factorL*(i11_im - i31_im);
215  o12_re += factorL*(i12_re - i32_re);
216  o12_im += factorL*(i12_im - i32_im);
217  o20_re += factorL*(i20_re - i00_re);
218  o20_im += factorL*(i20_im - i00_im);
219  o21_re += factorL*(i21_re - i01_re);
220  o21_im += factorL*(i21_im - i01_im);
221  o22_re += factorL*(i22_re - i02_re);
222  o22_im += factorL*(i22_im - i02_im);
223  o30_re += factorL*(i30_re - i10_re);
224  o30_im += factorL*(i30_im - i10_im);
225  o31_re += factorL*(i31_re - i11_re);
226  o31_im += factorL*(i31_im - i11_im);
227  o32_re += factorL*(i32_re - i12_re);
228  o32_im += factorL*(i32_im - i12_im);
229  }
230 } // end of M5inv dimension
231 
232 {
233 
234 #ifdef DSLASH_XPAY
235  READ_ACCUM(ACCUMTEX, param.sp_stride)
236 #ifdef SPINOR_DOUBLE
237  o00_re = a*o00_re + accum0.x;
238  o00_im = a*o00_im + accum0.y;
239  o01_re = a*o01_re + accum1.x;
240  o01_im = a*o01_im + accum1.y;
241  o02_re = a*o02_re + accum2.x;
242  o02_im = a*o02_im + accum2.y;
243  o10_re = a*o10_re + accum3.x;
244  o10_im = a*o10_im + accum3.y;
245  o11_re = a*o11_re + accum4.x;
246  o11_im = a*o11_im + accum4.y;
247  o12_re = a*o12_re + accum5.x;
248  o12_im = a*o12_im + accum5.y;
249  o20_re = a*o20_re + accum6.x;
250  o20_im = a*o20_im + accum6.y;
251  o21_re = a*o21_re + accum7.x;
252  o21_im = a*o21_im + accum7.y;
253  o22_re = a*o22_re + accum8.x;
254  o22_im = a*o22_im + accum8.y;
255  o30_re = a*o30_re + accum9.x;
256  o30_im = a*o30_im + accum9.y;
257  o31_re = a*o31_re + accum10.x;
258  o31_im = a*o31_im + accum10.y;
259  o32_re = a*o32_re + accum11.x;
260  o32_im = a*o32_im + accum11.y;
261 #else
262  o00_re = a*o00_re + accum0.x;
263  o00_im = a*o00_im + accum0.y;
264  o01_re = a*o01_re + accum0.z;
265  o01_im = a*o01_im + accum0.w;
266  o02_re = a*o02_re + accum1.x;
267  o02_im = a*o02_im + accum1.y;
268  o10_re = a*o10_re + accum1.z;
269  o10_im = a*o10_im + accum1.w;
270  o11_re = a*o11_re + accum2.x;
271  o11_im = a*o11_im + accum2.y;
272  o12_re = a*o12_re + accum2.z;
273  o12_im = a*o12_im + accum2.w;
274  o20_re = a*o20_re + accum3.x;
275  o20_im = a*o20_im + accum3.y;
276  o21_re = a*o21_re + accum3.z;
277  o21_im = a*o21_im + accum3.w;
278  o22_re = a*o22_re + accum4.x;
279  o22_im = a*o22_im + accum4.y;
280  o30_re = a*o30_re + accum4.z;
281  o30_im = a*o30_im + accum4.w;
282  o31_re = a*o31_re + accum5.x;
283  o31_im = a*o31_im + accum5.y;
284  o32_re = a*o32_re + accum5.z;
285  o32_im = a*o32_im + accum5.w;
286 #endif // SPINOR_DOUBLE
287 #endif // DSLASH_XPAY
288 }
289 
290 // write spinor field back to device memory
291 WRITE_SPINOR(param.sp_stride);
292 
293 // undefine to prevent warning when precision is changed
294 #undef m5
295 #undef mdwf_b5
296 #undef mdwf_c5
297 #undef spinorFloat
298 #undef SHARED_STRIDE
299 
300 #undef i00_re
301 #undef i00_im
302 #undef i01_re
303 #undef i01_im
304 #undef i02_re
305 #undef i02_im
306 #undef i10_re
307 #undef i10_im
308 #undef i11_re
309 #undef i11_im
310 #undef i12_re
311 #undef i12_im
312 #undef i20_re
313 #undef i20_im
314 #undef i21_re
315 #undef i21_im
316 #undef i22_re
317 #undef i22_im
318 #undef i30_re
319 #undef i30_im
320 #undef i31_re
321 #undef i31_im
322 #undef i32_re
323 #undef i32_im
324 
325 
326 
327 #undef VOLATILE
VOLATILE spinorFloat o21_re
__constant__ int Vh
VOLATILE spinorFloat o12_im
__constant__ int X1h
#define i10_re
VOLATILE spinorFloat o20_re
__constant__ int X2
#define i00_re
#define i01_im
spinorFloat inv_d_n
VOLATILE spinorFloat o32_re
#define i22_re
#define i12_re
#define i02_re
__constant__ int X1
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o11_im
#define i30_im
#define i11_im
VOLATILE spinorFloat o02_re
#define i10_im
VOLATILE spinorFloat o12_re
#define i01_re
spinorFloat factorL
VOLATILE spinorFloat kappa
VOLATILE spinorFloat o01_re
int boundaryCrossing
QudaGaugeParam param
Definition: pack_test.cpp:17
#define i30_re
VOLATILE spinorFloat o31_im
#define i21_re
#define m5
VOLATILE spinorFloat o10_im
#define i20_im
#define mdwf_c5
VOLATILE spinorFloat o02_im
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:100
#define i31_im
#define SPINORTEX
Definition: clover_def.h:40
#define i32_im
VOLATILE spinorFloat o30_re
#define i21_im
VOLATILE spinorFloat o01_im
#define i31_re
VOLATILE spinorFloat o11_re
#define i22_im
#define mdwf_b5
__constant__ int X3
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o32_im
spinorFloat factorR
VOLATILE spinorFloat o31_re
#define i02_im
VOLATILE spinorFloat o30_im
#define spinorFloat
#define VOLATILE
#define i12_im
#define WRITE_SPINOR
Definition: clover_def.h:48
VOLATILE spinorFloat o00_im
#define i00_im
VOLATILE spinorFloat o20_im
#define READ_SPINOR
Definition: clover_def.h:36
VOLATILE spinorFloat * s
#define i11_re
VOLATILE spinorFloat o21_im
#define i32_re
__constant__ int X4
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o10_re
#define i20_re