QUDA  0.9.0
dslash_quda.cuh
Go to the documentation of this file.
1 //#define DSLASH_TUNE_TILE
2 
3 #if (__COMPUTE_CAPABILITY__ >= 700)
4 // for running on Volta we set large shared memory mode to prefer hitting in L2
5 #define SET_CACHE(f) qudaFuncSetAttribute( (const void*)f, cudaFuncAttributePreferredSharedMemoryCarveout, (int)cudaSharedmemCarveoutMaxShared)
6 #else
7 #define SET_CACHE(f)
8 #endif
9 
10 #if 1
11 #define LAUNCH_KERNEL(f, grid, block, shared, stream, param) \
12  void *args[] = { &param }; \
13  void (*func)( const DslashParam ) = &(f); \
14  qudaLaunchKernel( (const void*)func, grid, block, args, shared, stream);
15 #else
16 #define LAUNCH_KERNEL(f, grid, block, shared, stream, param) f<<<grid, block, shared, stream>>>(param)
17 #endif
18 
19 #define EVEN_MORE_GENERIC_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
20  if (x==0) { \
21  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
22  SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type> ); \
23  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
24  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
25  SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type> ); \
26  LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
27  } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
28  SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type> ); \
29  LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
30  } \
31  } else { \
32  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
33  SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \
34  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
35  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
36  SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \
37  LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
38  } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
39  SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \
40  LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
41  } \
42  }
43 
44 #define MORE_GENERIC_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
45  if (typeid(sFloat) == typeid(double2)) { \
46  EVEN_MORE_GENERIC_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
47  } else if (typeid(sFloat) == typeid(float4) || typeid(sFloat) == typeid(float2)) { \
48  EVEN_MORE_GENERIC_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
49  } else if (typeid(sFloat)==typeid(short4) || typeid(sFloat) == typeid(short2)) { \
50  EVEN_MORE_GENERIC_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
51  } else { \
52  errorQuda("Undefined precision type"); \
53  }
54 
55 
56 #define EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
57  if (x==0) { \
58  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
59  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 18 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
60  } else if (reconstruct == QUDA_RECONSTRUCT_13) { \
61  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 13 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
62  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
63  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 12 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
64  } else if (reconstruct == QUDA_RECONSTRUCT_9) { \
65  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 9 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
66  } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
67  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 8 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
68  } \
69  } else { \
70  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
71  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
72  } else if (reconstruct == QUDA_RECONSTRUCT_13) { \
73  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 13 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
74  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
75  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
76  } else if (reconstruct == QUDA_RECONSTRUCT_9) { \
77  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 9 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
78  } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
79  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
80  } \
81  }
82 
83 #define MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
84  if (typeid(sFloat) == typeid(double2)) { \
85  EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
86  } else if (typeid(sFloat) == typeid(float2)) { \
87  EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
88  } else if (typeid(sFloat)==typeid(short2)) { \
89  EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
90  } else { \
91  errorQuda("Undefined precision type"); \
92  }
93 
94 #ifndef MULTI_GPU
95 
96 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \
97  switch(param.kernel_type) { \
98  case INTERIOR_KERNEL: \
99  MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \
100  break; \
101  default: \
102  errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \
103  }
104 
105 #define GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \
106  switch(param.kernel_type) { \
107  case INTERIOR_KERNEL: \
108  MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \
109  break; \
110  default: \
111  errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \
112  }
113 
114 
115 #else
116 
117 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \
118  switch(param.kernel_type) { \
119  case INTERIOR_KERNEL: \
120  MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \
121  break; \
122  case EXTERIOR_KERNEL_X: \
123  MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \
124  break; \
125  case EXTERIOR_KERNEL_Y: \
126  MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \
127  break; \
128  case EXTERIOR_KERNEL_Z: \
129  MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \
130  break; \
131  case EXTERIOR_KERNEL_T: \
132  MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \
133  break; \
134  case EXTERIOR_KERNEL_ALL: \
135  MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \
136  break; \
137  default: \
138  break; \
139  }
140 
141 #define GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \
142  switch(param.kernel_type) { \
143  case INTERIOR_KERNEL: \
144  MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \
145  break; \
146  case EXTERIOR_KERNEL_X: \
147  MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \
148  break; \
149  case EXTERIOR_KERNEL_Y: \
150  MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \
151  break; \
152  case EXTERIOR_KERNEL_Z: \
153  MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \
154  break; \
155  case EXTERIOR_KERNEL_T: \
156  MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \
157  break; \
158  case EXTERIOR_KERNEL_ALL: \
159  MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \
160  break; \
161  default: \
162  break; \
163  }
164 
165 
166 #endif
167 
168 // macro used for dslash types with dagger kernel defined (Wilson, domain wall, etc.)
169 #define DSLASH(FUNC, gridDim, blockDim, shared, stream, param) \
170  if (!dagger) { \
171  GENERIC_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param) \
172  } else { \
173  GENERIC_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param) \
174  }
175 
176 // macro used for staggered dslash
177 #define STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param) \
178  GENERIC_DSLASH(staggeredDslash, , Axpy, gridDim, blockDim, shared, stream, param)
179 
180 #define IMPROVED_STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param) \
181  GENERIC_STAGGERED_DSLASH(improvedStaggeredDslash, , Axpy, gridDim, blockDim, shared, stream, param)
182 
183 #define EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
184  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
185  SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \
186  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
187  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
188  SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type> ); \
189  LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
190  } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
191  SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type> ); \
192  LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
193  }
194 
195 #define MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
196  if (typeid(sFloat) == typeid(double2)) { \
197  EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
198  } else if (typeid(sFloat) == typeid(float4)) { \
199  EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
200  } else if (typeid(sFloat)==typeid(short4)) { \
201  EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
202  }
203 
204 
205 #ifndef MULTI_GPU
206 
207 #define GENERIC_ASYM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \
208  switch(param.kernel_type) { \
209  case INTERIOR_KERNEL: \
210  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \
211  break; \
212  default: \
213  errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \
214  }
215 
216 #else
217 
218 #define GENERIC_ASYM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \
219  switch(param.kernel_type) { \
220  case INTERIOR_KERNEL: \
221  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \
222  break; \
223  case EXTERIOR_KERNEL_X: \
224  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \
225  break; \
226  case EXTERIOR_KERNEL_Y: \
227  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \
228  break; \
229  case EXTERIOR_KERNEL_Z: \
230  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \
231  break; \
232  case EXTERIOR_KERNEL_T: \
233  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \
234  break; \
235  case EXTERIOR_KERNEL_ALL: \
236  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \
237  break; \
238  default: \
239  break; \
240  }
241 
242 #endif
243 
244 // macro used for dslash types with dagger kernel defined (Wilson, domain wall, etc.)
245 #define ASYM_DSLASH(FUNC, gridDim, blockDim, shared, stream, param) \
246  if (!dagger) { \
247  GENERIC_ASYM_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param) \
248  } else { \
249  GENERIC_ASYM_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param) \
250  }
251 
252 
253 
254 //macro used for twisted mass dslash:
255 
256 #define EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
257  if (x == 0 && d == 0) { \
258  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
259  SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Twist ## Kernel<kernel_type> ); \
260  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Twist ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
261  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
262  SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Twist ## Kernel<kernel_type> ); \
263  LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Twist ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
264  } else { \
265  SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Twist ## Kernel<kernel_type> ); \
266  LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Twist ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
267  } \
268  } else if (x != 0 && d == 0) { \
269  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
270  SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Twist ## X ## Kernel<kernel_type> ); \
271  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Twist ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
272  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
273  SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Twist ## X ## Kernel<kernel_type> ); \
274  LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Twist ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
275  } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
276  SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Twist ## X ## Kernel<kernel_type> ); \
277  LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Twist ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
278  } \
279  } else if (x == 0 && d != 0) { \
280  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
281  SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type> ); \
282  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
283  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
284  SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type> ); \
285  LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
286  } else { \
287  SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type> ); \
288  LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
289  } \
290  } else{ \
291  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
292  SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \
293  LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
294  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
295  SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type> ); \
296  LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
297  } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
298  SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type> ); \
299  LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \
300  } \
301  }
302 
303 #define MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
304  if (typeid(sFloat) == typeid(double2)) { \
305  EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
306  } else if (typeid(sFloat) == typeid(float4)) { \
307  EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
308  } else if (typeid(sFloat)==typeid(short4)) { \
309  EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \
310  } else { \
311  errorQuda("Undefined precision type"); \
312  }
313 
314 #ifndef MULTI_GPU
315 
316 #define GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \
317  switch(param.kernel_type) { \
318  case INTERIOR_KERNEL: \
319  MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \
320  break; \
321  default: \
322  errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \
323  }
324 
325 #else
326 
327 #define GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \
328  switch(param.kernel_type) { \
329  case INTERIOR_KERNEL: \
330  MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \
331  break; \
332  case EXTERIOR_KERNEL_X: \
333  MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \
334  break; \
335  case EXTERIOR_KERNEL_Y: \
336  MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \
337  break; \
338  case EXTERIOR_KERNEL_Z: \
339  MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \
340  break; \
341  case EXTERIOR_KERNEL_T: \
342  MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \
343  break; \
344  case EXTERIOR_KERNEL_ALL: \
345  MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \
346  break; \
347  default: \
348  break; \
349  }
350 
351 #endif
352 
353 #define NDEG_TM_DSLASH(FUNC, gridDim, blockDim, shared, stream, param) \
354  if (!dagger) { \
355  GENERIC_NDEG_TM_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param) \
356  } else { \
357  GENERIC_NDEG_TM_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param) \
358  }
359 //end of tm dslash macro
360 
361 
362 // Use an abstract class interface to drive the different CUDA dslash
363 // kernels. All parameters are curried into the derived classes to
364 // allow a simple interface.
365 class DslashCuda : public Tunable {
366 
367 protected:
368  cudaColorSpinorField *out;
369  const cudaColorSpinorField *in;
370  const cudaColorSpinorField *x;
371  const GaugeField &gauge;
374  const int dagger;
375  static bool init;
376 
377  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
378  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
379  bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
380  // all dslashes expect a 4-d volume here (dwf Ls is y thread dimension)
381  unsigned int minThreads() const { return dslashParam.threads; }
382  char aux_base[TuneKey::aux_n];
383  char aux[8][TuneKey::aux_n];
384  static char ghost_str[TuneKey::aux_n]; // string with ghostDim information
385 
390  inline void fillAuxBase() {
391  char comm[5];
392  comm[0] = (dslashParam.commDim[0] ? '1' : '0');
393  comm[1] = (dslashParam.commDim[1] ? '1' : '0');
394  comm[2] = (dslashParam.commDim[2] ? '1' : '0');
395  comm[3] = (dslashParam.commDim[3] ? '1' : '0');
396  comm[4] = '\0';
397  strcpy(aux_base,",comm=");
398  strcat(aux_base,comm);
399 
400  switch (reconstruct) {
401  case QUDA_RECONSTRUCT_NO: strcat(aux_base,",reconstruct=18"); break;
402  case QUDA_RECONSTRUCT_13: strcat(aux_base,",reconstruct=13"); break;
403  case QUDA_RECONSTRUCT_12: strcat(aux_base,",reconstruct=12"); break;
404  case QUDA_RECONSTRUCT_9: strcat(aux_base, ",reconstruct=9"); break;
405  case QUDA_RECONSTRUCT_8: strcat(aux_base, ",reconstruct=8"); break;
406  default: break;
407  }
408 
409  if (x) strcat(aux_base,",Xpay");
410  if (dagger) strcat(aux_base,",dagger");
411  }
412 
418  inline void fillAux(KernelType kernel_type, const char *kernel_str) {
419  strcpy(aux[kernel_type],kernel_str);
420  if (kernel_type == INTERIOR_KERNEL) strcat(aux[kernel_type],ghost_str);
421  strcat(aux[kernel_type],aux_base);
422  }
423 
429  inline void setParam()
430  {
431  // factor of 2 (or 1) for T-dimensional spin projection (FIXME - unnecessary)
432  dslashParam.tProjScale = getKernelPackT() ? 1.0 : 2.0;
434 
435  // update the ghosts for the non-p2p directions
436  for (int dim=0; dim<4; dim++) {
437 
438  for (int dir=0; dir<2; dir++) {
439  /* if doing interior kernel, then this is the initial call, so
440  we must all ghost pointers else if doing exterior kernel, then
441  we only have to update the non-p2p ghosts, since these may
442  have been assigned to zero-copy memory */
444  dslashParam.ghost[2*dim+dir] = (void*)in->Ghost2();
445  dslashParam.ghostNorm[2*dim+dir] = (float*)(in->Ghost2());
446 
447 #ifdef USE_TEXTURE_OBJECTS
448  dslashParam.ghostTex[2*dim+dir] = in->GhostTex();
449  dslashParam.ghostTexNorm[2*dim+dir] = in->GhostTexNorm();
450 #endif // USE_TEXTURE_OBJECTS
451  }
452  }
453  }
454 
455  }
456 
457 public:
459 
460  DslashCuda(cudaColorSpinorField *out, const cudaColorSpinorField *in,
461  const cudaColorSpinorField *x, const GaugeField &gauge,
462  const int parity, const int dagger, const int *commOverride)
463  : out(out), in(in), x(x), gauge(gauge), reconstruct(gauge.Reconstruct()),
464  dagger(dagger), saveOut(0), saveOutNorm(0) {
465 
466  if (in->Precision() != gauge.Precision())
467  errorQuda("Mixing gauge %d and spinor %d precision not supported", gauge.Precision(), in->Precision());
468 
469  constexpr int nDimComms = 4;
470  for (int i=0; i<nDimComms; i++){
471  dslashParam.ghostOffset[i][0] = in->GhostOffset(i,0)/in->FieldOrder();
472  dslashParam.ghostOffset[i][1] = in->GhostOffset(i,1)/in->FieldOrder();
473  dslashParam.ghostNormOffset[i][0] = in->GhostNormOffset(i,0);
474  dslashParam.ghostNormOffset[i][1] = in->GhostNormOffset(i,1);
475  dslashParam.ghostDim[i] = comm_dim_partitioned(i); // determines whether to use regular or ghost indexing at boundary
476  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : comm_dim_partitioned(i); // switch off comms if override = 0
477  }
478 
479  // set parameters particular for this instance
480  dslashParam.out = (void*)out->V();
481  dslashParam.outNorm = (float*)out->Norm();
482  dslashParam.in = (void*)in->V();
483  dslashParam.inNorm = (float*)in->Norm();
484  dslashParam.x = x ? (void*)x->V() : nullptr;
485  dslashParam.xNorm = x ? (float*)x->Norm() : nullptr;
486 
487 #ifdef USE_TEXTURE_OBJECTS
488  dslashParam.inTex = in->Tex();
489  dslashParam.inTexNorm = in->TexNorm();
490  if (out) dslashParam.outTex = out->Tex();
491  if (out) dslashParam.outTexNorm = out->TexNorm();
492  if (x) dslashParam.xTex = x->Tex();
493  if (x) dslashParam.xTexNorm = x->TexNorm();
494 #endif // USE_TEXTURE_OBJECTS
495 
497  bindGaugeTex(static_cast<const cudaGaugeField&>(gauge), parity, dslashParam);
498 
499  dslashParam.sp_stride = in->Stride();
500 
501  dslashParam.dc = in->getDslashConstant(); // get precomputed constants
502 
503  dslashParam.gauge_stride = gauge.Stride();
504  dslashParam.gauge_fixed = gauge.GaugeFixed();
505 
506  dslashParam.anisotropy = gauge.Anisotropy();
508 
509  dslashParam.t_boundary = (gauge.TBoundary() == QUDA_PERIODIC_T) ? 1.0 : -1.0;
511 
512  dslashParam.An2 = make_float2(gauge.Anisotropy(), 1.0 / (gauge.Anisotropy()*gauge.Anisotropy()));
514 
515  dslashParam.coeff = 1.0;
516  dslashParam.coeff_f = 1.0f;
517 
518  dslashParam.twist_a = 0.0;
519  dslashParam.twist_b = 0.0;
520 
521  dslashParam.No2 = make_float2(1.0f, 1.0f);
522  dslashParam.Pt0 = (comm_coord(3) == 0) ? true : false;
523  dslashParam.PtNm1 = (comm_coord(3) == comm_dim(3)-1) ? true : false;
524 
525  // this sets the communications pattern for the packing kernel
527 
528  if (!init) { // these parameters are constant across all dslash instances for a given run
529  char ghost[5]; // set the ghost string
530  for (int dim=0; dim<nDimComms; dim++) ghost[dim] = (dslashParam.ghostDim[dim] ? '1' : '0');
531  ghost[4] = '\0';
532  strcpy(ghost_str,",ghost=");
533  strcat(ghost_str,ghost);
534  init = true;
535  }
536 
537  fillAuxBase();
538 #ifdef MULTI_GPU
539  fillAux(INTERIOR_KERNEL, "policy_kernel=interior");
540  fillAux(EXTERIOR_KERNEL_ALL, "policy_kernel=exterior_all");
541  fillAux(EXTERIOR_KERNEL_X, "policy_kernel=exterior_x");
542  fillAux(EXTERIOR_KERNEL_Y, "policy_kernel=exterior_y");
543  fillAux(EXTERIOR_KERNEL_Z, "policy_kernel=exterior_z");
544  fillAux(EXTERIOR_KERNEL_T, "policy_kernel=exterior_t");
545 #else
546  fillAux(INTERIOR_KERNEL, "policy_kernel=single-GPU");
547 #endif // MULTI_GPU
548  fillAux(KERNEL_POLICY, "policy");
549 
550  }
551 
552  virtual ~DslashCuda() { unbindGaugeTex(static_cast<const cudaGaugeField&>(gauge)); }
553  virtual TuneKey tuneKey() const
554  { return TuneKey(in->VolString(), typeid(*this).name(), aux[dslashParam.kernel_type]); }
555 
556  const char* getAux(KernelType type) const {
557  return aux[type];
558  }
559 
560  void setAux(KernelType type, const char *aux_) {
561  strcpy(aux[type], aux_);
562  }
563 
564  void augmentAux(KernelType type, const char *extra) {
565  strcat(aux[type], extra);
566  }
567 
568  virtual int Nface() const { return 2; }
569 
570  int Dagger() const { return dagger; }
571 
572 #if defined(DSLASH_TUNE_TILE)
573  // Experimental autotuning of the thread ordering
574  bool advanceAux(TuneParam &param) const
575  {
576  if (in->Nspin()==1 || in->Ndim()==5) return false;
577  const int *X = in->X();
578 
579  if (param.aux.w < X[3] && param.aux.x > 1 && param.aux.w < 2) {
580  do { param.aux.w++; } while( (X[3]) % param.aux.w != 0);
581  if (param.aux.w <= X[3]) return true;
582  } else {
583  param.aux.w = 1;
584 
585  if (param.aux.z < X[2] && param.aux.x > 1 && param.aux.z < 8) {
586  do { param.aux.z++; } while( (X[2]) % param.aux.z != 0);
587  if (param.aux.z <= X[2]) return true;
588  } else {
589 
590  param.aux.z = 1;
591  if (param.aux.y < X[1] && param.aux.x > 1 && param.aux.y < 32) {
592  do { param.aux.y++; } while( X[1] % param.aux.y != 0);
593  if (param.aux.y <= X[1]) return true;
594  } else {
595  param.aux.y = 1;
596  if (param.aux.x < (2*X[0]) && param.aux.x < 32) {
597  do { param.aux.x++; } while( (2*X[0]) % param.aux.x != 0);
598  if (param.aux.x <= (2*X[0]) ) return true;
599  }
600  }
601  }
602  }
603  param.aux = make_int4(2,1,1,1);
604  return false;
605  }
606 
607  void initTuneParam(TuneParam &param) const
608  {
609  Tunable::initTuneParam(param);
610  param.aux = make_int4(2,1,1,1);
611  }
612 
614  void defaultTuneParam(TuneParam &param) const
615  {
616  Tunable::defaultTuneParam(param);
617  param.aux = make_int4(2,1,1,1);
618  }
619 #endif
620 
621  virtual void preTune()
622  {
623  out->backup();
624  }
625 
626  virtual void postTune()
627  {
628  out->restore();
629  }
630 
631  /*void launch_auxiliary(cudaStream_t &stream) {
632  auxiliary.apply(stream);
633  }*/
634 
635  /*
636  per direction / dimension flops
637  spin project flops = Nc * Ns
638  SU(3) matrix-vector flops = (8 Nc - 2) * Nc
639  spin reconstruction flops = 2 * Nc * Ns (just an accumulation to all components)
640  xpay = 2 * 2 * Nc * Ns
641 
642  So for the full dslash we have, where for the final spin
643  reconstruct we have -1 since the first direction does not
644  require any accumulation.
645 
646  flops = (2 * Nd * Nc * Ns) + (2 * Nd * (Ns/2) * (8*Nc-2) * Nc) + ((2 * Nd - 1) * 2 * Nc * Ns)
647  flops_xpay = flops + 2 * 2 * Nc * Ns
648 
649  For Wilson this should give 1344 for Nc=3,Ns=2 and 1368 for the xpay equivalent
650  */
651  virtual long long flops() const {
652  int mv_flops = (8 * in->Ncolor() - 2) * in->Ncolor(); // SU(3) matrix-vector flops
653  int num_mv_multiply = in->Nspin() == 4 ? 2 : 1;
654  int ghost_flops = (num_mv_multiply * mv_flops + 2*in->Ncolor()*in->Nspin());
655  int xpay_flops = 2 * 2 * in->Ncolor() * in->Nspin(); // multiply and add per real component
656  int num_dir = 2 * 4;
657 
658  long long flops_ = 0;
659  switch(dslashParam.kernel_type) {
660  case EXTERIOR_KERNEL_X:
661  case EXTERIOR_KERNEL_Y:
662  case EXTERIOR_KERNEL_Z:
663  case EXTERIOR_KERNEL_T:
664  flops_ = (ghost_flops + (x ? xpay_flops : 0)) * 2 * in->GhostFace()[dslashParam.kernel_type];
665  break;
666  case EXTERIOR_KERNEL_ALL:
667  {
668  long long ghost_sites = 2 * (in->GhostFace()[0]+in->GhostFace()[1]+in->GhostFace()[2]+in->GhostFace()[3]);
669  flops_ = (ghost_flops + (x ? xpay_flops : 0)) * ghost_sites;
670  break;
671  }
672  case INTERIOR_KERNEL:
673  case KERNEL_POLICY:
674  {
675  long long sites = in->VolumeCB();
676  flops_ = (num_dir*(in->Nspin()/4)*in->Ncolor()*in->Nspin() + // spin project (=0 for staggered)
677  num_dir*num_mv_multiply*mv_flops + // SU(3) matrix-vector multiplies
678  ((num_dir-1)*2*in->Ncolor()*in->Nspin())) * sites; // accumulation
679  if (x) flops_ += xpay_flops * sites;
680 
681  if (dslashParam.kernel_type == KERNEL_POLICY) break;
682  // now correct for flops done by exterior kernel
683  long long ghost_sites = 0;
684  for (int d=0; d<4; d++) if (dslashParam.commDim[d]) ghost_sites += 2 * in->GhostFace()[d];
685  flops_ -= (ghost_flops + (x ? xpay_flops : 0)) * ghost_sites;
686 
687  break;
688  }
689  }
690  return flops_;
691  }
692 
693  virtual long long bytes() const {
694  int gauge_bytes = reconstruct * in->Precision();
695  bool isHalf = in->Precision() == sizeof(short) ? true : false;
696  int spinor_bytes = 2 * in->Ncolor() * in->Nspin() * in->Precision() + (isHalf ? sizeof(float) : 0);
697  int proj_spinor_bytes = (in->Nspin()==4 ? 1 : 2) * in->Ncolor() * in->Nspin() * in->Precision() + (isHalf ? sizeof(float) : 0);
698  int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + spinor_bytes;
699  int num_dir = 2 * 4; // set to 4 dimensions since we take care of 5-d fermions in derived classes where necessary
700 
701  long long bytes_=0;
702  switch(dslashParam.kernel_type) {
703  case EXTERIOR_KERNEL_X:
704  case EXTERIOR_KERNEL_Y:
705  case EXTERIOR_KERNEL_Z:
706  case EXTERIOR_KERNEL_T:
707  bytes_ = (ghost_bytes + (x ? spinor_bytes : 0)) * 2 * in->GhostFace()[dslashParam.kernel_type];
708  break;
709  case EXTERIOR_KERNEL_ALL:
710  {
711  long long ghost_sites = 2 * (in->GhostFace()[0]+in->GhostFace()[1]+in->GhostFace()[2]+in->GhostFace()[3]);
712  bytes_ = (ghost_bytes + (x ? spinor_bytes : 0)) * ghost_sites;
713  break;
714  }
715  case INTERIOR_KERNEL:
716  case KERNEL_POLICY:
717  {
718  long long sites = in->VolumeCB();
719  bytes_ = (num_dir*gauge_bytes + ((num_dir-2)*spinor_bytes + 2*proj_spinor_bytes) + spinor_bytes)*sites;
720  if (x) bytes_ += spinor_bytes;
721 
722  if (dslashParam.kernel_type == KERNEL_POLICY) break;
723  // now correct for bytes done by exterior kernel
724  long long ghost_sites = 0;
725  for (int d=0; d<4; d++) if (dslashParam.commDim[d]) ghost_sites += 2*in->GhostFace()[d];
726  bytes_ -= (ghost_bytes + (x ? spinor_bytes : 0)) * ghost_sites;
727 
728  break;
729  }
730  }
731  return bytes_;
732  }
733 
734 };
735 
736 //static declarations
737 bool DslashCuda::init = false;
738 char DslashCuda::ghost_str[TuneKey::aux_n];
739 
743 #ifdef SHARED_WILSON_DSLASH
744 class SharedDslashCuda : public DslashCuda {
745 protected:
746  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; } // FIXME: this isn't quite true, but works
747  bool advanceSharedBytes(TuneParam &param) const {
748  if (dslashParam.kernel_type != INTERIOR_KERNEL) return DslashCuda::advanceSharedBytes(param);
749  else return false;
750  } // FIXME - shared memory tuning only supported on exterior kernels
751 
753  int sharedBytes(const dim3 &block) const {
754  int warpSize = 32; // FIXME - query from device properties
755  int block_xy = block.x*block.y;
756  if (block_xy % warpSize != 0) block_xy = ((block_xy / warpSize) + 1)*warpSize;
757  return block_xy*block.z*sharedBytesPerThread();
758  }
759 
761  dim3 createGrid(const dim3 &block) const {
762  unsigned int gx = (in->X(0)*in->X(3) + block.x - 1) / block.x;
763  unsigned int gy = (in->X(1) + block.y - 1 ) / block.y;
764  unsigned int gz = (in->X(2) + block.z - 1) / block.z;
765  return dim3(gx, gy, gz);
766  }
767 
769  bool advanceBlockDim(TuneParam &param) const {
770  if (dslashParam.kernel_type != INTERIOR_KERNEL) return DslashCuda::advanceBlockDim(param);
771  const unsigned int min_threads = 2;
772  const unsigned int max_threads = 512; // FIXME: use deviceProp.maxThreadsDim[0];
773  const unsigned int max_shared = 16384*3; // FIXME: use deviceProp.sharedMemPerBlock;
774 
775  // set the x-block dimension equal to the entire x dimension
776  bool set = false;
777  dim3 blockInit = param.block;
778  blockInit.z++;
779  for (unsigned bx=blockInit.x; bx<=in->X(0); bx++) {
780  //unsigned int gx = (in->X(0)*in->x(3) + bx - 1) / bx;
781  for (unsigned by=blockInit.y; by<=in->X(1); by++) {
782  unsigned int gy = (in->X(1) + by - 1 ) / by;
783 
784  if (by > 1 && (by%2) != 0) continue; // can't handle odd blocks yet except by=1
785 
786  for (unsigned bz=blockInit.z; bz<=in->X(2); bz++) {
787  unsigned int gz = (in->X(2) + bz - 1) / bz;
788 
789  if (bz > 1 && (bz%2) != 0) continue; // can't handle odd blocks yet except bz=1
790  if (bx*by*bz > max_threads) continue;
791  if (bx*by*bz < min_threads) continue;
792  // can't yet handle the last block properly in shared memory addressing
793  if (by*gy != in->X(1)) continue;
794  if (bz*gz != in->X(2)) continue;
795  if (sharedBytes(dim3(bx, by, bz)) > max_shared) continue;
796 
797  param.block = dim3(bx, by, bz);
798  set = true; break;
799  }
800  if (set) break;
801  blockInit.z = 1;
802  }
803  if (set) break;
804  blockInit.y = 1;
805  }
806 
807  if (param.block.x > in->X(0) && param.block.y > in->X(1) && param.block.z > in->X(2) || !set) {
808  //||sharedBytesPerThread()*param.block.x > max_shared) {
809  param.block = dim3(in->X(0), 1, 1);
810  return false;
811  } else {
812  param.grid = createGrid(param.block);
813  param.shared_bytes = sharedBytes(param.block);
814  return true;
815  }
816  }
817 
818 public:
819  SharedDslashCuda(cudaColorSpinorField *out, const cudaColorSpinorField *in,
820  const cudaColorSpinorField *x, const GaugeField &gauge,
821  int parity, int dagger, const int *commOverride)
822  : DslashCuda(out, in, x, gauge, parity, dagger, commOverride) { ; }
823  virtual ~SharedDslashCuda() { ; }
824 
825  virtual void initTuneParam(TuneParam &param) const
826  {
827  if (dslashParam.kernel_type != INTERIOR_KERNEL) return DslashCuda::initTuneParam(param);
828 
829  param.block = dim3(in->X(0), 1, 1);
830  param.grid = createGrid(param.block);
831  param.shared_bytes = sharedBytes(param.block);
832  }
833 
835  virtual void defaultTuneParam(TuneParam &param) const
836  {
837  if (dslashParam.kernel_type != INTERIOR_KERNEL) DslashCuda::defaultTuneParam(param);
838  else initTuneParam(param);
839  }
840 };
841 #else
842 class SharedDslashCuda : public DslashCuda {
843 public:
844  SharedDslashCuda(cudaColorSpinorField *out, const cudaColorSpinorField *in,
845  const cudaColorSpinorField *x, const GaugeField &gauge,
846  int parity, int dagger, const int *commOverride)
847  : DslashCuda(out, in, x, gauge, parity, dagger, commOverride) { }
848  virtual ~SharedDslashCuda() { }
849 };
850 #endif
void unbindGaugeTex(const cudaGaugeField &gauge)
virtual long long bytes() const
const cudaColorSpinorField * in
virtual ~SharedDslashCuda()
cudaColorSpinorField * out
int commDim[QUDA_MAX_DIM]
virtual void preTune()
bool getKernelPackT()
Definition: dslash_quda.cu:61
void fillAuxBase()
Set the base strings used by the different dslash kernel types for autotuning.
#define errorQuda(...)
Definition: util_quda.h:90
void setPackComms(const int *commDim)
Definition: dslash_pack.cu:41
virtual void postTune()
void augmentAux(KernelType type, const char *extra)
int comm_dim(int dim)
const char * getAux(KernelType type) const
float * ghostNorm[2 *QUDA_MAX_DIM]
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
int comm_coord(int dim)
DslashCuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, const cudaColorSpinorField *x, const GaugeField &gauge, const int parity, const int dagger, const int *commOverride)
char * strcpy(char *__dst, const char *__src)
virtual int Nface() const
char * strcat(char *__s1, const char *__s2)
int ghostOffset[QUDA_MAX_DIM+1][2]
char aux[8][TuneKey::aux_n]
char aux_base[TuneKey::aux_n]
const int dagger
KernelType
char * saveOut
const GaugeField & gauge
QudaGaugeParam param
Definition: pack_test.cpp:17
static bool init
void * ghost[2 *QUDA_MAX_DIM]
DslashConstant dc
KernelType kernel_type
cpuColorSpinorField * in
DslashParam dslashParam
bool tuneGridDim() const
virtual ~DslashCuda()
int int int enum cudaChannelFormatKind f
static char ghost_str[TuneKey::aux_n]
unsigned int sharedBytesPerBlock(const TuneParam &param) const
bool tuneAuxDim() const
bool comm_peer2peer_enabled(int dir, int dim)
char * saveOutNorm
const cudaColorSpinorField * x
cpuColorSpinorField * out
void setAux(KernelType type, const char *aux_)
enum QudaReconstructType_s QudaReconstructType
int Dagger() const
virtual TuneKey tuneKey() const
unsigned int minThreads() const
int ghostDim[QUDA_MAX_DIM]
void fillAux(KernelType kernel_type, const char *kernel_str)
Specialize the auxiliary strings for each kernel type.
void setParam()
Set the dslashParam for the current multi-GPU parameters (set these at the last minute to ensure we a...
const QudaReconstructType reconstruct
virtual long long flops() const
static __inline__ size_t size_t d
SharedDslashCuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, const cudaColorSpinorField *x, const GaugeField &gauge, int parity, int dagger, const int *commOverride)
QudaParity parity
Definition: covdev_test.cpp:53
int ghostNormOffset[QUDA_MAX_DIM+1][2]
int comm_dim_partitioned(int dim)
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, T &dslashParam)