QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_quda.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 //these are access control for staggered action
20 #ifdef GPU_STAGGERED_DIRAC
21 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only
22 //#define DIRECT_ACCESS_FAT_LINK
23 //#define DIRECT_ACCESS_LONG_LINK
24 //#define DIRECT_ACCESS_SPINOR
25 //#define DIRECT_ACCESS_ACCUM
26 //#define DIRECT_ACCESS_INTER
27 //#define DIRECT_ACCESS_PACK
28 #elif (__COMPUTE_CAPABILITY__ >= 200)
29 //#define DIRECT_ACCESS_FAT_LINK
30 //#define DIRECT_ACCESS_LONG_LINK
31 #define DIRECT_ACCESS_SPINOR
32 //#define DIRECT_ACCESS_ACCUM
33 //#define DIRECT_ACCESS_INTER
34 //#define DIRECT_ACCESS_PACK
35 #else
36 #define DIRECT_ACCESS_FAT_LINK
37 //#define DIRECT_ACCESS_LONG_LINK
38 //#define DIRECT_ACCESS_SPINOR
39 //#define DIRECT_ACCESS_ACCUM
40 //#define DIRECT_ACCESS_INTER
41 //#define DIRECT_ACCESS_PACK
42 #endif
43 #endif // GPU_STAGGERED_DIRAC
44 
45 #include <quda_internal.h>
46 #include <dslash_quda.h>
47 #include <sys/time.h>
48 #include <blas_quda.h>
49 #include <face_quda.h>
50 
51 #include <inline_ptx.h>
52 
53 enum KernelType {
59 };
60 
61 namespace quda {
62 
63  struct DslashParam {
64  int threads; // the desired number of active threads
65  int parity; // Even-Odd or Odd-Even
66  int commDim[QUDA_MAX_DIM]; // Whether to do comms or not
67  int ghostDim[QUDA_MAX_DIM]; // Whether a ghost zone has been allocated for a given dimension
70  KernelType kernel_type; //is it INTERIOR_KERNEL, EXTERIOR_KERNEL_X/Y/Z/T
71 
72 #ifdef USE_TEXTURE_OBJECTS
73  cudaTextureObject_t inTex;
74  cudaTextureObject_t inTexNorm;
75  cudaTextureObject_t xTex;
76  cudaTextureObject_t xTexNorm;
77  cudaTextureObject_t outTex;
78  cudaTextureObject_t outTexNorm;
79  cudaTextureObject_t gauge0Tex; // also applies to fat gauge
80  cudaTextureObject_t gauge1Tex; // also applies to fat gauge
81  cudaTextureObject_t longGauge0Tex;
82  cudaTextureObject_t longGauge1Tex;
83  cudaTextureObject_t cloverTex;
84  cudaTextureObject_t cloverNormTex;
85 #endif
86  };
87 
89 
90  // these are set in initDslashConst
91  int Vspatial;
92 
93  static cudaEvent_t packEnd[Nstream];
94  static cudaEvent_t gatherStart[Nstream];
95  static cudaEvent_t gatherEnd[Nstream];
96  static cudaEvent_t scatterStart[Nstream];
97  static cudaEvent_t scatterEnd[Nstream];
98  static cudaEvent_t dslashStart;
99  static cudaEvent_t dslashEnd;
100 
101  static struct timeval dslashStart_h;
102 #ifdef MULTI_GPU
103  static struct timeval commsStart[Nstream];
104  static struct timeval commsEnd[Nstream];
105 #endif
106 
107  // these events are only used for profiling
108 #ifdef DSLASH_PROFILING
109 #define DSLASH_TIME_PROFILE() dslashTimeProfile()
110 
111  static cudaEvent_t packStart[Nstream];
112  static cudaEvent_t kernelStart[Nstream];
113  static cudaEvent_t kernelEnd[Nstream];
114 
115  // dimension 2 because we want absolute and relative
116  float packTime[Nstream][2];
117  float gatherTime[Nstream][2];
118  float commsTime[Nstream][2];
119  float scatterTime[Nstream][2];
120  float kernelTime[Nstream][2];
121  float dslashTime;
122 #define CUDA_EVENT_RECORD(a,b) cudaEventRecord(a,b)
123 #else
124 #define CUDA_EVENT_RECORD(a,b)
125 #define DSLASH_TIME_PROFILE()
126 #endif
127 
128  static FaceBuffer *face;
129  static cudaColorSpinorField *inSpinor;
130 
131  // For tuneLaunch() to uniquely identify a suitable set of launch parameters, we need copies of a few of
132  // the constants set by initDslashConstants().
133  static struct {
134  int x[4];
135  int Ls;
136  unsigned long long VolumeCB() { return x[0]*x[1]*x[2]*x[3]/2; }
137  // In the future, we may also want to add gauge_fixed, sp_stride, ga_stride, cl_stride, etc.
138  } dslashConstants;
139 
140  // dslashTuning = QUDA_TUNE_YES enables autotuning when the dslash is
141  // first launched
142  static QudaTune dslashTuning = QUDA_TUNE_NO;
143  static QudaVerbosity verbosity = QUDA_SILENT;
144 
146  {
147  dslashTuning = tune;
148  verbosity = verbose;
149  }
150 
151  // determines whether the temporal ghost zones are packed with a gather kernel,
152  // as opposed to multiple calls to cudaMemcpy()
153  static bool kernelPackT = false;
154 
155  void setKernelPackT(bool packT) { kernelPackT = packT; }
156 
157  bool getKernelPackT() { return kernelPackT; }
158 
159 
160 #include <dslash_textures.h>
161 #include <dslash_constants.h>
162 
163 #if defined(DIRECT_ACCESS_LINK) || defined(DIRECT_ACCESS_WILSON_SPINOR) || \
164  defined(DIRECT_ACCESS_WILSON_ACCUM) || defined(DIRECT_ACCESS_WILSON_PACK_SPINOR) || \
165  defined(DIRECT_ACCESS_WILSON_INTER) || defined(DIRECT_ACCESS_WILSON_PACK_SPINOR) || \
166  defined(DIRECT_ACCESS_CLOVER)
167 
168  static inline __device__ float short2float(short a) {
169  return (float)a/MAX_SHORT;
170  }
171 
172  static inline __device__ short float2short(float c, float a) {
173  return (short)(a*c*MAX_SHORT);
174  }
175 
176  static inline __device__ short4 float42short4(float c, float4 a) {
177  return make_short4(float2short(c, a.x), float2short(c, a.y), float2short(c, a.z), float2short(c, a.w));
178  }
179 
180  static inline __device__ float4 short42float4(short4 a) {
181  return make_float4(short2float(a.x), short2float(a.y), short2float(a.z), short2float(a.w));
182  }
183 
184  static inline __device__ float2 short22float2(short2 a) {
185  return make_float2(short2float(a.x), short2float(a.y));
186  }
187 #endif // DIRECT_ACCESS inclusions
188 
189  // Enable shared memory dslash for Fermi architecture
190  //#define SHARED_WILSON_DSLASH
191  //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
192 
193 #include <pack_face_def.h> // kernels for packing the ghost zones and general indexing
194 #include <staggered_dslash_def.h> // staggered Dslash kernels
195 #include <wilson_dslash_def.h> // Wilson Dslash kernels (including clover)
196 #include <dw_dslash_def.h> // Domain Wall kernels
197 #include <tm_dslash_def.h> // Twisted Mass kernels
198 #include <tm_core.h> // solo twisted mass kernel
199 #include <clover_def.h> // kernels for applying the clover term alone
200 #include <tm_ndeg_dslash_def.h> // Non-degenerate twisted Mass
201 
202 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
203 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
204 #endif
205 
206 #ifndef CLOVER_SHARED_FLOATS_PER_THREAD
207 #define CLOVER_SHARED_FLOATS_PER_THREAD 0
208 #endif
209 
210 #ifndef NDEGTM_SHARED_FLOATS_PER_THREAD
211 #define NDEGTM_SHARED_FLOATS_PER_THREAD 0
212 #endif
213 
214 
215  void setFace(const FaceBuffer &Face) {
216  face = (FaceBuffer*)&Face; // nasty
217  }
218 
219 
221  {
222 #ifndef DSLASH_PROFILING
223  // add cudaEventDisableTiming for lower sync overhead
224  for (int i=0; i<Nstream; i++) {
225  cudaEventCreate(&packEnd[i], cudaEventDisableTiming);
226  cudaEventCreate(&gatherStart[i], cudaEventDisableTiming);
227  cudaEventCreate(&gatherEnd[i], cudaEventDisableTiming);
228  cudaEventCreateWithFlags(&scatterStart[i], cudaEventDisableTiming);
229  cudaEventCreateWithFlags(&scatterEnd[i], cudaEventDisableTiming);
230  }
231  cudaEventCreateWithFlags(&dslashStart, cudaEventDisableTiming);
232  cudaEventCreateWithFlags(&dslashEnd, cudaEventDisableTiming);
233 #else
234  cudaEventCreate(&dslashStart);
235  cudaEventCreate(&dslashEnd);
236  for (int i=0; i<Nstream; i++) {
237  cudaEventCreate(&packStart[i]);
238  cudaEventCreate(&packEnd[i]);
239 
240  cudaEventCreate(&gatherStart[i]);
241  cudaEventCreate(&gatherEnd[i]);
242 
243  cudaEventCreate(&scatterStart[i]);
244  cudaEventCreate(&scatterEnd[i]);
245 
246  cudaEventCreate(&kernelStart[i]);
247  cudaEventCreate(&kernelEnd[i]);
248 
249  kernelTime[i][0] = 0.0;
250  kernelTime[i][1] = 0.0;
251 
252  gatherTime[i][0] = 0.0;
253  gatherTime[i][1] = 0.0;
254 
255  commsTime[i][0] = 0.0;
256  commsTime[i][1] = 0.0;
257 
258  scatterTime[i][0] = 0.0;
259  scatterTime[i][1] = 0.0;
260  }
261 #endif
262 
263  checkCudaError();
264  }
265 
266 
268  {
269  for (int i=0; i<Nstream; i++) {
270  cudaEventDestroy(packEnd[i]);
271  cudaEventDestroy(gatherStart[i]);
272  cudaEventDestroy(gatherEnd[i]);
273  cudaEventDestroy(scatterStart[i]);
274  cudaEventDestroy(scatterEnd[i]);
275  }
276 
277  cudaEventDestroy(dslashStart);
278  cudaEventDestroy(dslashEnd);
279 
280 #ifdef DSLASH_PROFILING
281  for (int i=0; i<Nstream; i++) {
282  cudaEventDestroy(packStart[i]);
283  cudaEventDestroy(kernelStart[i]);
284  cudaEventDestroy(kernelEnd[i]);
285  }
286 #endif
287 
288  checkCudaError();
289  }
290 
291 
292 #define MORE_GENERIC_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param, ...) \
293  if (x==0) { \
294  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
295  FUNC ## 18 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
296  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
297  FUNC ## 12 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
298  } else { \
299  FUNC ## 8 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
300  } \
301  } else { \
302  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
303  FUNC ## 18 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
304  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
305  FUNC ## 12 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
306  } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
307  FUNC ## 8 ## DAG ## X ## Kernel<kernel_type> <<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
308  } \
309  }
310 
311 #ifndef MULTI_GPU
312 
313 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param, ...) \
314  switch(param.kernel_type) { \
315  case INTERIOR_KERNEL: \
316  MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
317  break; \
318  default: \
319  errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \
320  }
321 
322 #else
323 
324 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param, ...) \
325  switch(param.kernel_type) { \
326  case INTERIOR_KERNEL: \
327  MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
328  break; \
329  case EXTERIOR_KERNEL_X: \
330  MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
331  break; \
332  case EXTERIOR_KERNEL_Y: \
333  MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
334  break; \
335  case EXTERIOR_KERNEL_Z: \
336  MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
337  break; \
338  case EXTERIOR_KERNEL_T: \
339  MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
340  break; \
341  }
342 
343 #endif
344 
345  // macro used for dslash types with dagger kernel defined (Wilson, domain wall, etc.)
346 #define DSLASH(FUNC, gridDim, blockDim, shared, stream, param, ...) \
347  if (!dagger) { \
348  GENERIC_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
349  } else { \
350  GENERIC_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
351  }
352 
353  // macro used for staggered dslash
354 #define STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param, ...) \
355  GENERIC_DSLASH(staggeredDslash, , Axpy, gridDim, blockDim, shared, stream, param, __VA_ARGS__)
356 
357 
358 #define MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param, ...) \
359  if (reconstruct == QUDA_RECONSTRUCT_NO) { \
360  FUNC ## 18 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
361  } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
362  FUNC ## 12 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
363  } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
364  FUNC ## 8 ## DAG ## X ## Kernel<kernel_type> <<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
365  }
366 
367 #ifndef MULTI_GPU
368 
369 #define GENERIC_ASYM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param, ...) \
370  switch(param.kernel_type) { \
371  case INTERIOR_KERNEL: \
372  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
373  break; \
374  default: \
375  errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \
376  }
377 
378 #else
379 
380 #define GENERIC_ASYM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param, ...) \
381  switch(param.kernel_type) { \
382  case INTERIOR_KERNEL: \
383  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
384  break; \
385  case EXTERIOR_KERNEL_X: \
386  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
387  break; \
388  case EXTERIOR_KERNEL_Y: \
389  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
390  break; \
391  case EXTERIOR_KERNEL_Z: \
392  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
393  break; \
394  case EXTERIOR_KERNEL_T: \
395  MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
396  break; \
397  }
398 
399 #endif
400 
401  // macro used for dslash types with dagger kernel defined (Wilson, domain wall, etc.)
402 #define ASYM_DSLASH(FUNC, gridDim, blockDim, shared, stream, param, ...) \
403  if (!dagger) { \
404  GENERIC_ASYM_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
405  } else { \
406  GENERIC_ASYM_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
407  }
408 
409 
410  // Use an abstract class interface to drive the different CUDA dslash
411  // kernels. All parameters are curried into the derived classes to
412  // allow a simple interface.
413  class DslashCuda : public Tunable {
414  protected:
419 
420  int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
421  bool advanceGridDim(TuneParam &param) const { return false; } // Don't tune the grid dimensions.
423  bool advance = Tunable::advanceBlockDim(param);
424  if (advance) {
425  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x, 1, 1);
426  if (param.grid.x > deviceProp.maxGridSize[0]) {
427  warningQuda("Autotuner is skipping blockDim=%u (gridDim=%u) because lattice volume is too large",
428  param.block.x, param.grid.x);
429  advance = advanceBlockDim(param);
430  }
431  }
432  return advance;
433  }
434 
435  public:
437  const cudaColorSpinorField *x)
438  : out(out), in(in), x(x), saveOut(0), saveOutNorm(0) { }
439  virtual ~DslashCuda() { }
440  virtual TuneKey tuneKey() const;
441  std::string paramString(const TuneParam &param) const // Don't bother printing the grid dim.
442  {
443  std::stringstream ps;
444  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
445  ps << "shared=" << param.shared_bytes;
446  return ps.str();
447  }
448  virtual int Nface() { return 2; }
449 
450  virtual void initTuneParam(TuneParam &param) const
451  {
452  Tunable::initTuneParam(param);
453  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x, 1, 1);
454  if (param.grid.x > deviceProp.maxGridSize[0]) {
455  warningQuda("Autotuner is skipping blockDim=%u (gridDim=%u) because lattice volume is too large",
456  param.block.x, param.grid.x);
457  bool ok = advanceBlockDim(param);
458  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
459  }
460  }
461 
463  virtual void defaultTuneParam(TuneParam &param) const
464  {
466  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x, 1, 1);
467  if (param.grid.x > deviceProp.maxGridSize[0]) {
468  errorQuda("Lattice volume is too large for default blockDim");
469  }
470  }
471 
472  virtual void preTune()
473  {
474  if (dslashParam.kernel_type < 5) { // exterior kernel
475  saveOut = new char[in->Bytes()];
476  cudaMemcpy(saveOut, out->V(), in->Bytes(), cudaMemcpyDeviceToHost);
477  if (out->Precision() == QUDA_HALF_PRECISION) {
478  saveOutNorm = new char[in->NormBytes()];
479  cudaMemcpy(saveOutNorm, out->Norm(), in->NormBytes(), cudaMemcpyDeviceToHost);
480  }
481  }
482  }
483 
484  virtual void postTune()
485  {
486  if (dslashParam.kernel_type < 5) { // exterior kernel
487  cudaMemcpy(out->V(), saveOut, in->Bytes(), cudaMemcpyHostToDevice);
488  delete[] saveOut;
489  if (out->Precision() == QUDA_HALF_PRECISION) {
490  cudaMemcpy(out->Norm(), saveOutNorm, in->NormBytes(), cudaMemcpyHostToDevice);
491  delete[] saveOutNorm;
492  }
493  }
494  }
495 
496  };
497 
499  {
500  std::stringstream vol, aux;
501 
502  vol << dslashConstants.x[0] << "x";
503  vol << dslashConstants.x[1] << "x";
504  vol << dslashConstants.x[2] << "x";
505  vol << dslashConstants.x[3];
506 
507  aux << "type=";
508 #ifdef MULTI_GPU
509  char comm[5], ghost[5];
510  switch (dslashParam.kernel_type) {
511  case INTERIOR_KERNEL: aux << "interior"; break;
512  case EXTERIOR_KERNEL_X: aux << "exterior_x"; break;
513  case EXTERIOR_KERNEL_Y: aux << "exterior_y"; break;
514  case EXTERIOR_KERNEL_Z: aux << "exterior_z"; break;
515  case EXTERIOR_KERNEL_T: aux << "exterior_t"; break;
516  }
517  for (int i=0; i<4; i++) {
518  comm[i] = (dslashParam.commDim[i] ? '1' : '0');
519  ghost[i] = (dslashParam.ghostDim[i] ? '1' : '0');
520  }
521  comm[4] = '\0'; ghost[4] = '\0';
522  aux << ",comm=" << comm;
524  aux << ",ghost=" << ghost;
525  }
526 #else
527  aux << "single-GPU";
528 #endif // MULTI_GPU
529  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
530  }
531 
535 #if (__COMPUTE_CAPABILITY__ >= 200 && defined(SHARED_WILSON_DSLASH))
536  class SharedDslashCuda : public DslashCuda {
537  protected:
538  int sharedBytesPerBlock(const TuneParam &param) const { return 0; } // FIXME: this isn't quite true, but works
539  bool advanceSharedBytes(TuneParam &param) const {
541  else return false;
542  } // FIXME - shared memory tuning only supported on exterior kernels
543 
545  int sharedBytes(const dim3 &block) const {
546  int warpSize = 32; // FIXME - query from device properties
547  int block_xy = block.x*block.y;
548  if (block_xy % warpSize != 0) block_xy = ((block_xy / warpSize) + 1)*warpSize;
549  return block_xy*block.z*sharedBytesPerThread();
550  }
551 
553  dim3 createGrid(const dim3 &block) const {
554  unsigned int gx = ((dslashConstants.x[0]/2)*dslashConstants.x[3] + block.x - 1) / block.x;
555  unsigned int gy = (dslashConstants.x[1] + block.y - 1 ) / block.y;
556  unsigned int gz = (dslashConstants.x[2] + block.z - 1) / block.z;
557  return dim3(gx, gy, gz);
558  }
559 
561  bool advanceBlockDim(TuneParam &param) const {
563  const unsigned int min_threads = 2;
564  const unsigned int max_threads = 512; // FIXME: use deviceProp.maxThreadsDim[0];
565  const unsigned int max_shared = 16384*3; // FIXME: use deviceProp.sharedMemPerBlock;
566 
567  // set the x-block dimension equal to the entire x dimension
568  bool set = false;
569  dim3 blockInit = param.block;
570  blockInit.z++;
571  for (unsigned bx=blockInit.x; bx<=dslashConstants.x[0]/2; bx++) {
572  //unsigned int gx = (dslashConstants.x[0]*dslashConstants.x[3] + bx - 1) / bx;
573  for (unsigned by=blockInit.y; by<=dslashConstants.x[1]; by++) {
574  unsigned int gy = (dslashConstants.x[1] + by - 1 ) / by;
575 
576  if (by > 1 && (by%2) != 0) continue; // can't handle odd blocks yet except by=1
577 
578  for (unsigned bz=blockInit.z; bz<=dslashConstants.x[2]; bz++) {
579  unsigned int gz = (dslashConstants.x[2] + bz - 1) / bz;
580 
581  if (bz > 1 && (bz%2) != 0) continue; // can't handle odd blocks yet except bz=1
582  if (bx*by*bz > max_threads) continue;
583  if (bx*by*bz < min_threads) continue;
584  // can't yet handle the last block properly in shared memory addressing
585  if (by*gy != dslashConstants.x[1]) continue;
586  if (bz*gz != dslashConstants.x[2]) continue;
587  if (sharedBytes(dim3(bx, by, bz)) > max_shared) continue;
588 
589  param.block = dim3(bx, by, bz);
590  set = true; break;
591  }
592  if (set) break;
593  blockInit.z = 1;
594  }
595  if (set) break;
596  blockInit.y = 1;
597  }
598 
599  if (param.block.x > dslashConstants.x[0]/2 && param.block.y > dslashConstants.x[1] &&
600  param.block.z > dslashConstants.x[2] || !set) {
601  //||sharedBytesPerThread()*param.block.x > max_shared) {
602  param.block = dim3(dslashConstants.x[0]/2, 1, 1);
603  return false;
604  } else {
605  param.grid = createGrid(param.block);
606  param.shared_bytes = sharedBytes(param.block);
607  return true;
608  }
609 
610  }
611 
612  public:
613  SharedDslashCuda(cudaColorSpinorField *out, const cudaColorSpinorField *in,
614  const cudaColorSpinorField *x) : DslashCuda(out, in, x) { ; }
615  virtual ~SharedDslashCuda() { ; }
616  std::string paramString(const TuneParam &param) const // override and print out grid as well
617  {
618  std::stringstream ps;
619  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
620  ps << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
621  ps << "shared=" << param.shared_bytes;
622  return ps.str();
623  }
624 
625  virtual void initTuneParam(TuneParam &param) const
626  {
628 
629  param.block = dim3(dslashConstants.x[0]/2, 1, 1);
630  param.grid = createGrid(param.block);
631  param.shared_bytes = sharedBytes(param.block);
632  }
633 
635  virtual void defaultTuneParam(TuneParam &param) const
636  {
638  else initTuneParam(param);
639  }
640  };
641 #else
642  class SharedDslashCuda : public DslashCuda {
643  public:
645  const cudaColorSpinorField *x) : DslashCuda(out, in, x) { }
646  virtual ~SharedDslashCuda() { }
647  };
648 #endif
649 
650 
651  template <typename sFloat, typename gFloat>
653 
654  private:
655  const gFloat *gauge0, *gauge1;
656  const QudaReconstructType reconstruct;
657  const int dagger;
658  const double a;
659 
660  protected:
661  int sharedBytesPerThread() const
662  {
663 #if (__COMPUTE_CAPABILITY__ >= 200) // Fermi uses shared memory for common input
664  if (dslashParam.kernel_type == INTERIOR_KERNEL) { // Interior kernels use shared memory for common iunput
665  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
666  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
667  } else { // Exterior kernels use no shared memory
668  return 0;
669  }
670 #else // Pre-Fermi uses shared memory only for pseudo-registers
671  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
672  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
673 #endif
674  }
675 
676  public:
677  WilsonDslashCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
678  const QudaReconstructType reconstruct, const cudaColorSpinorField *in,
679  const cudaColorSpinorField *x, const double a, const int dagger)
680  : SharedDslashCuda(out, in, x), gauge0(gauge0), gauge1(gauge1),
681  reconstruct(reconstruct), dagger(dagger), a(a)
682  {
683  bindSpinorTex<sFloat>(in, out, x);
684  }
685 
686  virtual ~WilsonDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
687 
688  TuneKey tuneKey() const
689  {
690  TuneKey key = DslashCuda::tuneKey();
691  std::stringstream recon;
692  recon << reconstruct;
693  key.aux += ",reconstruct=" + recon.str();
694  if (x) key.aux += ",Xpay";
695  return key;
696  }
697 
698  void apply(const cudaStream_t &stream)
699  {
700 #ifdef SHARED_WILSON_DSLASH
702  errorQuda("Shared dslash does not yet support X-dimension partitioning");
703 #endif
704  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
705  DSLASH(dslash, tp.grid, tp.block, tp.shared_bytes, stream,
706  dslashParam, (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1,
707  (sFloat*)in->V(), (float*)in->Norm(), (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
708  }
709 
710  long long flops() const { return (x ? 1368ll : 1320ll) * dslashConstants.VolumeCB(); } // FIXME for multi-GPU
711  };
712 
713  template <typename sFloat, typename gFloat, typename cFloat>
715 
716  private:
717  const gFloat *gauge0, *gauge1;
718  const QudaReconstructType reconstruct;
719  const cFloat *clover;
720  const float *cloverNorm;
721  const int dagger;
722  const double a;
723 
724  protected:
725  int sharedBytesPerThread() const
726  {
727 #if (__COMPUTE_CAPABILITY__ >= 200)
729  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
730  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
731  } else {
732  return 0;
733  }
734 #else
735  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
736  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
737 #endif
738  }
739  public:
740  CloverDslashCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
741  const QudaReconstructType reconstruct, const cFloat *clover,
742  const float *cloverNorm, const cudaColorSpinorField *in,
743  const cudaColorSpinorField *x, const double a, const int dagger)
744  : SharedDslashCuda(out, in, x), gauge0(gauge0), gauge1(gauge1), clover(clover),
745  cloverNorm(cloverNorm), reconstruct(reconstruct), dagger(dagger), a(a)
746  {
747  bindSpinorTex<sFloat>(in, out, x);
748  }
749  virtual ~CloverDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
750 
751  TuneKey tuneKey() const
752  {
753  TuneKey key = DslashCuda::tuneKey();
754  std::stringstream recon;
755  recon << reconstruct;
756  key.aux += ",reconstruct=" + recon.str();
757  if (x) key.aux += ",Xpay";
758  return key;
759  }
760 
761  void apply(const cudaStream_t &stream)
762  {
763 #ifdef SHARED_WILSON_DSLASH
765  errorQuda("Shared dslash does not yet support X-dimension partitioning");
766 #endif
767  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
768  DSLASH(cloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
769  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, clover, cloverNorm,
770  (sFloat*)in->V(), (float*)in->Norm(), (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
771  }
772 
773  long long flops() const { return (x ? 1872ll : 1824ll) * dslashConstants.VolumeCB(); } // FIXME for multi-GPU
774  };
775 
776  template <typename sFloat, typename gFloat, typename cFloat>
778 
779  private:
780  const gFloat *gauge0, *gauge1;
781  const QudaReconstructType reconstruct;
782  const cFloat *clover;
783  const float *cloverNorm;
784  const int dagger;
785  const double a;
786 
787  protected:
788  int sharedBytesPerThread() const
789  {
790 #if (__COMPUTE_CAPABILITY__ >= 200)
792  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
793  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
794  } else {
795  return 0;
796  }
797 #else
798  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
799  return DSLASH_SHARED_FLOATS_PER_THREAD * reg_size;
800 #endif
801  }
802 
803  public:
804  AsymCloverDslashCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
805  const QudaReconstructType reconstruct, const cFloat *clover,
806  const float *cloverNorm, const cudaColorSpinorField *in,
807  const cudaColorSpinorField *x, const double a, const int dagger)
808  : SharedDslashCuda(out, in, x), gauge0(gauge0), gauge1(gauge1), clover(clover),
809  cloverNorm(cloverNorm), reconstruct(reconstruct), dagger(dagger), a(a)
810  {
811  bindSpinorTex<sFloat>(in, out, x);
812  if (!x) errorQuda("Asymmetric clover dslash only defined for Xpay");
813  }
814  virtual ~AsymCloverDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
815 
816  TuneKey tuneKey() const
817  {
818  TuneKey key = DslashCuda::tuneKey();
819  std::stringstream recon;
820  recon << reconstruct;
821  key.aux += ",reconstruct=" + recon.str() + ",Xpay";
822  return key;
823  }
824 
825  void apply(const cudaStream_t &stream)
826  {
827 #ifdef SHARED_WILSON_DSLASH
829  errorQuda("Shared dslash does not yet support X-dimension partitioning");
830 #endif
831  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
832  ASYM_DSLASH(asymCloverDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
833  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1, clover, cloverNorm,
834  (sFloat*)in->V(), (float*)in->Norm(), (sFloat*)x, (float*)x->Norm(), a);
835  }
836 
837  long long flops() const { return 1872ll * dslashConstants.VolumeCB(); } // FIXME for multi-GPU
838  };
839 
840  void setTwistParam(double &a, double &b, const double &kappa, const double &mu,
841  const int dagger, const QudaTwistGamma5Type twist) {
842  if (twist == QUDA_TWIST_GAMMA5_DIRECT) {
843  a = 2.0 * kappa * mu;
844  b = 1.0;
845  } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) {
846  a = -2.0 * kappa * mu;
847  b = 1.0 / (1.0 + a*a);
848  } else {
849  errorQuda("Twist type %d not defined\n", twist);
850  }
851  if (dagger) a *= -1.0;
852 
853  }
854 
855  template <typename sFloat, typename gFloat>
857 
858  private:
859  const gFloat *gauge0, *gauge1;
860  const QudaReconstructType reconstruct;
861  const int dagger;
862  double a, b, c;
863 
864  protected:
865  int sharedBytesPerThread() const
866  {
867 #if (__COMPUTE_CAPABILITY__ >= 200)
869  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
870  return ((in->TwistFlavor() == QUDA_TWIST_PLUS || in->TwistFlavor() == QUDA_TWIST_MINUS) ? DSLASH_SHARED_FLOATS_PER_THREAD * reg_size : NDEGTM_SHARED_FLOATS_PER_THREAD * reg_size);
871  } else {
872  return 0;
873  }
874 #else
875  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
876  return ((in->TwistFlavor() == QUDA_TWIST_PLUS || in->TwistFlavor() == QUDA_TWIST_MINUS) ? DSLASH_SHARED_FLOATS_PER_THREAD * reg_size : NDEGTM_SHARED_FLOATS_PER_THREAD * reg_size);
877 #endif
878  }
879 
880  public:
881  TwistedDslashCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
882  const QudaReconstructType reconstruct, const cudaColorSpinorField *in,
883  const cudaColorSpinorField *x, const double kappa, const double mu,
884  const double epsilon, const int dagger)
885  : SharedDslashCuda(out, in, x),gauge0(gauge0), gauge1(gauge1),
886  reconstruct(reconstruct), dagger(dagger)
887  {
888  bindSpinorTex<sFloat>(in, out, x);
889 
890  if((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS))
891  {
892  setTwistParam(a, b, kappa, mu, dagger, QUDA_TWIST_GAMMA5_INVERSE);
893  if (x) b *= epsilon; //reuse this parameter for degenerate twisted mass
894  c = 0;
895  }
896  else{//twist doublet:
897  a = kappa, b = mu, c = epsilon;
898  }
899  }
900  virtual ~TwistedDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
901 
902  TuneKey tuneKey() const
903  {
904  TuneKey key = DslashCuda::tuneKey();
905  std::stringstream recon;
906  recon << reconstruct;
907  key.aux += ",reconstruct=" + recon.str();
908  if (x) key.aux += ",Xpay";
909  return key;
910  }
911 
912  void apply(const cudaStream_t &stream)
913  {
914 #ifdef SHARED_WILSON_DSLASH
916  errorQuda("Shared dslash does not yet support X-dimension partitioning");
917 #endif
918  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
919 
920  if((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS)){
921  DSLASH(twistedMassDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
922  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1,
923  (sFloat*)in->V(), (float*)in->Norm(), a, b, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0));
924  }
925  else{
926  DSLASH(twistedNdegMassDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
927  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1,
928  (sFloat*)in->V(), (float*)in->Norm(), a, b, c, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0));
929  }
930  }
931 
932  long long flops() const { return (x ? 1416ll : 1392ll) * dslashConstants.VolumeCB(); } // FIXME for multi-GPU
933  };
934 
935  template <typename sFloat, typename gFloat>
937 
938  private:
939  const gFloat *gauge0, *gauge1;
940  const QudaReconstructType reconstruct;
941  const int dagger;
942  const double mferm;
943  const double a;
944 
945  bool checkGrid(TuneParam &param) const {
946  if (param.grid.x > deviceProp.maxGridSize[0] || param.grid.y > deviceProp.maxGridSize[1]) {
947  warningQuda("Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
948  param.block.x, param.block.y, param.block.z,
949  param.grid.x, param.grid.y, param.grid.z);
950  return false;
951  } else {
952  return true;
953  }
954  }
955 
956  protected:
957  bool advanceBlockDim(TuneParam &param) const
958  {
959  const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
960  const int step[2] = { deviceProp.warpSize, 1 };
961  bool advance[2] = { false, false };
962 
963  // first try to advance block.x
964  param.block.x += step[0];
965  if (param.block.x > deviceProp.maxThreadsDim[0] ||
966  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
967  advance[0] = false;
968  param.block.x = step[0]; // reset block.x
969  } else {
970  advance[0] = true; // successfully advanced block.x
971  }
972 
973  if (!advance[0]) { // if failed to advance block.x, now try block.y
974  param.block.y += step[1];
975 
976  if (param.block.y > in->X(4) ||
977  sharedBytesPerThread()*param.block.x*param.block.y > max_shared) {
978  advance[1] = false;
979  param.block.y = step[1]; // reset block.x
980  } else {
981  advance[1] = true; // successfully advanced block.y
982  }
983  }
984 
985  if (advance[0] || advance[1]) {
986  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
987  (in->X(4)+param.block.y-1) / param.block.y, 1);
988 
989  bool advance = true;
990  if (!checkGrid(param)) advance = advanceBlockDim(param);
991  return advance;
992  } else {
993  return false;
994  }
995  }
996 
997  int sharedBytesPerThread() const { return 0; }
998 
999  public:
1000  DomainWallDslashCuda(cudaColorSpinorField *out, const gFloat *gauge0, const gFloat *gauge1,
1001  const QudaReconstructType reconstruct, const cudaColorSpinorField *in,
1002  const cudaColorSpinorField *x, const double mferm,
1003  const double a, const int dagger)
1004  : DslashCuda(out, in, x), gauge0(gauge0), gauge1(gauge1), mferm(mferm),
1005  reconstruct(reconstruct), dagger(dagger), a(a)
1006  {
1007  bindSpinorTex<sFloat>(in, out, x);
1008  }
1009  virtual ~DomainWallDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
1010 
1011  virtual void initTuneParam(TuneParam &param) const
1012  {
1013  Tunable::initTuneParam(param);
1014  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
1015  (in->X(4)+param.block.y-1) / param.block.y, 1);
1016  bool ok = true;
1017  if (!checkGrid(param)) ok = advanceBlockDim(param);
1018  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
1019  }
1020 
1022  virtual void defaultTuneParam(TuneParam &param) const
1023  {
1024  Tunable::defaultTuneParam(param);
1025  param.grid = dim3( (dslashParam.threads+param.block.x-1) / param.block.x,
1026  (in->X(4)+param.block.y-1) / param.block.y, 1);
1027  bool ok = true;
1028  if (!checkGrid(param)) ok = advanceBlockDim(param);
1029  if (!ok) errorQuda("Lattice volume is too large for even the largest blockDim");
1030  }
1031 
1032  TuneKey tuneKey() const
1033  {
1034  TuneKey key = DslashCuda::tuneKey();
1035  std::stringstream ls, recon;
1036  ls << dslashConstants.Ls;
1037  recon << reconstruct;
1038  key.volume += "x" + ls.str();
1039  key.aux += ",reconstruct=" + recon.str();
1040  if (x) key.aux += ",Xpay";
1041  return key;
1042  }
1043 
1044  void apply(const cudaStream_t &stream)
1045  {
1046  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1047  DSLASH(domainWallDslash, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam,
1048  (sFloat*)out->V(), (float*)out->Norm(), gauge0, gauge1,
1049  (sFloat*)in->V(), (float*)in->Norm(), mferm, (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
1050  }
1051 
1052  long long flops() const { // FIXME for multi-GPU
1053  long long bulk = (dslashConstants.Ls-2)*(dslashConstants.VolumeCB()/dslashConstants.Ls);
1054  long long wall = 2*dslashConstants.VolumeCB()/dslashConstants.Ls;
1055  return (x ? 1368ll : 1320ll)*dslashConstants.VolumeCB()*dslashConstants.Ls + 96ll*bulk + 120ll*wall;
1056  }
1057  };
1058 
1059  template <typename sFloat, typename fatGFloat, typename longGFloat>
1061 
1062  private:
1063  const fatGFloat *fat0, *fat1;
1064  const longGFloat *long0, *long1;
1065  const QudaReconstructType reconstruct;
1066  const int dagger;
1067  const double a;
1068 
1069  protected:
1070  int sharedBytesPerThread() const
1071  {
1072  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
1073  return 6 * reg_size;
1074  }
1075 
1076  public:
1077  StaggeredDslashCuda(cudaColorSpinorField *out, const fatGFloat *fat0, const fatGFloat *fat1,
1078  const longGFloat *long0, const longGFloat *long1,
1079  const QudaReconstructType reconstruct, const cudaColorSpinorField *in,
1080  const cudaColorSpinorField *x, const double a, const int dagger)
1081  : DslashCuda(out, in, x), fat0(fat0), fat1(fat1), long0(long0), long1(long1),
1082  reconstruct(reconstruct), dagger(dagger), a(a)
1083  {
1084  bindSpinorTex<sFloat>(in, out, x);
1085  }
1086 
1087  virtual ~StaggeredDslashCuda() { unbindSpinorTex<sFloat>(in, out, x); }
1088 
1089  TuneKey tuneKey() const
1090  {
1091  TuneKey key = DslashCuda::tuneKey();
1092  std::stringstream recon;
1093  recon << reconstruct;
1094  key.aux += ",reconstruct=" + recon.str();
1095  if (x) key.aux += ",Axpy";
1096  return key;
1097  }
1098 
1099  void apply(const cudaStream_t &stream)
1100  {
1101  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1102  dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
1103  STAGGERED_DSLASH(gridDim, tp.block, tp.shared_bytes, stream, dslashParam,
1104  (sFloat*)out->V(), (float*)out->Norm(), fat0, fat1, long0, long1,
1105  (sFloat*)in->V(), (float*)in->Norm(), (sFloat*)(x ? x->V() : 0), (float*)(x ? x->Norm() : 0), a);
1106  }
1107 
1108  int Nface() { return 6; }
1109 
1110  long long flops() const { return (x ? 1158ll : 1146ll) * dslashConstants.VolumeCB(); } // FIXME for multi-GPU
1111  };
1112 
1113 #ifdef DSLASH_PROFILING
1114 
1115 #define TDIFF(a,b) 1e3*(b.tv_sec - a.tv_sec + 1e-6*(b.tv_usec - a.tv_usec))
1116 
1117  void dslashTimeProfile() {
1118 
1119  cudaEventSynchronize(dslashEnd);
1120  float runTime;
1121  cudaEventElapsedTime(&runTime, dslashStart, dslashEnd);
1122  dslashTime += runTime;
1123 
1124  for (int i=4; i>=0; i--) {
1125  if (!dslashParam.commDim[i] && i<4) continue;
1126 
1127  // kernel timing
1128  cudaEventElapsedTime(&runTime, dslashStart, kernelStart[2*i]);
1129  kernelTime[2*i][0] += runTime; // start time
1130  cudaEventElapsedTime(&runTime, dslashStart, kernelEnd[2*i]);
1131  kernelTime[2*i][1] += runTime; // end time
1132  }
1133 
1134 #ifdef MULTI_GPU
1135  for (int i=3; i>=0; i--) {
1136  if (!dslashParam.commDim[i]) continue;
1137 
1138  for (int dir = 0; dir < 2; dir ++) {
1139  // pack timing
1140  cudaEventElapsedTime(&runTime, dslashStart, packStart[2*i+dir]);
1141  packTime[2*i+dir][0] += runTime; // start time
1142  cudaEventElapsedTime(&runTime, dslashStart, packEnd[2*i+dir]);
1143  packTime[2*i+dir][1] += runTime; // end time
1144 
1145  // gather timing
1146  cudaEventElapsedTime(&runTime, dslashStart, gatherStart[2*i+dir]);
1147  gatherTime[2*i+dir][0] += runTime; // start time
1148  cudaEventElapsedTime(&runTime, dslashStart, gatherEnd[2*i+dir]);
1149  gatherTime[2*i+dir][1] += runTime; // end time
1150 
1151  // comms timing
1152  runTime = TDIFF(dslashStart_h, commsStart[2*i+dir]);
1153  commsTime[2*i+dir][0] += runTime; // start time
1154  runTime = TDIFF(dslashStart_h, commsEnd[2*i+dir]);
1155  commsTime[2*i+dir][1] += runTime; // end time
1156 
1157  // scatter timing
1158  cudaEventElapsedTime(&runTime, dslashStart, scatterStart[2*i+dir]);
1159  scatterTime[2*i+dir][0] += runTime; // start time
1160  cudaEventElapsedTime(&runTime, dslashStart, scatterEnd[2*i+dir]);
1161  scatterTime[2*i+dir][1] += runTime; // end time
1162  }
1163  }
1164 #endif
1165 
1166  }
1167 
1168  void printDslashProfile() {
1169 
1170  printfQuda("Total Dslash time = %6.2f\n", dslashTime);
1171 
1172  char dimstr[8][8] = {"X-", "X+", "Y-", "Y+", "Z-", "Z+", "T-", "T+"};
1173 
1174  printfQuda(" %13s %13s %13s %13s %13s\n", "Pack", "Gather", "Comms", "Scatter", "Kernel");
1175  printfQuda(" %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s\n",
1176  "Start", "End", "Start", "End", "Start", "End", "Start", "End", "Start", "End");
1177 
1178  printfQuda("%8s %55s %6.2f %6.2f\n", "Interior", "", kernelTime[8][0], kernelTime[8][1]);
1179 
1180  for (int i=3; i>=0; i--) {
1181  if (!dslashParam.commDim[i]) continue;
1182 
1183  for (int dir = 0; dir < 2; dir ++) {
1184  printfQuda("%8s ", dimstr[2*i+dir]);
1185 #ifdef MULTI_GPU
1186  printfQuda("%6.2f %6.2f ", packTime[2*i+dir][0], packTime[2*i+dir][1]);
1187  printfQuda("%6.2f %6.2f ", gatherTime[2*i+dir][0], gatherTime[2*i+dir][1]);
1188  printfQuda("%6.2f %6.2f ", commsTime[2*i+dir][0], commsTime[2*i+dir][1]);
1189  printfQuda("%6.2f %6.2f ", scatterTime[2*i+dir][0], scatterTime[2*i+dir][1]);
1190 #endif
1191 
1192  if (dir==0) printfQuda("%6.2f %6.2f\n", kernelTime[2*i][0], kernelTime[2*i][1]);
1193  else printfQuda("\n");
1194  }
1195  }
1196 
1197  }
1198 #endif
1199 
1205 
1210  for (int i=0; i<Nstream-1; i++) {
1211  gatherCompleted[i] = 0;
1212  commsCompleted[i] = 0;
1213  dslashCompleted[i] = 0;
1214  }
1215  gatherCompleted[Nstream-1] = 1;
1216  commsCompleted[Nstream-1] = 1;
1217 
1218  // We need to know which was the previous direction in which
1219  // communication was issued, since we only query a given event /
1220  // comms call after the previous the one has successfully
1221  // completed.
1222  for (int i=3; i>=0; i--) {
1223  if (dslashParam.commDim[i]) {
1224  int prev = Nstream-1;
1225  for (int j=3; j>i; j--) if (dslashParam.commDim[j]) prev = 2*j;
1226  previousDir[2*i + 1] = prev;
1227  previousDir[2*i + 0] = 2*i + 1; // always valid
1228  }
1229  }
1230 
1231  // this tells us how many events / comms occurances there are in
1232  // total. Used for exiting the while loop
1233  commDimTotal = 0;
1234  for (int i=3; i>=0; i--) commDimTotal += dslashParam.commDim[i];
1235  commDimTotal *= 4; // 2 from pipe length, 2 from direction
1236  }
1237 
1238  void dslashCuda(DslashCuda &dslash, const size_t regSize, const int parity, const int dagger,
1239  const int volume, const int *faceVolumeCB) {
1240 
1243  dslashParam.threads = volume;
1244 
1245  gettimeofday(&dslashStart_h, NULL);
1246 
1247 #ifdef MULTI_GPU
1248  // Record the start of the dslash
1249  cudaEventRecord(dslashStart, streams[Nstream-1]);
1250 
1251  // Initialize pack from source spinor
1252  face->pack(*inSpinor, 1-parity, dagger, streams);
1253 
1254  // Record the end of the packing
1255  cudaEventRecord(packEnd[0], streams[Nstream-1]);
1256 
1257  for(int i = 3; i >=0; i--){
1258  if (!dslashParam.commDim[i]) continue;
1259 
1260  for (int dir=1; dir>=0; dir--) {
1261  if (i!=3 || getKernelPackT())
1262  cudaStreamWaitEvent(streams[2*i+dir], packEnd[0], 0);
1263  else
1264  cudaStreamWaitEvent(streams[2*i+dir], dslashStart, 0);
1265 
1266  // Initialize host transfer from source spinor
1267  face->gather(*inSpinor, dagger, 2*i+dir);
1268 
1269  // Record the end of the gathering
1270  cudaEventRecord(gatherEnd[2*i+dir], streams[2*i+dir]);
1271  }
1272  }
1273 #endif
1274 
1275  dslash.apply(streams[Nstream-1]);
1276 
1277 #ifdef MULTI_GPU
1279 
1280  int completeSum = 0;
1281  while (completeSum < commDimTotal) {
1282  for (int i=3; i>=0; i--) {
1283  if (!dslashParam.commDim[i]) continue;
1284 
1285  for (int dir=1; dir>=0; dir--) {
1286 
1287  // Query if gather has completed
1288  if (!gatherCompleted[2*i+dir] && gatherCompleted[previousDir[2*i+dir]]) {
1289  if (cudaSuccess == cudaEventQuery(gatherEnd[2*i+dir])) {
1290  gatherCompleted[2*i+dir] = 1;
1291  completeSum++;
1292  gettimeofday(&commsStart[2*i+dir], NULL);
1293  face->commsStart(2*i+dir);
1294  }
1295  }
1296 
1297  // Query if comms has finished
1298  if (!commsCompleted[2*i+dir] && commsCompleted[previousDir[2*i+dir]] &&
1299  gatherCompleted[2*i+dir]) {
1300  if (face->commsQuery(2*i+dir)) {
1301  commsCompleted[2*i+dir] = 1;
1302  completeSum++;
1303  gettimeofday(&commsEnd[2*i+dir], NULL);
1304 
1305  // Scatter into the end zone
1306  face->scatter(*inSpinor, dagger, 2*i+dir);
1307  }
1308  }
1309 
1310  }
1311 
1312  // enqueue the boundary dslash kernel as soon as the scatters have been enqueued
1313  if (!dslashCompleted[2*i] && commsCompleted[2*i] && commsCompleted[2*i+1] ) {
1314  // Record the end of the scattering
1315  cudaEventRecord(scatterEnd[2*i], streams[2*i]);
1316 
1317  dslashParam.kernel_type = static_cast<KernelType>(i);
1318  dslashParam.threads = dslash.Nface()*faceVolumeCB[i]; // updating 2 or 6 faces
1319 
1320  // wait for scattering to finish and then launch dslash
1321  cudaStreamWaitEvent(streams[Nstream-1], scatterEnd[2*i], 0);
1322 
1323  dslash.apply(streams[Nstream-1]); // all faces use this stream
1324 
1325  dslashCompleted[2*i] = 1;
1326  }
1327 
1328  }
1329 
1330  }
1331 
1332  //cudaEventRecord(dslashEnd, streams[Nstream-1]);
1333  //DSLASH_TIME_PROFILE();
1334 #endif // MULTI_GPU
1335  }
1336 
1337  // Wilson wrappers
1339  const int dagger, const cudaColorSpinorField *x, const double &k, const int *commOverride)
1340  {
1341  inSpinor = (cudaColorSpinorField*)in; // EVIL
1342 
1343 #ifdef GPU_WILSON_DIRAC
1344  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
1345  for(int i=0;i<4;i++){
1346  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
1347  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
1348  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
1349  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
1350  }
1351 
1352  void *gauge0, *gauge1;
1353  bindGaugeTex(gauge, parity, &gauge0, &gauge1);
1354 
1355  if (in->Precision() != gauge.Precision())
1356  errorQuda("Mixing gauge %d and spinor %d precision not supported",
1357  gauge.Precision(), in->Precision());
1358 
1359  DslashCuda *dslash = 0;
1360  size_t regSize = sizeof(float);
1361  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
1362 #if (__COMPUTE_CAPABILITY__ >= 130)
1363  dslash = new WilsonDslashCuda<double2, double2>(out, (double2*)gauge0, (double2*)gauge1,
1364  gauge.Reconstruct(), in, x, k, dagger);
1365  regSize = sizeof(double);
1366 #else
1367  errorQuda("Double precision not supported on this GPU");
1368 #endif
1369  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
1370  dslash = new WilsonDslashCuda<float4, float4>(out, (float4*)gauge0, (float4*)gauge1,
1371  gauge.Reconstruct(), in, x, k, dagger);
1372  } else if (in->Precision() == QUDA_HALF_PRECISION) {
1373  dslash = new WilsonDslashCuda<short4, short4>(out, (short4*)gauge0, (short4*)gauge1,
1374  gauge.Reconstruct(), in, x, k, dagger);
1375  }
1376  dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace());
1377 
1378  delete dslash;
1379  unbindGaugeTex(gauge);
1380 
1381  checkCudaError();
1382 #else
1383  errorQuda("Wilson dslash has not been built");
1384 #endif // GPU_WILSON_DIRAC
1385 
1386  }
1387 
1389  const cudaColorSpinorField *in, const int parity, const int dagger,
1390  const cudaColorSpinorField *x, const double &a, const int *commOverride)
1391  {
1392  inSpinor = (cudaColorSpinorField*)in; // EVIL
1393 
1394 #ifdef GPU_CLOVER_DIRAC
1395  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
1396  for(int i=0;i<4;i++){
1397  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
1398  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
1399  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
1400  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
1401  }
1402 
1403  void *cloverP, *cloverNormP;
1404  QudaPrecision clover_prec = bindCloverTex(cloverInv, parity, &cloverP, &cloverNormP);
1405 
1406  void *gauge0, *gauge1;
1407  bindGaugeTex(gauge, parity, &gauge0, &gauge1);
1408 
1409  if (in->Precision() != gauge.Precision())
1410  errorQuda("Mixing gauge and spinor precision not supported");
1411 
1412  if (in->Precision() != clover_prec)
1413  errorQuda("Mixing clover and spinor precision not supported");
1414 
1415  DslashCuda *dslash = 0;
1416  size_t regSize = sizeof(float);
1417 
1418  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
1419 #if (__COMPUTE_CAPABILITY__ >= 130)
1420  dslash = new CloverDslashCuda<double2, double2, double2>(out, (double2*)gauge0, (double2*)gauge1,
1421  gauge.Reconstruct(), (double2*)cloverP,
1422  (float*)cloverNormP, in, x, a, dagger);
1423  regSize = sizeof(double);
1424 #else
1425  errorQuda("Double precision not supported on this GPU");
1426 #endif
1427  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
1428  dslash = new CloverDslashCuda<float4, float4, float4>(out, (float4*)gauge0, (float4*)gauge1,
1429  gauge.Reconstruct(), (float4*)cloverP,
1430  (float*)cloverNormP, in, x, a, dagger);
1431  } else if (in->Precision() == QUDA_HALF_PRECISION) {
1432  dslash = new CloverDslashCuda<short4, short4, short4>(out, (short4*)gauge0, (short4*)gauge1,
1433  gauge.Reconstruct(), (short4*)cloverP,
1434  (float*)cloverNormP, in, x, a, dagger);
1435  }
1436 
1437  dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace());
1438 
1439  delete dslash;
1440  unbindGaugeTex(gauge);
1441  unbindCloverTex(cloverInv);
1442 
1443  checkCudaError();
1444 #else
1445  errorQuda("Clover dslash has not been built");
1446 #endif
1447 
1448  }
1449 
1450 
1452  const cudaColorSpinorField *in, const int parity, const int dagger,
1453  const cudaColorSpinorField *x, const double &a, const int *commOverride)
1454  {
1455  inSpinor = (cudaColorSpinorField*)in; // EVIL
1456 
1457 #ifdef GPU_CLOVER_DIRAC
1458  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
1459  for(int i=0;i<4;i++){
1460  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
1461  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
1462  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
1463  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
1464  }
1465 
1466  void *cloverP, *cloverNormP;
1467  QudaPrecision clover_prec = bindCloverTex(cloverInv, parity, &cloverP, &cloverNormP);
1468 
1469  void *gauge0, *gauge1;
1470  bindGaugeTex(gauge, parity, &gauge0, &gauge1);
1471 
1472  if (in->Precision() != gauge.Precision())
1473  errorQuda("Mixing gauge and spinor precision not supported");
1474 
1475  if (in->Precision() != clover_prec)
1476  errorQuda("Mixing clover and spinor precision not supported");
1477 
1478  DslashCuda *dslash = 0;
1479  size_t regSize = sizeof(float);
1480 
1481  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
1482 #if (__COMPUTE_CAPABILITY__ >= 130)
1483  dslash = new AsymCloverDslashCuda<double2, double2, double2>(out, (double2*)gauge0, (double2*)gauge1,
1484  gauge.Reconstruct(), (double2*)cloverP,
1485  (float*)cloverNormP, in, x, a, dagger);
1486  regSize = sizeof(double);
1487 #else
1488  errorQuda("Double precision not supported on this GPU");
1489 #endif
1490  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
1491  dslash = new AsymCloverDslashCuda<float4, float4, float4>(out, (float4*)gauge0, (float4*)gauge1,
1492  gauge.Reconstruct(), (float4*)cloverP,
1493  (float*)cloverNormP, in, x, a, dagger);
1494  } else if (in->Precision() == QUDA_HALF_PRECISION) {
1495  dslash = new AsymCloverDslashCuda<short4, short4, short4>(out, (short4*)gauge0, (short4*)gauge1,
1496  gauge.Reconstruct(), (short4*)cloverP,
1497  (float*)cloverNormP, in, x, a, dagger);
1498  }
1499 
1500  dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace());
1501 
1502  delete dslash;
1503  unbindGaugeTex(gauge);
1504  unbindCloverTex(cloverInv);
1505 
1506  checkCudaError();
1507 #else
1508  errorQuda("Clover dslash has not been built");
1509 #endif
1510 
1511  }
1512 
1514  const cudaColorSpinorField *in, const int parity, const int dagger,
1515  const cudaColorSpinorField *x, const double &kappa, const double &mu,
1516  const double &epsilon, const int *commOverride)
1517  {
1518  inSpinor = (cudaColorSpinorField*)in; // EVIL
1519  #ifdef GPU_TWISTED_MASS_DIRAC
1520  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
1521 
1522  int ghost_threads[4] = {0};
1523  int bulk_threads = ((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS)) ? in->Volume() : in->Volume() / 2;
1524 
1525  for(int i=0;i<4;i++){
1526  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
1527  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
1528  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
1529  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
1530  ghost_threads[i] = ((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS)) ? in->GhostFace()[i] : in->GhostFace()[i] / 2;
1531  }
1532 
1533  void *gauge0, *gauge1;
1534  bindGaugeTex(gauge, parity, &gauge0, &gauge1);
1535 
1536  if (in->Precision() != gauge.Precision())
1537  errorQuda("Mixing gauge and spinor precision not supported");
1538 
1539  DslashCuda *dslash = 0;
1540  size_t regSize = sizeof(float);
1541 
1542  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
1543 #if (__COMPUTE_CAPABILITY__ >= 130)
1544  dslash = new TwistedDslashCuda<double2,double2>(out, (double2*)gauge0,(double2*)gauge1, gauge.Reconstruct(), in, x, kappa, mu, epsilon, dagger);
1545  regSize = sizeof(double);
1546 #else
1547  errorQuda("Double precision not supported on this GPU");
1548 #endif
1549  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
1550  dslash = new TwistedDslashCuda<float4,float4>(out, (float4*)gauge0,(float4*)gauge1, gauge.Reconstruct(), in, x, kappa, mu, epsilon, dagger);
1551 
1552  } else if (in->Precision() == QUDA_HALF_PRECISION) {
1553  dslash = new TwistedDslashCuda<short4,short4>(out, (short4*)gauge0,(short4*)gauge1, gauge.Reconstruct(), in, x, kappa, mu, epsilon, dagger);
1554  }
1555 
1556  dslashCuda(*dslash, regSize, parity, dagger, bulk_threads, ghost_threads);
1557 
1558  delete dslash;
1559  unbindGaugeTex(gauge);
1560 
1561  checkCudaError();
1562 #else
1563  errorQuda("Twisted mass dslash has not been built");
1564 #endif
1565  }
1566 
1568  const cudaColorSpinorField *in, const int parity, const int dagger,
1569  const cudaColorSpinorField *x, const double &m_f, const double &k2, const int *commOverride)
1570  {
1571  inSpinor = (cudaColorSpinorField*)in; // EVIL
1572 
1574 
1575 #ifdef GPU_DOMAIN_WALL_DIRAC
1576  //currently splitting in space-time is impelemented:
1577  int dirs = 4;
1578  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
1579  for(int i = 0;i < dirs; i++){
1580  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
1581  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
1582  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
1583  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
1584  }
1585 
1586  void *gauge0, *gauge1;
1587  bindGaugeTex(gauge, parity, &gauge0, &gauge1);
1588 
1589  if (in->Precision() != gauge.Precision())
1590  errorQuda("Mixing gauge and spinor precision not supported");
1591 
1592  DslashCuda *dslash = 0;
1593  size_t regSize = sizeof(float);
1594 
1595  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
1596 #if (__COMPUTE_CAPABILITY__ >= 130)
1597  dslash = new DomainWallDslashCuda<double2,double2>(out, (double2*)gauge0, (double2*)gauge1,
1598  gauge.Reconstruct(), in, x, m_f, k2, dagger);
1599  regSize = sizeof(double);
1600 #else
1601  errorQuda("Double precision not supported on this GPU");
1602 #endif
1603  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
1604  dslash = new DomainWallDslashCuda<float4,float4>(out, (float4*)gauge0, (float4*)gauge1,
1605  gauge.Reconstruct(), in, x, m_f, k2, dagger);
1606  } else if (in->Precision() == QUDA_HALF_PRECISION) {
1607  dslash = new DomainWallDslashCuda<short4,short4>(out, (short4*)gauge0, (short4*)gauge1,
1608  gauge.Reconstruct(), in, x, m_f, k2, dagger);
1609  }
1610 
1611  // the parameters passed to dslashCuda must be 4-d volume and 3-d
1612  // faces because Ls is added as the y-dimension in thread space
1613  int ghostFace[QUDA_MAX_DIM];
1614  for (int i=0; i<4; i++) ghostFace[i] = in->GhostFace()[i] / in->X(4);
1615  dslashCuda(*dslash, regSize, parity, dagger, in->Volume() / in->X(4), ghostFace);
1616 
1617  delete dslash;
1618  unbindGaugeTex(gauge);
1619 
1620  checkCudaError();
1621 #else
1622  errorQuda("Domain wall dslash has not been built");
1623 #endif
1624  }
1625 
1627  const cudaGaugeField &longGauge, const cudaColorSpinorField *in,
1628  const int parity, const int dagger, const cudaColorSpinorField *x,
1629  const double &k, const int *commOverride)
1630  {
1631  inSpinor = (cudaColorSpinorField*)in; // EVIL
1632 
1633 #ifdef GPU_STAGGERED_DIRAC
1634 
1635 #ifdef MULTI_GPU
1636  for(int i=0;i < 4; i++){
1637  if(commDimPartitioned(i) && (fatGauge.X()[i] < 6)){
1638  errorQuda("ERROR: partitioned dimension with local size less than 6 is not supported in staggered dslash\n");
1639  }
1640  }
1641 #endif
1642 
1643  int Npad = (in->Ncolor()*in->Nspin()*2)/in->FieldOrder(); // SPINOR_HOP in old code
1644 
1646 
1647  for(int i=0;i<4;i++){
1648  dslashParam.ghostDim[i] = commDimPartitioned(i); // determines whether to use regular or ghost indexing at boundary
1649  dslashParam.ghostOffset[i] = Npad*(in->GhostOffset(i) + in->Stride());
1650  dslashParam.ghostNormOffset[i] = in->GhostNormOffset(i) + in->Stride();
1651  dslashParam.commDim[i] = (!commOverride[i]) ? 0 : commDimPartitioned(i); // switch off comms if override = 0
1652  }
1653  void *fatGauge0, *fatGauge1;
1654  void* longGauge0, *longGauge1;
1655  bindFatGaugeTex(fatGauge, parity, &fatGauge0, &fatGauge1);
1656  bindLongGaugeTex(longGauge, parity, &longGauge0, &longGauge1);
1657 
1658  if (in->Precision() != fatGauge.Precision() || in->Precision() != longGauge.Precision()){
1659  errorQuda("Mixing gauge and spinor precision not supported"
1660  "(precision=%d, fatlinkGauge.precision=%d, longGauge.precision=%d",
1661  in->Precision(), fatGauge.Precision(), longGauge.Precision());
1662  }
1663 
1664  DslashCuda *dslash = 0;
1665  size_t regSize = sizeof(float);
1666 
1667  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
1668 #if (__COMPUTE_CAPABILITY__ >= 130)
1669  dslash = new StaggeredDslashCuda<double2, double2, double2>(out, (double2*)fatGauge0, (double2*)fatGauge1,
1670  (double2*)longGauge0, (double2*)longGauge1,
1671  longGauge.Reconstruct(), in, x, k, dagger);
1672  regSize = sizeof(double);
1673 #else
1674  errorQuda("Double precision not supported on this GPU");
1675 #endif
1676  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
1677  dslash = new StaggeredDslashCuda<float2, float2, float4>(out, (float2*)fatGauge0, (float2*)fatGauge1,
1678  (float4*)longGauge0, (float4*)longGauge1,
1679  longGauge.Reconstruct(), in, x, k, dagger);
1680  } else if (in->Precision() == QUDA_HALF_PRECISION) {
1681  dslash = new StaggeredDslashCuda<short2, short2, short4>(out, (short2*)fatGauge0, (short2*)fatGauge1,
1682  (short4*)longGauge0, (short4*)longGauge1,
1683  longGauge.Reconstruct(), in, x, k, dagger);
1684  }
1685 
1686  dslashCuda(*dslash, regSize, parity, dagger, in->Volume(), in->GhostFace());
1687 
1688  delete dslash;
1689  unbindGaugeTex(fatGauge);
1690  unbindGaugeTex(longGauge);
1691 
1692  checkCudaError();
1693 
1694 #else
1695  errorQuda("Staggered dslash has not been built");
1696 #endif // GPU_STAGGERED_DIRAC
1697  }
1698 
1699 
1700  template <typename sFloat, typename cFloat>
1701  class CloverCuda : public Tunable {
1702  private:
1704  float *outNorm;
1705  char *saveOut, *saveOutNorm;
1706  const cFloat *clover;
1707  const float *cloverNorm;
1708  const cudaColorSpinorField *in;
1709 
1710  protected:
1711  int sharedBytesPerThread() const
1712  {
1713  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
1714  return CLOVER_SHARED_FLOATS_PER_THREAD * reg_size;
1715  }
1716  int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
1717  bool advanceGridDim(TuneParam &param) const { return false; } // Don't tune the grid dimensions.
1718 
1719  public:
1720  CloverCuda(cudaColorSpinorField *out, const cFloat *clover, const float *cloverNorm,
1721  const cudaColorSpinorField *in)
1722  : out(out), clover(clover), cloverNorm(cloverNorm), in(in)
1723  {
1724  bindSpinorTex<sFloat>(in);
1725  }
1726  virtual ~CloverCuda() { unbindSpinorTex<sFloat>(in); }
1727  void apply(const cudaStream_t &stream)
1728  {
1729  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1730  dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
1731  cloverKernel<<<gridDim, tp.block, tp.shared_bytes, stream>>>
1732  ((sFloat*)out->V(), (float*)out->Norm(), clover, cloverNorm,
1733  (sFloat*)in->V(), (float*)in->Norm(), dslashParam);
1734  }
1735  virtual TuneKey tuneKey() const
1736  {
1737  std::stringstream vol, aux;
1738  vol << dslashConstants.x[0] << "x";
1739  vol << dslashConstants.x[1] << "x";
1740  vol << dslashConstants.x[2] << "x";
1741  vol << dslashConstants.x[3];
1742  return TuneKey(vol.str(), typeid(*this).name());
1743  }
1744 
1745  // Need to save the out field if it aliases the in field
1746  void preTune() {
1747  if (in == out) {
1748  saveOut = new char[out->Bytes()];
1749  cudaMemcpy(saveOut, out->V(), out->Bytes(), cudaMemcpyDeviceToHost);
1750  if (typeid(sFloat) == typeid(short4)) {
1751  saveOutNorm = new char[out->NormBytes()];
1752  cudaMemcpy(saveOutNorm, out->Norm(), out->NormBytes(), cudaMemcpyDeviceToHost);
1753  }
1754  }
1755  }
1756 
1757  // Restore if the in and out fields alias
1758  void postTune() {
1759  if (in == out) {
1760  cudaMemcpy(out->V(), saveOut, out->Bytes(), cudaMemcpyHostToDevice);
1761  delete[] saveOut;
1762  if (typeid(sFloat) == typeid(short4)) {
1763  cudaMemcpy(out->Norm(), saveOutNorm, out->NormBytes(), cudaMemcpyHostToDevice);
1764  delete[] saveOutNorm;
1765  }
1766  }
1767  }
1768 
1769  std::string paramString(const TuneParam &param) const // Don't bother printing the grid dim.
1770  {
1771  std::stringstream ps;
1772  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
1773  ps << "shared=" << param.shared_bytes;
1774  return ps.str();
1775  }
1776 
1777  long long flops() const { return 504ll * dslashConstants.VolumeCB(); }
1778  };
1779 
1780 
1782  const cudaColorSpinorField *in, const int parity) {
1783 
1785  dslashParam.threads = in->Volume();
1786 
1787 #ifdef GPU_CLOVER_DIRAC
1788  Tunable *clov = 0;
1789  void *cloverP, *cloverNormP;
1790  QudaPrecision clover_prec = bindCloverTex(clover, parity, &cloverP, &cloverNormP);
1791 
1792  if (in->Precision() != clover_prec)
1793  errorQuda("Mixing clover and spinor precision not supported");
1794 
1795  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
1796 #if (__COMPUTE_CAPABILITY__ >= 130)
1797  clov = new CloverCuda<double2, double2>(out, (double2*)cloverP, (float*)cloverNormP, in);
1798 #else
1799  errorQuda("Double precision not supported on this GPU");
1800 #endif
1801  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
1802  clov = new CloverCuda<float4, float4>(out, (float4*)cloverP, (float*)cloverNormP, in);
1803  } else if (in->Precision() == QUDA_HALF_PRECISION) {
1804  clov = new CloverCuda<short4, short4>(out, (short4*)cloverP, (float*)cloverNormP, in);
1805  }
1806  clov->apply(0);
1807 
1808  unbindCloverTex(clover);
1809  checkCudaError();
1810 
1811  delete clov;
1812 #else
1813  errorQuda("Clover dslash has not been built");
1814 #endif
1815  }
1816 
1817 
1818  template <typename sFloat>
1819  class TwistGamma5Cuda : public Tunable {
1820 
1821  private:
1823  const cudaColorSpinorField *in;
1824  double a;
1825  double b;
1826  double c;
1827 
1828  int sharedBytesPerThread() const { return 0; }
1829  int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
1830  bool advanceGridDim(TuneParam &param) const { return false; } // Don't tune the grid dimensions.
1831 
1832  char *saveOut, *saveOutNorm;
1833 
1834  public:
1836  double kappa, double mu, double epsilon, const int dagger, QudaTwistGamma5Type twist) :
1837  out(out), in(in)
1838  {
1839  bindSpinorTex<sFloat>(in);
1840  if((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS))
1841  setTwistParam(a, b, kappa, mu, dagger, twist);
1842  else{//twist doublet
1843  a = kappa, b = mu, c = epsilon;
1844  }
1845  }
1846  virtual ~TwistGamma5Cuda() {
1847  unbindSpinorTex<sFloat>(in);
1848  }
1849 
1850  TuneKey tuneKey() const {
1851  std::stringstream vol, aux;
1852  vol << dslashConstants.x[0] << "x";
1853  vol << dslashConstants.x[1] << "x";
1854  vol << dslashConstants.x[2] << "x";
1855  vol << dslashConstants.x[3];
1856  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
1857  }
1858 
1859  void apply(const cudaStream_t &stream)
1860  {
1861  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1862  dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
1863  if((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS))
1864  {
1865  twistGamma5Kernel<<<gridDim, tp.block, tp.shared_bytes, stream>>>
1866  ((sFloat*)out->V(), (float*)out->Norm(), a, b, (sFloat*)in->V(), (float*)in->Norm(), dslashParam);
1867  }
1868  else
1869  {
1870  twistGamma5Kernel<<<gridDim, tp.block, tp.shared_bytes, stream>>>
1871  ((sFloat*)out->V(), (float*)out->Norm(), a, b, c, (sFloat*)in->V(), (float*)in->Norm(), dslashParam);
1872  }
1873  }
1874 
1875  void preTune() {
1876  saveOut = new char[out->Bytes()];
1877  cudaMemcpy(saveOut, out->V(), out->Bytes(), cudaMemcpyDeviceToHost);
1878  if (typeid(sFloat) == typeid(short4)) {
1879  saveOutNorm = new char[out->NormBytes()];
1880  cudaMemcpy(saveOutNorm, out->Norm(), out->NormBytes(), cudaMemcpyDeviceToHost);
1881  }
1882  }
1883 
1884  void postTune() {
1885  cudaMemcpy(out->V(), saveOut, out->Bytes(), cudaMemcpyHostToDevice);
1886  delete[] saveOut;
1887  if (typeid(sFloat) == typeid(short4)) {
1888  cudaMemcpy(out->Norm(), saveOutNorm, out->NormBytes(), cudaMemcpyHostToDevice);
1889  delete[] saveOutNorm;
1890  }
1891  }
1892 
1893  std::string paramString(const TuneParam &param) const {
1894  std::stringstream ps;
1895  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
1896  ps << "shared=" << param.shared_bytes;
1897  return ps.str();
1898  }
1899 
1900  long long flops() const { return 24ll * dslashConstants.VolumeCB(); }
1901  };
1902 
1904  void twistGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in,
1905  const int dagger, const double &kappa, const double &mu, const double &epsilon, const QudaTwistGamma5Type twist)
1906  {
1907  if(in->TwistFlavor() == QUDA_TWIST_PLUS || in->TwistFlavor() == QUDA_TWIST_MINUS)
1908  dslashParam.threads = in->Volume();
1909  else //twist doublet
1910  dslashParam.threads = in->Volume() / 2;
1911 
1912 #if (defined GPU_TWISTED_MASS_DIRAC) || (defined GPU_NDEG_TWISTED_MASS_DIRAC)
1913  Tunable *twistGamma5 = 0;
1914 
1915  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
1916 #if (__COMPUTE_CAPABILITY__ >= 130)
1917  twistGamma5 = new TwistGamma5Cuda<double2>(out, in, kappa, mu, epsilon, dagger, twist);
1918 #else
1919  errorQuda("Double precision not supported on this GPU");
1920 #endif
1921  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
1922  twistGamma5 = new TwistGamma5Cuda<float4>(out, in, kappa, mu, epsilon, dagger, twist);
1923  } else if (in->Precision() == QUDA_HALF_PRECISION) {
1924  twistGamma5 = new TwistGamma5Cuda<short4>(out, in, kappa, mu, epsilon, dagger, twist);
1925  }
1926 
1927  twistGamma5->apply(streams[Nstream-1]);
1928  checkCudaError();
1929 
1930  delete twistGamma5;
1931 #else
1932  errorQuda("Twisted mass dslash has not been built");
1933 #endif // GPU_TWISTED_MASS_DIRAC
1934  }
1935 
1936 } // namespace quda
1937 
1938 #include "misc_helpers.cu"
1939 
1940 
1941 #if defined(GPU_FATLINK) || defined(GPU_GAUGE_FORCE) || defined(GPU_FERMION_FORCE) || defined(GPU_HISQ_FORCE) || defined(GPU_UNITARIZE)
1942 #include <force_common.h>
1943 #endif
1944 
1945 #ifdef GPU_FATLINK
1946 #include "llfat_quda.cu"
1947 #endif
1948 
1949 #ifdef GPU_GAUGE_FORCE
1950 #include "gauge_force_quda.cu"
1951 #endif
1952 
1953 #ifdef GPU_FERMION_FORCE
1954 #include "fermion_force_quda.cu"
1955 #endif
1956 
1957 #ifdef GPU_UNITARIZE
1958 #include "unitarize_links_quda.cu"
1959 #endif
1960 
1961 #ifdef GPU_HISQ_FORCE
1962 #include "hisq_paths_force_quda.cu"
1963 #include "unitarize_force_quda.cu"
1964 #endif
1965