QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_quda.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 //these are access control for staggered action
20 #ifdef GPU_STAGGERED_DIRAC
21 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only
22 //#define DIRECT_ACCESS_FAT_LINK
23 //#define DIRECT_ACCESS_LONG_LINK
24 //#define DIRECT_ACCESS_SPINOR
25 //#define DIRECT_ACCESS_ACCUM
26 //#define DIRECT_ACCESS_INTER
27 //#define DIRECT_ACCESS_PACK
28 #elif (__COMPUTE_CAPABILITY__ >= 200)
29 //#define DIRECT_ACCESS_FAT_LINK
30 //#define DIRECT_ACCESS_LONG_LINK
31 #define DIRECT_ACCESS_SPINOR
32 //#define DIRECT_ACCESS_ACCUM
33 //#define DIRECT_ACCESS_INTER
34 //#define DIRECT_ACCESS_PACK
35 #else
36 #define DIRECT_ACCESS_FAT_LINK
37 //#define DIRECT_ACCESS_LONG_LINK
38 //#define DIRECT_ACCESS_SPINOR
39 //#define DIRECT_ACCESS_ACCUM
40 //#define DIRECT_ACCESS_INTER
41 //#define DIRECT_ACCESS_PACK
42 #endif
43 #endif // GPU_STAGGERED_DIRAC
44 
45 #include <quda_internal.h>
46 #include <dslash_quda.h>
47 #include <sys/time.h>
48 #include <blas_quda.h>
49 #include <face_quda.h>
50 
51 #include <inline_ptx.h>
52 
53 namespace quda {
54 
55  namespace dslash_aux {
56 #include <dslash_constants.h>
57 #include <dslash_textures.h>
58 #include <dslash_index.cuh>
59 
60 #include <tm_core.h> // solo twisted mass kernel
61 #include <tmc_core.h> // solo twisted mass kernel
62 #include <clover_def.h> // kernels for applying the clover term alone
63  }
64 
65 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
66 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
67 #endif
68 
69 #ifndef CLOVER_SHARED_FLOATS_PER_THREAD
70 #define CLOVER_SHARED_FLOATS_PER_THREAD 0
71 #endif
72 
73 #ifndef NDEGTM_SHARED_FLOATS_PER_THREAD
74 #define NDEGTM_SHARED_FLOATS_PER_THREAD 0
75 #endif
76 
77  // these should not be namespaced!!
78  // determines whether the temporal ghost zones are packed with a gather kernel,
79  // as opposed to multiple calls to cudaMemcpy()
80  static bool kernelPackT = false;
81 
82  void setKernelPackT(bool packT) { kernelPackT = packT; }
83 
84  bool getKernelPackT() { return kernelPackT; }
85 
86 
87  //these params are needed for twisted mass (in particular, for packing twisted spinor)
88  static bool twistPack = false;
89 
90  void setTwistPack(bool flag) { twistPack = flag; }
91  bool getTwistPack() { return twistPack; }
92 
93  namespace dslash {
94  int it = 0;
95 
96 #ifdef PTHREADS
97  cudaEvent_t interiorDslashEnd;
98 #endif
99  cudaEvent_t packEnd[Nstream];
100  cudaEvent_t gatherStart[Nstream];
101  cudaEvent_t gatherEnd[Nstream];
102  cudaEvent_t scatterStart[Nstream];
103  cudaEvent_t scatterEnd[Nstream];
104  cudaEvent_t dslashStart;
105  cudaEvent_t dslashEnd;
106  }
107 
109  {
110  using namespace dslash;
111  // add cudaEventDisableTiming for lower sync overhead
112  for (int i=0; i<Nstream; i++) {
113  cudaEventCreate(&packEnd[i], cudaEventDisableTiming);
114  cudaEventCreate(&gatherStart[i], cudaEventDisableTiming);
115  cudaEventCreate(&gatherEnd[i], cudaEventDisableTiming);
116  cudaEventCreateWithFlags(&scatterStart[i], cudaEventDisableTiming);
117  cudaEventCreateWithFlags(&scatterEnd[i], cudaEventDisableTiming);
118  }
119  cudaEventCreateWithFlags(&dslashStart, cudaEventDisableTiming);
120  cudaEventCreateWithFlags(&dslashEnd, cudaEventDisableTiming);
121 #ifdef PTHREADS
122  cudaEventCreateWithFlags(&interiorDslashEnd, cudaEventDisableTiming);
123 #endif
124 
125  checkCudaError();
126  }
127 
128 
130  {
131  using namespace dslash;
132  for (int i=0; i<Nstream; i++) {
133  cudaEventDestroy(packEnd[i]);
134  cudaEventDestroy(gatherStart[i]);
135  cudaEventDestroy(gatherEnd[i]);
136  cudaEventDestroy(scatterStart[i]);
137  cudaEventDestroy(scatterEnd[i]);
138  }
139 
140  cudaEventDestroy(dslashStart);
141  cudaEventDestroy(dslashEnd);
142 #ifdef PTHREADS
143  cudaEventDestroy(interiorDslashEnd);
144 #endif
145 
146  checkCudaError();
147  }
148 
149  using namespace dslash_aux;
150 
151 template <typename sFloat, typename cFloat>
152 class CloverCuda : public Tunable {
153  private:
155  float *outNorm;
156  char *saveOut, *saveOutNorm;
157  const cFloat *clover;
158  const float *cloverNorm;
159  const cudaColorSpinorField *in;
160 
161  protected:
162  unsigned int sharedBytesPerThread() const
163  {
164  int reg_size = (typeid(sFloat)==typeid(double2) ? sizeof(double) : sizeof(float));
165  return CLOVER_SHARED_FLOATS_PER_THREAD * reg_size;
166  }
167  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
168  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
169  unsigned int minThreads() const { return in->VolumeCB(); }
170 
171  public:
172  CloverCuda(cudaColorSpinorField *out, const cFloat *clover, const float *cloverNorm,
173  int cl_stride, const cudaColorSpinorField *in)
174  : out(out), clover(clover), cloverNorm(cloverNorm), in(in)
175  {
176  bindSpinorTex<sFloat>(in);
177  dslashParam.sp_stride = in->Stride();
178 #ifdef GPU_CLOVER_DIRAC
179  dslashParam.cl_stride = cl_stride;
180 #endif
181  }
182  virtual ~CloverCuda() { unbindSpinorTex<sFloat>(in); }
183  void apply(const cudaStream_t &stream)
184  {
185  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
186  dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
187  cloverKernel<<<gridDim, tp.block, tp.shared_bytes, stream>>>
188  ((sFloat*)out->V(), (float*)out->Norm(), clover, cloverNorm,
189  (sFloat*)in->V(), (float*)in->Norm(), dslashParam);
190  }
191  virtual TuneKey tuneKey() const { return TuneKey(in->VolString(), typeid(*this).name()); }
192 
193  // Need to save the out field if it aliases the in field
194  void preTune() {
195  if (in == out) {
196  saveOut = new char[out->Bytes()];
197  cudaMemcpy(saveOut, out->V(), out->Bytes(), cudaMemcpyDeviceToHost);
198  if (typeid(sFloat) == typeid(short4)) {
199  saveOutNorm = new char[out->NormBytes()];
200  cudaMemcpy(saveOutNorm, out->Norm(), out->NormBytes(), cudaMemcpyDeviceToHost);
201  }
202  }
203  }
204 
205  // Restore if the in and out fields alias
206  void postTune() {
207  if (in == out) {
208  cudaMemcpy(out->V(), saveOut, out->Bytes(), cudaMemcpyHostToDevice);
209  delete[] saveOut;
210  if (typeid(sFloat) == typeid(short4)) {
211  cudaMemcpy(out->Norm(), saveOutNorm, out->NormBytes(), cudaMemcpyHostToDevice);
212  delete[] saveOutNorm;
213  }
214  }
215  }
216 
217  std::string paramString(const TuneParam &param) const // Don't bother printing the grid dim.
218  {
219  std::stringstream ps;
220  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
221  ps << "shared=" << param.shared_bytes;
222  return ps.str();
223  }
224 
225  long long flops() const { return 504ll * in->VolumeCB(); }
226 };
227 
228 
230  const cudaColorSpinorField *in, const int parity) {
231 
232  dslashParam.parity = parity;
233  dslashParam.threads = in->Volume();
234 
235 #ifdef GPU_CLOVER_DIRAC
236  Tunable *clov = 0;
237  void *cloverP, *cloverNormP;
238  QudaPrecision clover_prec = bindCloverTex(clover, parity, &cloverP, &cloverNormP);
239 
240  if (in->Precision() != clover_prec)
241  errorQuda("Mixing clover and spinor precision not supported");
242 
243  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
244 #if (__COMPUTE_CAPABILITY__ >= 130)
245  clov = new CloverCuda<double2, double2>(out, (double2*)cloverP, (float*)cloverNormP, clover.stride, in);
246 #else
247  errorQuda("Double precision not supported on this GPU");
248 #endif
249  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
250  clov = new CloverCuda<float4, float4>(out, (float4*)cloverP, (float*)cloverNormP, clover.stride, in);
251  } else if (in->Precision() == QUDA_HALF_PRECISION) {
252  clov = new CloverCuda<short4, short4>(out, (short4*)cloverP, (float*)cloverNormP, clover.stride, in);
253  }
254  clov->apply(0);
255 
256  unbindCloverTex(clover);
257  checkCudaError();
258 
259  delete clov;
260 #else
261  errorQuda("Clover dslash has not been built");
262 #endif
263 }
264 
265 
266 template <typename sFloat>
267 class TwistGamma5Cuda : public Tunable {
268 
269  private:
271  const cudaColorSpinorField *in;
272  double a;
273  double b;
274  double c;
275 
276  unsigned int sharedBytesPerThread() const { return 0; }
277  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
278  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
279  unsigned int minThreads() const { return in->X(0) * in->X(1) * in->X(2) * in->X(3); }
280 
281  char *saveOut, *saveOutNorm;
282 
283  public:
285  double kappa, double mu, double epsilon, const int dagger, QudaTwistGamma5Type twist) :
286  out(out), in(in)
287  {
288  bindSpinorTex<sFloat>(in);
289  dslashParam.sp_stride = in->Stride();
290  if((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS)) {
291  setTwistParam(a, b, kappa, mu, dagger, twist);
292 #if (defined GPU_TWISTED_MASS_DIRAC) || (defined GPU_NDEG_TWISTED_MASS_DIRAC)
293  dslashParam.fl_stride = in->VolumeCB();
294 #endif
295  } else {//twist doublet
296  a = kappa, b = mu, c = epsilon;
297 #if (defined GPU_TWISTED_MASS_DIRAC) || (defined GPU_NDEG_TWISTED_MASS_DIRAC)
298  dslashParam.fl_stride = in->VolumeCB()/2;
299 #endif
300  }
301  }
302 
303  virtual ~TwistGamma5Cuda() {
304  unbindSpinorTex<sFloat>(in);
305  }
306 
307  TuneKey tuneKey() const { return TuneKey(in->VolString(), typeid(*this).name(), in->AuxString()); }
308 
309  void apply(const cudaStream_t &stream)
310  {
311 #if (defined GPU_TWISTED_MASS_DIRAC) || (defined GPU_NDEG_TWISTED_MASS_DIRAC)
312  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
313  dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
314  if((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS)) {
315  twistGamma5Kernel<<<gridDim, tp.block, tp.shared_bytes, stream>>>
316  ((sFloat*)out->V(), (float*)out->Norm(), a, b,
317  (sFloat*)in->V(), (float*)in->Norm(), dslashParam);
318  } else {
319  twistGamma5Kernel<<<gridDim, tp.block, tp.shared_bytes, stream>>>
320  ((sFloat*)out->V(), (float*)out->Norm(), a, b, c,
321  (sFloat*)in->V(), (float*)in->Norm(), dslashParam);
322  }
323 #endif
324  }
325 
326  void preTune() {
327  saveOut = new char[out->Bytes()];
328  cudaMemcpy(saveOut, out->V(), out->Bytes(), cudaMemcpyDeviceToHost);
329  if (typeid(sFloat) == typeid(short4)) {
330  saveOutNorm = new char[out->NormBytes()];
331  cudaMemcpy(saveOutNorm, out->Norm(), out->NormBytes(), cudaMemcpyDeviceToHost);
332  }
333  }
334 
335  void postTune() {
336  cudaMemcpy(out->V(), saveOut, out->Bytes(), cudaMemcpyHostToDevice);
337  delete[] saveOut;
338  if (typeid(sFloat) == typeid(short4)) {
339  cudaMemcpy(out->Norm(), saveOutNorm, out->NormBytes(), cudaMemcpyHostToDevice);
340  delete[] saveOutNorm;
341  }
342  }
343 
344  std::string paramString(const TuneParam &param) const {
345  std::stringstream ps;
346  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
347  ps << "shared=" << param.shared_bytes;
348  return ps.str();
349  }
350 
351  long long flops() const { return 24ll * in->VolumeCB(); }
352  long long bytes() const { return in->Bytes() + in->NormBytes() + out->Bytes() + out->NormBytes(); }
353 };
354 
357  const int dagger, const double &kappa, const double &mu, const double &epsilon, const QudaTwistGamma5Type twist)
358 {
360  dslashParam.threads = in->Volume();
361  else //twist doublet
362  dslashParam.threads = in->Volume() / 2;
363 
364 #if (defined GPU_TWISTED_MASS_DIRAC) || (defined GPU_NDEG_TWISTED_MASS_DIRAC)
365  Tunable *twistGamma5 = 0;
366 
367  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
368 #if (__COMPUTE_CAPABILITY__ >= 130)
369  twistGamma5 = new TwistGamma5Cuda<double2>(out, in, kappa, mu, epsilon, dagger, twist);
370 #else
371  errorQuda("Double precision not supported on this GPU");
372 #endif
373  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
374  twistGamma5 = new TwistGamma5Cuda<float4>(out, in, kappa, mu, epsilon, dagger, twist);
375  } else if (in->Precision() == QUDA_HALF_PRECISION) {
376  twistGamma5 = new TwistGamma5Cuda<short4>(out, in, kappa, mu, epsilon, dagger, twist);
377  }
378 
379  twistGamma5->apply(streams[Nstream-1]);
380  checkCudaError();
381 
382  delete twistGamma5;
383 #else
384  errorQuda("Twisted mass dslash has not been built");
385 #endif // GPU_TWISTED_MASS_DIRAC
386 }
387 
388 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_TWISTED_CLOVER_DIRAC)
390 #endif
391 
392 template <typename cFloat, typename sFloat>
394  private:
395  const cFloat *clover;
396  const float *cNorm;
397  const cFloat *cloverInv;
398  const float *cNrm2;
399  QudaTwistGamma5Type twist;
401  const cudaColorSpinorField *in;
402  double a;
403  double b;
404  double c;
405 
406  unsigned int sharedBytesPerThread() const { return 0; }
407  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
408  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
409  unsigned int minThreads() const { return in->X(0) * in->X(1) * in->X(2) * in->X(3); }
410  char *saveOut, *saveOutNorm;
411 
412  public:
414  double kappa, double mu, double epsilon, const int dagger, QudaTwistGamma5Type tw,
415  cFloat *clov, const float *cN, cFloat *clovInv, const float *cN2, int cl_stride) :
416  out(out), in(in)
417  {
418  bindSpinorTex<sFloat>(in);
419  dslashParam.sp_stride = in->Stride();
420 #ifdef GPU_TWISTED_CLOVER_DIRAC
421  dslashParam.cl_stride = cl_stride;
422  dslashParam.fl_stride = in->VolumeCB();
423 #endif
424  twist = tw;
425  clover = clov;
426  cNorm = cN;
427  cloverInv = clovInv;
428  cNrm2 = cN2;
429 
430  if((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS))
431  setTwistParam(a, b, kappa, mu, dagger, tw);
432  else{//twist doublet
433  errorQuda("ERROR: Non-degenerated twisted-mass not supported in this regularization\n");
434  }
435  }
437  unbindSpinorTex<sFloat>(in);
438  }
439 
440  TuneKey tuneKey() const {
441  return TuneKey(in->VolString(), typeid(*this).name(), in->AuxString());
442  }
443 
444  void apply(const cudaStream_t &stream)
445  {
446 #if (__COMPUTE_CAPABILITY__ >= 200) && defined(GPU_TWISTED_CLOVER_DIRAC)
447  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
448  dim3 gridDim( (dslashParam.threads+tp.block.x-1) / tp.block.x, 1, 1);
449  if((in->TwistFlavor() == QUDA_TWIST_PLUS) || (in->TwistFlavor() == QUDA_TWIST_MINUS)) { //Idea for the kernel, two spinor inputs (IN and clover applied IN), on output (Clover applied IN + ig5IN)
450  if (twist == QUDA_TWIST_GAMMA5_DIRECT)
451  twistCloverGamma5Kernel<<<gridDim, tp.block, tp.shared_bytes, stream>>>
452  ((sFloat*)out->V(), (float*)out->Norm(), a,
453  (sFloat*)in->V(), (float*)in->Norm(), dslashParam,
454  clover, cNorm, cloverInv, cNrm2);
455  else if (twist == QUDA_TWIST_GAMMA5_INVERSE)
456  twistCloverGamma5InvKernel<<<gridDim, tp.block, tp.shared_bytes, stream>>>
457  ((sFloat*)out->V(), (float*)out->Norm(), a,
458  (sFloat*)in->V(), (float*)in->Norm(), dslashParam,
459  clover, cNorm, cloverInv, cNrm2);
460  } else {
461  errorQuda("ERROR: Non-degenerated twisted-mass not supported in this regularization\n");
462  }
463 #endif
464  }
465 
466  void preTune() {
467  saveOut = new char[out->Bytes()];
468  cudaMemcpy(saveOut, out->V(), out->Bytes(), cudaMemcpyDeviceToHost);
469  if (typeid(sFloat) == typeid(short4)) {
470  saveOutNorm = new char[out->NormBytes()];
471  cudaMemcpy(saveOutNorm, out->Norm(), out->NormBytes(), cudaMemcpyDeviceToHost);
472  }
473  }
474 
475  void postTune() {
476  cudaMemcpy(out->V(), saveOut, out->Bytes(), cudaMemcpyHostToDevice);
477  delete[] saveOut;
478  if (typeid(sFloat) == typeid(short4)) {
479  cudaMemcpy(out->Norm(), saveOutNorm, out->NormBytes(), cudaMemcpyHostToDevice);
480  delete[] saveOutNorm;
481  }
482  }
483 
484  std::string paramString(const TuneParam &param) const {
485  std::stringstream ps;
486  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
487  ps << "shared=" << param.shared_bytes;
488  return ps.str();
489  }
490 
491  long long flops() const { return 24ll * in->VolumeCB(); } //TODO FIX THIS NUMBER!!!
492  long long bytes() const { return in->Bytes() + in->NormBytes() + out->Bytes() + out->NormBytes(); }
493 };
494 
495 void twistCloverGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, const int dagger, const double &kappa, const double &mu,
496  const double &epsilon, const QudaTwistGamma5Type twist, const FullClover *clov, const FullClover *clovInv, const int parity)
497 {
499  dslashParam.threads = in->Volume();
500  else //twist doublet
501  errorQuda("Twisted doublet not supported in twisted clover dslash");
502 
503 #ifdef GPU_TWISTED_CLOVER_DIRAC
504  Tunable *tmClovGamma5 = 0;
505 
506  void *clover, *cNorm, *cloverInv, *cNorm2;
507  QudaPrecision clover_prec = bindTwistedCloverTex(*clov, *clovInv, parity, &clover, &cNorm, &cloverInv, &cNorm2);
508 
509  if (in->Precision() != clover_prec)
510  errorQuda("ERROR: Clover precision and spinor precision do not match\n");
511 
512  if (clov->stride != clovInv->stride)
513  errorQuda("clover and cloverInv must have matching strides (%d != %d)", clov->stride, clovInv->stride);
514 
515 
516  if (in->Precision() == QUDA_DOUBLE_PRECISION) {
517 #if (__COMPUTE_CAPABILITY__ >= 130)
518  tmClovGamma5 = new TwistCloverGamma5Cuda<double2,double2>
519  (out, in, kappa, mu, epsilon, dagger, twist, (double2 *) clover, (float *) cNorm, (double2 *) cloverInv, (float *) cNorm2, clov->stride);
520 #else
521  errorQuda("Double precision not supported on this GPU");
522 #endif
523  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
524  tmClovGamma5 = new TwistCloverGamma5Cuda<float4,float4>
525  (out, in, kappa, mu, epsilon, dagger, twist, (float4 *) clover, (float *) cNorm, (float4 *) cloverInv, (float *) cNorm2, clov->stride);
526  } else if (in->Precision() == QUDA_HALF_PRECISION) {
527  tmClovGamma5 = new TwistCloverGamma5Cuda<short4,short4>
528  (out, in, kappa, mu, epsilon, dagger, twist, (short4 *) clover, (float *) cNorm, (short4 *) cloverInv, (float *) cNorm2, clov->stride);
529  }
530 
531  tmClovGamma5->apply(streams[Nstream-1]);
532  checkCudaError();
533 
534  delete tmClovGamma5;
535  unbindTwistedCloverTex(*clov);
536 #else
537  errorQuda("Twisted clover dslash has not been built");
538 #endif // GPU_TWISTED_MASS_DIRAC
539 }
540 
541 } // namespace quda
542 
543 #ifdef GPU_CONTRACT
544 #include "contract.cu"
545 #endif
QudaPrecision bindTwistedCloverTex(const FullClover clover, const FullClover cloverInv, const int oddBit, void **cloverP, void **cloverNormP, void **cloverInvP, void **cloverInvNormP)
enum QudaPrecision_s QudaPrecision
__global__ void twistCloverGamma5InvKernel(float4 *spinor, float *null, float a, const float4 *in, const float *null2, DslashParam param, const float4 *clover, const float *cNorm, const float4 *cloverInv, const float *cNrm2)
virtual TuneKey tuneKey() const
Definition: dslash_quda.cu:191
bool getKernelPackT()
Definition: dslash_quda.cu:84
std::string paramString(const TuneParam &param) const
Definition: dslash_quda.cu:344
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
cudaEvent_t dslashStart
Definition: dslash_quda.cu:104
#define errorQuda(...)
Definition: util_quda.h:73
long long bytes() const
Definition: dslash_quda.cu:352
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
cudaStream_t * streams
cudaStream_t * stream
::std::string string
Definition: gtest.h:1979
TwistCloverGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, double kappa, double mu, double epsilon, const int dagger, QudaTwistGamma5Type tw, cFloat *clov, const float *cN, cFloat *clovInv, const float *cN2, int cl_stride)
Definition: dslash_quda.cu:413
const int Nstream
__global__ void twistGamma5Kernel(float4 *spinor, float *null, float a, float b, const float4 *in, const float *null2, DslashParam param)
Definition: tm_core.h:506
std::string paramString(const TuneParam &param) const
Definition: dslash_quda.cu:484
cudaEvent_t scatterStart[Nstream]
Definition: dslash_quda.cu:102
QudaDagType dagger
Definition: test_util.cpp:1558
QudaGaugeParam param
Definition: pack_test.cpp:17
__global__ void twistCloverGamma5Kernel(float4 *spinor, float *null, float a, const float4 *in, const float *null2, DslashParam param, const float4 *clover, const float *cNorm, const float4 *cloverInv, const float *cNrm2)
unsigned int minThreads() const
Definition: dslash_quda.cu:169
VOLATILE spinorFloat kappa
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:309
long long flops() const
Definition: dslash_quda.cu:225
cudaEvent_t packEnd[Nstream]
Definition: dslash_quda.cu:99
void cloverCuda(cudaColorSpinorField *out, const cudaGaugeField &gauge, const FullClover clover, const cudaColorSpinorField *in, const int oddBit)
Definition: dslash_quda.cu:229
cpuColorSpinorField * in
void createDslashEvents()
Definition: dslash_quda.cu:108
TuneKey tuneKey() const
Definition: dslash_quda.cu:440
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
void setTwistParam(double &a, double &b, const double &kappa, const double &mu, const int dagger, const QudaTwistGamma5Type twist)
std::string paramString(const TuneParam &param) const
Definition: dslash_quda.cu:217
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:444
void unbindCloverTex(const FullClover clover)
TwistGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, double kappa, double mu, double epsilon, const int dagger, QudaTwistGamma5Type twist)
Definition: dslash_quda.cu:284
CloverCuda(cudaColorSpinorField *out, const cFloat *clover, const float *cloverNorm, int cl_stride, const cudaColorSpinorField *in)
Definition: dslash_quda.cu:172
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:101
void twistGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, const int dagger, const double &kappa, const double &mu, const double &epsilon, const QudaTwistGamma5Type)
ndeg tm:
Definition: dslash_quda.cu:356
long long bytes() const
Definition: dslash_quda.cu:492
void setTwistPack(bool pack)
Definition: dslash_quda.cu:90
#define CLOVER_SHARED_FLOATS_PER_THREAD
Definition: dslash_quda.cu:70
long long flops() const
Definition: dslash_quda.cu:351
void twistCloverGamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, const int dagger, const double &kappa, const double &mu, const double &epsilon, const QudaTwistGamma5Type twist, const FullClover *clov, const FullClover *clovInv, const int parity)
Definition: dslash_quda.cu:495
cpuColorSpinorField * out
QudaPrecision Precision() const
bool getTwistPack()
Definition: dslash_quda.cu:91
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: dslash_quda.cu:167
bool tuneGridDim() const
Definition: dslash_quda.cu:168
QudaTwistFlavorType TwistFlavor() const
unsigned int sharedBytesPerThread() const
Definition: dslash_quda.cu:162
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:82
virtual ~CloverCuda()
Definition: dslash_quda.cu:182
long long flops() const
Definition: dslash_quda.cu:491
TuneKey tuneKey() const
Definition: dslash_quda.cu:307
QudaPrecision bindCloverTex(const FullClover clover, const int oddBit, void **cloverP, void **cloverNormP)
enum QudaTwistGamma5Type_s QudaTwistGamma5Type
#define checkCudaError()
Definition: util_quda.h:110
void unbindTwistedCloverTex(const FullClover clover)
cudaEvent_t dslashEnd
Definition: dslash_quda.cu:105
QudaTune getTuning()
Definition: util_quda.cpp:32
cudaEvent_t scatterEnd[Nstream]
Definition: dslash_quda.cu:103
void destroyDslashEvents()
Definition: dslash_quda.cu:129
const QudaParity parity
Definition: dslash_test.cpp:29
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:183
void * gauge[4]
Definition: su3_test.cpp:15
virtual ~TwistGamma5Cuda()
Definition: dslash_quda.cu:303
void twistGamma5(sFloat *out, sFloat *in, const int dagger, const sFloat kappa, const sFloat mu, const QudaTwistFlavorType flavor, const int V, QudaTwistGamma5Type twist)
cudaEvent_t gatherStart[Nstream]
Definition: dslash_quda.cu:100
virtual void apply(const cudaStream_t &stream)=0