QUDA  0.9.0
dslash_quda.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h>
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_LINK
12 //#define DIRECT_ACCESS_WILSON_SPINOR
13 //#define DIRECT_ACCESS_WILSON_ACCUM
14 //#define DIRECT_ACCESS_WILSON_INTER
15 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
16 //#define DIRECT_ACCESS_CLOVER
17 #endif // GPU_WILSON_DIRAC
18 
19 //these are access control for staggered action
20 #ifdef GPU_STAGGERED_DIRAC
21 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only
22 //#define DIRECT_ACCESS_FAT_LINK
23 //#define DIRECT_ACCESS_LONG_LINK
24 //#define DIRECT_ACCESS_SPINOR
25 //#define DIRECT_ACCESS_ACCUM
26 //#define DIRECT_ACCESS_INTER
27 //#define DIRECT_ACCESS_PACK
28 #else // fermi
29 //#define DIRECT_ACCESS_FAT_LINK
30 //#define DIRECT_ACCESS_LONG_LINK
31 #define DIRECT_ACCESS_SPINOR
32 //#define DIRECT_ACCESS_ACCUM
33 //#define DIRECT_ACCESS_INTER
34 //#define DIRECT_ACCESS_PACK
35 #endif
36 #endif // GPU_STAGGERED_DIRAC
37 
38 #include <quda_internal.h>
39 #include <dslash_quda.h>
40 #include <sys/time.h>
41 #include <blas_quda.h>
42 
43 #include <inline_ptx.h>
44 
45 
47 #include <clover_field_order.h>
48 #include <index_helper.cuh>
49 #include <color_spinor.h>
50 #include <linalg.cuh>
51 
52 namespace quda {
53 
54  // these should not be namespaced!!
55  // determines whether the temporal ghost zones are packed with a gather kernel,
56  // as opposed to multiple calls to cudaMemcpy()
57  static bool kernelPackT = false;
58 
59  void setKernelPackT(bool packT) { kernelPackT = packT; }
60 
61  bool getKernelPackT() { return kernelPackT; }
62 
63  namespace dslash {
64  int it = 0;
65 
66 #ifdef PTHREADS
67  cudaEvent_t interiorDslashEnd;
68 #endif
69  cudaEvent_t packEnd[2];
70  cudaEvent_t gatherStart[Nstream];
71  cudaEvent_t gatherEnd[Nstream];
72  cudaEvent_t scatterStart[Nstream];
73  cudaEvent_t scatterEnd[Nstream];
74  cudaEvent_t dslashStart[2];
75 
76  // FIX this is a hack from hell
77  // Auxiliary work that can be done while waiting on comms to finis
79 
80 #if CUDA_VERSION >= 8000
81  cuuint32_t *commsEnd_h;
82  CUdeviceptr commsEnd_d[Nstream];
83 #endif
84  }
85 
87  {
88  using namespace dslash;
89  // add cudaEventDisableTiming for lower sync overhead
90  for (int i=0; i<Nstream; i++) {
91  cudaEventCreateWithFlags(&gatherStart[i], cudaEventDisableTiming);
92  cudaEventCreateWithFlags(&gatherEnd[i], cudaEventDisableTiming);
93  cudaEventCreateWithFlags(&scatterStart[i], cudaEventDisableTiming);
94  cudaEventCreateWithFlags(&scatterEnd[i], cudaEventDisableTiming);
95  }
96  for (int i=0; i<2; i++) {
97  cudaEventCreateWithFlags(&packEnd[i], cudaEventDisableTiming);
98  cudaEventCreateWithFlags(&dslashStart[i], cudaEventDisableTiming);
99  }
100 #ifdef PTHREADS
101  cudaEventCreateWithFlags(&interiorDslashEnd, cudaEventDisableTiming);
102 #endif
103 
104  aux_worker = NULL;
105 
106 #if CUDA_VERSION >= 8000
107  commsEnd_h = static_cast<cuuint32_t*>(mapped_malloc(Nstream*sizeof(int)));
108  for (int i=0; i<Nstream; i++) {
109  cudaHostGetDevicePointer((void**)&commsEnd_d[i], commsEnd_h+i, 0);
110  commsEnd_h[i] = 0;
111  }
112 #endif
113 
114  checkCudaError();
115  }
116 
117 
119  {
120  using namespace dslash;
121 
122 #if CUDA_VERSION >= 8000
124  commsEnd_h = 0;
125 #endif
126 
127  for (int i=0; i<Nstream; i++) {
128  cudaEventDestroy(gatherStart[i]);
129  cudaEventDestroy(gatherEnd[i]);
130  cudaEventDestroy(scatterStart[i]);
131  cudaEventDestroy(scatterEnd[i]);
132  }
133 
134  for (int i=0; i<2; i++) {
135  cudaEventDestroy(packEnd[i]);
136  cudaEventDestroy(dslashStart[i]);
137  }
138 #ifdef PTHREADS
139  cudaEventDestroy(interiorDslashEnd);
140 #endif
141 
142  checkCudaError();
143  }
144 
148  template <typename Float, int nColor>
149  struct GammaArg {
151  typedef typename mapper<Float>::type RegType;
152 
153  F out; // output vector field
154  const F in; // input vector field
155  const int d; // which gamma matrix are we applying
156  const int nParity; // number of parities we're working on
157  bool doublet; // whether we applying the operator to a doublet
158  const int volumeCB; // checkerboarded volume
162 
164  RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0,
166  : out(out), in(in), d(d), nParity(in.SiteSubset()),
167  doublet(in.TwistFlavor() == QUDA_TWIST_DEG_DOUBLET || in.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET),
168  volumeCB(doublet ? in.VolumeCB()/2 : in.VolumeCB()), a(0.0), b(0.0), c(0.0)
169  {
170  if (d < 0 || d > 4) errorQuda("Undefined gamma matrix %d", d);
171  if (in.Nspin() != 4) errorQuda("Cannot apply gamma5 to nSpin=%d field", in.Nspin());
172  if (!in.isNative() || !out.isNative()) errorQuda("Unsupported field order out=%d in=%d\n", out.FieldOrder(), in.FieldOrder());
173 
174  if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
176  a = 2.0 * kappa * mu;
177  b = 1.0;
178  } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) {
179  a = -2.0 * kappa * mu;
180  b = 1.0 / (1.0 + a*a);
181  }
182  c = 0.0;
183  if (dagger) a *= -1.0;
184  } else if (doublet) {
185  a = (twist == QUDA_TWIST_GAMMA5_INVERSE) ? -2.0 * kappa * mu : 2.0 * kappa * mu;
186  b = 2.0 * kappa * epsilon;
187  c = 1.0 / (1.0 + a*a - b*b);
188  if (c<=0) errorQuda("Invalid twisted mass parameters (kappa=%e, mu=%e, epsilon=%e)\n", kappa, mu, epsilon);
189  }
190  }
191  };
192 
193  // CPU kernel for applying the gamma matrix to a colorspinor
194  template <typename Float, int nColor, typename Arg>
195  void gammaCPU(Arg arg)
196  {
197  typedef typename mapper<Float>::type RegType;
198  for (int parity= 0; parity < arg.nParity; parity++) {
199 
200  for (int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
202  arg.out(x_cb, parity) = in.gamma(arg.d);
203  } // 4-d volumeCB
204  } // parity
205 
206  }
207 
208  // GPU Kernel for applying the gamma matrix to a colorspinor
209  template <typename Float, int nColor, int d, typename Arg>
210  __global__ void gammaGPU(Arg arg)
211  {
212  typedef typename mapper<Float>::type RegType;
213  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
214  int parity = blockDim.y*blockIdx.y + threadIdx.y;
215 
216  if (x_cb >= arg.volumeCB) return;
217  if (parity >= arg.nParity) return;
218 
220  arg.out(x_cb, parity) = in.gamma(d);
221  }
222 
223  template <typename Float, int nColor, typename Arg>
224  class Gamma : public TunableVectorY {
225 
226  protected:
227  Arg &arg;
229 
230  long long flops() const { return 0; }
231  long long bytes() const { return arg.out.Bytes() + arg.in.Bytes(); }
232  bool tuneGridDim() const { return false; }
233  unsigned int minThreads() const { return arg.volumeCB; }
234 
235  public:
237  {
238  strcpy(aux, meta.AuxString());
239  }
240  virtual ~Gamma() { }
241 
242  void apply(const cudaStream_t &stream) {
244  gammaCPU<Float,nColor>(arg);
245  } else {
246  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
247  switch (arg.d) {
248  case 4: gammaGPU<Float,nColor,4> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
249  default: errorQuda("%d not instantiated", arg.d);
250  }
251  }
252  }
253 
254  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
255 
256  void preTune() { arg.out.save(); }
257  void postTune() { arg.out.load(); }
258  };
259 
260 
261  template <typename Float, int nColor>
263  {
266  gamma.apply(streams[Nstream-1]);
267  }
268 
269  // template on the number of colors
270  template <typename Float>
272  {
273  if (in.Ncolor() == 3) {
274  ApplyGamma<Float,3>(out, in, d);
275  } else {
276  errorQuda("Unsupported number of colors %d\n", in.Ncolor());
277  }
278  }
279 
280  //Apply the Gamma matrix to a colorspinor field
281  //out(x) = gamma_d*in
282  void ApplyGamma(ColorSpinorField &out, const ColorSpinorField &in, int d)
283  {
284  checkPrecision(out, in); // check all precisions match
285  checkLocation(out, in); // check all locations match
286 
288  ApplyGamma<double>(out, in, d);
289  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
290  ApplyGamma<float>(out, in, d);
291  } else if (in.Precision() == QUDA_HALF_PRECISION) {
292  ApplyGamma<short>(out, in, d);
293  } else {
294  errorQuda("Unsupported precision %d\n", in.Precision());
295  }
296  }
297 
298  // CPU kernel for applying the gamma matrix to a colorspinor
299  template <bool doublet, typename Float, int nColor, typename Arg>
300  void twistGammaCPU(Arg arg)
301  {
302  typedef typename mapper<Float>::type RegType;
303  for (int parity= 0; parity < arg.nParity; parity++) {
304  for (int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
305  if (!doublet) {
307  arg.out(x_cb, parity) = arg.b*(in + arg.a*in.igamma(arg.d));
308  } else {
309  ColorSpinor<RegType,nColor,4> in_1 = arg.in(x_cb+0*arg.volumeCB, parity);
310  ColorSpinor<RegType,nColor,4> in_2 = arg.in(x_cb+1*arg.volumeCB, parity);
311  arg.out(x_cb+0*arg.volumeCB, parity) = arg.c*(in_1 - arg.a*in_1.igamma(arg.d) + arg.b*in_2);
312  arg.out(x_cb+1*arg.volumeCB, parity) = arg.c*(in_2 + arg.a*in_2.igamma(arg.d) + arg.b*in_1);
313  }
314  } // 4-d volumeCB
315  } // parity
316 
317  }
318 
319  // GPU Kernel for applying the gamma matrix to a colorspinor
320  template <bool doublet, typename Float, int nColor, int d, typename Arg>
321  __global__ void twistGammaGPU(Arg arg)
322  {
323  typedef typename mapper<Float>::type RegType;
324  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
325  int parity = blockDim.y*blockIdx.y + threadIdx.y;
326  if (x_cb >= arg.volumeCB) return;
327 
328  if (!doublet) {
330  arg.out(x_cb, parity) = arg.b*(in + arg.a*in.igamma(d));
331  } else {
332  ColorSpinor<RegType,nColor,4> in_1 = arg.in(x_cb+0*arg.volumeCB, parity);
333  ColorSpinor<RegType,nColor,4> in_2 = arg.in(x_cb+1*arg.volumeCB, parity);
334  arg.out(x_cb+0*arg.volumeCB, parity) = arg.c*(in_1 - arg.a*in_1.igamma(d) + arg.b*in_2);
335  arg.out(x_cb+1*arg.volumeCB, parity) = arg.c*(in_2 + arg.a*in_2.igamma(d) + arg.b*in_1);
336  }
337  }
338 
339  template <typename Float, int nColor, typename Arg>
340  class TwistGamma : public TunableVectorY {
341 
342  protected:
343  Arg &arg;
345 
346  long long flops() const { return 0; }
347  long long bytes() const { return arg.out.Bytes() + arg.in.Bytes(); }
348  bool tuneGridDim() const { return false; }
349  unsigned int minThreads() const { return arg.volumeCB; }
350 
351  public:
353  {
354  strcpy(aux, meta.AuxString());
355  }
356  virtual ~TwistGamma() { }
357 
358  void apply(const cudaStream_t &stream) {
360  if (arg.doublet) twistGammaCPU<true,Float,nColor>(arg);
361  twistGammaCPU<false,Float,nColor>(arg);
362  } else {
363  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
364  if (arg.doublet)
365  switch (arg.d) {
366  case 4: twistGammaGPU<true,Float,nColor,4> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
367  default: errorQuda("%d not instantiated", arg.d);
368  }
369  else
370  switch (arg.d) {
371  case 4: twistGammaGPU<false,Float,nColor,4> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
372  default: errorQuda("%d not instantiated", arg.d);
373  }
374  }
375  }
376 
377  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
378  void preTune() { if (arg.out.field == arg.in.field) arg.out.save(); }
379  void postTune() { if (arg.out.field == arg.in.field) arg.out.load(); }
380  };
381 
382 
383  template <typename Float, int nColor>
384  void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
385  {
386  GammaArg<Float,nColor> arg(out, in, d, kappa, mu, epsilon, dagger, type);
388  gamma.apply(streams[Nstream-1]);
389 
390  checkCudaError();
391  }
392 
393  // template on the number of colors
394  template <typename Float>
395  void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
396  {
397  if (in.Ncolor() == 3) {
398  ApplyTwistGamma<Float,3>(out, in, d, kappa, mu, epsilon, dagger, type);
399  } else {
400  errorQuda("Unsupported number of colors %d\n", in.Ncolor());
401  }
402  }
403 
404  //Apply the Gamma matrix to a colorspinor field
405  //out(x) = gamma_d*in
406  void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
407  {
408  checkPrecision(out, in); // check all precisions match
409  checkLocation(out, in); // check all locations match
410 
411 #ifdef GPU_TWISTED_MASS_DIRAC
413  ApplyTwistGamma<double>(out, in, d, kappa, mu, epsilon, dagger, type);
414  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
415  ApplyTwistGamma<float>(out, in, d, kappa, mu, epsilon, dagger, type);
416  } else if (in.Precision() == QUDA_HALF_PRECISION) {
417  ApplyTwistGamma<short>(out, in, d, kappa, mu, epsilon, dagger, type);
418  } else {
419  errorQuda("Unsupported precision %d\n", in.Precision());
420  }
421 #else
422  errorQuda("Twisted mass dslash has not been built");
423 #endif // GPU_TWISTED_MASS_DIRAC
424  }
425 
426  // Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
428 
436  template <typename Float, int nSpin, int nColor, bool dynamic_clover_=false>
437  struct CloverArg {
438  static constexpr int length = (nSpin / (nSpin/2)) * 2 * nColor * nColor * (nSpin/2) * (nSpin/2) / 2;
439  static constexpr bool dynamic_clover = dynamic_clover_;
440 
443  typedef typename mapper<Float>::type RegType;
444 
445  F out; // output vector field
446  const F in; // input vector field
447  const C clover; // clover field
448  const C cloverInv; // inverse clover field (only set if not dynamic clover and doing twisted clover)
449  const int nParity; // number of parities we're working on
450  const int parity; // which parity we're acting on (if nParity=1)
451  bool inverse; // whether we are applying the inverse
452  bool doublet; // whether we applying the operator to a doublet
453  const int volumeCB; // checkerboarded volume
458 
460  bool inverse, int parity, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0,
464  in(in), nParity(in.SiteSubset()), parity(parity), inverse(inverse),
465  doublet(in.TwistFlavor() == QUDA_TWIST_DEG_DOUBLET || in.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET),
466  volumeCB(doublet ? in.VolumeCB()/2 : in.VolumeCB()), a(0.0), b(0.0), c(0.0), twist(twist)
467  {
468  if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
470  a = 2.0 * kappa * mu;
471  b = 1.0;
472  } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) {
473  a = -2.0 * kappa * mu;
474  b = 1.0 / (1.0 + a*a);
475  }
476  c = 0.0;
477  if (dagger) a *= -1.0;
478  } else if (doublet) {
479  errorQuda("ERROR: Non-degenerated twisted-mass not supported in this regularization\n");
480  }
481  }
482  };
483 
484  template <typename Float, int nSpin, int nColor, typename Arg>
485  __device__ __host__ inline void cloverApply(Arg &arg, int x_cb, int parity) {
486  typedef typename mapper<Float>::type RegType;
487  int spinor_parity = arg.nParity == 2 ? parity : 0;
488  ColorSpinor<RegType,nColor,nSpin> in = arg.in(x_cb, spinor_parity);
490 
491  in.toRel(); // change to chiral basis here
492 
493 #pragma unroll
494  for (int chirality=0; chirality<2; chirality++) {
495  HMatrix<RegType,nColor*nSpin/2> A = arg.clover(x_cb, parity, chirality);
496  out += (A * in.chiral_project(chirality)).chiral_reconstruct(chirality);
497  }
498 
499  out.toNonRel(); // change basis back
500 
501  arg.out(x_cb, spinor_parity) = out;
502  }
503 
504  template <typename Float, int nSpin, int nColor, typename Arg>
505  void cloverCPU(Arg &arg) {
506  for (int parity=0; parity<arg.nParity; parity++) {
507  parity = (arg.nParity == 2) ? parity : arg.parity;
508  for (int x_cb=0; x_cb<arg.volumeCB; x_cb++) cloverApply<Float,nSpin,nColor>(arg, x_cb, parity);
509  }
510  }
511 
512  template <typename Float, int nSpin, int nColor, typename Arg>
513  __global__ void cloverGPU(Arg arg) {
514  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
515  int parity = (arg.nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
516  if (x_cb >= arg.volumeCB) return;
517  cloverApply<Float,nSpin,nColor>(arg, x_cb, parity);
518  }
519 
520  template <typename Float, int nSpin, int nColor, typename Arg>
521  class Clover : public TunableVectorY {
522 
523  protected:
524  Arg &arg;
526 
527  protected:
528  long long flops() const { return arg.nParity*arg.volumeCB*504ll; }
529  long long bytes() const { return arg.out.Bytes() + arg.in.Bytes() + arg.nParity*arg.volumeCB*arg.clover.Bytes(); }
530  bool tuneGridDim() const { return false; }
531  unsigned int minThreads() const { return arg.volumeCB; }
532 
533  public:
535  {
536  strcpy(aux, meta.AuxString());
537  }
538  virtual ~Clover() { }
539 
540  void apply(const cudaStream_t &stream)
541  {
542  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
544  cloverCPU<Float,nSpin,nColor>(arg);
545  } else {
546  cloverGPU<Float,nSpin,nColor> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
547  }
548  }
549 
550  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
551  void preTune() { if (arg.out.field == arg.in.field) arg.out.save(); } // Need to save the out field if it aliases the in field
552  void postTune() { if (arg.out.field == arg.in.field) arg.out.load(); } // Restore if the in and out fields alias
553  };
554 
555 
556  template <typename Float, int nColor>
557  void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
558  {
559  if (in.Nspin() != 4) errorQuda("Unsupported nSpin=%d", in.Nspin());
560  constexpr int nSpin = 4;
563  worker.apply(streams[Nstream-1]);
564 
565  checkCudaError();
566  }
567 
568  // template on the number of colors
569  template <typename Float>
570  void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
571  {
572  if (in.Ncolor() == 3) {
573  ApplyClover<Float,3>(out, in, clover, inverse, parity);
574  } else {
575  errorQuda("Unsupported number of colors %d\n", in.Ncolor());
576  }
577  }
578 
579  //Apply the clvoer matrix field to a colorspinor field
580  //out(x) = clover*in
581  void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
582  {
583  checkPrecision(out, clover, in); // check all precisions match
584  checkLocation(out, clover, in); // check all locations match
585 
586 #ifdef GPU_CLOVER_DIRAC
588  ApplyClover<double>(out, in, clover, inverse, parity);
589  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
590  ApplyClover<float>(out, in, clover, inverse, parity);
591  } else if (in.Precision() == QUDA_HALF_PRECISION) {
592  ApplyClover<short>(out, in, clover, inverse, parity);
593  } else {
594  errorQuda("Unsupported precision %d\n", in.Precision());
595  }
596 #else
597  errorQuda("Clover dslash has not been built");
598 #endif // GPU_TWISTED_MASS_DIRAC
599  }
600 
601  // if (!inverse) apply (Clover + i*a*gamma_5) to the input spinor
602  // else apply (Clover + i*a*gamma_5)/(Clover^2 + a^2) to the input spinor
603  template <bool inverse, typename Float, int nSpin, int nColor, typename Arg>
604  __device__ __host__ inline void twistCloverApply(Arg &arg, int x_cb, int parity) {
605  using namespace linalg; // for Cholesky
606  constexpr int N = nColor*nSpin/2;
607  typedef typename mapper<Float>::type RegType;
609  typedef ColorSpinor<RegType,nColor,nSpin/2> HalfSpinor;
610  typedef HMatrix<RegType,N> Mat;
611  int spinor_parity = arg.nParity == 2 ? parity : 0;
612  Spinor in = arg.in(x_cb, spinor_parity);
613  Spinor out;
614 
615  in.toRel(); // change to chiral basis here
616 
617 #pragma unroll
618  for (int chirality=0; chirality<2; chirality++) {
619  // factor of 2 comes from clover normalization we need to correct for
620  const complex<RegType> j(0.0, chirality == 0 ? static_cast<RegType>(0.5) : -static_cast<RegType>(0.5));
621 
622  Mat A = arg.clover(x_cb, parity, chirality);
623 
624  HalfSpinor in_chi = in.chiral_project(chirality);
625  HalfSpinor out_chi = A*in_chi + j*arg.a*in_chi;
626 
627  if (inverse) {
628  if (arg.dynamic_clover) {
629  Mat A2 = A.square();
630  A2 += arg.a*arg.a*static_cast<RegType>(0.25);
631  Cholesky<HMatrix,RegType,N> cholesky(A2);
632  out_chi = static_cast<RegType>(0.25)*cholesky.backward(cholesky.forward(out_chi));
633  } else {
634  Mat Ainv = arg.cloverInv(x_cb, parity, chirality);
635  out_chi = static_cast<RegType>(2.0)*(Ainv*out_chi);
636  }
637  }
638 
639  out += (out_chi).chiral_reconstruct(chirality);
640  }
641 
642  out.toNonRel(); // change basis back
643 
644  arg.out(x_cb, spinor_parity) = out;
645  }
646 
647  template <bool inverse, typename Float, int nSpin, int nColor, typename Arg>
648  void twistCloverCPU(Arg &arg) {
649  for (int parity=0; parity<arg.nParity; parity++) {
650  parity = (arg.nParity == 2) ? parity : arg.parity;
651  for (int x_cb=0; x_cb<arg.volumeCB; x_cb++) twistCloverApply<inverse,Float,nSpin,nColor>(arg, x_cb, parity);
652  }
653  }
654 
655  template <bool inverse, typename Float, int nSpin, int nColor, typename Arg>
656  __global__ void twistCloverGPU(Arg arg) {
657  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
658  int parity = (arg.nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
659  if (x_cb >= arg.volumeCB) return;
660  twistCloverApply<inverse,Float,nSpin,nColor>(arg, x_cb, parity);
661  }
662 
663  template <typename Float, int nSpin, int nColor, typename Arg>
664  class TwistClover : public TunableVectorY {
665 
666  protected:
667  Arg &arg;
669 
670  protected:
671  long long flops() const { return (arg.inverse ? 1056ll : 552ll) * arg.nParity*arg.volumeCB; }
672  long long bytes() const {
673  long long rtn = arg.out.Bytes() + arg.in.Bytes() + arg.nParity*arg.volumeCB*arg.clover.Bytes();
674  if (arg.twist == QUDA_TWIST_GAMMA5_INVERSE && !arg.dynamic_clover)
675  rtn += arg.nParity*arg.volumeCB*arg.cloverInv.Bytes();
676  return rtn;
677  }
678  bool tuneGridDim() const { return false; }
679  unsigned int minThreads() const { return arg.volumeCB; }
680 
681  public:
683  {
684  strcpy(aux, meta.AuxString());
685  strcat(aux, arg.inverse ? ",inverse" : ",direct");
686  }
687  virtual ~TwistClover() { }
688 
689  void apply(const cudaStream_t &stream)
690  {
691  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
693  if (arg.inverse) twistCloverCPU<true,Float,nSpin,nColor>(arg);
694  else twistCloverCPU<false,Float,nSpin,nColor>(arg);
695  } else {
696  if (arg.inverse) twistCloverGPU<true,Float,nSpin,nColor> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
697  else twistCloverGPU<false,Float,nSpin,nColor> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
698  }
699  }
700 
701  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
702  void preTune() { if (arg.out.field == arg.in.field) arg.out.save(); } // Need to save the out field if it aliases the in field
703  void postTune() { if (arg.out.field == arg.in.field) arg.out.load(); } // Restore if the in and out fields alias
704  };
705 
706 
707  template <typename Float, int nColor>
709  double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
710  {
711  if (in.Nspin() != 4) errorQuda("Unsupported nSpin=%d", in.Nspin());
712  constexpr int nSpin = 4;
713  bool inverse = twist == QUDA_TWIST_GAMMA5_DIRECT ? false : true;
714 
715 #ifdef DYNAMIC_CLOVER
716  constexpr bool dynamic_clover = true;
717 #else
718  constexpr bool dynamic_clover = false;
719 #endif
720 
723  worker.apply(streams[Nstream-1]);
724 
725  checkCudaError();
726  }
727 
728  // template on the number of colors
729  template <typename Float>
730  void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover,
731  double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
732  {
733  if (in.Ncolor() == 3) {
734  ApplyTwistClover<Float,3>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
735  } else {
736  errorQuda("Unsupported number of colors %d\n", in.Ncolor());
737  }
738  }
739 
740  //Apply the twisted-clover matrix field to a colorspinor field
741  void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover,
742  double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
743  {
744  checkPrecision(out, clover, in); // check all precisions match
745  checkLocation(out, clover, in); // check all locations match
746 
747 #ifdef GPU_CLOVER_DIRAC
749  ApplyTwistClover<double>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
750  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
751  ApplyTwistClover<float>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
752  } else if (in.Precision() == QUDA_HALF_PRECISION) {
753  ApplyTwistClover<short>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
754  } else {
755  errorQuda("Unsupported precision %d\n", in.Precision());
756  }
757 #else
758  errorQuda("Clover dslash has not been built");
759 #endif // GPU_TWISTED_MASS_DIRAC
760  }
761 
762 } // namespace quda
763 
764 #include "contract.cu"
__global__ void gammaGPU(Arg arg)
Definition: dslash_quda.cu:210
colorspinor_mapper< Float, 4, nColor >::type F
Definition: dslash_quda.cu:150
dim3 dim3 blockDim
double mu
Definition: test_util.cpp:1643
const int volumeCB
Definition: dslash_quda.cu:453
const ColorSpinorField & meta
Definition: dslash_quda.cu:344
bool getKernelPackT()
Definition: dslash_quda.cu:61
const char * AuxString() const
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const ColorSpinorField & meta
Definition: dslash_quda.cu:228
mapper< Float >::type RegType
Definition: dslash_quda.cu:443
#define checkPrecision(...)
#define errorQuda(...)
Definition: util_quda.h:90
void ApplyGamma(ColorSpinorField &out, const ColorSpinorField &in, int d)
Definition: dslash_quda.cu:262
__device__ __host__ void twistCloverApply(Arg &arg, int x_cb, int parity)
Definition: dslash_quda.cu:604
#define host_free(ptr)
Definition: malloc_quda.h:59
unsigned int minThreads() const
Definition: dslash_quda.cu:349
cudaStream_t * streams
cudaStream_t * stream
const int Nstream
char * strcpy(char *__dst, const char *__src)
const char * VolString() const
char * strcat(char *__s1, const char *__s2)
void postTune()
Definition: dslash_quda.cu:552
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
static constexpr bool dynamic_clover
Definition: dslash_quda.cu:439
Main header file for host and device accessors to CloverFields.
unsigned int minThreads() const
Definition: dslash_quda.cu:679
int * commsEnd_h
long long bytes() const
Definition: dslash_quda.cu:672
cudaEvent_t scatterStart[Nstream]
Definition: dslash_quda.cu:72
virtual ~Clover()
Definition: dslash_quda.cu:538
virtual ~TwistClover()
Definition: dslash_quda.cu:687
void postTune()
Definition: dslash_quda.cu:257
cudaEvent_t packEnd[2]
Definition: dslash_quda.cu:69
bool tuneGridDim() const
Definition: dslash_quda.cu:348
const int nParity
Definition: dslash_quda.cu:156
GammaArg(ColorSpinorField &out, const ColorSpinorField &in, int d, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
Definition: dslash_quda.cu:163
Clover(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:534
mapper< Float >::type RegType
Definition: dslash_quda.cu:151
VOLATILE spinorFloat kappa
void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
Apply the twisted-mass gamma operator to a color-spinor field.
Definition: dslash_quda.cu:384
void preTune()
Definition: dslash_quda.cu:256
Gamma(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:236
Worker * aux_worker
Definition: dslash_quda.cu:78
const int nColor
Definition: covdev_test.cpp:77
TuneKey tuneKey() const
Definition: dslash_quda.cu:377
cpuColorSpinorField * in
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:540
for(int s=0;s< param.dc.Ls;s++)
void createDslashEvents()
Definition: dslash_quda.cu:86
void twistCloverCPU(Arg &arg)
Definition: dslash_quda.cu:648
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
clover_mapper< Float, length >::type C
Definition: dslash_quda.cu:442
#define checkLocation(...)
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
Definition: quda_matrix.h:65
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:689
static bool kernelPackT
Definition: dslash_quda.cu:57
TwistClover(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:682
const ColorSpinorField & meta
Definition: dslash_quda.cu:668
long long flops() const
Definition: dslash_quda.cu:671
TuneKey tuneKey() const
Definition: dslash_quda.cu:701
const int volumeCB
Definition: dslash_quda.cu:158
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:71
double gamma(double) __attribute__((availability(macosx
unsigned int minThreads() const
Definition: dslash_quda.cu:233
long long bytes() const
Definition: dslash_quda.cu:231
void preTune()
Definition: dslash_quda.cu:551
const ColorSpinorField & meta
Definition: dslash_quda.cu:525
QudaFieldLocation Location() const
static constexpr int length
Definition: dslash_quda.cu:438
CloverArg(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
Definition: dslash_quda.cu:459
void cloverCPU(Arg &arg)
Definition: dslash_quda.cu:505
TwistGamma(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:352
TuneKey tuneKey() const
Definition: dslash_quda.cu:254
cpuColorSpinorField * out
long long flops() const
Definition: dslash_quda.cu:230
__device__ __host__ void cloverApply(Arg &arg, int x_cb, int parity)
Definition: dslash_quda.cu:485
void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
Apply clover-matrix field to a color-spinor field.
Definition: dslash_quda.cu:557
Parameteter structure for driving the clover and twist-clover application kernels.
Definition: dslash_quda.cu:437
__global__ void twistCloverGPU(Arg arg)
Definition: dslash_quda.cu:656
colorspinor_mapper< Float, nSpin, nColor >::type F
Definition: dslash_quda.cu:441
unsigned int minThreads() const
Definition: dslash_quda.cu:531
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:358
if(err !=cudaSuccess)
QudaTwistGamma5Type twist
Definition: dslash_quda.cu:457
const int parity
Definition: dslash_quda.cu:450
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
virtual ~TwistGamma()
Definition: dslash_quda.cu:356
bool tuneGridDim() const
Definition: dslash_quda.cu:678
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:59
TuneKey tuneKey() const
Definition: dslash_quda.cu:550
void gammaCPU(Arg arg)
Definition: dslash_quda.cu:195
void twistGammaCPU(Arg arg)
Definition: dslash_quda.cu:300
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
Definition: dslash_quda.cu:427
long long flops() const
Definition: dslash_quda.cu:346
const int nParity
Definition: dslash_quda.cu:449
enum QudaTwistGamma5Type_s QudaTwistGamma5Type
#define checkCudaError()
Definition: util_quda.h:129
#define mapped_malloc(size)
Definition: malloc_quda.h:56
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaPrecision Precision() const
cudaEvent_t scatterEnd[Nstream]
Definition: dslash_quda.cu:73
virtual ~Gamma()
Definition: dslash_quda.cu:240
bool tuneGridDim() const
Definition: dslash_quda.cu:530
QudaParity parity
Definition: covdev_test.cpp:53
__global__ void twistGammaGPU(Arg arg)
Definition: dslash_quda.cu:321
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:242
__global__ void cloverGPU(Arg arg)
Definition: dslash_quda.cu:513
void destroyDslashEvents()
Definition: dslash_quda.cu:118
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
cudaEvent_t dslashStart[2]
Definition: dslash_quda.cu:74
long long bytes() const
Definition: dslash_quda.cu:347
cudaEvent_t gatherStart[Nstream]
Definition: dslash_quda.cu:70
long long flops() const
Definition: dslash_quda.cu:528
void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
Apply twisted clover-matrix field to a color-spinor field.
Definition: dslash_quda.cu:708
long long bytes() const
Definition: dslash_quda.cu:529
Parameter structure for driving the Gamma operator.
Definition: dslash_quda.cu:149
CUdeviceptr commsEnd_d[Nstream]
bool tuneGridDim() const
Definition: dslash_quda.cu:232