QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dslash_quda.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 #include <stack>
6 
7 #include <color_spinor_field.h>
8 #include <clover_field.h>
9 #include <dslash_quda.h>
11 #include <clover_field_order.h>
12 #include <index_helper.cuh>
13 #include <color_spinor.h>
14 #include <linalg.cuh>
15 #include <dslash_policy.cuh>
16 
17 namespace quda {
18 
19  // these should not be namespaced!!
20  // determines whether the temporal ghost zones are packed with a gather kernel,
21  // as opposed to multiple calls to cudaMemcpy()
22  static bool kernelPackT = false;
23 
24  void setKernelPackT(bool packT) { kernelPackT = packT; }
25 
26  bool getKernelPackT() { return kernelPackT; }
27 
28  static std::stack<bool> kptstack;
29 
30  void pushKernelPackT(bool packT)
31  {
32  kptstack.push(getKernelPackT());
33  setKernelPackT(packT);
34 
35  if (kptstack.size() > 10)
36  {
37  warningQuda("KernelPackT stack contains %u elements. Is there a missing popKernelPackT() somewhere?",
38  static_cast<unsigned int>(kptstack.size()));
39  }
40  }
41 
43  {
44  if (kptstack.empty())
45  {
46  errorQuda("popKernelPackT() called with empty stack");
47  }
48  setKernelPackT(kptstack.top());
49  kptstack.pop();
50  }
51 
52  namespace dslash {
53  int it = 0;
54 
55  cudaEvent_t packEnd[2];
56  cudaEvent_t gatherStart[Nstream];
57  cudaEvent_t gatherEnd[Nstream];
58  cudaEvent_t scatterStart[Nstream];
59  cudaEvent_t scatterEnd[Nstream];
60  cudaEvent_t dslashStart[2];
61 
62  // these variables are used for benchmarking the dslash components in isolation
68 
69  // whether the dslash policy tuner has been enabled
71 
72  // used to keep track of which policy to start the autotuning
75 
76  // list of dslash policies that are enabled
77  std::vector<QudaDslashPolicy> policies;
78 
79  // list of p2p policies that are enabled
80  std::vector<QudaP2PPolicy> p2p_policies;
81 
82  // string used as a tunekey to ensure we retune if the dslash policy env changes
84 
85  // FIX this is a hack from hell
86  // Auxiliary work that can be done while waiting on comms to finis
88 
89 #if CUDA_VERSION >= 8000
90  cuuint32_t *commsEnd_h;
91  CUdeviceptr commsEnd_d[Nstream];
92 #endif
93  }
94 
96  {
97  using namespace dslash;
98  // add cudaEventDisableTiming for lower sync overhead
99  for (int i=0; i<Nstream; i++) {
100  cudaEventCreateWithFlags(&gatherStart[i], cudaEventDisableTiming);
101  cudaEventCreateWithFlags(&gatherEnd[i], cudaEventDisableTiming);
102  cudaEventCreateWithFlags(&scatterStart[i], cudaEventDisableTiming);
103  cudaEventCreateWithFlags(&scatterEnd[i], cudaEventDisableTiming);
104  }
105  for (int i=0; i<2; i++) {
106  cudaEventCreateWithFlags(&packEnd[i], cudaEventDisableTiming);
107  cudaEventCreateWithFlags(&dslashStart[i], cudaEventDisableTiming);
108  }
109 
110  aux_worker = NULL;
111 
112 #if CUDA_VERSION >= 8000
113  commsEnd_h = static_cast<cuuint32_t*>(mapped_malloc(Nstream*sizeof(int)));
114  for (int i=0; i<Nstream; i++) {
115  cudaHostGetDevicePointer((void**)&commsEnd_d[i], commsEnd_h+i, 0);
116  commsEnd_h[i] = 0;
117  }
118 #endif
119 
120  checkCudaError();
121 
122  dslash_pack_compute = true;
125  dslash_comms = true;
126  dslash_copy = true;
127 
128  dslash_policy_init = false;
131 
132  // list of dslash policies that are enabled
133  policies = std::vector<QudaDslashPolicy>(
135 
136  // list of p2p policies that are enabled
137  p2p_policies = std::vector<QudaP2PPolicy>(
139 
140  strcat(policy_string, ",pol=");
141  }
142 
143 
145  {
146  using namespace dslash;
147 
148 #if CUDA_VERSION >= 8000
149  host_free(commsEnd_h);
150  commsEnd_h = 0;
151 #endif
152 
153  for (int i=0; i<Nstream; i++) {
154  cudaEventDestroy(gatherStart[i]);
155  cudaEventDestroy(gatherEnd[i]);
156  cudaEventDestroy(scatterStart[i]);
157  cudaEventDestroy(scatterEnd[i]);
158  }
159 
160  for (int i=0; i<2; i++) {
161  cudaEventDestroy(packEnd[i]);
162  cudaEventDestroy(dslashStart[i]);
163  }
164 
165  checkCudaError();
166  }
167 
171  template <typename Float, int nColor>
172  struct GammaArg {
174  typedef typename mapper<Float>::type RegType;
175 
176  F out; // output vector field
177  const F in; // input vector field
178  const int d; // which gamma matrix are we applying
179  const int nParity; // number of parities we're working on
180  bool doublet; // whether we applying the operator to a doublet
181  const int volumeCB; // checkerboarded volume
182  RegType a; // scale factor
183  RegType b; // chiral twist
184  RegType c; // flavor twist
185 
187  RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0,
189  : out(out), in(in), d(d), nParity(in.SiteSubset()),
190  doublet(in.TwistFlavor() == QUDA_TWIST_DEG_DOUBLET || in.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET),
191  volumeCB(doublet ? in.VolumeCB()/2 : in.VolumeCB()), a(0.0), b(0.0), c(0.0)
192  {
193  if (d < 0 || d > 4) errorQuda("Undefined gamma matrix %d", d);
194  if (in.Nspin() != 4) errorQuda("Cannot apply gamma5 to nSpin=%d field", in.Nspin());
195  if (!in.isNative() || !out.isNative()) errorQuda("Unsupported field order out=%d in=%d\n", out.FieldOrder(), in.FieldOrder());
196 
197  if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
198  if (twist == QUDA_TWIST_GAMMA5_DIRECT) {
199  b = 2.0 * kappa * mu;
200  a = 1.0;
201  } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) {
202  b = -2.0 * kappa * mu;
203  a = 1.0 / (1.0 + b * b);
204  }
205  c = 0.0;
206  if (dagger) b *= -1.0;
207  } else if (doublet) {
208  if (twist == QUDA_TWIST_GAMMA5_DIRECT) {
209  b = 2.0 * kappa * mu;
210  c = -2.0 * kappa * epsilon;
211  a = 1.0;
212  } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) {
213  b = -2.0 * kappa * mu;
214  c = 2.0 * kappa * epsilon;
215  a = 1.0 / (1.0 + b * b - c * c);
216  if (a <= 0) errorQuda("Invalid twisted mass parameters (kappa=%e, mu=%e, epsilon=%e)\n", kappa, mu, epsilon);
217  }
218  if (dagger) b *= -1.0;
219  }
220  }
221  };
222 
223  // CPU kernel for applying the gamma matrix to a colorspinor
224  template <typename Float, int nColor, typename Arg>
226  {
227  typedef typename mapper<Float>::type RegType;
228  for (int parity= 0; parity < arg.nParity; parity++) {
229 
230  for (int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
231  ColorSpinor<RegType,nColor,4> in = arg.in(x_cb, parity);
232  arg.out(x_cb, parity) = in.gamma(arg.d);
233  } // 4-d volumeCB
234  } // parity
235 
236  }
237 
238  // GPU Kernel for applying the gamma matrix to a colorspinor
239  template <typename Float, int nColor, int d, typename Arg>
240  __global__ void gammaGPU(Arg arg)
241  {
242  typedef typename mapper<Float>::type RegType;
243  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
244  int parity = blockDim.y*blockIdx.y + threadIdx.y;
245 
246  if (x_cb >= arg.volumeCB) return;
247  if (parity >= arg.nParity) return;
248 
249  ColorSpinor<RegType,nColor,4> in = arg.in(x_cb, parity);
250  arg.out(x_cb, parity) = in.gamma(d);
251  }
252 
253  template <typename Float, int nColor, typename Arg>
254  class Gamma : public TunableVectorY {
255 
256  protected:
259 
260  long long flops() const { return 0; }
261  long long bytes() const { return arg.out.Bytes() + arg.in.Bytes(); }
262  bool tuneGridDim() const { return false; }
263  unsigned int minThreads() const { return arg.volumeCB; }
264 
265  public:
266  Gamma(Arg &arg, const ColorSpinorField &meta) : TunableVectorY(arg.nParity), arg(arg), meta(meta)
267  {
268  strcpy(aux, meta.AuxString());
269  }
270  virtual ~Gamma() { }
271 
272  void apply(const cudaStream_t &stream) {
273  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
274  gammaCPU<Float,nColor>(arg);
275  } else {
276  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
277  switch (arg.d) {
278  case 4: gammaGPU<Float,nColor,4> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
279  default: errorQuda("%d not instantiated", arg.d);
280  }
281  }
282  }
283 
284  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
285 
286  void preTune() { arg.out.save(); }
287  void postTune() { arg.out.load(); }
288  };
289 
290 
291  template <typename Float, int nColor>
293  {
294  GammaArg<Float,nColor> arg(out, in, d);
296  gamma.apply(streams[Nstream-1]);
297  }
298 
299  // template on the number of colors
300  template <typename Float>
302  {
303  if (in.Ncolor() == 3) {
304  ApplyGamma<Float,3>(out, in, d);
305  } else {
306  errorQuda("Unsupported number of colors %d\n", in.Ncolor());
307  }
308  }
309 
310  //Apply the Gamma matrix to a colorspinor field
311  //out(x) = gamma_d*in
312  void ApplyGamma(ColorSpinorField &out, const ColorSpinorField &in, int d)
313  {
314  checkPrecision(out, in); // check all precisions match
315  checkLocation(out, in); // check all locations match
316 
317  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
318  ApplyGamma<double>(out, in, d);
319  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
320  ApplyGamma<float>(out, in, d);
321  } else if (in.Precision() == QUDA_HALF_PRECISION) {
322  ApplyGamma<short>(out, in, d);
323  } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
324  ApplyGamma<char>(out, in, d);
325  } else {
326  errorQuda("Unsupported precision %d\n", in.Precision());
327  }
328  }
329 
330  // CPU kernel for applying the gamma matrix to a colorspinor
331  template <bool doublet, typename Float, int nColor, typename Arg>
333  {
334  typedef typename mapper<Float>::type RegType;
335  for (int parity= 0; parity < arg.nParity; parity++) {
336  for (int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
337  if (!doublet) {
338  ColorSpinor<RegType,nColor,4> in = arg.in(x_cb, parity);
339  arg.out(x_cb, parity) = arg.a * (in + arg.b * in.igamma(arg.d));
340  } else {
341  ColorSpinor<RegType,nColor,4> in_1 = arg.in(x_cb+0*arg.volumeCB, parity);
342  ColorSpinor<RegType,nColor,4> in_2 = arg.in(x_cb+1*arg.volumeCB, parity);
343  arg.out(x_cb + 0 * arg.volumeCB, parity) = arg.a * (in_1 + arg.b * in_1.igamma(arg.d) + arg.c * in_2);
344  arg.out(x_cb + 1 * arg.volumeCB, parity) = arg.a * (in_2 - arg.b * in_2.igamma(arg.d) + arg.c * in_1);
345  }
346  } // 4-d volumeCB
347  } // parity
348 
349  }
350 
351  // GPU Kernel for applying the gamma matrix to a colorspinor
352  template <bool doublet, typename Float, int nColor, int d, typename Arg>
353  __global__ void twistGammaGPU(Arg arg)
354  {
355  typedef typename mapper<Float>::type RegType;
356  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
357  int parity = blockDim.y*blockIdx.y + threadIdx.y;
358  if (x_cb >= arg.volumeCB) return;
359 
360  if (!doublet) {
361  ColorSpinor<RegType,nColor,4> in = arg.in(x_cb, parity);
362  arg.out(x_cb, parity) = arg.a * (in + arg.b * in.igamma(d));
363  } else {
364  ColorSpinor<RegType,nColor,4> in_1 = arg.in(x_cb+0*arg.volumeCB, parity);
365  ColorSpinor<RegType,nColor,4> in_2 = arg.in(x_cb+1*arg.volumeCB, parity);
366  arg.out(x_cb + 0 * arg.volumeCB, parity) = arg.a * (in_1 + arg.b * in_1.igamma(d) + arg.c * in_2);
367  arg.out(x_cb + 1 * arg.volumeCB, parity) = arg.a * (in_2 - arg.b * in_2.igamma(d) + arg.c * in_1);
368  }
369  }
370 
371  template <typename Float, int nColor, typename Arg>
372  class TwistGamma : public TunableVectorY {
373 
374  protected:
377 
378  long long flops() const { return 0; }
379  long long bytes() const { return arg.out.Bytes() + arg.in.Bytes(); }
380  bool tuneGridDim() const { return false; }
381  unsigned int minThreads() const { return arg.volumeCB; }
382 
383  public:
384  TwistGamma(Arg &arg, const ColorSpinorField &meta) : TunableVectorY(arg.nParity), arg(arg), meta(meta)
385  {
386  strcpy(aux, meta.AuxString());
387  }
388  virtual ~TwistGamma() { }
389 
390  void apply(const cudaStream_t &stream) {
391  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
392  if (arg.doublet) twistGammaCPU<true,Float,nColor>(arg);
393  twistGammaCPU<false,Float,nColor>(arg);
394  } else {
395  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
396  if (arg.doublet)
397  switch (arg.d) {
398  case 4: twistGammaGPU<true,Float,nColor,4> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
399  default: errorQuda("%d not instantiated", arg.d);
400  }
401  else
402  switch (arg.d) {
403  case 4: twistGammaGPU<false,Float,nColor,4> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg); break;
404  default: errorQuda("%d not instantiated", arg.d);
405  }
406  }
407  }
408 
409  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
410  void preTune() { if (arg.out.field == arg.in.field) arg.out.save(); }
411  void postTune() { if (arg.out.field == arg.in.field) arg.out.load(); }
412  };
413 
414 
415  template <typename Float, int nColor>
416  void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
417  {
418  GammaArg<Float,nColor> arg(out, in, d, kappa, mu, epsilon, dagger, type);
420  gamma.apply(streams[Nstream-1]);
421 
422  checkCudaError();
423  }
424 
425  // template on the number of colors
426  template <typename Float>
427  void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
428  {
429  if (in.Ncolor() == 3) {
430  ApplyTwistGamma<Float,3>(out, in, d, kappa, mu, epsilon, dagger, type);
431  } else {
432  errorQuda("Unsupported number of colors %d\n", in.Ncolor());
433  }
434  }
435 
436  //Apply the Gamma matrix to a colorspinor field
437  //out(x) = gamma_d*in
438  void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
439  {
440  checkPrecision(out, in); // check all precisions match
441  checkLocation(out, in); // check all locations match
442 
443 #ifdef GPU_TWISTED_MASS_DIRAC
444  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
445  ApplyTwistGamma<double>(out, in, d, kappa, mu, epsilon, dagger, type);
446  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
447  ApplyTwistGamma<float>(out, in, d, kappa, mu, epsilon, dagger, type);
448  } else if (in.Precision() == QUDA_HALF_PRECISION) {
449  ApplyTwistGamma<short>(out, in, d, kappa, mu, epsilon, dagger, type);
450  } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
451  ApplyTwistGamma<char>(out, in, d, kappa, mu, epsilon, dagger, type);
452  } else {
453  errorQuda("Unsupported precision %d\n", in.Precision());
454  }
455 #else
456  errorQuda("Twisted mass dslash has not been built");
457 #endif // GPU_TWISTED_MASS_DIRAC
458  }
459 
460  // Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
461  void gamma5(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,4); }
462 
470  template <typename Float, int nSpin, int nColor, bool dynamic_clover_=false>
471  struct CloverArg {
472  static constexpr int length = (nSpin / (nSpin/2)) * 2 * nColor * nColor * (nSpin/2) * (nSpin/2) / 2;
473  static constexpr bool dynamic_clover = dynamic_clover_;
474 
477  typedef typename mapper<Float>::type RegType;
478 
479  F out; // output vector field
480  const F in; // input vector field
481  const C clover; // clover field
482  const C cloverInv; // inverse clover field (only set if not dynamic clover and doing twisted clover)
483  const int nParity; // number of parities we're working on
484  const int parity; // which parity we're acting on (if nParity=1)
485  bool inverse; // whether we are applying the inverse
486  bool doublet; // whether we applying the operator to a doublet
487  const int volumeCB; // checkerboarded volume
488  RegType a;
489  RegType b;
490  RegType c;
492 
493  CloverArg(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover,
494  bool inverse, int parity, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0,
495  bool dagger = false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
496  : out(out), clover(clover, twist == QUDA_TWIST_GAMMA5_INVALID ? inverse : false),
497  cloverInv(clover, (twist != QUDA_TWIST_GAMMA5_INVALID && !dynamic_clover) ? true : false),
498  in(in), nParity(in.SiteSubset()), parity(parity), inverse(inverse),
499  doublet(in.TwistFlavor() == QUDA_TWIST_DEG_DOUBLET || in.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET),
500  volumeCB(doublet ? in.VolumeCB()/2 : in.VolumeCB()), a(0.0), b(0.0), c(0.0), twist(twist)
501  {
502  if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
503  if (twist == QUDA_TWIST_GAMMA5_DIRECT) {
504  a = 2.0 * kappa * mu;
505  b = 1.0;
506  } else if (twist == QUDA_TWIST_GAMMA5_INVERSE) {
507  a = -2.0 * kappa * mu;
508  b = 1.0 / (1.0 + a*a);
509  }
510  c = 0.0;
511  if (dagger) a *= -1.0;
512  } else if (doublet) {
513  errorQuda("ERROR: Non-degenerated twisted-mass not supported in this regularization\n");
514  }
515  }
516  };
517 
518  template <typename Float, int nSpin, int nColor, typename Arg>
519  __device__ __host__ inline void cloverApply(Arg &arg, int x_cb, int parity) {
520  using namespace linalg; // for Cholesky
521  typedef typename mapper<Float>::type RegType;
523  typedef ColorSpinor<RegType, nColor, nSpin / 2> HalfSpinor;
524  int spinor_parity = arg.nParity == 2 ? parity : 0;
525  Spinor in = arg.in(x_cb, spinor_parity);
526  Spinor out;
527 
528  in.toRel(); // change to chiral basis here
529 
530 #pragma unroll
531  for (int chirality=0; chirality<2; chirality++) {
532 
533  HMatrix<RegType,nColor*nSpin/2> A = arg.clover(x_cb, parity, chirality);
534  HalfSpinor chi = in.chiral_project(chirality);
535 
536  if (arg.dynamic_clover) {
537  Cholesky<HMatrix, RegType, nColor * nSpin / 2> cholesky(A);
538  chi = static_cast<RegType>(0.25) * cholesky.backward(cholesky.forward(chi));
539  } else {
540  chi = A * chi;
541  }
542 
543  out += chi.chiral_reconstruct(chirality);
544  }
545 
546  out.toNonRel(); // change basis back
547 
548  arg.out(x_cb, spinor_parity) = out;
549  }
550 
551  template <typename Float, int nSpin, int nColor, typename Arg>
552  void cloverCPU(Arg &arg) {
553  for (int parity=0; parity<arg.nParity; parity++) {
554  parity = (arg.nParity == 2) ? parity : arg.parity;
555  for (int x_cb=0; x_cb<arg.volumeCB; x_cb++) cloverApply<Float,nSpin,nColor>(arg, x_cb, parity);
556  }
557  }
558 
559  template <typename Float, int nSpin, int nColor, typename Arg>
560  __global__ void cloverGPU(Arg arg) {
561  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
562  int parity = (arg.nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
563  if (x_cb >= arg.volumeCB) return;
564  cloverApply<Float,nSpin,nColor>(arg, x_cb, parity);
565  }
566 
567  template <typename Float, int nSpin, int nColor, typename Arg>
568  class Clover : public TunableVectorY {
569 
570  protected:
573 
574  protected:
575  long long flops() const { return arg.nParity*arg.volumeCB*504ll; }
576  long long bytes() const { return arg.out.Bytes() + arg.in.Bytes() + arg.nParity*arg.volumeCB*arg.clover.Bytes(); }
577  bool tuneGridDim() const { return false; }
578  unsigned int minThreads() const { return arg.volumeCB; }
579 
580  public:
581  Clover(Arg &arg, const ColorSpinorField &meta) : TunableVectorY(arg.nParity), arg(arg), meta(meta)
582  {
583  strcpy(aux, meta.AuxString());
584  }
585  virtual ~Clover() { }
586 
587  void apply(const cudaStream_t &stream)
588  {
589  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
590  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
591  cloverCPU<Float,nSpin,nColor>(arg);
592  } else {
593  cloverGPU<Float,nSpin,nColor> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
594  }
595  }
596 
597  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
598  void preTune() { if (arg.out.field == arg.in.field) arg.out.save(); } // Need to save the out field if it aliases the in field
599  void postTune() { if (arg.out.field == arg.in.field) arg.out.load(); } // Restore if the in and out fields alias
600  };
601 
602 
603  template <typename Float, int nColor>
604  void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
605  {
606  if (in.Nspin() != 4) errorQuda("Unsupported nSpin=%d", in.Nspin());
607  constexpr int nSpin = 4;
608 
609  if (inverse) {
610 #ifdef DYNAMIC_CLOVER
611  constexpr bool dynamic_clover = true;
612 #else
613  constexpr bool dynamic_clover = false;
614 #endif
615  CloverArg<Float, nSpin, nColor, dynamic_clover> arg(out, in, clover, inverse, parity);
617  worker.apply(streams[Nstream - 1]);
618  } else {
619  CloverArg<Float, nSpin, nColor, false> arg(out, in, clover, inverse, parity);
621  worker.apply(streams[Nstream - 1]);
622  }
623 
624  checkCudaError();
625  }
626 
627  // template on the number of colors
628  template <typename Float>
629  void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
630  {
631  if (in.Ncolor() == 3) {
632  ApplyClover<Float,3>(out, in, clover, inverse, parity);
633  } else {
634  errorQuda("Unsupported number of colors %d\n", in.Ncolor());
635  }
636  }
637 
638  //Apply the clvoer matrix field to a colorspinor field
639  //out(x) = clover*in
640  void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
641  {
642  checkPrecision(out, clover, in); // check all precisions match
643  checkLocation(out, clover, in); // check all locations match
644 
645 #ifdef GPU_CLOVER_DIRAC
646  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
647  ApplyClover<double>(out, in, clover, inverse, parity);
648  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
649  ApplyClover<float>(out, in, clover, inverse, parity);
650  } else if (in.Precision() == QUDA_HALF_PRECISION) {
651  ApplyClover<short>(out, in, clover, inverse, parity);
652  } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
653  ApplyClover<char>(out, in, clover, inverse, parity);
654  } else {
655  errorQuda("Unsupported precision %d\n", in.Precision());
656  }
657 #else
658  errorQuda("Clover dslash has not been built");
659 #endif // GPU_TWISTED_MASS_DIRAC
660  }
661 
662  // if (!inverse) apply (Clover + i*a*gamma_5) to the input spinor
663  // else apply (Clover + i*a*gamma_5)/(Clover^2 + a^2) to the input spinor
664  template <bool inverse, typename Float, int nSpin, int nColor, typename Arg>
665  __device__ __host__ inline void twistCloverApply(Arg &arg, int x_cb, int parity) {
666  using namespace linalg; // for Cholesky
667  constexpr int N = nColor*nSpin/2;
668  typedef typename mapper<Float>::type RegType;
670  typedef ColorSpinor<RegType,nColor,nSpin/2> HalfSpinor;
671  typedef HMatrix<RegType,N> Mat;
672  int spinor_parity = arg.nParity == 2 ? parity : 0;
673  Spinor in = arg.in(x_cb, spinor_parity);
674  Spinor out;
675 
676  in.toRel(); // change to chiral basis here
677 
678 #pragma unroll
679  for (int chirality=0; chirality<2; chirality++) {
680  // factor of 2 comes from clover normalization we need to correct for
681  const complex<RegType> j(0.0, chirality == 0 ? static_cast<RegType>(0.5) : -static_cast<RegType>(0.5));
682 
683  Mat A = arg.clover(x_cb, parity, chirality);
684 
685  HalfSpinor in_chi = in.chiral_project(chirality);
686  HalfSpinor out_chi = A*in_chi + j*arg.a*in_chi;
687 
688  if (inverse) {
689  if (arg.dynamic_clover) {
690  Mat A2 = A.square();
691  A2 += arg.a*arg.a*static_cast<RegType>(0.25);
692  Cholesky<HMatrix,RegType,N> cholesky(A2);
693  out_chi = static_cast<RegType>(0.25)*cholesky.backward(cholesky.forward(out_chi));
694  } else {
695  Mat Ainv = arg.cloverInv(x_cb, parity, chirality);
696  out_chi = static_cast<RegType>(2.0)*(Ainv*out_chi);
697  }
698  }
699 
700  out += (out_chi).chiral_reconstruct(chirality);
701  }
702 
703  out.toNonRel(); // change basis back
704 
705  arg.out(x_cb, spinor_parity) = out;
706  }
707 
708  template <bool inverse, typename Float, int nSpin, int nColor, typename Arg>
710  for (int parity=0; parity<arg.nParity; parity++) {
711  parity = (arg.nParity == 2) ? parity : arg.parity;
712  for (int x_cb=0; x_cb<arg.volumeCB; x_cb++) twistCloverApply<inverse,Float,nSpin,nColor>(arg, x_cb, parity);
713  }
714  }
715 
716  template <bool inverse, typename Float, int nSpin, int nColor, typename Arg>
717  __global__ void twistCloverGPU(Arg arg) {
718  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
719  int parity = (arg.nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
720  if (x_cb >= arg.volumeCB) return;
721  twistCloverApply<inverse,Float,nSpin,nColor>(arg, x_cb, parity);
722  }
723 
724  template <typename Float, int nSpin, int nColor, typename Arg>
725  class TwistClover : public TunableVectorY {
726 
727  protected:
730 
731  protected:
732  long long flops() const { return (arg.inverse ? 1056ll : 552ll) * arg.nParity*arg.volumeCB; }
733  long long bytes() const {
734  long long rtn = arg.out.Bytes() + arg.in.Bytes() + arg.nParity*arg.volumeCB*arg.clover.Bytes();
735  if (arg.twist == QUDA_TWIST_GAMMA5_INVERSE && !arg.dynamic_clover)
736  rtn += arg.nParity*arg.volumeCB*arg.cloverInv.Bytes();
737  return rtn;
738  }
739  bool tuneGridDim() const { return false; }
740  unsigned int minThreads() const { return arg.volumeCB; }
741 
742  public:
743  TwistClover(Arg &arg, const ColorSpinorField &meta) : TunableVectorY(arg.nParity), arg(arg), meta(meta)
744  {
745  strcpy(aux, meta.AuxString());
746  strcat(aux, arg.inverse ? ",inverse" : ",direct");
747  }
748  virtual ~TwistClover() { }
749 
750  void apply(const cudaStream_t &stream)
751  {
752  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
753  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
754  if (arg.inverse) twistCloverCPU<true,Float,nSpin,nColor>(arg);
755  else twistCloverCPU<false,Float,nSpin,nColor>(arg);
756  } else {
757  if (arg.inverse) twistCloverGPU<true,Float,nSpin,nColor> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
758  else twistCloverGPU<false,Float,nSpin,nColor> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
759  }
760  }
761 
762  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
763  void preTune() { if (arg.out.field == arg.in.field) arg.out.save(); } // Need to save the out field if it aliases the in field
764  void postTune() { if (arg.out.field == arg.in.field) arg.out.load(); } // Restore if the in and out fields alias
765  };
766 
767 
768  template <typename Float, int nColor>
769  void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover,
770  double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
771  {
772  if (in.Nspin() != 4) errorQuda("Unsupported nSpin=%d", in.Nspin());
773  constexpr int nSpin = 4;
774  bool inverse = twist == QUDA_TWIST_GAMMA5_DIRECT ? false : true;
775 
776 #ifdef DYNAMIC_CLOVER
777  constexpr bool dynamic_clover = true;
778 #else
779  constexpr bool dynamic_clover = false;
780 #endif
781 
782  CloverArg<Float,nSpin,nColor,dynamic_clover> arg(out, in, clover, inverse, parity, kappa, mu, epsilon, dagger, twist);
784  worker.apply(streams[Nstream-1]);
785 
786  checkCudaError();
787  }
788 
789  // template on the number of colors
790  template <typename Float>
791  void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover,
792  double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
793  {
794  if (in.Ncolor() == 3) {
795  ApplyTwistClover<Float,3>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
796  } else {
797  errorQuda("Unsupported number of colors %d\n", in.Ncolor());
798  }
799  }
800 
801  //Apply the twisted-clover matrix field to a colorspinor field
802  void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover,
803  double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
804  {
805  checkPrecision(out, clover, in); // check all precisions match
806  checkLocation(out, clover, in); // check all locations match
807 
808 #ifdef GPU_CLOVER_DIRAC
809  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
810  ApplyTwistClover<double>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
811  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
812  ApplyTwistClover<float>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
813  } else if (in.Precision() == QUDA_HALF_PRECISION) {
814  ApplyTwistClover<short>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
815  } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
816  ApplyTwistClover<char>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
817  } else {
818  errorQuda("Unsupported precision %d\n", in.Precision());
819  }
820 #else
821  errorQuda("Clover dslash has not been built");
822 #endif // GPU_TWISTED_MASS_DIRAC
823  }
824 
825 } // namespace quda
bool dslash_exterior_compute
Definition: dslash_quda.cu:65
__global__ void gammaGPU(Arg arg)
Definition: dslash_quda.cu:240
colorspinor_mapper< Float, 4, nColor >::type F
Definition: dslash_quda.cu:173
bool dslash_interior_compute
Definition: dslash_quda.cu:64
cudaEvent_t scatterStart[Nstream]
Definition: dslash_quda.cu:58
cudaEvent_t gatherStart[Nstream]
Definition: dslash_quda.cu:56
double mu
Definition: test_util.cpp:1648
const int volumeCB
Definition: dslash_quda.cu:487
const ColorSpinorField & meta
Definition: dslash_quda.cu:376
const char * AuxString() const
bool getKernelPackT()
Definition: dslash_quda.cu:26
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
double kappa
Definition: test_util.cpp:1647
const ColorSpinorField & meta
Definition: dslash_quda.cu:258
mapper< Float >::type RegType
Definition: dslash_quda.cu:477
#define checkPrecision(...)
#define errorQuda(...)
Definition: util_quda.h:121
void ApplyGamma(ColorSpinorField &out, const ColorSpinorField &in, int d)
Definition: dslash_quda.cu:292
__device__ __host__ void twistCloverApply(Arg &arg, int x_cb, int parity)
Definition: dslash_quda.cu:665
#define host_free(ptr)
Definition: malloc_quda.h:71
unsigned int minThreads() const
Definition: dslash_quda.cu:381
double epsilon
Definition: test_util.cpp:1649
cudaStream_t * streams
cudaStream_t * stream
const int Nstream
Definition: quda_internal.h:83
__device__ __host__ complex< ValueType > apply(int row, const complex< ValueType > &a) const
Definition: gamma.cuh:221
const char * VolString() const
void postTune()
Definition: dslash_quda.cu:599
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
Main header file for host and device accessors to CloverFields.
unsigned int minThreads() const
Definition: dslash_quda.cu:740
long long bytes() const
Definition: dslash_quda.cu:733
int length[]
bool dslash_policy_init
Definition: dslash_quda.cu:70
cudaEvent_t dslashStart[2]
Definition: dslash_quda.cu:60
virtual ~Clover()
Definition: dslash_quda.cu:585
virtual ~TwistClover()
Definition: dslash_quda.cu:748
void postTune()
Definition: dslash_quda.cu:287
std::vector< QudaP2PPolicy > p2p_policies
Definition: dslash_quda.cu:80
char policy_string[TuneKey::aux_n]
Definition: dslash_quda.cu:83
void popKernelPackT()
Definition: dslash_quda.cu:42
bool tuneGridDim() const
Definition: dslash_quda.cu:380
const int nParity
Definition: dslash_quda.cu:179
std::vector< QudaDslashPolicy > policies
Definition: dslash_quda.cu:77
bool dslash_comms
Definition: dslash_quda.cu:66
bool dslash_copy
Definition: dslash_quda.cu:67
GammaArg(ColorSpinorField &out, const ColorSpinorField &in, int d, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
Definition: dslash_quda.cu:186
Clover(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:581
mapper< Float >::type RegType
Definition: dslash_quda.cu:174
void ApplyTwistGamma(ColorSpinorField &out, const ColorSpinorField &in, int d, double kappa, double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
Apply the twisted-mass gamma operator to a color-spinor field.
Definition: dslash_quda.cu:416
void preTune()
Definition: dslash_quda.cu:286
Gamma(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:266
Worker * aux_worker
Definition: dslash_quda.cu:87
const int nColor
Definition: covdev_test.cpp:75
TuneKey tuneKey() const
Definition: dslash_quda.cu:409
cpuColorSpinorField * in
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:587
void createDslashEvents()
Definition: dslash_quda.cu:95
void twistCloverCPU(Arg &arg)
Definition: dslash_quda.cu:709
static std::stack< bool > kptstack
Definition: dslash_quda.cu:28
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
clover_mapper< Float, length >::type C
Definition: dslash_quda.cu:476
#define warningQuda(...)
Definition: util_quda.h:133
#define checkLocation(...)
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
Definition: quda_matrix.h:61
cudaEvent_t packEnd[2]
Definition: dslash_quda.cu:55
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:750
static bool kernelPackT
Definition: dslash_quda.cu:22
TwistClover(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:743
const ColorSpinorField & meta
Definition: dslash_quda.cu:729
long long flops() const
Definition: dslash_quda.cu:732
TuneKey tuneKey() const
Definition: dslash_quda.cu:762
const int volumeCB
Definition: dslash_quda.cu:181
unsigned int minThreads() const
Definition: dslash_quda.cu:263
int first_active_p2p_policy
Definition: dslash_quda.cu:74
long long bytes() const
Definition: dslash_quda.cu:261
void preTune()
Definition: dslash_quda.cu:598
const ColorSpinorField & meta
Definition: dslash_quda.cu:572
QudaFieldLocation Location() const
CloverArg(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
Definition: dslash_quda.cu:493
void cloverCPU(Arg &arg)
Definition: dslash_quda.cu:552
TwistGamma(Arg &arg, const ColorSpinorField &meta)
Definition: dslash_quda.cu:384
TuneKey tuneKey() const
Definition: dslash_quda.cu:284
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:611
cpuColorSpinorField * out
const int nParity
Definition: spinor_noise.cu:25
long long flops() const
Definition: dslash_quda.cu:260
__device__ __host__ void cloverApply(Arg &arg, int x_cb, int parity)
Definition: dslash_quda.cu:519
void ApplyClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity)
Apply clover-matrix field to a color-spinor field.
Definition: dslash_quda.cu:604
Parameteter structure for driving the clover and twist-clover application kernels.
Definition: dslash_quda.cu:471
__global__ void twistCloverGPU(Arg arg)
Definition: dslash_quda.cu:717
colorspinor_mapper< Float, nSpin, nColor >::type F
Definition: dslash_quda.cu:475
unsigned int minThreads() const
Definition: dslash_quda.cu:578
static const int aux_n
Definition: tune_key.h:12
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:390
bool dslash_pack_compute
Definition: dslash_quda.cu:63
cudaEvent_t scatterEnd[Nstream]
Definition: dslash_quda.cu:59
QudaTwistFlavorType TwistFlavor() const
QudaTwistGamma5Type twist
Definition: dslash_quda.cu:491
const int parity
Definition: dslash_quda.cu:484
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
virtual ~TwistGamma()
Definition: dslash_quda.cu:388
const int volumeCB
Definition: spinor_noise.cu:26
bool tuneGridDim() const
Definition: dslash_quda.cu:739
void setKernelPackT(bool pack)
Definition: dslash_quda.cu:24
TuneKey tuneKey() const
Definition: dslash_quda.cu:597
void gammaCPU(Arg arg)
Definition: dslash_quda.cu:225
void twistGammaCPU(Arg arg)
Definition: dslash_quda.cu:332
void gamma5(ColorSpinorField &out, const ColorSpinorField &in)
Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
Definition: dslash_quda.cu:461
long long flops() const
Definition: dslash_quda.cu:378
const int nParity
Definition: dslash_quda.cu:483
enum QudaTwistGamma5Type_s QudaTwistGamma5Type
void pushKernelPackT(bool pack)
Definition: dslash_quda.cu:30
#define checkCudaError()
Definition: util_quda.h:161
#define mapped_malloc(size)
Definition: malloc_quda.h:68
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
virtual ~Gamma()
Definition: dslash_quda.cu:270
QudaDagType dagger
Definition: test_util.cpp:1620
bool tuneGridDim() const
Definition: dslash_quda.cu:577
QudaParity parity
Definition: covdev_test.cpp:54
__global__ void twistGammaGPU(Arg arg)
Definition: dslash_quda.cu:353
void apply(const cudaStream_t &stream)
Definition: dslash_quda.cu:272
__global__ void cloverGPU(Arg arg)
Definition: dslash_quda.cu:560
void destroyDslashEvents()
Definition: dslash_quda.cu:144
QudaFieldOrder FieldOrder() const
int first_active_policy
Definition: dslash_quda.cu:73
long long bytes() const
Definition: dslash_quda.cu:379
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:57
long long flops() const
Definition: dslash_quda.cu:575
void ApplyTwistClover(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, double kappa, double mu, double epsilon, int parity, int dagger, QudaTwistGamma5Type twist)
Apply twisted clover-matrix field to a color-spinor field.
Definition: dslash_quda.cu:769
long long bytes() const
Definition: dslash_quda.cu:576
Parameter structure for driving the Gamma operator.
Definition: dslash_quda.cu:172
bool tuneGridDim() const
Definition: dslash_quda.cu:262