QUDA  v1.1.0
A library for QCD on GPUs
dslash_wilson_clover_preconditioned.cu
Go to the documentation of this file.
1 #include <gauge_field.h>
2 #include <color_spinor_field.h>
3 #include <clover_field.h>
4 #include <dslash.h>
5 #include <worker.h>
6 
7 #include <dslash_policy.cuh>
8 #include <kernels/dslash_wilson_clover_preconditioned.cuh>
9 
10 /**
11  This is the Wilson-clover preconditioned linear operator
12 */
13 
14 namespace quda
15 {
16 
17  template <typename Arg> class WilsonCloverPreconditioned : public Dslash<wilsonCloverPreconditioned, Arg>
18  {
19  using Dslash = Dslash<wilsonCloverPreconditioned, Arg>;
20  using Dslash::arg;
21  using Dslash::in;
22 
23  public:
24  WilsonCloverPreconditioned(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in) : Dslash(arg, out, in)
25  {
26  }
27 
28  void apply(const qudaStream_t &stream)
29  {
30  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
31  Dslash::setParam(tp);
32  // specialize here to constrain the template instantiation
33  if (arg.nParity == 1) {
34  if (arg.xpay) {
35  if (arg.dagger) errorQuda("xpay operator only defined for not dagger");
36  Dslash::template instantiate<packShmem, 1, false, true>(tp, stream);
37  } else {
38  if (arg.dagger)
39  Dslash::template instantiate<packShmem, 1, true, false>(tp, stream);
40  else
41  Dslash::template instantiate<packShmem, 1, false, false>(tp, stream);
42  }
43  } else {
44  errorQuda("Preconditioned Wilson-clover operator not defined nParity=%d", arg.nParity);
45  }
46  }
47 
48  long long flops() const
49  {
50  int clover_flops = 504;
51  long long flops = Dslash::flops();
52  switch (arg.kernel_type) {
53  case EXTERIOR_KERNEL_X:
54  case EXTERIOR_KERNEL_Y:
55  case EXTERIOR_KERNEL_Z:
56  case EXTERIOR_KERNEL_T: flops += clover_flops * 2 * in.GhostFace()[arg.kernel_type]; break;
57  case EXTERIOR_KERNEL_ALL:
58  flops += clover_flops * 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
59  break;
60  case INTERIOR_KERNEL:
61  case UBER_KERNEL:
62  case KERNEL_POLICY:
63  flops += clover_flops * in.Volume();
64 
65  if (arg.kernel_type == KERNEL_POLICY) break;
66  // now correct for flops done by exterior kernel
67  long long ghost_sites = 0;
68  for (int d = 0; d < 4; d++)
69  if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
70  flops -= clover_flops * ghost_sites;
71 
72  break;
73  }
74  return flops;
75  }
76 
77  long long bytes() const
78  {
79  int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
80 
81  long long bytes = Dslash::bytes();
82  switch (arg.kernel_type) {
83  case EXTERIOR_KERNEL_X:
84  case EXTERIOR_KERNEL_Y:
85  case EXTERIOR_KERNEL_Z:
86  case EXTERIOR_KERNEL_T: bytes += clover_bytes * 2 * in.GhostFace()[arg.kernel_type]; break;
87  case EXTERIOR_KERNEL_ALL:
88  bytes += clover_bytes * 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
89  break;
90  case INTERIOR_KERNEL:
91  case UBER_KERNEL:
92  case KERNEL_POLICY:
93  bytes += clover_bytes * in.Volume();
94 
95  if (arg.kernel_type == KERNEL_POLICY) break;
96  // now correct for bytes done by exterior kernel
97  long long ghost_sites = 0;
98  for (int d = 0; d < 4; d++)
99  if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
100  bytes -= clover_bytes * ghost_sites;
101 
102  break;
103  }
104 
105  return bytes;
106  }
107 
108  };
109 
110  template <typename Float, int nColor, QudaReconstructType recon> struct WilsonCloverPreconditionedApply {
111 
112  inline WilsonCloverPreconditionedApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U,
113  const CloverField &A, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override,
114  TimeProfile &profile)
115  {
116  constexpr int nDim = 4;
117  WilsonCloverArg<Float, nColor, nDim, recon> arg(out, in, U, A, a, x, parity, dagger, comm_override);
118  WilsonCloverPreconditioned<decltype(arg)> wilson(arg, out, in);
119 
120  dslash::DslashPolicyTune<decltype(wilson)> policy(wilson,
121  const_cast<cudaColorSpinorField *>(static_cast<const cudaColorSpinorField *>(&in)), in.VolumeCB(),
122  in.GhostFaceCB(), profile);
123  policy.apply(0);
124  }
125  };
126 
127  // Apply the preconditioned Wilson-clover operator
128  // out(x) = M*in = a * A(x)^{-1} (\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu))
129  // Uses the kappa normalization for the Wilson operator.
130  void ApplyWilsonCloverPreconditioned(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U,
131  const CloverField &A, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override,
132  TimeProfile &profile)
133  {
134 #ifdef GPU_CLOVER_DIRAC
135  instantiate<WilsonCloverPreconditionedApply>(out, in, U, A, a, x, parity, dagger, comm_override, profile);
136 #else
137  errorQuda("Clover dslash has not been built");
138 #endif
139  }
140 
141 } // namespace quda