QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
laplace.cu
Go to the documentation of this file.
1 #include <dslash.h>
2 #include <worker.h>
3 #include <dslash_helper.cuh>
5 #include <gauge_field_order.h>
6 #include <color_spinor.h>
7 #include <dslash_helper.cuh>
8 #include <index_helper.cuh>
9 #include <gauge_field.h>
10 #include <uint_to_char.h>
11 
12 #include <dslash_policy.cuh>
13 #include <kernels/laplace.cuh>
14 
19 namespace quda
20 {
21 
26  template <typename Float, int nDim, int nColor, int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
27  struct LaplaceLaunch {
28 
29  // kernel name for jit compilation
30  static constexpr const char *kernel = "quda::laplaceGPU";
31 
32  template <typename Dslash>
33  inline static void launch(Dslash &dslash, TuneParam &tp, Arg &arg, const cudaStream_t &stream)
34  {
35  dslash.launch(laplaceGPU<Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg>, tp, arg, stream);
36  }
37  };
38 
39  template <typename Float, int nDim, int nColor, typename Arg> class Laplace : public Dslash<Float>
40  {
41 
42 protected:
43  Arg &arg;
45 
46 public:
47  Laplace(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in) :
48  Dslash<Float>(arg, out, in, "kernels/laplace.cuh"),
49  arg(arg),
50  in(in)
51  {
52  }
53 
54  virtual ~Laplace() {}
55 
56  void apply(const cudaStream_t &stream)
57  {
58  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
60  Dslash<Float>::template instantiate<LaplaceLaunch, nDim, nColor>(tp, arg, stream);
61  }
62 
63  long long flops() const
64  {
65  int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
66  int num_mv_multiply = in.Nspin() == 4 ? 2 : 1;
67  int ghost_flops = (num_mv_multiply * mv_flops + 2 * in.Ncolor() * in.Nspin());
68  int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component
69  int num_dir = (arg.dir == 4 ? 2 * 4 : 2 * 3); // 3D or 4D operator
70 
71  long long flops_ = 0;
72 
73  // FIXME - should we count the xpay flops in the derived kernels
74  // since some kernels require the xpay in the exterior (preconditiond clover)
75 
76  switch (arg.kernel_type) {
77  case EXTERIOR_KERNEL_X:
78  case EXTERIOR_KERNEL_Y:
79  case EXTERIOR_KERNEL_Z:
80  case EXTERIOR_KERNEL_T:
81  flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * 2 * in.GhostFace()[arg.kernel_type];
82  break;
83  case EXTERIOR_KERNEL_ALL: {
84  long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
85  flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * ghost_sites;
86  break;
87  }
88  case INTERIOR_KERNEL:
89  case KERNEL_POLICY: {
90  long long sites = in.Volume();
91  flops_ = (num_dir * (in.Nspin() / 4) * in.Ncolor() * in.Nspin() + // spin project (=0 for staggered)
92  num_dir * num_mv_multiply * mv_flops + // SU(3) matrix-vector multiplies
93  ((num_dir - 1) * 2 * in.Ncolor() * in.Nspin()))
94  * sites; // accumulation
95  if (arg.xpay) flops_ += xpay_flops * sites;
96 
97  if (arg.kernel_type == KERNEL_POLICY) break;
98  // now correct for flops done by exterior kernel
99  long long ghost_sites = 0;
100  for (int d = 0; d < 4; d++)
101  if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
102  flops_ -= ghost_flops * ghost_sites;
103 
104  break;
105  }
106  }
107 
108  return flops_;
109  }
110 
111  virtual long long bytes() const
112  {
113  int gauge_bytes = arg.reconstruct * in.Precision();
114  bool isFixed = (in.Precision() == sizeof(short) || in.Precision() == sizeof(char)) ? true : false;
115  int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed ? sizeof(float) : 0);
116  int proj_spinor_bytes = in.Nspin() == 4 ? spinor_bytes / 2 : spinor_bytes;
117  int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + 2 * spinor_bytes; // 2 since we have to load the partial
118  int num_dir = (arg.dir == 4 ? 2 * 4 : 2 * 3); // 3D or 4D operator
119 
120  long long bytes_ = 0;
121 
122  switch (arg.kernel_type) {
123  case EXTERIOR_KERNEL_X:
124  case EXTERIOR_KERNEL_Y:
125  case EXTERIOR_KERNEL_Z:
126  case EXTERIOR_KERNEL_T: bytes_ = ghost_bytes * 2 * in.GhostFace()[arg.kernel_type]; break;
127  case EXTERIOR_KERNEL_ALL: {
128  long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
129  bytes_ = ghost_bytes * ghost_sites;
130  break;
131  }
132  case INTERIOR_KERNEL:
133  case KERNEL_POLICY: {
134  long long sites = in.Volume();
135  bytes_ = (num_dir * gauge_bytes + ((num_dir - 2) * spinor_bytes + 2 * proj_spinor_bytes) + spinor_bytes) * sites;
136  if (arg.xpay) bytes_ += spinor_bytes;
137 
138  if (arg.kernel_type == KERNEL_POLICY) break;
139  // now correct for bytes done by exterior kernel
140  long long ghost_sites = 0;
141  for (int d = 0; d < 4; d++)
142  if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
143  bytes_ -= ghost_bytes * ghost_sites;
144 
145  break;
146  }
147  }
148  return bytes_;
149  }
150 
151  TuneKey tuneKey() const
152  {
153  // add laplace transverse dir to the key
154  char aux[TuneKey::aux_n];
155  strcpy(aux, Dslash<Float>::aux[arg.kernel_type]);
156  strcat(aux, ",laplace=");
157  char laplace[32];
158  u32toa(laplace, arg.dir);
159  strcat(aux, laplace);
160  return TuneKey(in.VolString(), typeid(*this).name(), aux);
161  }
162  };
163 
164  template <typename Float, int nColor, QudaReconstructType recon> struct LaplaceApply {
165 
166  inline LaplaceApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double a,
167  const ColorSpinorField &x, int parity, bool dagger, const int *comm_override,
168  TimeProfile &profile)
169  {
170 
171  constexpr int nDim = 4;
172  LaplaceArg<Float, nColor, recon> arg(out, in, U, dir, a, x, parity, dagger, comm_override);
174 
176  laplace, const_cast<cudaColorSpinorField *>(static_cast<const cudaColorSpinorField *>(&in)), in.VolumeCB(),
177  in.GhostFaceCB(), profile);
178  policy.apply(0);
179 
180  checkCudaError();
181  }
182  };
183 
184  // Apply the Laplace operator
185  // out(x) = M*in = - kappa*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
186  // Uses the kappa normalization for the Wilson operator.
187  // Omits direction 'dir' from the operator.
188  void ApplyLaplace(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double kappa,
189  const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
190  {
191 
192  if (in.V() == out.V()) errorQuda("Aliasing pointers");
193  if (in.FieldOrder() != out.FieldOrder())
194  errorQuda("Field order mismatch in = %d, out = %d", in.FieldOrder(), out.FieldOrder());
195 
196  // check all precisions match
197  checkPrecision(out, in, U);
198 
199  // check all locations match
200  checkLocation(out, in, U);
201 
202  instantiate<LaplaceApply>(out, in, U, dir, kappa, x, parity, dagger, comm_override, profile);
203  }
204 } // namespace quda
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
Definition: dslash.h:101
LaplaceApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
Definition: laplace.cu:166
void setParam(Arg &arg)
Definition: dslash.h:66
void apply(const cudaStream_t &stream)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
double kappa
Definition: test_util.cpp:1647
#define checkPrecision(...)
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * stream
const ColorSpinorField & in
Definition: laplace.cu:44
const char * VolString() const
void ApplyLaplace(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double kappa, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
Driver for applying the Laplace stencil.
Definition: laplace.cu:188
virtual ~Laplace()
Definition: laplace.cu:54
This is a helper class that is used to instantiate the correct templated kernel for the dslash...
Definition: laplace.cu:27
virtual long long bytes() const
Definition: laplace.cu:111
long long flops() const
Definition: laplace.cu:63
static void launch(Dslash &dslash, TuneParam &tp, Arg &arg, const cudaStream_t &stream)
Definition: laplace.cu:33
Parameter structure for driving the covariatnt derivative operator.
Definition: laplace.cuh:16
TuneKey tuneKey() const
Definition: laplace.cu:151
cpuColorSpinorField * in
const int * GhostFaceCB() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
#define checkLocation(...)
Main header file for host and device accessors to GaugeFields.
Arg & arg
Definition: laplace.cu:43
Laplace(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in)
Definition: laplace.cu:47
cpuColorSpinorField * out
void u32toa(char *buffer, uint32_t value)
Definition: uint_to_char.h:45
static const int aux_n
Definition: tune_key.h:12
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
const int * GhostFace() const
static constexpr const char * kernel
Definition: laplace.cu:30
__device__ __host__ void laplace(Arg &arg, int idx, int parity)
Definition: laplace.cuh:132
#define checkCudaError()
Definition: util_quda.h:161
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
QudaDagType dagger
Definition: test_util.cpp:1620
void apply(const cudaStream_t &stream)
Definition: laplace.cu:56
QudaParity parity
Definition: covdev_test.cpp:54
QudaFieldOrder FieldOrder() const