QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dslash_improved_staggered.cu
Go to the documentation of this file.
1 #include <dslash.h>
2 #include <worker.h>
3 #include <dslash_helper.cuh>
5 #include <gauge_field_order.h>
6 #include <color_spinor.h>
7 #include <dslash_helper.cuh>
8 #include <index_helper.cuh>
9 #include <gauge_field.h>
10 
11 #include <dslash_policy.cuh>
13 
18 namespace quda
19 {
20 
21  template <typename Float, int nDim, int nColor, int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
22  struct StaggeredLaunch {
23  static constexpr const char *kernel = "quda::staggeredGPU"; // kernel name for jit compilation
24  template <typename Dslash>
25  inline static void launch(Dslash &dslash, TuneParam &tp, Arg &arg, const cudaStream_t &stream)
26  {
27  dslash.launch(staggeredGPU<Float, nDim, nColor, nParity, dagger, xpay, kernel_type, Arg>, tp, arg, stream);
28  }
29  };
30 
31  template <typename Float, int nDim, int nColor, typename Arg> class Staggered : public Dslash<Float>
32  {
33 
34 protected:
35  Arg &arg;
37 
38 public:
39  Staggered(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in) :
40  Dslash<Float>(arg, out, in, "kernels/dslash_staggered.cuh"),
41  arg(arg),
42  in(in)
43  {
44  }
45 
46  virtual ~Staggered() {}
47 
48  void apply(const cudaStream_t &stream)
49  {
50  if (in.Location() == QUDA_CPU_FIELD_LOCATION) {
51  errorQuda("Staggered Dslash not implemented on CPU");
52  } else {
53  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
55  Dslash<Float>::template instantiate<StaggeredLaunch, nDim, nColor>(tp, arg, stream);
56  }
57  }
58 
59  /*
60  per direction / dimension flops
61  SU(3) matrix-vector flops = (8 Nc - 2) * Nc
62  xpay = 2 * 2 * Nc * Ns
63 
64  So for the full dslash we have
65  flops = (2 * 2 * Nd * (8*Nc-2) * Nc) + ((2 * 2 * Nd - 1) * 2 * Nc * Ns)
66  flops_xpay = flops + 2 * 2 * Nc * Ns
67 
68  For Asqtad this should give 1146 for Nc=3,Ns=2 and 1158 for the axpy equivalent
69  */
70  long long flops() const
71  {
72  int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
73  int ghost_flops = (3 + 1) * (mv_flops + 2 * in.Ncolor() * in.Nspin());
74  int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component
75  int num_dir = 2 * 4; // hard code factor of 4 in direction since fields may be 5-d
76 
77  long long flops_ = 0;
78 
79  switch (arg.kernel_type) {
80  case EXTERIOR_KERNEL_X:
81  case EXTERIOR_KERNEL_Y:
82  case EXTERIOR_KERNEL_Z:
83  case EXTERIOR_KERNEL_T: flops_ = ghost_flops * 2 * in.GhostFace()[arg.kernel_type]; break;
84  case EXTERIOR_KERNEL_ALL: {
85  long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
86  flops_ = ghost_flops * ghost_sites;
87  break;
88  }
89  case INTERIOR_KERNEL:
90  case KERNEL_POLICY: {
91  long long sites = in.Volume();
92  flops_ = (2 * num_dir * mv_flops + // SU(3) matrix-vector multiplies
93  (2 * num_dir - 1) * 2 * in.Ncolor() * in.Nspin())
94  * sites; // accumulation
95  if (arg.xpay) flops_ += xpay_flops * sites; // axpy is always on interior
96 
97  if (arg.kernel_type == KERNEL_POLICY) break;
98  // now correct for flops done by exterior kernel
99  long long ghost_sites = 0;
100  for (int d = 0; d < 4; d++)
101  if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
102  flops_ -= ghost_flops * ghost_sites;
103 
104  break;
105  }
106  }
107  return flops_;
108  }
109 
110  long long bytes() const
111  {
112  int gauge_bytes_fat = QUDA_RECONSTRUCT_NO * in.Precision();
113  int gauge_bytes_long = arg.reconstruct * in.Precision();
114  bool isFixed = (in.Precision() == sizeof(short) || in.Precision() == sizeof(char)) ? true : false;
115  int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed ? sizeof(float) : 0);
116  int ghost_bytes = 3 * (spinor_bytes + gauge_bytes_long) + (spinor_bytes + gauge_bytes_fat)
117  + 3 * 2 * spinor_bytes; // last term is the accumulator load/store through the face
118  int num_dir = 2 * 4; // set to 4-d since we take care of 5-d fermions in derived classes where necessary
119 
120  long long bytes_ = 0;
121 
122  switch (arg.kernel_type) {
123  case EXTERIOR_KERNEL_X:
124  case EXTERIOR_KERNEL_Y:
125  case EXTERIOR_KERNEL_Z:
126  case EXTERIOR_KERNEL_T: bytes_ = ghost_bytes * 2 * in.GhostFace()[arg.kernel_type]; break;
127  case EXTERIOR_KERNEL_ALL: {
128  long long ghost_sites = 2 * (in.GhostFace()[0] + in.GhostFace()[1] + in.GhostFace()[2] + in.GhostFace()[3]);
129  bytes_ = ghost_bytes * ghost_sites;
130  break;
131  }
132  case INTERIOR_KERNEL:
133  case KERNEL_POLICY: {
134  long long sites = in.Volume();
135  bytes_ = (num_dir * (gauge_bytes_fat + gauge_bytes_long) + // gauge reads
136  num_dir * 2 * spinor_bytes + // spinor reads
137  spinor_bytes)
138  * sites; // spinor write
139  if (arg.xpay) bytes_ += spinor_bytes;
140 
141  if (arg.kernel_type == KERNEL_POLICY) break;
142  // now correct for bytes done by exterior kernel
143  long long ghost_sites = 0;
144  for (int d = 0; d < 4; d++)
145  if (arg.commDim[d]) ghost_sites += 2 * in.GhostFace()[d];
146  bytes_ -= ghost_bytes * ghost_sites;
147 
148  break;
149  }
150  }
151  return bytes_;
152  }
153 
154  TuneKey tuneKey() const
155  {
156  return TuneKey(in.VolString(), typeid(*this).name(), Dslash<Float>::aux[arg.kernel_type]);
157  }
158  };
159 
160  template <typename Float, int nColor, QudaReconstructType recon_l> struct ImprovedStaggeredApply {
161 
163  const GaugeField &U, double a, const ColorSpinorField &x, int parity, bool dagger,
164  const int *comm_override, TimeProfile &profile)
165  {
166  constexpr int nDim = 4; // MWTODO: this probably should be 5 for mrhs Dslash
167  constexpr bool improved = true;
168  constexpr QudaReconstructType recon_u = QUDA_RECONSTRUCT_NO;
169  StaggeredArg<Float, nColor, recon_u, recon_l, improved> arg(out, in, U, L, a, x, parity, dagger, comm_override);
171 
173  staggered, const_cast<cudaColorSpinorField *>(static_cast<const cudaColorSpinorField *>(&in)), in.VolumeCB(),
174  in.GhostFaceCB(), profile);
175  policy.apply(0);
176 
177  checkCudaError();
178  }
179  };
180 
182  const GaugeField &L, double a, const ColorSpinorField &x, int parity, bool dagger,
183  const int *comm_override, TimeProfile &profile)
184  {
185 
186 #ifdef GPU_STAGGERED_DIRAC
187  if (in.V() == out.V()) errorQuda("Aliasing pointers");
188  if (in.FieldOrder() != out.FieldOrder())
189  errorQuda("Field order mismatch in = %d, out = %d", in.FieldOrder(), out.FieldOrder());
190 
191  // check all precisions match
192  checkPrecision(out, in, U, L);
193 
194  // check all locations match
195  checkLocation(out, in, U, L);
196 
197  for (int i = 0; i < 4; i++) {
198  if (comm_dim_partitioned(i) && (U.X()[i] < 6)) {
199  errorQuda(
200  "ERROR: partitioned dimension with local size less than 6 is not supported in improved staggered dslash\n");
201  }
202  }
203 
204  // L must be first gauge field argument since we template on long reconstruct
205  instantiate<ImprovedStaggeredApply, StaggeredReconstruct>(out, in, L, U, a, x, parity, dagger, comm_override,
206  profile);
207 #else
208  errorQuda("Staggered dslash has not been built");
209 #endif
210  }
211 
212 } // namespace quda
void launch(T *f, const TuneParam &tp, Arg &arg, const cudaStream_t &stream)
Definition: dslash.h:101
void setParam(Arg &arg)
Definition: dslash.h:66
void apply(const cudaStream_t &stream)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define checkPrecision(...)
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * stream
static void launch(Dslash &dslash, TuneParam &tp, Arg &arg, const cudaStream_t &stream)
const char * VolString() const
ImprovedStaggeredApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &L, const GaugeField &U, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
const ColorSpinorField & in
__device__ __host__ void staggered(Arg &arg, int idx, int parity)
cpuColorSpinorField * in
const int * GhostFaceCB() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
#define checkLocation(...)
Main header file for host and device accessors to GaugeFields.
QudaFieldLocation Location() const
Parameter structure for driving the Staggered Dslash operator.
cpuColorSpinorField * out
enum QudaReconstructType_s QudaReconstructType
static constexpr const char * kernel
void apply(const cudaStream_t &stream)
void ApplyImprovedStaggered(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, const GaugeField &L, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
Apply the improved staggered dslash operator to a color-spinor field.
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
const int * GhostFace() const
Staggered(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in)
#define checkCudaError()
Definition: util_quda.h:161
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
QudaDagType dagger
Definition: test_util.cpp:1620
QudaParity parity
Definition: covdev_test.cpp:54
QudaFieldOrder FieldOrder() const
int comm_dim_partitioned(int dim)
const int * X() const