QUDA  v1.1.0
A library for QCD on GPUs
dslash5_mobius_eofa.cu
Go to the documentation of this file.
1 #include <color_spinor_field.h>
2 #include <color_spinor_field_order.h>
3 #include <index_helper.cuh>
4 #include <instantiate.h>
5 #include <instantiate_dslash.h>
6 
7 #include <kernels/dslash_mobius_eofa.cuh>
8 
9 namespace quda
10 {
11  namespace mobius_eofa
12  {
13  template <typename storage_type, int nColor> class Dslash5 : public TunableVectorYZ
14  {
15  Dslash5Arg<storage_type, nColor> arg;
16  const ColorSpinorField &meta;
17  static constexpr bool shared = true; // whether to use shared memory cache blocking for M5inv
18 
19  long long flops() const
20  {
21  // FIXME: Fix the flop count
22  long long Ls = meta.X(4);
23  long long bulk = (Ls - 2) * (meta.Volume() / Ls);
24  long long wall = 2 * meta.Volume() / Ls;
25  long long n = meta.Ncolor() * meta.Nspin();
26 
27  long long flops_ = 0;
28  switch (arg.type) {
29  case M5_EOFA:
30  case M5INV_EOFA: flops_ = n * (8ll * bulk + 10ll * wall + (arg.xpay ? 4ll * meta.Volume() : 0)); break;
31  default: errorQuda("Unknown Dslash5Type %d for EOFA", arg.type);
32  }
33 
34  return flops_;
35  }
36 
37  long long bytes() const
38  {
39  long long Ls = meta.X(4);
40  switch (arg.type) {
41  case M5_EOFA:
42  case M5INV_EOFA: return arg.out.Bytes() + 2 * arg.in.Bytes() + (arg.xpay ? arg.x.Bytes() : 0);
43  default: errorQuda("Unknown Dslash5Type %d for EOFA", arg.type);
44  }
45  return 0ll;
46  }
47 
48  bool tuneGridDim() const { return false; }
49  unsigned int minThreads() const { return arg.volume_4d_cb; }
50  int blockStep() const { return 4; }
51  int blockMin() const { return 4; }
52  unsigned int sharedBytesPerThread() const
53  {
54  // spin components in shared depend on inversion algorithm
55  int nSpin = meta.Nspin();
56  return 2 * nSpin * nColor * sizeof(typename mapper<storage_type>::type);
57  }
58 
59  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
60 
61  // overloaded to return max dynamic shared memory if doing shared-memory
62  // inverse
63  unsigned int maxSharedBytesPerBlock() const
64  {
65  if (shared && (arg.type == M5_EOFA || arg.type == M5INV_EOFA)) {
66  return maxDynamicSharedBytesPerBlock();
67  } else {
68  return TunableVectorYZ::maxSharedBytesPerBlock();
69  }
70  }
71 
72  public:
73  Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, const double m_f,
74  const double m_5, const Complex *b_5, const Complex *c_5, double a, int eofa_pm, double inv,
75  double kappa, const double *eofa_u, const double *eofa_x, const double *eofa_y,
76  double sherman_morrison, bool dagger, Dslash5Type type) :
77  TunableVectorYZ(in.X(4), in.SiteSubset()),
78  arg(out, in, x, m_f, m_5, b_5, c_5, a, eofa_pm, inv, kappa, eofa_u, eofa_x, eofa_y, sherman_morrison, dagger, type),
79  meta(in)
80  {
81  TunableVectorY::resizeStep(arg.Ls);
82  strcpy(aux, meta.AuxString());
83  if (arg.dagger) strcat(aux, ",Dagger");
84  if (arg.xpay) strcat(aux, ",xpay");
85  if (arg.eofa_pm) {
86  strcat(aux, ",eofa_plus");
87  } else {
88  strcat(aux, ",eofa_minus");
89  }
90  switch (arg.type) {
91  case M5_EOFA: strcat(aux, ",mobius_M5_EOFA"); break;
92  case M5INV_EOFA: strcat(aux, ",mobius_M5INV_EOFA"); break;
93  default: errorQuda("Unknown Dslash5Type %d", arg.type);
94  }
95 
96  apply(streams[Nstream - 1]);
97  }
98 
99  template <typename T, typename Arg> inline void launch(T *f, TuneParam &tp, Arg &arg, const qudaStream_t &stream)
100  {
101  if (shared && (arg.type == M5_EOFA || arg.type == M5INV_EOFA)) {
102  // if inverse kernel uses shared memory then maximize total shared memory
103  tp.set_max_shared_bytes = true;
104  }
105  qudaLaunchKernel(f, tp, stream, arg);
106  }
107 
108  void apply(const qudaStream_t &stream)
109  {
110  using Arg = decltype(arg);
111  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
112  if (arg.type == M5_EOFA) {
113  if (arg.eofa_pm) {
114  if (arg.xpay) {
115  arg.dagger ? launch(dslash5GPU<storage_type, nColor, true, true, true, M5_EOFA, Arg>, tp, arg, stream) :
116  launch(dslash5GPU<storage_type, nColor, false, true, true, M5_EOFA, Arg>, tp, arg, stream);
117  } else {
118  arg.dagger ? launch(dslash5GPU<storage_type, nColor, true, true, false, M5_EOFA, Arg>, tp, arg, stream) :
119  launch(dslash5GPU<storage_type, nColor, false, true, false, M5_EOFA, Arg>, tp, arg, stream);
120  }
121  } else {
122  if (arg.xpay) {
123  arg.dagger ? launch(dslash5GPU<storage_type, nColor, true, false, true, M5_EOFA, Arg>, tp, arg, stream) :
124  launch(dslash5GPU<storage_type, nColor, false, false, true, M5_EOFA, Arg>, tp, arg, stream);
125  } else {
126  arg.dagger ? launch(dslash5GPU<storage_type, nColor, true, false, false, M5_EOFA, Arg>, tp, arg, stream) :
127  launch(dslash5GPU<storage_type, nColor, false, false, false, M5_EOFA, Arg>, tp, arg, stream);
128  }
129  }
130  } else if (arg.type == M5INV_EOFA) {
131  if (arg.eofa_pm) {
132  if (arg.xpay) {
133  arg.dagger ? launch(dslash5GPU<storage_type, nColor, true, true, true, M5INV_EOFA, Arg>, tp, arg, stream) :
134  launch(dslash5GPU<storage_type, nColor, false, true, true, M5INV_EOFA, Arg>, tp, arg, stream);
135  } else {
136  arg.dagger ?
137  launch(dslash5GPU<storage_type, nColor, true, true, false, M5INV_EOFA, Arg>, tp, arg, stream) :
138  launch(dslash5GPU<storage_type, nColor, false, true, false, M5INV_EOFA, Arg>, tp, arg, stream);
139  }
140  } else {
141  if (arg.xpay) {
142  arg.dagger ?
143  launch(dslash5GPU<storage_type, nColor, true, false, true, M5INV_EOFA, Arg>, tp, arg, stream) :
144  launch(dslash5GPU<storage_type, nColor, false, false, true, M5INV_EOFA, Arg>, tp, arg, stream);
145  } else {
146  arg.dagger ?
147  launch(dslash5GPU<storage_type, nColor, true, false, false, M5INV_EOFA, Arg>, tp, arg, stream) :
148  launch(dslash5GPU<storage_type, nColor, false, false, false, M5INV_EOFA, Arg>, tp, arg, stream);
149  }
150  }
151  } else {
152  errorQuda("Unknown Dslash5Type %d", arg.type);
153  }
154  }
155 
156  void initTuneParam(TuneParam &param) const
157  {
158  TunableVectorYZ::initTuneParam(param);
159  param.block.y = arg.Ls; // Ls must be contained in the block
160  param.grid.y = 1;
161  param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
162  }
163 
164  void defaultTuneParam(TuneParam &param) const { initTuneParam(param); }
165 
166  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
167  };
168 
169  // Apply the 5th dimension dslash operator to a colorspinor field
170  // out = Dslash5*in
171  void apply_dslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f,
172  double m_5, const Complex *b_5, const Complex *c_5, double a, int eofa_pm, double inv,
173  double kappa, const double *eofa_u, const double *eofa_x, const double *eofa_y,
174  double sherman_morrison, bool dagger, Dslash5Type type)
175  {
176 #ifdef GPU_DOMAIN_WALL_DIRAC
177  checkLocation(out, in, x); // check all locations match
178  instantiate<Dslash5>(out, in, x, m_f, m_5, b_5, c_5, a, eofa_pm, inv, kappa, eofa_u, eofa_x, eofa_y,
179  sherman_morrison, dagger, type);
180 #else
181  errorQuda("Mobius EOFA dslash has not been built");
182 #endif
183  }
184  } // namespace mobius_eofa
185 } // namespace quda