QUDA  v1.1.0
A library for QCD on GPUs
multi_reduce_quda.cu
Go to the documentation of this file.
1 #include <blas_quda.h>
2 #include <tune_quda.h>
3 #include <color_spinor_field_order.h>
4 #include <uint_to_char.h>
5 
6 #include <launch_kernel.cuh>
7 #include <jitify_helper.cuh>
8 #include <kernels/multi_reduce_core.cuh>
9 
10 namespace quda {
11 
12  namespace blas {
13 
14  qudaStream_t* getStream();
15 
16  template <int block_size, typename real, int len, int NXZ, typename Arg>
17  typename std::enable_if<block_size!=32, qudaError_t>::type launch(Arg &arg, const TuneParam &tp, const qudaStream_t &stream)
18  {
19  if (tp.block.x == block_size)
20  return qudaLaunchKernel(multiReduceKernel<block_size, real, len, NXZ, Arg>, tp, stream, arg);
21  else
22  return launch<block_size - 32, real, len, NXZ>(arg, tp, stream);
23  }
24 
25  template <int block_size, typename real, int len, int NXZ, typename Arg>
26  typename std::enable_if<block_size==32, qudaError_t>::type launch(Arg &arg, const TuneParam &tp, const qudaStream_t &stream)
27  {
28  if (block_size != tp.block.x) errorQuda("Unexpected block size %d\n", tp.block.x);
29  return qudaLaunchKernel(multiReduceKernel<block_size, real, len, NXZ, Arg>, tp, stream, arg);
30  }
31 
32 #ifdef QUDA_FAST_COMPILE_REDUCE
33  constexpr static unsigned int max_block_size() { return 32; }
34 #else
35  constexpr static unsigned int max_block_size() { return 128; }
36 #endif
37 
38  template <typename real, int len, int NXZ, typename Arg, typename T>
39  void multiReduceLaunch(T result[], Arg &arg, const TuneParam &tp, const qudaStream_t &stream, Tunable &tunable)
40  {
41  using reduce_t = typename Arg::Reducer::reduce_t;
42  if (tp.grid.x > (unsigned int)deviceProp.maxGridSize[0])
43  errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, deviceProp.maxGridSize[0]);
44 
45 #ifdef JITIFY
46  using namespace jitify::reflection;
47  tunable.jitifyError() = program->kernel("quda::blas::multiReduceKernel")
48  .instantiate((int)tp.block.x, Type<real>(), len, NXZ, Type<Arg>())
49  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
50  .launch(arg);
51  arg.launch_error = tunable.jitifyError() == CUDA_SUCCESS ? QUDA_SUCCESS : QUDA_ERROR;
52 #else
53  arg.launch_error = launch<max_block_size(), real, len, NXZ>(arg, tp, stream);
54 #endif
55 
56  std::vector<T> result_(NXZ * arg.NYW);
57  if (!commAsyncReduction()) arg.complete(result_, stream);
58 
59  // need to transpose for same order with vector thread reduction
60  for (int i = 0; i < NXZ; i++) {
61  for (int j = 0; j < arg.NYW; j++) {
62  result[i * arg.NYW + j] = result_[j * NXZ + i];
63  }
64  }
65 
66  }
67 
68  template <template <typename ...> class Reducer, typename store_t, typename y_store_t, int nSpin, typename T>
69  class MultiReduce : public Tunable
70  {
71  using real = typename mapper<y_store_t>::type;
72  using host_reduce_t = typename Reducer<double, real>::reduce_t;
73  const int NXZ;
74  const int NYW;
75  Reducer<device_reduce_t, real> r;
76  const int nParity;
77  const T &a, &b, &c;
78  std::vector<ColorSpinorField *> &x, &y, &z, &w;
79  host_reduce_t *result;
80  QudaFieldLocation location;
81 
82  unsigned int sharedBytesPerThread() const { return 0; }
83  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
84 
85  virtual bool advanceSharedBytes(TuneParam &param) const
86  {
87  TuneParam next(param);
88  advanceBlockDim(next); // to get next blockDim
89  int nthreads = next.block.x * next.block.y * next.block.z;
90  param.shared_bytes = sharedBytesPerThread() * nthreads > sharedBytesPerBlock(param) ?
91  sharedBytesPerThread() * nthreads : sharedBytesPerBlock(param);
92  return false;
93  }
94 
95  unsigned int maxBlockSize(const TuneParam &param) const { return max_block_size(); }
96 
97  public:
98  MultiReduce(const T &a, const T &b, const T &c, const ColorSpinorField &x_meta, const ColorSpinorField &y_meta,
99  std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y,
100  std::vector<ColorSpinorField *> &z, std::vector<ColorSpinorField *> &w,
101  host_reduce_t *result) :
102  NXZ(x.size()),
103  NYW(y.size()),
104  r(NXZ, NYW),
105  nParity(x[0]->SiteSubset()),
106  a(a),
107  b(b),
108  c(c),
109  x(x),
110  y(y),
111  z(z),
112  w(w),
113  result(result),
114  location(checkLocation(*x[0], *y[0], *z[0], *w[0]))
115  {
116  checkLength(*x[0], *y[0], *z[0], *w[0]);
117  auto x_prec = checkPrecision(*x[0], *z[0], *w[0]);
118  auto y_prec = y[0]->Precision();
119  auto x_order = checkOrder(*x[0], *z[0], *w[0]);
120  auto y_order = y[0]->FieldOrder();
121  if (sizeof(store_t) != x_prec) errorQuda("Expected precision %lu but received %d", sizeof(store_t), x_prec);
122  if (sizeof(y_store_t) != y_prec) errorQuda("Expected precision %lu but received %d", sizeof(y_store_t), y_prec);
123  if (x_prec == y_prec && x_order != y_order) errorQuda("Orders %d %d do not match", x_order, y_order);
124 
125  strcpy(aux, "policy_kernel,");
126  strcat(aux, x[0]->AuxString());
127  if (x_prec != y_prec) {
128  strcat(aux, ",");
129  strcat(aux, y[0]->AuxString());
130  }
131  strcat(aux, nParity == 2 ? ",nParity=2" : ",nParity=1");
132 
133  // since block dot product and block norm use the same functors, we need to distinguish them
134  bool is_norm = false;
135  if (NXZ == NYW) {
136  is_norm = true;
137  for (int i = 0; i < NXZ; i++) {
138  if (x[i]->V() != y[i]->V() || x[i]->V() != z[i]->V() || x[i]->V() != w[i]->V()) {
139  is_norm = false;
140  break;
141  }
142  }
143  }
144  if (is_norm) strcat(aux, ",norm");
145 
146 #ifdef JITIFY
147  ::quda::create_jitify_program("kernels/multi_reduce_core.cuh");
148 #endif
149 
150  apply(*blas::getStream());
151 
152  blas::bytes += bytes();
153  blas::flops += flops();
154  }
155 
156  TuneKey tuneKey() const
157  {
158  char name[TuneKey::name_n];
159  char NXZ_str[8];
160  char NYW_str[8];
161  u32toa(NXZ_str, NXZ);
162  u32toa(NYW_str, NYW);
163  strcpy(name, "Nxz");
164  strcat(name, NXZ_str);
165  strcat(name, "Nyw");
166  strcat(name, NYW_str);
167  strcat(name, typeid(r).name());
168  return TuneKey(x[0]->VolString(), name, aux);
169  }
170 
171  template <typename buffer_t>
172  void set_param(buffer_t &d, const T &h, const qudaStream_t &stream)
173  {
174  using coeff_t = typename decltype(r)::coeff_t;
175  constexpr size_t n_coeff = MAX_MATRIX_SIZE / sizeof(coeff_t);
176 
177  coeff_t tmp[n_coeff];
178  for (int i = 0; i < NXZ; i++)
179  for (int j = 0; j < NYW; j++) tmp[NYW * i + j] = coeff_t(h.data[NYW * i + j]);
180  cudaMemcpyToSymbolAsync(d, tmp, NXZ * NYW * sizeof(decltype(tmp[0])), 0, cudaMemcpyHostToDevice, stream);
181  //cuMemcpyHtoDAsync(d, tmp, NXZ * NYW * sizeof(decltype(tmp[0])), stream);
182  }
183 
184  template <int NXZ> void compute(const qudaStream_t &stream)
185  {
186  staticCheck<NXZ, store_t, y_store_t, decltype(r)>(r, x, y);
187 
188  constexpr bool site_unroll_check = !std::is_same<store_t, y_store_t>::value || isFixed<store_t>::value;
189  if (site_unroll_check && (x[0]->Ncolor() != 3 || x[0]->Nspin() == 2))
190  errorQuda("site unroll not supported for nSpin = %d nColor = %d", x[0]->Nspin(), x[0]->Ncolor());
191 
192  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
193 
194  if (location == QUDA_CUDA_FIELD_LOCATION) {
195  if (site_unroll_check) checkNative(*x[0], *y[0], *z[0], *w[0]); // require native order when using site_unroll
196  using device_store_t = typename device_type_mapper<store_t>::type;
197  using device_y_store_t = typename device_type_mapper<y_store_t>::type;
198  using device_real_t = typename mapper<device_y_store_t>::type;
199  Reducer<device_reduce_t, device_real_t> r_(NXZ, NYW);
200 
201  // redefine site_unroll with device_store types to ensure we have correct N/Ny/M values
202  constexpr bool site_unroll = !std::is_same<device_store_t, device_y_store_t>::value || isFixed<device_store_t>::value;
203  constexpr int N = n_vector<device_store_t, true, nSpin, site_unroll>();
204  constexpr int Ny = n_vector<device_y_store_t, true, nSpin, site_unroll>();
205  constexpr int M = site_unroll ? (nSpin == 4 ? 24 : 6) : N; // real numbers per thread
206  const int length = x[0]->Length() / (nParity * M);
207 
208  MultiReduceArg<NXZ, device_store_t, N, device_y_store_t, Ny, decltype(r_)> arg(x, y, z, w, r_, NYW, length, nParity, tp);
209 
210 #ifdef JITIFY
211  // need to get constants pointer from jitify instance
212  if (a.data || b.data || c.data) errorQuda("Constant memory buffer support not enabled with jitify yet");
213 #else
214  if (a.data) set_param(Amatrix_d, a, stream);
215  if (b.data) set_param(Bmatrix_d, b, stream);
216  if (c.data) set_param(Cmatrix_d, c, stream);
217 #endif
218  multiReduceLaunch<device_real_t, M, NXZ>(result, arg, tp, stream, *this);
219  } else {
220  errorQuda("Only implemented for GPU fields");
221  }
222  }
223 
224  template <int n> typename std::enable_if<n!=1, void>::type instantiateLinear(const qudaStream_t &stream)
225  {
226  if (NXZ == n) compute<n>(stream);
227  else instantiateLinear<n-1>(stream);
228  }
229 
230  template <int n> typename std::enable_if<n==1, void>::type instantiateLinear(const qudaStream_t &stream)
231  {
232  compute<1>(stream);
233  }
234 
235  template <int n> typename std::enable_if<n!=1, void>::type instantiatePow2(const qudaStream_t &stream)
236  {
237  if (NXZ == n) compute<n>(stream);
238  else instantiatePow2<n/2>(stream);
239  }
240 
241  template <int n> typename std::enable_if<n==1, void>::type instantiatePow2(const qudaStream_t &stream)
242  {
243  compute<1>(stream);
244  }
245 
246  void apply(const qudaStream_t &stream)
247  {
248  constexpr int pow2_max = max_NXZ_power2<true, isFixed<store_t>::value>();
249  if (NXZ <= pow2_max && is_power2(NXZ)) instantiatePow2<pow2_max>(stream);
250  else if (NXZ <= MAX_MULTI_BLAS_N) instantiateLinear<MAX_MULTI_BLAS_N>(stream);
251  else errorQuda("x.size %lu greater than MAX_MULTI_BLAS_N %d", x.size(), MAX_MULTI_BLAS_N);
252  }
253 
254  bool advanceGridDim(TuneParam &param) const
255  {
256  bool rtn = Tunable::advanceGridDim(param);
257  if (NYW > deviceProp.maxGridSize[1]) errorQuda("N=%d is greater than the maximum support grid size", NYW);
258  return rtn;
259  }
260 
261  void initTuneParam(TuneParam &param) const
262  {
263  Tunable::initTuneParam(param);
264  param.block.y = 1;
265  param.grid.y = NYW;
266  }
267 
268  void defaultTuneParam(TuneParam &param) const
269  {
270  Tunable::defaultTuneParam(param);
271  param.block.y = 1;
272  param.grid.y = NYW;
273  }
274 
275  void preTune()
276  {
277  for (int i = 0; i < NYW; ++i) {
278  if (r.write.X) x[i]->backup();
279  if (r.write.Y) y[i]->backup();
280  if (r.write.Z) z[i]->backup();
281  if (r.write.W) w[i]->backup();
282  }
283  }
284 
285  void postTune()
286  {
287  for (int i = 0; i < NYW; ++i) {
288  if (r.write.X) x[i]->restore();
289  if (r.write.Y) y[i]->restore();
290  if (r.write.Z) z[i]->restore();
291  if (r.write.W) w[i]->restore();
292  }
293  }
294 
295  long long flops() const
296  {
297  return NYW * NXZ * r.flops() * x[0]->Length();
298  }
299 
300  long long bytes() const
301  {
302  // X and Z reads are repeated (and hopefully cached) across NYW
303  // each Y and W read/write is done once
304  return NYW * NXZ * (r.read.X + r.write.X) * x[0]->Bytes() +
305  NYW * (r.read.Y + r.write.Y) * y[0]->Bytes() +
306  NYW * NXZ * (r.read.Z + r.write.Z) * z[0]->Bytes() +
307  NYW * (r.read.W + r.write.W) * w[0]->Bytes();
308  }
309 
310  int tuningIter() const { return 3; }
311  };
312 
313  template <template <typename ...> class ReducerDiagonal, template <typename ...> class ReducerOffDiagonal, typename T>
314  void multiReduce(T result[], const coeff_array<T> &a, const coeff_array<T> &b, const coeff_array<T> &c,
315  CompositeColorSpinorField &x, CompositeColorSpinorField &y, CompositeColorSpinorField &z,
316  CompositeColorSpinorField &w, int i, int j)
317  {
318  if (i == j) { // we are on the diagonal so invoke the diagonal reducer
319  using host_reduce_t = typename ReducerDiagonal<double, double>::reduce_t;
320  instantiate<ReducerDiagonal, MultiReduce, true>(a, b, c, *x[0], *y[0], x, y, z, w, (host_reduce_t*)result);
321  } else { // we are on the diagonal so invoke the off-diagonal reducer
322  using host_reduce_t = typename ReducerOffDiagonal<double, double>::reduce_t;
323  instantiate<ReducerOffDiagonal, MultiReduce, true>(a, b, c, *x[0], *y[0], x, y, z, w, (host_reduce_t*)result);
324  }
325  }
326 
327  // This function does the outer product of dot products... in column major.
328  // There's a function below called 'cDotProduct' that flips it to row major.
329  template <template <typename ...> class ReducerDiagonal,
330  template <typename ...> class ReducerOffDiagonal, typename T>
331  void multiReduce_recurse(T *result, std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y,
332  std::vector<ColorSpinorField *> &z, std::vector<ColorSpinorField *> &w, int i_idx,
333  int j_idx, bool hermitian, uint2 tile_size)
334  {
335  if (y.size() > tile_size.y) { // if greater than max single-kernel size, split and recurse
336  // Do the recurse first.
337  T* result0 = &result[0];
338  T* result1 = &result[x.size()*(y.size()/2)];
339  std::vector<ColorSpinorField*> y0(y.begin(), y.begin() + y.size()/2);
340  std::vector<ColorSpinorField*> y1(y.begin() + y.size()/2, y.end());
341  multiReduce_recurse<ReducerDiagonal,ReducerOffDiagonal>(result0, x, y0, z, w, i_idx, 2*j_idx+0, hermitian, tile_size);
342  multiReduce_recurse<ReducerDiagonal,ReducerOffDiagonal>(result1, x, y1, z, w, i_idx, 2*j_idx+1, hermitian, tile_size);
343  } else {
344  T* tmp_dot = new T[x.size()*y.size()];
345 
346  // if at bottom of recursion, return if on lower left
347  if (x.size() <= tile_size.x && is_valid_NXZ(x.size(), true) && hermitian) {
348  if (j_idx < i_idx) { return; }
349  }
350 
351  coeff_array<T> a, b, c;
352 
353  if (x.size() <= tile_size.x && is_valid_NXZ(x.size(), true) && x.size() * y.size() <= (unsigned int)max_n_reduce()) {
354  // problem will fit, so do the computation
355  multiReduce<ReducerDiagonal, ReducerOffDiagonal>(tmp_dot, a, b, c, x, y, z, w, i_idx, j_idx);
356  } else {
357  // split the problem and recurse. Splitting in x requires
358  // memory reshuffling (unless y = 1).
359  // Use a few temporary variables.
360 
361  T* tmpmajor = new T[x.size()*y.size()];
362  T* result0 = &tmpmajor[0];
363  T* result1 = &tmpmajor[(x.size()/2)*y.size()];
364  std::vector<ColorSpinorField*> x0(x.begin(), x.begin() + x.size()/2);
365  std::vector<ColorSpinorField*> x1(x.begin() + x.size()/2, x.end());
366  std::vector<ColorSpinorField*> z0(z.begin(), z.begin() + z.size()/2);
367  std::vector<ColorSpinorField*> z1(z.begin() + z.size()/2, z.end());
368  std::vector<ColorSpinorField*> w0(w.begin(), w.begin() + w.size()/2);
369  std::vector<ColorSpinorField*> w1(w.begin() + w.size()/2, w.end());
370 
371  multiReduce_recurse<ReducerDiagonal,ReducerOffDiagonal>(result0, x0, y, z0, w0, 2*i_idx+0, j_idx, hermitian, tile_size);
372  multiReduce_recurse<ReducerDiagonal,ReducerOffDiagonal>(result1, x1, y, z1, w1, 2*i_idx+1, j_idx, hermitian, tile_size);
373 
374  const unsigned int xlen0 = x.size()/2;
375  const unsigned int xlen1 = x.size() - xlen0;
376  const unsigned int ylen = y.size();
377 
378  // Copy back into result.
379  int count = 0, count0 = 0, count1 = 0;
380  for (unsigned int i = 0; i < ylen; i++)
381  {
382  for (unsigned int j = 0; j < xlen0; j++)
383  result[count++] = result0[count0++];
384  for (unsigned int j = 0; j < xlen1; j++)
385  result[count++] = result1[count1++];
386  }
387 
388  delete[] tmpmajor;
389  }
390 
391  // we are at the leaf of the binary tree (e.g., we ran the kernel): perform the row-to-column-major transpose here.
392  if (x.size() <= tile_size.x && is_valid_NXZ(x.size(), true) && x.size() * y.size() <= (unsigned int)max_n_reduce()) {
393  const unsigned int xlen = x.size();
394  const unsigned int ylen = y.size();
395  for (unsigned int j = 0; j < xlen; j++)
396  for (unsigned int i = 0; i < ylen; i++)
397  result[i*xlen+j] = tmp_dot[j*ylen + i];
398  }
399  delete[] tmp_dot;
400  }
401  }
402 
403  template <template <typename ...> class ReducerDiagonal,
404  template <typename ...> class ReducerOffDiagonal, typename T>
405  class TileSizeTune : public Tunable
406  {
407  typedef std::vector<ColorSpinorField*> vec;
408  T *result;
409  vec &x, &y, &z, &w;
410  bool hermitian;
411  bool Anorm;
412 
413  unsigned int sharedBytesPerThread() const { return 0; }
414  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
415 
416  int NYW_max;
417  uint2 max_tile_size;
418 
419  public:
420  TileSizeTune(T *result, vec &x, vec &y, vec &z, vec &w, int coeff_width, bool hermitian, bool Anorm = false,
421  bool nested_policy = false) :
422  result(result),
423  x(x),
424  y(y),
425  z(z),
426  w(w),
427  hermitian(hermitian),
428  Anorm(Anorm)
429  {
430  NYW_max = max_YW_size(x.size(), x[0]->Precision(), y[0]->Precision(), false, false, coeff_width, true);
431  max_tile_size = make_uint2(1, 1);
432 
433  strcpy(aux, nested_policy ? "nested_policy," : "policy,");
434  strcat(aux, x[0]->AuxString());
435  strcat(aux, ",");
436  strcat(aux, y[0]->AuxString());
437  if (hermitian) strcat(aux, ",hermitian");
438  if (Anorm) strcat(aux, ",Anorm");
439  strcat(aux,",n=");
440  char size[8];
441  u64toa(size, x.size());
442  strcat(aux,size);
443  strcat(aux,",m=");
444  u64toa(size, y.size());
445  strcat(aux,size);
446  u64toa(size, MAX_MULTI_BLAS_N);
447  strcat(aux, ",multi-blas-n=");
448  strcat(aux, size);
449 
450  // before we do policy tuning we must ensure the kernel
451  // constituents have been tuned since we can't do nested tuning
452  // FIXME this will break if the kernels are destructive - which they aren't here
453  if (!tuned()) {
454  if (!nested_policy) disableProfileCount(); // purely for profiling reasons, don't want to profile tunings.
455 
456  // note the 1-d tuning is all redundent now that we call
457  // multiReduce_recurse directly now for 1-d multi
458  // reductions, but I'll keep this code here for now
459  if (x.size() == 1) { // 1-d reduction
460 
461  max_tile_size = make_uint2(1, std::min(NYW_max, (int)y.size()));
462  multiReduce_recurse<ReducerDiagonal, ReducerOffDiagonal>(result, x, y, z, w, 0, 0, hermitian, max_tile_size);
463 
464  } else if (y.size() == 1) { // 1-d reduction
465 
466  max_tile_size = make_uint2(std::min((size_t)max_NXZ_power2(true), x.size()), 1);
467  multiReduce_recurse<ReducerDiagonal, ReducerOffDiagonal>(result, x, y, z, w, 0, 0, hermitian, max_tile_size);
468 
469  } else { // 2-d reduction
470 
471  // max_tile_size should be set to the largest power of 2,
472  // since we have a requirement that the tile size is a
473  // power of 2.
474  // FIXME - we only do simple square tiling here
475  max_tile_size = make_uint2(max_NXZ_power2(true), max_NXZ_power2(true));
476 
477  // Make sure constituents are tuned.
478  for (unsigned int tile_size = 1;
479  tile_size <= max_tile_size.x && tile_size <= x.size() && (tile_size <= y.size() || y.size() == 1);
480  tile_size *= 2) {
481  multiReduce_recurse<ReducerDiagonal, ReducerOffDiagonal>(result, x, y, z, w, 0, 0, hermitian, make_uint2(tile_size, tile_size));
482  }
483 
484  // also test case using a single kernel if both dimensions are less than max
485  if (is_valid_NXZ(x.size(), true) && y.size() <= (unsigned int)NYW_max) {
486  multiReduce_recurse<ReducerDiagonal, ReducerOffDiagonal>(result, x, y, z, w, 0, 0, hermitian, make_uint2(x.size(), y.size()));
487  }
488  }
489 
490  if (!nested_policy) enableProfileCount();
491  setPolicyTuning(true);
492  }
493  }
494 
495  virtual ~TileSizeTune() { setPolicyTuning(false); }
496 
497  void apply(const qudaStream_t &stream) {
498  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
499 
500  // tp.aux.x is where the tile size is stored. "tp" is the tuning struct.
501  // it contains blocksize, grid size, etc. Since we're only tuning
502  // a policy, we don't care about those sizes. That's why we only
503  // tune "aux.x", which is the tile size.
504  multiReduce_recurse<ReducerDiagonal, ReducerOffDiagonal>(result, x, y, z, w, 0, 0, hermitian, make_uint2(tp.aux.x, tp.aux.y));
505  }
506 
507  // aux.x is the tile size
508  bool advanceAux(TuneParam &param) const
509  {
510  // for 1-d reductions we don't do any tuning and just use the largest tile
511  if (x.size() == 1 || y.size() == 1) {
512  return false;
513  } else { // 2-d reduction
514 
515  if ((unsigned int)(2 * param.aux.x) <= max_tile_size.x && (unsigned int)(2 * param.aux.y) <= max_tile_size.y
516  && (unsigned int)(2 * param.aux.x) <= x.size() && (unsigned int)(2 * param.aux.y) <= y.size()) {
517  // only tune powers of two
518  param.aux.x *= 2;
519  param.aux.y *= 2;
520  return true;
521  } else if (is_valid_NXZ(x.size(), true) && y.size() <= (size_t)NYW_max
522  && ((size_t)param.aux.x != x.size() || (size_t)param.aux.y != y.size())) {
523  // we've run out of power of two tiles to try, but before
524  // we finish, try a single kernel if it fits
525  param.aux.x = x.size();
526  param.aux.y = y.size();
527  return true;
528  } else {
529  // reset to the beginning (which we'd need for multi-dimensional tuning)
530  param.aux.x = 1;
531  param.aux.y = 1;
532  return false;
533  }
534  }
535  }
536 
537  bool advanceTuneParam(TuneParam &param) const { return advanceAux(param); }
538 
539  void initTuneParam(TuneParam &param) const {
540  Tunable::initTuneParam(param);
541  if (x.size() == 1 || y.size() == 1) {
542  param.aux.x = max_tile_size.x;
543  param.aux.y = max_tile_size.y;
544  } else { // only do non-trivial tuning for 2-d reductions
545  param.aux.x = 1;
546  param.aux.y = 1;
547  }
548  param.aux.z = 0;
549  param.aux.w = 0;
550  }
551 
552  void defaultTuneParam(TuneParam &param) const {
553  Tunable::defaultTuneParam(param); // default is max tile size
554  param.aux.x = max_tile_size.x;
555  param.aux.y = max_tile_size.y;
556  param.aux.z = 0;
557  param.aux.w = 0;
558  }
559 
560  TuneKey tuneKey() const {
561  return TuneKey(x[0]->VolString(), typeid(*this).name(), aux);
562  }
563 
564  long long flops() const { return 0; } // FIXME
565  long long bytes() const { return 0; } // FIXME
566 
567  void preTune() { } // FIXME - use write to determine what needs to be saved
568  void postTune() { } // FIXME - use write to determine what needs to be saved
569  };
570 
571  template <template <typename ...> class ReducerDiagonal,
572  template <typename ...> class ReducerOffDiagonal, typename T>
573  class TransposeTune : public Tunable
574  {
575  using TileTuner = TileSizeTune<ReducerDiagonal, ReducerOffDiagonal, T>;
576  using vec = std::vector<ColorSpinorField *>;
577  T *result;
578  vec &x, &y;
579  int coeff_width;
580  bool hermitian;
581  bool Anorm;
582 
583  unsigned int sharedBytesPerThread() const { return 0; }
584  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
585 
586  public:
587  TransposeTune(T *result, vec &x, vec &y, int coeff_width, bool hermitian, bool Anorm = false) :
588  result(result),
589  x(x),
590  y(y),
591  coeff_width(coeff_width),
592  hermitian(hermitian),
593  Anorm(Anorm)
594  {
595  strcpy(aux, "policy,");
596  strcat(aux, x[0]->AuxString());
597  strcat(aux, ",");
598  strcat(aux, y[0]->AuxString());
599  if (hermitian) strcat(aux, ",hermitian");
600  if (Anorm) strcat(aux, ",Anorm");
601  strcat(aux, ",n=");
602  char size[8];
603  u64toa(size, x.size());
604  strcat(aux, size);
605  strcat(aux, ",m=");
606  u64toa(size, y.size());
607  strcat(aux, size);
608  u64toa(size, MAX_MULTI_BLAS_N);
609  strcat(aux, ",multi-blas-n=");
610  strcat(aux, size);
611 
612  // before we do policy tuning we must ensure the kernel
613  // constituents have been tuned since we can't do nested tuning
614  if (!tuned()) {
615  disableProfileCount(); // purely for profiling reasons, don't want to profile tunings.
616 
617  // note the 1-d tuning is all redundent now that we call
618  // multiReduce_recurse directly now for 1-d multi
619  // reductions, but I'll keep this code here for now
620  if (x.size() == 1) {
621  TileTuner tile(result, x, y, x, x, coeff_width, hermitian, Anorm, true);
622  tile.apply(0);
623  } else if (y.size() == 1) {
624  TileTuner tile(result, y, x, y, y, coeff_width, hermitian, Anorm, true);
625  tile.apply(0);
626  } else {
627 
628  { // tune regular inner product
629  TileTuner tile(result, x, y, x, x, coeff_width, hermitian, Anorm, true);
630  tile.apply(0);
631  }
632 
633  { // tune transpose inner product
634  TileTuner tile(result, y, x, y, y, coeff_width, hermitian, Anorm, true);
635  tile.apply(0);
636  }
637  }
638 
639  enableProfileCount();
640  setPolicyTuning(true);
641  }
642  }
643 
644  virtual ~TransposeTune() { setPolicyTuning(false); }
645 
646  void apply(const qudaStream_t &stream)
647  {
648  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
649 
650  if (tp.aux.x == 0) {
651  TileTuner tile(result, x, y, x, x, coeff_width, hermitian, Anorm, true);
652  tile.apply(stream);
653  } else if (tp.aux.x == 1) {
654  T *result_trans = new T[x.size() * y.size()];
655 
656  // swap (x<->y and w<-z> when doing transpose calculation)
657  TileTuner tile(result_trans, y, x, y, y, coeff_width, hermitian, Anorm, true);
658  tile.apply(stream);
659 
660  // tranpose the result if we are doing the transpose calculation
661  const auto xlen = x.size();
662  const auto ylen = y.size();
663  for (unsigned int j = 0; j < xlen; j++)
664  for (unsigned int i = 0; i < ylen; i++) result[i * xlen + j] = conj(result_trans[j * ylen + i]);
665 
666  delete[] result_trans;
667  } else {
668  errorQuda("Unexpected transpose parameter %d", tp.aux.x);
669  }
670  }
671 
672  bool advanceAux(TuneParam &param) const
673  {
674  if (x.size() == 1 || y.size() == 1) {
675  return false;
676  } else {
677  if (param.aux.x == 0) {
678  param.aux.x = 1;
679  return true;
680  } else {
681  param.aux.x = 0;
682  return false;
683  }
684  }
685  }
686 
687  bool advanceTuneParam(TuneParam &param) const { return advanceAux(param); }
688 
689  void initTuneParam(TuneParam &param) const
690  {
691  Tunable::initTuneParam(param);
692  if (x.size() == 1)
693  param.aux = make_int4(0, 0, 0, 0);
694  else if (y.size() == 1)
695  param.aux = make_int4(1, 0, 0, 0);
696  else
697  param.aux = make_int4(0, 0, 0, 0); // default is not to transpose
698  }
699 
700  void defaultTuneParam(TuneParam &param) const { initTuneParam(param); }
701 
702  TuneKey tuneKey() const { return TuneKey(x[0]->VolString(), typeid(*this).name(), aux); }
703 
704  long long flops() const { return 0; } // FIXME
705  long long bytes() const { return 0; } // FIXME
706 
707  void preTune() {} // FIXME - use write to determine what needs to be saved
708  void postTune() {} // FIXME - use write to determine what needs to be saved
709  };
710 
711  void reDotProduct(double *result, std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y)
712  {
713  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
714  double *result_tmp = new double[x.size() * y.size()];
715  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
716  int coeff_width = 0;
717 
718  if (x.size() == 1) {
719  int NYW_max = max_YW_size(x.size(), x[0]->Precision(), y[0]->Precision(), false, false, coeff_width, true);
720  // if fine-grid then we set max tile size to 32 to avoid unnecessary tuning
721  uint2 max_tile_size = make_uint2(1, std::min( {NYW_max, (int)y.size(), x[0]->Ncolor() == 3 ? 32 : NYW_max} ));
722  multiReduce_recurse<multiDot, multiDot>(result_tmp, x, y, x, x, 0, 0, false, max_tile_size);
723  } else if (y.size() == 1 && x[0]->Precision() == y[0]->Precision()) {
724 
725  double *result_trans = new double[x.size() * y.size()];
726 
727  // swap (x<->y and w<-z> when doing transpose calculation)
728  int NXZ_max = max_YW_size(y.size(), y[0]->Precision(), x[0]->Precision(), false, false, coeff_width, true);
729  // if fine-grid then we set max tile size to 32 to avoid unnecessary tuning
730  uint2 max_tile_size = make_uint2(1, std::min( {NXZ_max, (int)x.size(), x[0]->Ncolor() == 3 ? 32 : NXZ_max} ));
731  multiReduce_recurse<multiDot, multiDot>(result_trans, y, x, y, y, 0, 0, false, max_tile_size);
732 
733  // transpose the result if we are doing the transpose calculation
734  const auto xlen = x.size();
735  const auto ylen = y.size();
736  for (unsigned int j = 0; j < xlen; j++)
737  for (unsigned int i = 0; i < ylen; i++) result_tmp[i * xlen + j] = result_trans[j * ylen + i];
738 
739  delete[] result_trans;
740 
741  } else if (x[0]->Precision() == y[0]->Precision()) {
742  TransposeTune<multiDot, multiDot, double> trans(result_tmp, x, y, coeff_width, false);
743  trans.apply(0);
744  } else {
745  TileSizeTune<multiDot, multiDot, double> tile(result_tmp, x, y, x, x, coeff_width, false);
746  tile.apply(0);
747  }
748 
749  // do a single multi-node reduction only once we have computed all local dot products
750  const int Nreduce = x.size() * y.size();
751  reduceDoubleArray(result_tmp, Nreduce);
752 
753  // multiReduce_recurse returns a column-major matrix.
754  // To be consistent with the multi-blas functions, we should
755  // switch this to row-major.
756  const unsigned int xlen = x.size();
757  const unsigned int ylen = y.size();
758  for (unsigned int j = 0; j < xlen; j++)
759  for (unsigned int i = 0; i < ylen; i++) result[j * ylen + i] = result_tmp[i * xlen + j];
760 
761  delete[] result_tmp;
762  }
763 
764  void cDotProduct(Complex *result, std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y)
765  {
766  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
767  Complex *result_tmp = new Complex[x.size() * y.size()];
768  for (unsigned int i = 0; i < x.size() * y.size(); i++) result_tmp[i] = 0.0;
769  int coeff_width = 0;
770 
771  if (x.size() == 1) {
772  int NYW_max = max_YW_size(x.size(), x[0]->Precision(), y[0]->Precision(), false, false, coeff_width, true);
773  // if fine-grid then we set max tile size to 32 to avoid unnecessary tuning
774  uint2 max_tile_size = make_uint2(1, std::min( {NYW_max, (int)y.size(), x[0]->Ncolor() == 3 ? 32 : NYW_max} ));
775  multiReduce_recurse<multiCdot, multiCdot>(result_tmp, x, y, x, x, 0, 0, false, max_tile_size);
776  } else if (y.size() == 1 && x[0]->Precision() == y[0]->Precision()) {
777 
778  Complex *result_trans = new Complex[x.size() * y.size()];
779 
780  // swap (x<->y and w<-z> when doing transpose calculation)
781  int NXZ_max = max_YW_size(y.size(), y[0]->Precision(), x[0]->Precision(), false, false, coeff_width, true);
782  // if fine-grid then we set max tile size to 32 to avoid unnecessary tuning
783  uint2 max_tile_size = make_uint2(1, std::min( {NXZ_max, (int)x.size(), x[0]->Ncolor() == 3 ? 32 : NXZ_max} ));
784  multiReduce_recurse<multiCdot, multiCdot>(result_trans, y, x, y, y, 0, 0, false, max_tile_size);
785 
786  // transpose the result if we are doing the transpose calculation
787  const auto xlen = x.size();
788  const auto ylen = y.size();
789  for (unsigned int j = 0; j < xlen; j++)
790  for (unsigned int i = 0; i < ylen; i++) result_tmp[i * xlen + j] = conj(result_trans[j * ylen + i]);
791 
792  delete[] result_trans;
793 
794  } else if (x[0]->Precision() == y[0]->Precision()) {
795  TransposeTune<multiCdot, multiCdot, Complex> trans(result_tmp, x, y, coeff_width, false);
796  trans.apply(0);
797  } else {
798  TileSizeTune<multiCdot, multiCdot, Complex> tile(result_tmp, x, y, x, x, coeff_width, false);
799  tile.apply(0);
800  }
801 
802  // do a single multi-node reduction only once we have computed all local dot products
803  const int Nreduce = 2*x.size()*y.size();
804  reduceDoubleArray((double*)result_tmp, Nreduce);
805 
806  // multiReduce_recurse returns a column-major matrix.
807  // To be consistent with the multi-blas functions, we should
808  // switch this to row-major.
809  const unsigned int xlen = x.size();
810  const unsigned int ylen = y.size();
811  for (unsigned int j = 0; j < xlen; j++)
812  for (unsigned int i = 0; i < ylen; i++)
813  result[j*ylen+i] = result_tmp[i*xlen + j];
814 
815  delete[] result_tmp;
816  }
817 
818  void hDotProduct(Complex *result, std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y)
819  {
820  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
821  if (x.size() != y.size()) errorQuda("Cannot call Hermitian block dot product on non-square inputs");
822 
823  Complex* result_tmp = new Complex[x.size()*y.size()];
824  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
825 
826  int coeff_width = 0;
827  TileSizeTune<multiCdot, multiCdot, Complex> tile(result_tmp, x, y, x, x, coeff_width, true, false); // last false is b/c L2 norm
828  tile.apply(0);
829 
830  // do a single multi-node reduction only once we have computed all local dot products
831  const int Nreduce = 2*x.size()*y.size();
832  reduceDoubleArray((double*)result_tmp, Nreduce); // FIXME - could optimize this for Hermiticity as well
833 
834  // Switch from col-major to row-major
835  const unsigned int xlen = x.size();
836  const unsigned int ylen = y.size();
837  for (unsigned int j = 0; j < xlen; j++)
838  for (unsigned int i = j; i < ylen; i++) {
839  result[j*ylen+i] = result_tmp[i*xlen + j];
840  result[i*ylen+j] = conj(result_tmp[i*xlen + j]);
841  }
842 
843  delete[] result_tmp;
844  }
845 
846  // for (p, Ap) norms in CG which are Hermitian.
847  void hDotProduct_Anorm(Complex *result, std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y)
848  {
849  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
850  if (x.size() != y.size()) errorQuda("Cannot call Hermitian block A-norm dot product on non-square inputs");
851 
852  Complex* result_tmp = new Complex[x.size()*y.size()];
853  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
854 
855  int coeff_width = 0;
856  TileSizeTune<multiCdot, multiCdot, Complex> tile(result_tmp, x, y, x, x, coeff_width, true, true); // last true is b/c A norm
857  tile.apply(0);
858 
859  // do a single multi-node reduction only once we have computed all local dot products
860  const int Nreduce = 2*x.size()*y.size();
861  reduceDoubleArray((double*)result_tmp, Nreduce); // FIXME - could optimize this for Hermiticity as well
862 
863  // Switch from col-major to row-major
864  const unsigned int xlen = x.size();
865  const unsigned int ylen = y.size();
866  for (unsigned int j = 0; j < xlen; j++)
867  for (unsigned int i = j; i < ylen; i++) {
868  result[j*ylen+i] = result_tmp[i*xlen + j];
869  result[i*ylen+j] = conj(result_tmp[i*xlen + j]);
870  }
871 
872  delete[] result_tmp;
873  }
874 
875  // takes the outer product of inner products between and y and copies y into z
876  void cDotProductCopy(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y,
877  std::vector<ColorSpinorField*>&z){
878 
879 #if 0
880  // FIXME - if this is enabled we need to ensure that use_w is
881  // enabled above. Also, I think this might break if the diagonal
882  // write is different from the off-diagonal write
883  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
884  if (y.size() != z.size()) errorQuda("Cannot copy input y of size %lu into z of size %lu\n", y.size(), z.size());
885 
886  Complex* result_tmp = new Complex[x.size()*y.size()];
887  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
888 
889  int coeff_width = 0;
890  // When recursing, only the diagonal tiles will do the copy, the rest just do the outer product
891  TileSizeTune<double2, typename vector<device_reduce_t,2>::type,multiCdotCopy,multiCdot,Complex> tile(result_tmp, x, y, x, y, coeff_width, true);
892  tile.apply(0);
893 
894  // do a single multi-node reduction only once we have computed all local dot products
895  const int Nreduce = 2*x.size()*y.size();
896  reduceDoubleArray((double*)result_tmp, Nreduce);
897 
898  // Switch from col-major to row-major.
899  const unsigned int xlen = x.size();
900  const unsigned int ylen = y.size();
901  for (unsigned int j = 0; j < xlen; j++)
902  for (unsigned int i = 0; i < ylen; i++)
903  result[j*ylen+i] = result_tmp[i*xlen + j];
904 
905  delete[] result_tmp;
906 #else
907  errorQuda("cDotProductCopy not enabled");
908 #endif
909  }
910 
911  } // namespace blas
912 
913 } // namespace quda