QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
multi_reduce_quda.cu
Go to the documentation of this file.
1 #include <blas_quda.h>
2 #include <tune_quda.h>
3 #include <float_vector.h>
5 #include <uint_to_char.h>
6 
7 #include <launch_kernel.cuh>
8 #include <jitify_helper.cuh>
10 
11 // work around for Fermi
12 #if (__COMPUTE_CAPABILITY__ < 300)
13 #undef MAX_MULTI_BLAS_N
14 #define MAX_MULTI_BLAS_N 2
15 #endif
16 
17 namespace quda {
18 
19  namespace blas {
20 
21  cudaStream_t* getStream();
22  cudaEvent_t* getReduceEvent();
23  bool getFastReduce();
24  void initFastReduce(int words);
25  void completeFastReduce(int32_t words);
26 
27  template <int writeX, int writeY, int writeZ, int writeW>
28  struct write {
29  static constexpr int X = writeX;
30  static constexpr int Y = writeY;
31  static constexpr int Z = writeZ;
32  static constexpr int W = writeW;
33  };
34 
35  template <typename doubleN, typename ReduceType, typename FloatN, int M, int NXZ, typename Arg>
36  void multiReduceLaunch(doubleN result[], Arg &arg, const TuneParam &tp, const cudaStream_t &stream, Tunable &tunable)
37  {
38 
39  if (tp.grid.x > (unsigned int)deviceProp.maxGridSize[0])
40  errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, deviceProp.maxGridSize[0]);
41 
42  const int32_t words = tp.grid.z * NXZ * arg.NYW * sizeof(ReduceType) / sizeof(int32_t);
44 
45 #ifdef WARP_MULTI_REDUCE
46 #error "Untested - should be reverified"
47  // multiReduceKernel<ReduceType,FloatN,M,NXZ><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
48 #else
49 #ifdef JITIFY
50  using namespace jitify::reflection;
51  tunable.jitifyError() = program->kernel("quda::blas::multiReduceKernel")
52  .instantiate((int)tp.block.x, Type<ReduceType>(), Type<FloatN>(), M, NXZ, Type<Arg>())
53  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
54  .launch(arg);
55 #else
56 #if CUDA_VERSION < 9000
57  cudaMemcpyToSymbolAsync(arg_buffer, reinterpret_cast<char *>(&arg), sizeof(arg), 0, cudaMemcpyHostToDevice,
58  *getStream());
59 #endif
60  LAUNCH_KERNEL_LOCAL_PARITY(multiReduceKernel, tp, stream, arg, ReduceType, FloatN, M, NXZ);
61 #endif
62 #endif
63 
64  if (!commAsyncReduction()) {
65 #if (defined(_MSC_VER) && defined(_WIN64) || defined(__LP64__))
66  if (deviceProp.canMapHostMemory) {
67  if (getFastReduce()) {
68  completeFastReduce(words);
69  } else {
70  qudaEventRecord(*getReduceEvent(), stream);
71  while (cudaSuccess != qudaEventQuery(*getReduceEvent())) {}
72  }
73  } else
74 #endif
75  {
76  qudaMemcpy(getHostReduceBuffer(), getMappedHostReduceBuffer(), tp.grid.z * sizeof(ReduceType) * NXZ * arg.NYW,
77  cudaMemcpyDeviceToHost);
78  }
79  }
80 
81  // need to transpose for same order with vector thread reduction
82  for (int i = 0; i < NXZ; i++) {
83  for (int j = 0; j < arg.NYW; j++) {
84  result[i * arg.NYW + j] = set(((ReduceType *)getHostReduceBuffer())[j * NXZ + i]);
85  if (tp.grid.z == 2)
86  sum(result[i * arg.NYW + j], ((ReduceType *)getHostReduceBuffer())[NXZ * arg.NYW + j * NXZ + i]);
87  }
88  }
89  }
90 
91  namespace detail
92  {
93  template <unsigned... digits> struct to_chars {
94  static const char value[];
95  };
96 
97  template <unsigned... digits> const char to_chars<digits...>::value[] = {('0' + digits)..., 0};
98 
99  template <unsigned rem, unsigned... digits> struct explode : explode<rem / 10, rem % 10, digits...> {
100  };
101 
102  template <unsigned... digits> struct explode<0, digits...> : to_chars<digits...> {
103  };
104  } // namespace detail
105 
106  template <unsigned num> struct num_to_string : detail::explode<num / 10, num % 10> {
107  };
108 
109  template <int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX,
110  typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>
111  class MultiReduceCuda : public Tunable
112  {
113 
114  private:
115  const int NYW;
116  int nParity;
118  doubleN *result;
119 
120  std::vector<ColorSpinorField *> &x, &y, &z, &w;
121 
122  // don't curry into the Spinors to minimize parameter size
124 
125  unsigned int sharedBytesPerThread() const { return 0; }
126  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
127 
128  virtual bool advanceSharedBytes(TuneParam &param) const
129  {
130  TuneParam next(param);
131  advanceBlockDim(next); // to get next blockDim
132  int nthreads = next.block.x * next.block.y * next.block.z;
133  param.shared_bytes = sharedBytesPerThread() * nthreads > sharedBytesPerBlock(param) ?
134  sharedBytesPerThread() * nthreads :
135  sharedBytesPerBlock(param);
136  return false;
137  }
138 
139  // we only launch thread blocks up to size 512 since the autoner
140  // tuner favours smaller blocks and this helps with compile time
141  unsigned int maxBlockSize(const TuneParam &param) const { return deviceProp.maxThreadsPerBlock / 2; }
142 
143  public:
144  MultiReduceCuda(doubleN result[], SpinorX X[], SpinorY Y[], SpinorZ Z[], SpinorW W[], Reducer &r,
145  std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y, std::vector<ColorSpinorField *> &z,
146  std::vector<ColorSpinorField *> &w, int NYW, int length) :
147  NYW(NYW),
148  nParity(x[0]->SiteSubset()),
149  arg(X, Y, Z, W, r, NYW, length / nParity),
150  x(x),
151  y(y),
152  z(z),
153  w(w),
154  result(result),
155  Y_h(),
156  W_h(),
157  Ynorm_h(),
158  Wnorm_h()
159  {
160  strcpy(aux, "policy_kernel,");
161  strcat(aux, x[0]->AuxString());
162  if (getFastReduce()) strcat(aux, ",fast_reduce");
163 
164  // since block dot product and block norm use the same functors, we need to distinguish them
165  bool is_norm = false;
166  if (NXZ == NYW) {
167  is_norm = true;
168  for (int i = 0; i < NXZ; i++) {
169  if (x[i]->V() != y[i]->V() || x[i]->V() != z[i]->V() || x[i]->V() != w[i]->V()) {
170  is_norm = false;
171  break;
172  }
173  }
174  }
175  if (is_norm) strcat(aux, ",norm");
176 
177 #ifdef JITIFY
178  ::quda::create_jitify_program("kernels/multi_reduce_core.cuh");
179 #endif
180  }
181 
182  inline TuneKey tuneKey() const
183  {
184  char name[TuneKey::name_n];
185  strcpy(name, num_to_string<NXZ>::value);
186  strcat(name, std::to_string(NYW).c_str());
187  strcat(name, typeid(arg.r).name());
188  return TuneKey(x[0]->VolString(), name, aux);
189  }
190 
191  void apply(const cudaStream_t &stream)
192  {
193  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
194  multiReduceLaunch<doubleN, ReduceType, FloatN, M, NXZ>(result, arg, tp, stream, *this);
195  }
196 
197  // Should these be NYW?
198 #ifdef WARP_MULTI_REDUCE
199 
206  bool advanceBlockDim(TuneParam &param) const
207  {
208  if (param.block.y < NYW) {
209  param.block.y++;
210  param.grid.y = (NYW + param.block.y - 1) / param.block.y;
211  return true;
212  } else {
213  param.block.y = 1;
214  param.grid.y = NYW;
215  return false;
216  }
217  }
218 #endif
219 
220  bool advanceGridDim(TuneParam &param) const
221  {
222  bool rtn = Tunable::advanceGridDim(param);
223  if (NYW > deviceProp.maxGridSize[1]) errorQuda("N=%d is greater than the maximum support grid size", NYW);
224  return rtn;
225  }
226 
227  void initTuneParam(TuneParam &param) const
228  {
229  Tunable::initTuneParam(param);
230  param.block.y = 1;
231  param.grid.y = NYW;
232  param.grid.z = nParity;
233  }
234 
235  void defaultTuneParam(TuneParam &param) const
236  {
238  param.block.y = 1;
239  param.grid.y = NYW;
240  param.grid.z = nParity;
241  }
242 
243  void preTune()
244  {
245  for (int i = 0; i < NYW; ++i) {
246  arg.Y[i].backup(&Y_h[i], &Ynorm_h[i], y[i]->Bytes(), y[i]->NormBytes());
247  arg.W[i].backup(&W_h[i], &Wnorm_h[i], w[i]->Bytes(), w[i]->NormBytes());
248  }
249  }
250 
251  void postTune()
252  {
253  for (int i = 0; i < NYW; ++i) {
254  arg.Y[i].restore(&Y_h[i], &Ynorm_h[i], y[i]->Bytes(), y[i]->NormBytes());
255  arg.W[i].restore(&W_h[i], &Wnorm_h[i], w[i]->Bytes(), w[i]->NormBytes());
256  }
257  }
258 
259  long long flops() const
260  {
261  return NYW * NXZ * arg.r.flops() * vec_length<FloatN>::value * (long long)arg.length * nParity * M;
262  }
263 
264  long long bytes() const
265  {
266  // this will be wrong when mixed precision is added
267  return NYW * NXZ * arg.r.streams() * x[0]->Bytes();
268  }
269 
270  int tuningIter() const { return 3; }
271  };
272 
273  template <typename doubleN, typename ReduceType, typename RegType, typename StoreType, typename yType, int M, int NXZ,
274  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class Reducer, typename write, typename T>
275  void multiReduce(doubleN result[], const coeff_array<T> &a, const coeff_array<T> &b, const coeff_array<T> &c,
276  std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y, std::vector<ColorSpinorField *> &z,
277  std::vector<ColorSpinorField *> &w, int length)
278  {
279 
280  const int NYW = y.size();
281 
282  memset(result, 0, NXZ * NYW * sizeof(doubleN));
283 
284  const int N_MAX = NXZ > NYW ? NXZ : NYW;
285  const int N_MIN = NXZ < NYW ? NXZ : NYW;
286 
288  "MAX_MULTI_BLAS_N^2 exceeds maximum number of reductions");
289  static_assert(MAX_MULTI_BLAS_N <= 16, "MAX_MULTI_BLAS_N exceeds maximum size 16");
290  if (N_MAX > MAX_MULTI_BLAS_N)
291  errorQuda("Spinor vector length exceeds max size (%d > %d)", N_MAX, MAX_MULTI_BLAS_N);
292 
293  if (NXZ * NYW * sizeof(Complex) > MAX_MATRIX_SIZE)
294  errorQuda("A matrix exceeds max size (%lu > %d)", NXZ * NYW * sizeof(Complex), MAX_MATRIX_SIZE);
295 
296  for (int i = 0; i < N_MIN; i++) {
297  checkSpinor(*x[i], *y[i]);
298  checkSpinor(*x[i], *z[i]);
299  checkSpinor(*x[i], *w[i]);
300  if (!x[i]->isNative()) {
301  warningQuda("Reductions on non-native fields are not supported\n");
302  return;
303  }
304  }
305 
306  typedef typename scalar<RegType>::type Float;
307  typedef typename vector<Float, 2>::type Float2;
308  typedef vector<Float, 2> vec2;
309 
310 #ifdef JITIFY
311  // need to get constants pointer from jitify instance
312  if (a.use_const || b.use_const || c.use_const)
313  errorQuda("Constant memory buffer support not enabled with jitify yet");
314 #endif
315 
316  // FIXME - if NXZ=1 no need to copy entire array
317  // FIXME - do we really need strided access here?
318  if (a.data && a.use_const) {
319  Float2 A[MAX_MATRIX_SIZE / sizeof(Float2)];
320  // since the kernel doesn't know the width of them matrix at compile
321  // time we stride it and copy the padded matrix to GPU
322  for (int i = 0; i < NXZ; i++)
323  for (int j = 0; j < NYW; j++) A[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(a.data[NYW * i + j]));
324 
325  cudaMemcpyToSymbolAsync(Amatrix_d, A, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
326  Amatrix_h = reinterpret_cast<signed char *>(const_cast<T *>(a.data));
327  }
328 
329  if (b.data && b.use_const) {
330  Float2 B[MAX_MATRIX_SIZE / sizeof(Float2)];
331  // since the kernel doesn't know the width of them matrix at compile
332  // time we stride it and copy the padded matrix to GPU
333  for (int i = 0; i < NXZ; i++)
334  for (int j = 0; j < NYW; j++) B[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(b.data[NYW * i + j]));
335 
336  cudaMemcpyToSymbolAsync(Bmatrix_d, B, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
337  Bmatrix_h = reinterpret_cast<signed char *>(const_cast<T *>(b.data));
338  }
339 
340  if (c.data && c.use_const) {
341  Float2 C[MAX_MATRIX_SIZE / sizeof(Float2)];
342  // since the kernel doesn't know the width of them matrix at compile
343  // time we stride it and copy the padded matrix to GPU
344  for (int i = 0; i < NXZ; i++)
345  for (int j = 0; j < NYW; j++) C[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(c.data[NYW * i + j]));
346 
347  cudaMemcpyToSymbolAsync(Cmatrix_d, C, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
348  Cmatrix_h = reinterpret_cast<signed char *>(const_cast<T *>(c.data));
349  }
350 
355 
356  for (int i = 0; i < NXZ; i++) {
357  X[i].set(*dynamic_cast<cudaColorSpinorField *>(x[i]));
358  Z[i].set(*dynamic_cast<cudaColorSpinorField *>(z[i]));
359  }
360  for (int i = 0; i < NYW; i++) {
361  Y[i].set(*dynamic_cast<cudaColorSpinorField *>(y[i]));
362  W[i].set(*dynamic_cast<cudaColorSpinorField *>(w[i]));
363  }
364 
365  Reducer<NXZ, ReduceType, Float2, RegType> r(a, b, c, NYW);
366 
370  reduce(result, X, Y, Z, W, r, x, y, z, w, NYW, length);
371  reduce.apply(*blas::getStream());
372 
373  blas::bytes += reduce.bytes();
374  blas::flops += reduce.flops();
375 
376  checkCudaError();
377  }
378 
382  template <int NXZ, typename doubleN, typename ReduceType,
383  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class Reducer, typename write,
384  bool siteUnroll, typename T>
385  void multiReduce(doubleN result[], const coeff_array<T> &a, const coeff_array<T> &b, const coeff_array<T> &c,
388  {
389  const int NYW = y.size();
390 
391  int reduce_length = siteUnroll ? x[0]->RealLength() : x[0]->Length();
392 
393  QudaPrecision precision = checkPrecision(*x[0], *y[0], *z[0], *w[0]);
394 
395  if (precision == QUDA_DOUBLE_PRECISION) {
396 
397 #if QUDA_PRECISION & 8
398  if (x[0]->Nspin() == 4 || x[0]->Nspin() == 2) { // wilson
399 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_MULTIGRID)
400  const int M = siteUnroll ? 12 : 1; // determines how much work per thread to do
401  if (x[0]->Nspin() == 2 && siteUnroll) errorQuda("siteUnroll not supported for nSpin==2");
402  multiReduce<doubleN, ReduceType, double2, double2, double2, M, NXZ, Reducer, write>(
403  result, a, b, c, x, y, z, w, reduce_length / (2 * M));
404 #else
405  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
406 #endif
407  } else if (x[0]->Nspin() == 1) {
408 #ifdef GPU_STAGGERED_DIRAC
409  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
410  multiReduce<doubleN, ReduceType, double2, double2, double2, M, NXZ, Reducer, write>(
411  result, a, b, c, x, y, z, w, reduce_length / (2 * M));
412 #else
413  errorQuda("blas has not been built for Nspin=%d field", x[0]->Nspin());
414 #endif
415  } else {
416  errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
417  }
418 #else
419  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
420 #endif
421 
422  } else if (precision == QUDA_SINGLE_PRECISION) {
423 
424 #if QUDA_PRECISION & 4
425  if (x[0]->Nspin() == 4) { // wilson
426 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
427  const int M = siteUnroll ? 6 : 1; // determines how much work per thread to do
428  multiReduce<doubleN, ReduceType, float4, float4, float4, M, NXZ, Reducer, write>(
429  result, a, b, c, x, y, z, w, reduce_length / (4 * M));
430 #else
431  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
432 #endif
433  } else if (x[0]->Nspin() == 1 || x[0]->Nspin() == 2) { // staggered
434 #if defined(GPU_STAGGERED_DIRAC) || defined(GPU_MULTIGRID)
435  const int M = siteUnroll ? 3 : 1;
436  if (x[0]->Nspin() == 2 && siteUnroll) errorQuda("siteUnroll not supported for nSpin==2");
437  multiReduce<doubleN, ReduceType, float2, float2, float2, M, NXZ, Reducer, write>(
438  result, a, b, c, x, y, z, w, reduce_length / (2 * M));
439 #else
440  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
441 #endif
442  } else {
443  errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
444  }
445 #else
446  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
447 #endif
448 
449  } else if (precision == QUDA_HALF_PRECISION) { // half precision
450 
451 #if QUDA_PRECISION & 2
452  if (x[0]->Nspin() == 4) { // wilson
453 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
454  const int M = 6;
455  multiReduce<doubleN, ReduceType, float4, short4, short4, M, NXZ, Reducer, write>(
456  result, a, b, c, x, y, z, w, x[0]->Volume());
457 #else
458  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
459 #endif
460  } else if (x[0]->Nspin() == 1) { // staggered
461 #ifdef GPU_STAGGERED_DIRAC
462  const int M = 3;
463  multiReduce<doubleN, ReduceType, float2, short2, short2, M, NXZ, Reducer, write>(
464  result, a, b, c, x, y, z, w, x[0]->Volume());
465 #else
466  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
467 #endif
468  } else {
469  errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
470  }
471 #else
472  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
473 #endif
474 
475  } else if (precision == QUDA_QUARTER_PRECISION) { // quarter precision
476 
477 #if QUDA_PRECISION & 1
478  if (x[0]->Nspin() == 4) { // wilson
479 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
480  const int M = 6;
481  multiReduce<doubleN, ReduceType, float4, char4, char4, M, NXZ, Reducer, write>(
482  result, a, b, c, x, y, z, w, x[0]->Volume());
483 #else
484  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
485 #endif
486  } else if (x[0]->Nspin() == 1) { // staggered
487 #ifdef GPU_STAGGERED_DIRAC
488  const int M = 3;
489  multiReduce<doubleN, ReduceType, float2, char2, char2, M, NXZ, Reducer, write>(
490  result, a, b, c, x, y, z, w, x[0]->Volume());
491 #else
492  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
493 #endif
494  } else {
495  errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
496  }
497 #else
498  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, precision);
499 #endif
500  } else {
501  errorQuda("Precision %d not supported\n", precision);
502  }
503  }
504 
508  template <int NXZ, typename doubleN, typename ReduceType,
509  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class Reducer, typename write,
510  bool siteUnroll, typename T>
511  void mixedMultiReduce(doubleN result[], const coeff_array<T> &a, const coeff_array<T> &b, const coeff_array<T> &c,
514  {
515  const int NYW = y.size();
516 
517  checkPrecision(*x[0], *z[0]);
518  checkPrecision(*y[0], *w[0]);
519 
520  assert(siteUnroll == true);
521  int reduce_length = siteUnroll ? x[0]->RealLength() : x[0]->Length();
522 
523  if (y[0]->Precision() == QUDA_DOUBLE_PRECISION && x[0]->Precision() == QUDA_SINGLE_PRECISION) {
524 
525  if (x[0]->Nspin() == 4) { // wilson
526 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
527  const int M = 12; // determines how much work per thread to do
528  multiReduce<doubleN, ReduceType, double2, float4, double2, M, NXZ, Reducer, write>(
529  result, a, b, c, x, y, z, w, reduce_length / (2 * M));
530 #else
531  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
532 #endif
533  } else if (x[0]->Nspin() == 1) {
534 #ifdef GPU_STAGGERED_DIRAC
535  const int M = 3; // determines how much work per thread to do
536  multiReduce<doubleN, ReduceType, double2, float2, double2, M, NXZ, Reducer, write>(
537  result, a, b, c, x, y, z, w, reduce_length / (2 * M));
538 #else
539  errorQuda("blas has not been built for Nspin=%d field", x[0]->Nspin());
540 #endif
541  } else {
542  errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
543  }
544 
545  } else if (y[0]->Precision() == QUDA_DOUBLE_PRECISION && x[0]->Precision() == QUDA_HALF_PRECISION) {
546 
547  if (x[0]->Nspin() == 4) { // wilson
548 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
549  const int M = 6; // determines how much work per thread to do
550  multiReduce<doubleN, ReduceType, double2, short4, double2, M, NXZ, Reducer, write>(
551  result, a, b, c, x, y, z, w, reduce_length / (4 * M));
552 #else
553  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
554 #endif
555  } else if (x[0]->Nspin() == 1 || x[0]->Nspin() == 2) { // staggered
556 #if defined(GPU_STAGGERED_DIRAC)
557  const int M = 3;
558  multiReduce<doubleN, ReduceType, double2, short2, double2, M, NXZ, Reducer, write>(
559  result, a, b, c, x, y, z, w, reduce_length / (2 * M));
560 #else
561  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
562 #endif
563  } else {
564  errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
565  }
566 
567  } else if (y[0]->Precision() == QUDA_SINGLE_PRECISION && x[0]->Precision() == QUDA_HALF_PRECISION) {
568 
569  if (x[0]->Nspin() == 4) { // wilson
570 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
571  const int M = 6;
572  multiReduce<doubleN, ReduceType, float4, short4, float4, M, NXZ, Reducer, write>(
573  result, a, b, c, x, y, z, w, x[0]->Volume());
574 #else
575  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
576 #endif
577  } else if (x[0]->Nspin() == 1) { // staggered
578 #ifdef GPU_STAGGERED_DIRAC
579  const int M = 3;
580  multiReduce<doubleN, ReduceType, float2, short2, float2, M, NXZ, Reducer, write>(
581  result, a, b, c, x, y, z, w, x[0]->Volume());
582 #else
583  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
584 #endif
585  } else {
586  errorQuda("nSpin=%d is not supported\n", x[0]->Nspin());
587  }
588 
589  } else {
590  errorQuda("Precision combination x=%d y=%d not supported\n", x[0]->Precision(), y[0]->Precision());
591  }
592  }
593 
594  template <int NXZ, typename doubleN, typename ReduceType,
595  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerDiagonal, typename writeDiagonal,
596  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerOffDiagonal,
597  typename writeOffDiagonal, bool siteUnroll, typename T>
598  void multiReduce(doubleN result[], const coeff_array<T> &a, const coeff_array<T> &b, const coeff_array<T> &c,
600  CompositeColorSpinorField &w, int i, int j)
601  {
602 
603  if (x[0]->Precision() == y[0]->Precision()) {
604  if (i == j) { // we are on the diagonal so invoke the diagonal reducer
605  multiReduce<NXZ, doubleN, ReduceType, ReducerDiagonal, writeDiagonal, siteUnroll, T>(
606  result, a, b, c, x, y, z, w);
607  } else { // we are on the diagonal so invoke the off-diagonal reducer
608  multiReduce<NXZ, doubleN, ReduceType, ReducerOffDiagonal, writeOffDiagonal, siteUnroll, T>(
609  result, a, b, c, x, y, z, w);
610  }
611  } else {
612  if (i == j) { // we are on the diagonal so invoke the diagonal reducer
613  mixedMultiReduce<NXZ, doubleN, ReduceType, ReducerDiagonal, writeDiagonal, true, T>(
614  result, a, b, c, x, y, z, w);
615  } else { // we are on the diagonal so invoke the off-diagonal reducer
616  mixedMultiReduce<NXZ, doubleN, ReduceType, ReducerOffDiagonal, writeOffDiagonal, true, T>(
617  result, a, b, c, x, y, z, w);
618  }
619  }
620  }
621 
622  void reDotProduct(double* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
623 #ifndef SSTEP
624  errorQuda("S-step code not built\n");
625 #else
626  switch(x.size()){
627  case 1:
628  multiReduce<1, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
629  result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
630  break;
631  case 2:
632  multiReduce<2, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
633  result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
634  break;
635  case 3:
636  multiReduce<3, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
637  result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
638  break;
639  case 4:
640  multiReduce<4, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
641  result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
642  break;
643  case 5:
644  multiReduce<5, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
645  result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
646  break;
647  case 6:
648  multiReduce<6, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
649  result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
650  break;
651  case 7:
652  multiReduce<7, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
653  result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
654  break;
655  case 8:
656  multiReduce<8, double, QudaSumFloat, Dot, 0, 0, 0, 0, false>(
657  result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
658  break;
659  /*case 9:
660  multiReduce<9,double,QudaSumFloat,Dot,0,0,0,0,false>
661  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
662  break;
663  case 10:
664  multiReduce<10,double,QudaSumFloat,Dot,0,0,0,0,false>
665  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
666  break;
667  case 11:
668  multiReduce<11,double,QudaSumFloat,Dot,0,0,0,0,false>
669  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
670  break;
671  case 12:
672  multiReduce<12,double,QudaSumFloat,Dot,0,0,0,0,false>
673  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
674  break;
675  case 13:
676  multiReduce<13,double,QudaSumFloat,Dot,0,0,0,0,false>
677  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
678  break;
679  case 14:
680  multiReduce<14,double,QudaSumFloat,Dot,0,0,0,0,false>
681  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
682  break;
683  case 15:
684  multiReduce<15,double,QudaSumFloat,Dot,0,0,0,0,false>
685  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
686  break;
687  case 16:
688  multiReduce<16,double,QudaSumFloat,Dot,0,0,0,0,false>
689  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
690  break;*/
691  default:
692  errorQuda("Unsupported vector size");
693  break;
694  }
695 #endif // SSTEP
696  // do a single multi-node reduction only once we have computed all local dot products
697  const int Nreduce = x.size()*y.size();
698  reduceDoubleArray((double*)result, Nreduce);
699  }
700 
701 
702  // This function does the outer product of dot products... in column major.
703  // There's a function below called 'cDotProduct' that flips it to row major.
704  template <template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerDiagonal, typename writeDiagonal,
705  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerOffDiagonal, typename writeOffDiagonal>
706  void multiReduce_recurse(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y,
707  std::vector<ColorSpinorField*>&z, std::vector<ColorSpinorField*>&w, int i_idx, int j_idx, bool hermitian, unsigned int tile_size) {
708 
709  if (y.size() > tile_size) // if greater than max single-kernel size, split and recurse
710  {
711  // Do the recurse first.
712  Complex* result0 = &result[0];
713  Complex* result1 = &result[x.size()*(y.size()/2)];
714  std::vector<ColorSpinorField*> y0(y.begin(), y.begin() + y.size()/2);
715  std::vector<ColorSpinorField*> y1(y.begin() + y.size()/2, y.end());
716  std::vector<ColorSpinorField*> w0(w.begin(), w.begin() + w.size()/2);
717  std::vector<ColorSpinorField*> w1(w.begin() + w.size()/2, w.end());
718  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result0, x, y0, z, w0, i_idx, 2*j_idx+0, hermitian, tile_size);
719  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result1, x, y1, z, w1, i_idx, 2*j_idx+1, hermitian, tile_size);
720  }
721  else
722  {
723  double2* cdot = new double2[x.size()*y.size()];
724 
725  // if at bottom of recursion, return if on lower left
726  if (x.size() <= tile_size && hermitian) {
727  if (j_idx < i_idx) { return; }
728  }
729 
730  coeff_array<Complex> a, b, c;
731 
732  if (x.size() <= tile_size) {
733  switch(x.size()){ // COMMENT HERE FOR COMPILE TIME
734  case 1:
735  multiReduce<1, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
736  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
737  break;
738 #if MAX_MULTI_BLAS_N >= 2
739  case 2:
740  multiReduce<2, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
741  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
742  break;
743 #if MAX_MULTI_BLAS_N >= 3
744  case 3:
745  multiReduce<3, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
746  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
747  break;
748 #if MAX_MULTI_BLAS_N >= 4
749  case 4:
750  multiReduce<4, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
751  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
752  break;
753 #if MAX_MULTI_BLAS_N >= 5
754  case 5:
755  multiReduce<5, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
756  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
757  break;
758 #if MAX_MULTI_BLAS_N >= 6
759  case 6:
760  multiReduce<6, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
761  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
762  break;
763 #if MAX_MULTI_BLAS_N >= 7
764  case 7:
765  multiReduce<7, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
766  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
767  break;
768 #if MAX_MULTI_BLAS_N >= 8
769  case 8:
770  multiReduce<8, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
771  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
772  break;
773 #if MAX_MULTI_BLAS_N >= 9
774  case 9:
775  multiReduce<9, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
776  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
777  break;
778 #if MAX_MULTI_BLAS_N >= 10
779  case 10:
780  multiReduce<10, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
781  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
782  break;
783 #if MAX_MULTI_BLAS_N >= 11
784  case 11:
785  multiReduce<11, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
786  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
787  break;
788 #if MAX_MULTI_BLAS_N >= 12
789  case 12:
790  multiReduce<12, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
791  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
792  break;
793 #if MAX_MULTI_BLAS_N >= 13
794  case 13:
795  multiReduce<13, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
796  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
797  break;
798 #if MAX_MULTI_BLAS_N >= 14
799  case 14:
800  multiReduce<14, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
801  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
802  break;
803 #if MAX_MULTI_BLAS_N >= 15
804  case 15:
805  multiReduce<15, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
806  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
807  break;
808 #if MAX_MULTI_BLAS_N >= 16
809  case 16:
810  multiReduce<16, double2, QudaSumFloat2, ReducerDiagonal, writeDiagonal, ReducerOffDiagonal, writeOffDiagonal, false>(
811  cdot, a, b, c, x, y, z, w, i_idx, j_idx);
812  break;
813 #endif //16
814 #endif //15
815 #endif //14
816 #endif //13
817 #endif //12
818 #endif //11
819 #endif //10
820 #endif // 9
821 #endif // 8
822 #endif // 7
823 #endif // 6
824 #endif // 5
825 #endif // 4
826 #endif // 3
827 #endif // 2
828  }
829  } else {
830  // split the problem and recurse. Splitting in x requires
831  // memory reshuffling (unless y = 1).
832  // Use a few temporary variables.
833 
834  Complex* tmpmajor = new Complex[x.size()*y.size()];
835  Complex* result0 = &tmpmajor[0];
836  Complex* result1 = &tmpmajor[(x.size()/2)*y.size()];
837  std::vector<ColorSpinorField*> x0(x.begin(), x.begin() + x.size()/2);
838  std::vector<ColorSpinorField*> x1(x.begin() + x.size()/2, x.end());
839  std::vector<ColorSpinorField*> z0(z.begin(), z.begin() + z.size()/2);
840  std::vector<ColorSpinorField*> z1(z.begin() + z.size()/2, z.end());
841 
842  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result0, x0, y, z0, w, 2*i_idx+0, j_idx, hermitian, tile_size);
843  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result1, x1, y, z1, w, 2*i_idx+1, j_idx, hermitian, tile_size);
844 
845  const unsigned int xlen0 = x.size()/2;
846  const unsigned int xlen1 = x.size() - xlen0;
847  const unsigned int ylen = y.size();
848 
849  // Copy back into result.
850  int count = 0, count0 = 0, count1 = 0;
851  for (unsigned int i = 0; i < ylen; i++)
852  {
853  for (unsigned int j = 0; j < xlen0; j++)
854  result[count++] = result0[count0++];
855  for (unsigned int j = 0; j < xlen1; j++)
856  result[count++] = result1[count1++];
857  }
858 
859  delete[] tmpmajor;
860  }
861 
862  // we are at the leaf of the binary tree (e.g., we ran the kernel): perform the row-to-column-major transpose here.
863  if (x.size() <= tile_size)
864  {
865  const unsigned int xlen = x.size();
866  const unsigned int ylen = y.size();
867  for (unsigned int j = 0; j < xlen; j++)
868  for (unsigned int i = 0; i < ylen; i++)
869  result[i*xlen+j] = Complex(cdot[j*ylen + i].x, cdot[j*ylen+i].y);
870  }
871  delete[] cdot;
872  }
873  }
874 
875 
876  template <template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerDiagonal,
877  typename writeDiagonal,
878  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerOffDiagonal,
879  typename writeOffDiagonal>
880  class TileSizeTune : public Tunable {
881  typedef std::vector<ColorSpinorField*> vec;
883  vec &x, &y, &z, &w;
884  bool hermitian;
885  bool Anorm;
886 
887  unsigned int sharedBytesPerThread() const { return 0; }
888  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
889 
890  unsigned int max_tile_size;
891 
892  public:
893  TileSizeTune(Complex *result, vec &x, vec &y, vec &z, vec &w, bool hermitian, bool Anorm = false)
894  : result(result), x(x), y(y), z(z), w(w), hermitian(hermitian), Anorm(Anorm), max_tile_size(1)
895  {
896  strcpy(aux, "policy,");
897  strcat(aux, x[0]->AuxString());
898  strcat(aux, ",");
899  strcat(aux, y[0]->AuxString());
900  if (hermitian) strcat(aux, ",hermitian");
901  if (Anorm) strcat(aux, ",Anorm");
902  strcat(aux,",n=");
903  char size[8];
904  u64toa(size, x.size());
905  strcat(aux,size);
906  strcat(aux,",m=");
907  u64toa(size, y.size());
908  strcat(aux,size);
909  u64toa(size, MAX_MULTI_BLAS_N);
910  strcat(aux, ",multi-blas-n=");
911  strcat(aux, size);
912 
913  // before we do policy tuning we must ensure the kernel
914  // constituents have been tuned since we can't do nested tuning
915  // FIXME this will break if the kernels are destructive - which they aren't here
916  if (getTuning() && getTuneCache().find(tuneKey()) == getTuneCache().end()) {
917  disableProfileCount(); // purely for profiling reasons, don't want to profile tunings.
918 
919  if (x.size() == 1 || y.size() == 1) { // 1-d reduction
920 
921  max_tile_size = std::min(MAX_MULTI_BLAS_N, (int)std::max(x.size(), y.size()));
922 
923  // Make sure constituents are tuned.
924  for ( unsigned int tile_size=1; tile_size <= max_tile_size; tile_size++) {
925  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
926  (result, x, y, z, w, 0, 0, hermitian, tile_size);
927  }
928 
929  } else { // 2-d reduction
930 
931  // max_tile_size should be set to the largest power of 2 less than
932  // MAX_MULTI_BLAS_N, since we have a requirement that the
933  // tile size is a power of 2.
934  unsigned int max_count = 0;
935  unsigned int tile_size_tmp = MAX_MULTI_BLAS_N;
936  while (tile_size_tmp != 1) { tile_size_tmp = tile_size_tmp >> 1; max_count++; }
937  tile_size_tmp = 1;
938  for (unsigned int i = 0; i < max_count; i++) { tile_size_tmp = tile_size_tmp << 1; }
939  max_tile_size = tile_size_tmp;
940 
941  // Make sure constituents are tuned.
942  for ( unsigned int tile_size=1; tile_size <= max_tile_size && tile_size <= x.size() &&
943  (tile_size <= y.size() || y.size()==1) ; tile_size*=2) {
944  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
945  (result, x, y, z, w, 0, 0, hermitian, tile_size);
946  }
947 
948  // also test case using a single kernel if both dimensions
949  // are less than MAX_MULTI_BLAS_N
950  if (x.size() <= MAX_MULTI_BLAS_N && y.size() <= MAX_MULTI_BLAS_N) {
951  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
952  (result, x, y, z, w, 0, 0, hermitian, MAX_MULTI_BLAS_N);
953  }
954  }
955 
957  setPolicyTuning(true);
958  }
959  }
960 
961  virtual ~TileSizeTune() { setPolicyTuning(false); }
962 
963  void apply(const cudaStream_t &stream) {
964  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
965 
966  // tp.aux.x is where the tile size is stored. "tp" is the tuning struct.
967  // it contains blocksize, grid size, etc. Since we're only tuning
968  // a policy, we don't care about those sizes. That's why we only
969  // tune "aux.x", which is the tile size.
970  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
971  (result, x, y, z, w, 0, 0, hermitian, tp.aux.x);
972  }
973 
974  // aux.x is the tile size
976  {
977  if ( x.size()==1 || y.size()==1 ) { // 1-d reduction
978 
979  param.aux.x++;
980  if ( (unsigned int)param.aux.x <= max_tile_size ) {
981  return true;
982  } else {
983  param.aux.x = 1;
984  return false;
985  }
986 
987  } else { // 2-d reduction
988 
989  if ( (unsigned int)(2*param.aux.x) <= max_tile_size &&
990  (unsigned int)(2*param.aux.x) <= x.size() &&
991  (unsigned int)(2*param.aux.x) <= y.size() ) {
992  param.aux.x *= 2; // only tune powers of two
993  return true;
994  } else if (x.size() <= MAX_MULTI_BLAS_N && y.size() <= MAX_MULTI_BLAS_N && param.aux.x < MAX_MULTI_BLAS_N) {
995  // we've run out of power of two tiles to try, but before
996  // we finish, try a single kernel if it fits
997  param.aux.x = MAX_MULTI_BLAS_N;
998  return true;
999  } else {
1000  param.aux.x = 1; // reset to the beginning (which we'd need for multi-dimensional tuning)
1001  return false;
1002  }
1003 
1004  }
1005  }
1006 
1007  bool advanceTuneParam(TuneParam &param) const { return advanceAux(param); }
1008 
1010  Tunable::initTuneParam(param);
1011  param.aux.x = 1; param.aux.y = 0; param.aux.z = 0; param.aux.w = 0;
1012  }
1013 
1015  Tunable::defaultTuneParam(param); // default is max tile size
1016  // max_tile_size is MAX_MULTI_BLAS_N rounded down to the nearest power of 2.
1017  param.aux.x = max_tile_size; param.aux.y = 0; param.aux.z = 0; param.aux.w = 0;
1018  }
1019 
1020  TuneKey tuneKey() const {
1021  return TuneKey(x[0]->VolString(), typeid(*this).name(), aux);
1022  }
1023 
1024  long long flops() const { return 0; } // FIXME
1025  long long bytes() const { return 0; } // FIXME
1026 
1027  void preTune() { } // FIXME - use write to determine what needs to be saved
1028  void postTune() { } // FIXME - use write to determine what needs to be saved
1029  };
1030 
1031  void cDotProduct(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
1032  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
1033  Complex* result_tmp = new Complex[x.size()*y.size()];
1034  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
1035 
1036  // cDotProduct_recurse returns a column-major matrix.
1037  // To be consistent with the multi-blas functions, we should
1038  // switch this to row-major.
1039  TileSizeTune<Cdot,write<0,0,0,0>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, false);
1040  tile.apply(0);
1041 
1042  // do a single multi-node reduction only once we have computed all local dot products
1043  const int Nreduce = 2*x.size()*y.size();
1044  reduceDoubleArray((double*)result_tmp, Nreduce);
1045 
1046  // Switch from col-major to row-major
1047  const unsigned int xlen = x.size();
1048  const unsigned int ylen = y.size();
1049  for (unsigned int j = 0; j < xlen; j++)
1050  for (unsigned int i = 0; i < ylen; i++)
1051  result[j*ylen+i] = result_tmp[i*xlen + j];
1052 
1053  delete[] result_tmp;
1054  }
1055 
1056  void hDotProduct(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
1057  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
1058  if (x.size() != y.size()) errorQuda("Cannot call Hermitian block dot product on non-square inputs");
1059 
1060  Complex* result_tmp = new Complex[x.size()*y.size()];
1061  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
1062 
1063  TileSizeTune<Cdot,write<0,0,0,0>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, true, false); // last false is b/c L2 norm
1064  tile.apply(0);
1065 
1066  // do a single multi-node reduction only once we have computed all local dot products
1067  const int Nreduce = 2*x.size()*y.size();
1068  reduceDoubleArray((double*)result_tmp, Nreduce); // FIXME - could optimize this for Hermiticity as well
1069 
1070  // Switch from col-major to row-major
1071  const unsigned int xlen = x.size();
1072  const unsigned int ylen = y.size();
1073  for (unsigned int j = 0; j < xlen; j++)
1074  for (unsigned int i = j; i < ylen; i++) {
1075  result[j*ylen+i] = result_tmp[i*xlen + j];
1076  result[i*ylen+j] = conj(result_tmp[i*xlen + j]);
1077  }
1078 
1079  delete[] result_tmp;
1080  }
1081 
1082  // for (p, Ap) norms in CG which are Hermitian.
1083  void hDotProduct_Anorm(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
1084  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
1085  if (x.size() != y.size()) errorQuda("Cannot call Hermitian block A-norm dot product on non-square inputs");
1086 
1087  Complex* result_tmp = new Complex[x.size()*y.size()];
1088  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
1089 
1090  TileSizeTune<Cdot,write<0,0,0,0>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, true, true); // last true is b/c A norm
1091  tile.apply(0);
1092 
1093  // do a single multi-node reduction only once we have computed all local dot products
1094  const int Nreduce = 2*x.size()*y.size();
1095  reduceDoubleArray((double*)result_tmp, Nreduce); // FIXME - could optimize this for Hermiticity as well
1096 
1097  // Switch from col-major to row-major
1098  const unsigned int xlen = x.size();
1099  const unsigned int ylen = y.size();
1100  for (unsigned int j = 0; j < xlen; j++)
1101  for (unsigned int i = j; i < ylen; i++) {
1102  result[j*ylen+i] = result_tmp[i*xlen + j];
1103  result[i*ylen+j] = conj(result_tmp[i*xlen + j]);
1104  }
1105 
1106  delete[] result_tmp;
1107  }
1108 
1109  // takes the outer product of inner products between and y and copies y into z
1110  void cDotProductCopy(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y,
1111  std::vector<ColorSpinorField*>&z){
1112 
1113 #if 0
1114  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
1115  if (y.size() != z.size()) errorQuda("Cannot copy input y of size %lu into z of size %lu\n", y.size(), z.size());
1116 
1117  Complex* result_tmp = new Complex[x.size()*y.size()];
1118  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
1119 
1120  // When recursing, only the diagonal tiles will do the copy, the rest just do the outer product
1121  TileSizeTune<CdotCopy,write<0,0,0,1>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, true);
1122  tile.apply(0);
1123 
1124  // do a single multi-node reduction only once we have computed all local dot products
1125  const int Nreduce = 2*x.size()*y.size();
1126  reduceDoubleArray((double*)result_tmp, Nreduce);
1127 
1128  // Switch from col-major to row-major.
1129  const unsigned int xlen = x.size();
1130  const unsigned int ylen = y.size();
1131  for (unsigned int j = 0; j < xlen; j++)
1132  for (unsigned int i = 0; i < ylen; i++)
1133  result[j*ylen+i] = result_tmp[i*xlen + j];
1134 
1135  delete[] result_tmp;
1136 #else
1137  errorQuda("cDotProductCopy not enabled");
1138 #endif
1139  }
1140 
1141  } // namespace blas
1142 
1143 } // namespace quda
MultiReduceCuda(doubleN result[], SpinorX X[], SpinorY Y[], SpinorZ Z[], SpinorW W[], Reducer &r, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int NYW, int length)
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33
CUresult jitifyError() const
Definition: tune_quda.h:375
void multiReduceLaunch(doubleN result[], Arg &arg, const TuneParam &tp, const cudaStream_t &stream, Tunable &tunable)
enum QudaPrecision_s QudaPrecision
void * getHostReduceBuffer()
Definition: reduce_quda.cu:28
bool commAsyncReduction()
static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaDeviceProp deviceProp
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
void disableProfileCount()
Disable the profile kernel counting.
Definition: tune.cpp:125
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
unsigned int sharedBytesPerBlock(const TuneParam &param) const
void end(void)
Definition: blas_quda.cu:489
#define checkPrecision(...)
#define errorQuda(...)
Definition: util_quda.h:121
Helper file when using jitify run-time compilation. This file should be included in source code...
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:764
void cDotProductCopy(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b, std::vector< ColorSpinorField *> &c)
Computes the matrix of inner products between the vector set a and the vector set b...
#define QUDA_MAX_MULTI_REDUCE
Maximum number of simultaneous reductions that can take place. This number may be increased if needed...
cudaStream_t * stream
void reduceDoubleArray(double *, const int len)
void apply(const cudaStream_t &stream)
void set(const cudaColorSpinorField &x)
Definition: texture.h:321
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:728
void mixedMultiReduce(doubleN result[], const coeff_array< T > &a, const coeff_array< T > &b, const coeff_array< T > &c, CompositeColorSpinorField &x, CompositeColorSpinorField &y, CompositeColorSpinorField &z, CompositeColorSpinorField &w)
void multiReduce(doubleN result[], const coeff_array< T > &a, const coeff_array< T > &b, const coeff_array< T > &c, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int length)
void * getMappedHostReduceBuffer()
Definition: reduce_quda.cu:27
std::vector< ColorSpinorField * > & z
void multiReduce_recurse(Complex *result, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int i_idx, int j_idx, bool hermitian, unsigned int tile_size)
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:137
int length[]
__host__ __device__ void sum(double &a, double &b)
Definition: blas_helper.cuh:62
virtual bool advanceGridDim(TuneParam &param) const
Definition: tune_quda.h:77
void initTuneParam(TuneParam &param) const
static constexpr int Y
int Nspin
Definition: blas_test.cu:45
cpuGaugeField * Y_h
void defaultTuneParam(TuneParam &param) const
void enableProfileCount()
Enable the profile kernel counting.
Definition: tune.cpp:126
void completeFastReduce(int32_t words)
Definition: reduce_quda.cu:43
SpinorW W[MAX_MULTI_BLAS_N]
QudaGaugeParam param
Definition: pack_test.cpp:17
cudaStream_t * getStream()
Definition: blas_quda.cu:494
bool getFastReduce()
Definition: reduce_quda.cu:30
void initFastReduce(int words)
std::vector< ColorSpinorField * > CompositeColorSpinorField
void initTuneParam(TuneParam &param) const
static constexpr int Z
SpinorY Y[MAX_MULTI_BLAS_N]
unsigned int sharedBytesPerThread() const
bool advanceTuneParam(TuneParam &param) const
std::vector< ColorSpinorField * > vec
constexpr int size
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
#define warningQuda(...)
Definition: util_quda.h:133
static signed char * Bmatrix_h
static constexpr int W
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
void hDotProduct_Anorm(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b)
Computes the matrix of inner products between the vector set a and the vector set b...
std::complex< double > Complex
Definition: quda_internal.h:46
cudaEvent_t * getReduceEvent()
Definition: reduce_quda.cu:29
void defaultTuneParam(TuneParam &param) const
Parameter struct for generic multi-blas kernel.
#define MAX_MATRIX_SIZE
MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > arg
static constexpr int X
void setPolicyTuning(bool)
Enable / disable whether are tuning a policy.
Definition: tune.cpp:499
int V
Definition: test_util.cpp:27
unsigned int maxBlockSize(const TuneParam &param) const
void * memset(void *s, int c, size_t n)
void apply(const cudaStream_t &stream)
void set(const cudaColorSpinorField &x, int nFace=1)
Definition: texture.h:196
TileSizeTune(Complex *result, vec &x, vec &y, vec &z, vec &w, bool hermitian, bool Anorm=false)
void hDotProduct(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b)
Computes the matrix of inner products between the vector set a and the vector set b...
static __constant__ signed char arg_buffer[MAX_MATRIX_SIZE]
bool advanceAux(TuneParam &param) const
void checkSpinor(const ColorSpinorField &a, const ColorSpinorField &b)
Definition: blas_helper.cuh:20
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
virtual bool advanceSharedBytes(TuneParam &param) const
__global__ void multiReduceKernel(Arg arg_)
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:304
#define checkCudaError()
Definition: util_quda.h:161
const std::map< TuneKey, TuneParam > & getTuneCache()
Returns a reference to the tunecache map.
Definition: tune.cpp:128
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
void u64toa(char *buffer, uint64_t value)
Definition: uint_to_char.h:127
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:90
static signed char * Amatrix_h
bool advanceGridDim(TuneParam &param) const
unsigned int sharedBytesPerThread() const
static signed char * Cmatrix_h
static const int name_n
Definition: tune_key.h:11
unsigned long long bytes
Definition: blas_quda.cu:23
unsigned int sharedBytesPerBlock(const TuneParam &param) const
#define MAX_MULTI_BLAS_N
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:329