QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
reduce_quda.cu
Go to the documentation of this file.
1 #include <atomic>
2 #include <blas_quda.h>
3 #include <tune_quda.h>
4 #include <float_vector.h>
6 
7 #include <launch_kernel.cuh>
8 #include <jitify_helper.cuh>
10 
11 // These are used for reduction kernels
15 static cudaEvent_t reduceEnd;
16 static bool fast_reduce_enabled = false;
17 
18 namespace quda {
19 
20  namespace blas {
21 
22 #include <generic_reduce.cuh>
23 
24  cudaStream_t* getStream();
25 
26  void* getDeviceReduceBuffer() { return d_reduce; }
28  void* getHostReduceBuffer() { return h_reduce; }
29  cudaEvent_t* getReduceEvent() { return &reduceEnd; }
31 
32  void initFastReduce(int32_t words)
33  {
34  // initialize the reduction values in 32-bit increments to INT_MIN
35  for (int32_t i = 0; i < words; i++) {
36  reinterpret_cast<int32_t *>(h_reduce)[i] = std::numeric_limits<int32_t>::min();
37  }
38 
39  // ensure that the host memory write is complete before we launch the kernel
40  atomic_thread_fence(std::memory_order_release);
41  }
42 
43  void completeFastReduce(int32_t words)
44  {
45  volatile int32_t *check = reinterpret_cast<int32_t *>(h_reduce);
46  int count = 0;
47  int complete = 0;
48  while (complete < words) {
49  // ensure visiblity to any changes in memory
50  atomic_thread_fence(std::memory_order_acquire);
51 
52  complete = 0;
53  for (int32_t i = 0; i < words; i++) {
54  // spin-wait until all values have been updated
55  if (check[i] != std::numeric_limits<int32_t>::min()) complete++;
56  }
57  if (count++ % 10000 == 0) { // check error every 10000 iterations
58  // if there is an error in the kernel then we need to exit the spin-wait
59  if (cudaSuccess != cudaPeekAtLastError()) break;
60  }
61  }
62  }
63 
64  void initReduce()
65  {
66  /* we have these different reductions to cater for:
67 
68  - regular reductions (reduce_quda.cu) where are reducing to a
69  single vector type (max length 4 presently), with possibly
70  parity dimension, and a grid-stride loop with max number of
71  blocks = 2 x SM count
72 
73  - multi-reductions where we are reducing to a matrix of size
74  of size MAX_MULTI_BLAS_N^2 of vectors (max length 4), with
75  possible parity dimension, and a grid-stride loop with
76  maximum number of blocks = 2 x SM count
77  */
78 
79  const int reduce_size = 4 * sizeof(QudaSumFloat);
80  const int max_reduce_blocks = 2*deviceProp.multiProcessorCount;
81 
82  const int max_reduce = 2 * max_reduce_blocks * reduce_size;
83  const int max_multi_reduce = 2 * MAX_MULTI_BLAS_N * MAX_MULTI_BLAS_N * max_reduce_blocks * reduce_size;
84 
85  // reduction buffer size
86  size_t bytes = max_reduce > max_multi_reduce ? max_reduce : max_multi_reduce;
87 
88  if (!d_reduce) d_reduce = (QudaSumFloat *) device_malloc(bytes);
89 
90  // these arrays are actually oversized currently (only needs to be QudaSumFloat3)
91 
92  // if the device supports host-mapped memory then use a host-mapped array for the reduction
93  if (!h_reduce) {
94  // only use zero copy reductions when using 64-bit
95 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
96  if(deviceProp.canMapHostMemory) {
97  h_reduce = (QudaSumFloat *) mapped_malloc(bytes);
98  cudaHostGetDevicePointer(&hd_reduce, h_reduce, 0); // set the matching device pointer
99  } else
100 #endif
101  {
102  h_reduce = (QudaSumFloat *) pinned_malloc(bytes);
104  }
105  memset(h_reduce, 0, bytes); // added to ensure that valgrind doesn't report h_reduce is unitialised
106  }
107 
108  cudaEventCreateWithFlags(&reduceEnd, cudaEventDisableTiming);
109 
110  // enable fast reductions with CPU spin waiting as opposed to using CUDA events
111  char *fast_reduce_env = getenv("QUDA_ENABLE_FAST_REDUCE");
112  if (fast_reduce_env && strcmp(fast_reduce_env,"1") == 0) {
113  warningQuda("Experimental fast reductions enabled");
114  fast_reduce_enabled = true;
115  }
116 
117  checkCudaError();
118  }
119 
120  void endReduce(void)
121  {
122  if (d_reduce) {
124  d_reduce = 0;
125  }
126  if (h_reduce) {
128  h_reduce = 0;
129  }
130  hd_reduce = 0;
131 
132  cudaEventDestroy(reduceEnd);
133  }
134 
138  template <typename doubleN, typename ReduceType, typename FloatN, int M, typename Arg>
139  doubleN reduceLaunch(Arg &arg, const TuneParam &tp, const cudaStream_t &stream, Tunable &tunable)
140  {
141  if (tp.grid.x > (unsigned int)deviceProp.maxGridSize[0])
142  errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, deviceProp.maxGridSize[0]);
143 
144  const int32_t words = tp.grid.y * sizeof(ReduceType) / sizeof(int32_t);
145  if (getFastReduce() && !commAsyncReduction()) initFastReduce(words);
146 
147 #ifdef JITIFY
148  using namespace jitify::reflection;
149  tunable.jitifyError() = program->kernel("quda::blas::reduceKernel")
150  .instantiate((int)tp.block.x, Type<ReduceType>(), Type<FloatN>(), M, Type<Arg>())
151  .configure(tp.grid, tp.block, tp.shared_bytes, stream)
152  .launch(arg);
153 #else
154  LAUNCH_KERNEL(reduceKernel, tp, stream, arg, ReduceType, FloatN, M);
155 #endif
156 
157  if (!commAsyncReduction()) {
158 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
159  if (deviceProp.canMapHostMemory) {
160  if (getFastReduce()) {
161  completeFastReduce(words);
162  } else {
163  qudaEventRecord(reduceEnd, stream);
164  while (cudaSuccess != qudaEventQuery(reduceEnd)) { ; }
165  }
166  } else
167 #endif
168  {
169  qudaMemcpy(h_reduce, hd_reduce, sizeof(ReduceType), cudaMemcpyDeviceToHost);
170  }
171  }
172  doubleN cpu_sum = set(((ReduceType *)h_reduce)[0]);
173  if (tp.grid.y == 2) sum(cpu_sum, ((ReduceType *)h_reduce)[1]); // add other parity if needed
174  return cpu_sum;
175  }
176 
177  template <typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX, typename SpinorY,
178  typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
179  class ReduceCuda : public Tunable
180  {
181 
182  private:
183  const int nParity; // for composite fields this includes the number of composites
185  doubleN &result;
186 
187  const ColorSpinorField &x, &y, &z, &w, &v;
188 
189  // host pointers used for backing up fields when tuning
190  // these can't be curried into the Spinors because of Tesla argument length restriction
191  char *X_h, *Y_h, *Z_h, *W_h, *V_h;
192  char *Xnorm_h, *Ynorm_h, *Znorm_h, *Wnorm_h, *Vnorm_h;
193 
194  unsigned int sharedBytesPerThread() const { return 0; }
195  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
196 
197  virtual bool advanceSharedBytes(TuneParam &param) const
198  {
199  TuneParam next(param);
200  advanceBlockDim(next); // to get next blockDim
201  int nthreads = next.block.x * next.block.y * next.block.z;
202  param.shared_bytes = sharedBytesPerThread() * nthreads > sharedBytesPerBlock(param) ?
203  sharedBytesPerThread() * nthreads :
204  sharedBytesPerBlock(param);
205  return false;
206  }
207 
208  public:
209  ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer &r,
211  int length) :
212  nParity((x.IsComposite() ? x.CompositeDim() : 1) * (x.SiteSubset())),
213  arg(X, Y, Z, W, V, r, length / nParity),
214  x(x),
215  y(y),
216  z(z),
217  w(w),
218  v(v),
219  result(result),
220  X_h(0),
221  Y_h(0),
222  Z_h(0),
223  W_h(0),
224  V_h(0),
225  Xnorm_h(0),
226  Ynorm_h(0),
227  Znorm_h(0),
228  Wnorm_h(0),
229  Vnorm_h(0)
230  {
231  strcpy(aux, x.AuxString());
232  if (x.Precision() != z.Precision()) {
233  strcat(aux, ",");
234  strcat(aux, z.AuxString());
235  }
236  if (getFastReduce()) strcat(aux, ",fast_reduce");
237 
238 #ifdef JITIFY
239  ::quda::create_jitify_program("kernels/reduce_core.cuh");
240 #endif
241  }
242  virtual ~ReduceCuda() {}
243 
244  inline TuneKey tuneKey() const { return TuneKey(x.VolString(), typeid(arg.r).name(), aux); }
245 
246  void apply(const cudaStream_t &stream)
247  {
248  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
249  result = reduceLaunch<doubleN, ReduceType, FloatN, M>(arg, tp, stream, *this);
250  }
251 
252  void preTune()
253  {
254  arg.X.backup(&X_h, &Xnorm_h, x.Bytes(), x.NormBytes());
255  arg.Y.backup(&Y_h, &Ynorm_h, y.Bytes(), y.NormBytes());
256  arg.Z.backup(&Z_h, &Znorm_h, z.Bytes(), z.NormBytes());
257  arg.W.backup(&W_h, &Wnorm_h, w.Bytes(), w.NormBytes());
258  arg.V.backup(&V_h, &Vnorm_h, v.Bytes(), v.NormBytes());
259  }
260 
261  void postTune()
262  {
263  arg.X.restore(&X_h, &Xnorm_h, x.Bytes(), x.NormBytes());
264  arg.Y.restore(&Y_h, &Ynorm_h, y.Bytes(), y.NormBytes());
265  arg.Z.restore(&Z_h, &Znorm_h, z.Bytes(), z.NormBytes());
266  arg.W.restore(&W_h, &Wnorm_h, w.Bytes(), w.NormBytes());
267  arg.V.restore(&V_h, &Vnorm_h, v.Bytes(), v.NormBytes());
268  }
269 
271  {
272  Tunable::initTuneParam(param);
273  param.grid.y = nParity;
274  }
275 
277  {
279  param.grid.y = nParity;
280  }
281 
282  long long flops() const { return arg.r.flops() * vec_length<FloatN>::value * arg.length * nParity * M; }
283 
284  long long bytes() const
285  {
286  // the factor two here assumes we are reading and writing to the high precision vector
287  // this will evaluate correctly for non-mixed kernels since the +2/-2 will cancel out
288  return (arg.r.streams() - 2) * x.Bytes() + 2 * z.Bytes();
289  }
290 
291  int tuningIter() const { return 3; }
292  };
293 
294  template <typename doubleN, typename ReduceType, typename RegType, typename StoreType, typename zType, int M,
295  template <typename ReducerType, typename Float, typename FloatN> class Reducer, int writeX, int writeY,
296  int writeZ, int writeW, int writeV>
297  doubleN nativeReduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y,
299  {
300 
301  checkLength(x, y);
302  checkLength(x, z);
303  checkLength(x, w);
304  checkLength(x, v);
305 
311 
312  doubleN value;
313  typedef typename scalar<RegType>::type Float;
314  typedef typename vector<Float, 2>::type Float2;
315  typedef vector<Float, 2> vec2;
316 
317  Reducer<ReduceType, Float2, RegType> r((Float2)vec2(a), (Float2)vec2(b));
318  ReduceCuda<doubleN, ReduceType, RegType, M, decltype(X), decltype(Y), decltype(Z), decltype(W), decltype(V),
319  Reducer<ReduceType, Float2, RegType>>
320  reduce(value, X, Y, Z, W, V, r, x, y, z, w, v, length);
321  reduce.apply(*(blas::getStream()));
322 
323  blas::bytes += reduce.bytes();
324  blas::flops += reduce.flops();
325 
326  checkCudaError();
327  return value;
328  }
329 
330  /*
331  Wilson
332  double double2 M = 1/12
333  single float4 M = 1/6
334  half short4 M = 6/6
335 
336  Staggered
337  double double2 M = 1/3
338  single float2 M = 1/3
339  half short2 M = 3/3
340  */
341 
347  template <typename doubleN, typename ReduceType, template <typename ReducerType, typename Float, typename FloatN> class Reducer,
348  int writeX, int writeY, int writeZ, int writeW, int writeV, bool siteUnroll>
349  doubleN uni_reduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y,
351  {
352 
353  checkPrecision(x, y, z, w, v);
354 
355  doubleN value;
356  if (checkLocation(x, y, z, w, v) == QUDA_CUDA_FIELD_LOCATION) {
357 
358  if (!x.isNative()
360  || x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER && x.Precision() == QUDA_HALF_PRECISION)) {
361  warningQuda("Device reductions on non-native fields is not supported\n");
362  doubleN value;
363  ::quda::zero(value);
364  return value;
365  }
366 
367  // cannot do site unrolling for arbitrary color (needs JIT)
368  if (siteUnroll && x.Ncolor() != 3) errorQuda("Not supported");
369 
370  int reduce_length = siteUnroll ? x.RealLength() : x.Length();
371 
372  if (x.Precision() == QUDA_DOUBLE_PRECISION) {
373 
374 #if QUDA_PRECISION & 8
375  if (x.Nspin() == 4 || x.Nspin() == 2) { // wilson
376 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_MULTIGRID) || defined(GPU_COVDEV)
377  const int M = siteUnroll ? 12 : 1; // determines how much work per thread to do
378  if (x.Nspin() == 2 && siteUnroll) errorQuda("siteUnroll not supported for nSpin==2");
379  value = nativeReduce<doubleN, ReduceType, double2, double2, double2, M, Reducer, writeX, writeY, writeZ,
380  writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
381 #else
382  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
383 #endif
384  } else if (x.Nspin() == 1) { // staggered
385 #ifdef GPU_STAGGERED_DIRAC
386  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
387  value = nativeReduce<doubleN, ReduceType, double2, double2, double2, M, Reducer, writeX, writeY, writeZ,
388  writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
389 #else
390  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
391 #endif
392  } else {
393  errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
394  }
395 #else
396  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
397 #endif
398 
399  } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
400 
401 #if QUDA_PRECISION & 4
402  if (x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT4_FIELD_ORDER) { // wilson
403 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_COVDEV)
404  const int M = siteUnroll ? 6 : 1; // determines how much work per thread to do
405  value = nativeReduce<doubleN, ReduceType, float4, float4, float4, M, Reducer, writeX, writeY, writeZ,
406  writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (4 * M));
407 #else
408  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
409 #endif
410  } else if (x.Nspin() == 1 || x.Nspin() == 2 || (x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER)) {
411 #if defined(GPU_STAGGERED_DIRAC) || defined(GPU_MULTIGRID)
412  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
413  if (x.Nspin() == 2 && siteUnroll) errorQuda("siteUnroll not supported for nSpin==2");
414  value = nativeReduce<doubleN, ReduceType, float2, float2, float2, M, Reducer, writeX, writeY, writeZ,
415  writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
416 #else
417  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
418 #endif
419  } else {
420  errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
421  }
422 #else
423  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
424 #endif
425 
426  } else if (x.Precision() == QUDA_HALF_PRECISION) { // half precision
427 
428 #if QUDA_PRECISION & 2
429  if (x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT4_FIELD_ORDER) { // wilson
430 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_COVDEV)
431  const int M = 6; // determines how much work per thread to do
432  value = nativeReduce<doubleN, ReduceType, float4, short4, short4, M, Reducer, writeX, writeY, writeZ,
433  writeW, writeV>(a, b, x, y, z, w, v, y.Volume());
434 #else
435  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
436 #endif
437  } else if (x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER) { // wilson
438 #if defined(GPU_MULTIGRID)
439  const int M = 12; // determines how much work per thread to do
440  value
441  = nativeReduce<doubleN, ReduceType, float2, short2, short2, M, Reducer, writeX, writeY, writeZ, writeW, writeV>(
442  a, b, x, y, z, w, v, y.Volume());
443 #else
444  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
445 #endif
446  } else if (x.Nspin() == 1) { // staggered
447 #ifdef GPU_STAGGERED_DIRAC
448  const int M = 3; // determines how much work per thread to do
449  value = nativeReduce<doubleN, ReduceType, float2, short2, short2, M, Reducer, writeX, writeY, writeZ,
450  writeW, writeV>(a, b, x, y, z, w, v, y.Volume());
451 #else
452  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
453 #endif
454  } else {
455  errorQuda("nSpin=%d is not supported\n", x.Nspin());
456  }
457 #else
458  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
459 #endif
460 
461  } else if (x.Precision() == QUDA_QUARTER_PRECISION) { // quarter precision
462 
463 #if QUDA_PRECISION & 1
464  if (x.Nspin() == 4) { // wilson
465 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_COVDEV)
466  const int M = 6; // determines how much work per thread to do
467  value
468  = nativeReduce<doubleN, ReduceType, float4, char4, char4, M, Reducer, writeX, writeY, writeZ, writeW, writeV>(
469  a, b, x, y, z, w, v, y.Volume());
470 #else
471  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
472 #endif
473  } else if (x.Nspin() == 1) { // staggered
474 #ifdef GPU_STAGGERED_DIRAC
475  const int M = 3; // determines how much work per thread to do
476  value
477  = nativeReduce<doubleN, ReduceType, float2, char2, char2, M, Reducer, writeX, writeY, writeZ, writeW, writeV>(
478  a, b, x, y, z, w, v, y.Volume());
479 #else
480  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
481 #endif
482  } else {
483  errorQuda("nSpin=%d is not supported\n", x.Nspin());
484  }
485 #else
486  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
487 #endif
488 
489  } else {
490  errorQuda("precision=%d is not supported\n", x.Precision());
491  }
492  } else { // fields are on the CPU
493  // we don't have quad precision support on the GPU so use doubleN instead of ReduceType
494  if (x.Precision() == QUDA_DOUBLE_PRECISION) {
495  Reducer<doubleN, double2, double2> r(a, b);
496  value = genericReduce<doubleN, doubleN, double, double, writeX, writeY, writeZ, writeW, writeV,
497  Reducer<doubleN, double2, double2>>(x, y, z, w, v, r);
498  } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
499  Reducer<doubleN, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
500  value = genericReduce<doubleN, doubleN, float, float, writeX, writeY, writeZ, writeW, writeV,
501  Reducer<doubleN, float2, float2>>(x, y, z, w, v, r);
502  } else {
503  errorQuda("Precision %d not implemented", x.Precision());
504  }
505  }
506 
507  const int Nreduce = sizeof(doubleN) / sizeof(double);
508  reduceDoubleArray((double *)&value, Nreduce);
509 
510  return value;
511  }
512 
518  template <typename doubleN, typename ReduceType, template <typename ReducerType, typename Float, typename FloatN> class Reducer,
519  int writeX, int writeY, int writeZ, int writeW, int writeV, bool siteUnroll>
520  doubleN mixed_reduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y,
522  {
523 
524  checkPrecision(x, y, w, v);
525 
526  doubleN value;
527  if (checkLocation(x, y, z, w, v) == QUDA_CUDA_FIELD_LOCATION) {
528 
529  if (!x.isNative()
530  && !(x.Nspin() == 4 && x.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER && x.Precision() == QUDA_SINGLE_PRECISION)) {
531  warningQuda("Device reductions on non-native fields is not supported\n");
532  doubleN value;
533  ::quda::zero(value);
534  return value;
535  }
536 
537  // cannot do site unrolling for arbitrary color (needs JIT)
538  if (x.Ncolor() != 3) errorQuda("Not supported");
539 
540  if (z.Precision() == QUDA_DOUBLE_PRECISION) {
541 
542 #if QUDA_PRECISION & 8
543  if (x.Precision() == QUDA_SINGLE_PRECISION) {
544 
545 #if QUDA_PRECISION & 4
546  if (x.Nspin() == 4) { // wilson
547 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
548  const int M = 12; // determines how much work per thread to do
549  value = nativeReduce<doubleN, ReduceType, double2, float4, double2, M, Reducer, writeX, writeY, writeZ,
550  writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
551 #else
552  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
553 #endif
554  } else if (x.Nspin() == 1) { // staggered
555 #ifdef GPU_STAGGERED_DIRAC
556  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
557  const int reduce_length = siteUnroll ? x.RealLength() : x.Length();
558  value = nativeReduce<doubleN, ReduceType, double2, float2, double2, M, Reducer, writeX, writeY, writeZ,
559  writeW, writeV>(a, b, x, y, z, w, v, reduce_length / (2 * M));
560 #else
561  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
562 #endif
563  } else {
564  errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
565  }
566 #else
567  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
568 #endif
569 
570  } else if (x.Precision() == QUDA_HALF_PRECISION) {
571 
572 #if QUDA_PRECISION & 2
573  if (x.Nspin() == 4) { // wilson
574 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
575  const int M = 12; // determines how much work per thread to do
576  value = nativeReduce<doubleN, ReduceType, double2, short4, double2, M, Reducer, writeX, writeY, writeZ,
577  writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
578 #else
579  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
580 #endif
581  } else if (x.Nspin() == 1) { // staggered
582 #ifdef GPU_STAGGERED_DIRAC
583  const int M = 3; // determines how much work per thread to do
584  value = nativeReduce<doubleN, ReduceType, double2, short2, double2, M, Reducer, writeX, writeY, writeZ,
585  writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
586 #else
587  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
588 #endif
589  } else {
590  errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
591  }
592 #else
593  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
594 #endif
595 
596  } else if (x.Precision() == QUDA_QUARTER_PRECISION) {
597 
598 #if QUDA_PRECISION & 1
599  if (x.Nspin() == 4) { // wilson
600 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
601  const int M = 12; // determines how much work per thread to do
602  value = nativeReduce<doubleN, ReduceType, double2, char4, double2, M, Reducer, writeX, writeY, writeZ,
603  writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
604 #else
605  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
606 #endif
607  } else if (x.Nspin() == 1) { // staggered
608 #ifdef GPU_STAGGERED_DIRAC
609  const int M = 3; // determines how much work per thread to do
610  value = nativeReduce<doubleN, ReduceType, double2, char2, double2, M, Reducer, writeX, writeY, writeZ,
611  writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
612 #else
613  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
614 #endif
615  } else {
616  errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
617  }
618 #else
619  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
620 #endif
621 
622  } else {
623  errorQuda("Not implemented for this precision combination %d %d", x.Precision(), z.Precision());
624  }
625 #else
626  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, z.Precision());
627 #endif
628 
629  } else if (z.Precision() == QUDA_SINGLE_PRECISION) {
630 
631 #if QUDA_PRECISION & 4
632  if (x.Precision() == QUDA_HALF_PRECISION) {
633 
634 #if QUDA_PRECISION & 2
635  if (x.Nspin() == 4) { // wilson
636 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
637  const int M = 6;
638  value = nativeReduce<doubleN, ReduceType, float4, short4, float4, M, Reducer, writeX, writeY, writeZ,
639  writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
640 #else
641  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
642 #endif
643  } else if (x.Nspin() == 1) { // staggered
644 #ifdef GPU_STAGGERED_DIRAC
645  const int M = 3;
646  value = nativeReduce<doubleN, ReduceType, float2, short2, float2, M, Reducer, writeX, writeY, writeZ,
647  writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
648 #else
649  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
650 #endif
651  } else {
652  errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
653  }
655  += Reducer<ReduceType, double2, double2>::streams() * (unsigned long long)x.Volume() * sizeof(float);
656 #else
657  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
658 #endif
659 
660  } else if (x.Precision() == QUDA_QUARTER_PRECISION) {
661 #if QUDA_PRECISION & 1
662  if (x.Nspin() == 4) { // wilson
663 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
664  const int M = 6;
665  value = nativeReduce<doubleN, ReduceType, float4, char4, float4, M, Reducer, writeX, writeY, writeZ,
666  writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
667 #else
668  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
669 #endif
670  } else if (x.Nspin() == 1) { // staggered
671 #ifdef GPU_STAGGERED_DIRAC
672  const int M = 3;
673  value = nativeReduce<doubleN, ReduceType, float2, char2, float2, M, Reducer, writeX, writeY, writeZ,
674  writeW, writeV>(a, b, x, y, z, w, v, x.Volume());
675 #else
676  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
677 #endif
678  } else {
679  errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin());
680  }
682  += Reducer<ReduceType, double2, double2>::streams() * (unsigned long long)x.Volume() * sizeof(float);
683 #else
684  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
685 #endif
686  } else {
687  errorQuda("Not implemented for this precision combination %d %d", x.Precision(), z.Precision());
688  }
689 #else
690  errorQuda("QUDA_PRECISION=%d does not enable precision %d", QUDA_PRECISION, x.Precision());
691 #endif
692 
693  } else {
694  errorQuda("Not implemented for this precision combination %d %d", x.Precision(), z.Precision());
695  }
696 
697  } else {
698  // we don't have quad precision support on the GPU so use doubleN instead of ReduceType
700  Reducer<doubleN, double2, double2> r(a, b);
701  value = genericReduce<doubleN, doubleN, float, double, writeX, writeY, writeZ, writeW, writeV,
702  Reducer<doubleN, double2, double2>>(x, y, z, w, v, r);
703  } else {
704  errorQuda("Precision %d not implemented", x.Precision());
705  }
706  }
707 
708  const int Nreduce = sizeof(doubleN) / sizeof(double);
709  reduceDoubleArray((double *)&value, Nreduce);
710 
711  return value;
712  }
713 
714  double norm1(const ColorSpinorField &x)
715  {
716  ColorSpinorField &y = const_cast<ColorSpinorField &>(x); // FIXME
717  return uni_reduce<double, QudaSumFloat, Norm1, 0, 0, 0, 0, 0, false>(
718  make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
719  }
720 
721  double norm2(const ColorSpinorField &x)
722  {
723  ColorSpinorField &y = const_cast<ColorSpinorField &>(x);
724  return uni_reduce<double, QudaSumFloat, Norm2, 0, 0, 0, 0, 0, false>(
725  make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
726  }
727 
729  {
730  return uni_reduce<double, QudaSumFloat, Dot, 0, 0, 0, 0, 0, false>(
731  make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
732  }
733 
734  double axpbyzNorm(double a, ColorSpinorField &x, double b, ColorSpinorField &y, ColorSpinorField &z)
735  {
736  return uni_reduce<double, QudaSumFloat, axpbyzNorm2, 0, 0, 1, 0, 0, false>(
737  make_double2(a, 0.0), make_double2(b, 0.0), x, y, z, x, x);
738  }
739 
740  double axpyReDot(double a, ColorSpinorField &x, ColorSpinorField &y)
741  {
742  return uni_reduce<double, QudaSumFloat, AxpyReDot, 0, 1, 0, 0, 0, false>(
743  make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
744  }
745 
747  {
748  return uni_reduce<double, QudaSumFloat, caxpyNorm2, 0, 1, 0, 0, 0, false>(
749  make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, x, x, x);
750  }
751 
753  {
754  return uni_reduce<double, QudaSumFloat, caxpyxmaznormx, 1, 1, 0, 0, 0, false>(
755  make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, z, x, x);
756  }
757 
759  {
760  return uni_reduce<double, QudaSumFloat, cabxpyzaxnorm, 1, 0, 1, 0, 0, false>(
761  make_double2(a, 0.0), make_double2(REAL(b), IMAG(b)), x, y, z, x, x);
762  }
763 
765  {
766  double2 cdot = uni_reduce<double2, QudaSumFloat2, Cdot, 0, 0, 0, 0, 0, false>(
767  make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
768  return Complex(cdot.x, cdot.y);
769  }
770 
772  {
773  double2 cdot = uni_reduce<double2, QudaSumFloat2, caxpydotzy, 0, 1, 0, 0, 0, false>(
774  make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, z, x, x);
775  return Complex(cdot.x, cdot.y);
776  }
777 
779  return uni_reduce<double3, QudaSumFloat3, CdotNormA, 0, 0, 0, 0, 0, false>(
780  make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
781  }
782 
784  const Complex &b, ColorSpinorField &y,
786  ColorSpinorField &u) {
787  if (x.Precision() != z.Precision()) {
788  return mixed_reduce<double3, QudaSumFloat3, caxpbypzYmbwcDotProductUYNormY_, 0, 1, 1, 0, 0, false>(
789  make_double2(REAL(a), IMAG(a)), make_double2(REAL(b), IMAG(b)), x, y, z, w, u);
790  } else {
791  return uni_reduce<double3, QudaSumFloat3, caxpbypzYmbwcDotProductUYNormY_, 0, 1, 1, 0, 0, false>(
792  make_double2(REAL(a), IMAG(a)), make_double2(REAL(b), IMAG(b)), x, y, z, w, u);
793  }
794  }
795 
797  // swizzle since mixed is on z
798  double2 cg_norm ;
799  if (x.Precision() != y.Precision()) {
800  cg_norm = mixed_reduce<double2, QudaSumFloat2, axpyCGNorm2, 0, 0, 1, 0, 0, false>(
801  make_double2(a, 0.0), make_double2(0.0, 0.0), x, x, y, x, x);
802  } else {
803  cg_norm = uni_reduce<double2, QudaSumFloat2, axpyCGNorm2, 0, 0, 1, 0, 0, false>(
804  make_double2(a, 0.0), make_double2(0.0, 0.0), x, x, y, x, x);
805  }
806  return Complex(cg_norm.x, cg_norm.y);
807  }
808 
810  // in case of x.Ncolor()!=3 (MG mainly) reduce_core do not support this function.
811  if (x.Ncolor()!=3) return make_double3(0.0, 0.0, 0.0);
812  double3 rtn = uni_reduce<double3, QudaSumFloat3, HeavyQuarkResidualNorm_, 0, 0, 0, 0, 0, true>(
813  make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, r, r, r, r);
814  rtn.z /= (x.Volume()*comm_size());
815  return rtn;
816  }
817 
819  ColorSpinorField &r) {
820  // in case of x.Ncolor()!=3 (MG mainly) reduce_core do not support this function.
821  if (x.Ncolor()!=3) return make_double3(0.0, 0.0, 0.0);
822  double3 rtn = uni_reduce<double3, QudaSumFloat3, xpyHeavyQuarkResidualNorm_, 0, 0, 0, 0, 0, true>(
823  make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, r, r, r);
824  rtn.z /= (x.Volume()*comm_size());
825  return rtn;
826  }
827 
829  return uni_reduce<double3, QudaSumFloat3, tripleCGReduction_, 0, 0, 0, 0, 0, false>(
830  make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
831  }
832 
834  return uni_reduce<double4, QudaSumFloat4, quadrupleCGReduction_, 0, 0, 0, 0, 0, false>(
835  make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
836  }
837 
839  return uni_reduce<double, QudaSumFloat, quadrupleCG3InitNorm_, 1, 1, 1, 1, 0, false>(
840  make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, z, w, v);
841  }
842 
844  return uni_reduce<double, QudaSumFloat, quadrupleCG3UpdateNorm_, 1, 1, 1, 1, 0, false>(
845  make_double2(a, 0.0), make_double2(b, 1. - b), x, y, z, w, v);
846  }
847 
849  return uni_reduce<double, QudaSumFloat, doubleCG3InitNorm_, 1, 1, 0, 0, 0, false>(
850  make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, z, z, z);
851  }
852 
854  return uni_reduce<double, QudaSumFloat, doubleCG3UpdateNorm_, 1, 1, 0, 0, 0, false>(
855  make_double2(a, 0.0), make_double2(b, 1.0 - b), x, y, z, z, z);
856  }
857 
858  } // namespace blas
859 
860 } // namespace quda
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33
CUresult jitifyError() const
Definition: tune_quda.h:375
int Z[4]
Definition: test_util.cpp:26
#define pinned_malloc(size)
Definition: malloc_quda.h:67
void * getHostReduceBuffer()
Definition: reduce_quda.cu:28
double3 cDotProductNormA(ColorSpinorField &a, ColorSpinorField &b)
Definition: reduce_quda.cu:778
bool commAsyncReduction()
double caxpyNorm(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:746
const char * AuxString() const
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaDeviceProp deviceProp
double quadrupleCG3InitNorm(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
Definition: reduce_quda.cu:838
long long flops() const
Definition: reduce_quda.cu:282
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define checkPrecision(...)
ReductionArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > arg
Definition: reduce_quda.cu:184
#define errorQuda(...)
Definition: util_quda.h:121
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:721
Helper file when using jitify run-time compilation. This file should be included in source code...
#define host_free(ptr)
Definition: malloc_quda.h:71
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:764
void apply(const cudaStream_t &stream)
Definition: reduce_quda.cu:246
cudaStream_t * streams
cudaStream_t * stream
doubleN reduceLaunch(Arg &arg, const TuneParam &tp, const cudaStream_t &stream, Tunable &tunable)
Definition: reduce_quda.cu:139
ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer &r, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v, int length)
Definition: reduce_quda.cu:209
double3 xpyHeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &r)
Definition: reduce_quda.cu:818
void reduceDoubleArray(double *, const int len)
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:728
Complex axpyCGNorm(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:796
const char * VolString() const
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: reduce_quda.cu:195
void * getMappedHostReduceBuffer()
Definition: reduce_quda.cu:27
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:137
int length[]
__host__ __device__ void sum(double &a, double &b)
Definition: blas_helper.cuh:62
void initTuneParam(TuneParam &param) const
Definition: reduce_quda.cu:270
cpuGaugeField * Y_h
doubleN uni_reduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
Definition: reduce_quda.cu:349
static bool fast_reduce_enabled
Definition: reduce_quda.cu:16
void completeFastReduce(int32_t words)
Definition: reduce_quda.cu:43
QudaGaugeParam param
Definition: pack_test.cpp:17
const ColorSpinorField & z
Definition: reduce_quda.cu:187
cudaStream_t * getStream()
Definition: blas_quda.cu:494
bool getFastReduce()
Definition: reduce_quda.cu:30
void initFastReduce(int words)
static cudaEvent_t reduceEnd
Definition: reduce_quda.cu:15
int comm_size(void)
Definition: comm_mpi.cpp:88
TuneKey tuneKey() const
Definition: reduce_quda.cu:244
void initReduce()
Definition: reduce_quda.cu:64
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
double4 quadrupleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:833
#define warningQuda(...)
Definition: util_quda.h:133
#define checkLocation(...)
__global__ void reduceKernel(Arg arg)
Definition: reduce_core.cuh:44
double cabxpyzAxNorm(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:758
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
Definition: reduce_quda.cu:809
#define REAL(a)
Definition: blas_helper.cuh:14
int X[4]
Definition: covdev_test.cpp:70
std::complex< double > Complex
Definition: quda_internal.h:46
cudaEvent_t * getReduceEvent()
Definition: reduce_quda.cu:29
double caxpyXmazNormX(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:752
double axpyReDot(double a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:740
Complex caxpyDotzy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:771
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:472
int V
Definition: test_util.cpp:27
void checkLength(const ColorSpinorField &a, const ColorSpinorField &b)
Definition: blas_helper.cuh:26
void * memset(void *s, int c, size_t n)
#define LAUNCH_KERNEL(kernel, tp, stream, arg,...)
double axpbyzNorm(double a, ColorSpinorField &x, double b, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:734
#define QudaSumFloat
Definition: blas_helper.cuh:52
double3 caxpbypzYmbwcDotProductUYNormY(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &u)
Definition: reduce_quda.cu:783
double norm1(const ColorSpinorField &b)
Definition: reduce_quda.cu:714
static QudaSumFloat * h_reduce
Definition: reduce_quda.cu:13
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: reduce_quda.cu:197
cpuGaugeField * X_h
unsigned int sharedBytesPerThread() const
Definition: reduce_quda.cu:194
void * getDeviceReduceBuffer()
Definition: reduce_quda.cu:26
double doubleCG3UpdateNorm(double a, double b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:853
ReduceType genericReduce(SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer r)
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define device_malloc(size)
Definition: malloc_quda.h:64
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
long long bytes() const
Definition: reduce_quda.cu:284
static QudaSumFloat * d_reduce
Definition: reduce_quda.cu:12
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:304
#define checkCudaError()
Definition: util_quda.h:161
#define mapped_malloc(size)
Definition: malloc_quda.h:68
doubleN mixed_reduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
Definition: reduce_quda.cu:520
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
double doubleCG3InitNorm(double a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:848
QudaPrecision Precision() const
doubleN nativeReduce(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v, int length)
Definition: reduce_quda.cu:297
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:90
QudaFieldOrder FieldOrder() const
static QudaSumFloat * hd_reduce
Definition: reduce_quda.cu:14
void defaultTuneParam(TuneParam &param) const
Definition: reduce_quda.cu:276
void endReduce()
Definition: reduce_quda.cu:120
unsigned long long bytes
Definition: blas_quda.cu:23
double3 tripleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:828
#define MAX_MULTI_BLAS_N
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:329
#define device_free(ptr)
Definition: malloc_quda.h:69
double quadrupleCG3UpdateNorm(double a, double b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v)
Definition: reduce_quda.cu:843
#define IMAG(a)
Definition: blas_helper.cuh:15