QUDA  0.9.0
reduce_quda.cu
Go to the documentation of this file.
1 #include <blas_quda.h>
2 #include <tune_quda.h>
3 #include <float_vector.h>
5 
6 //#define QUAD_SUM
7 #ifdef QUAD_SUM
8 #include <dbldbl.h>
9 #endif
10 
11 #include <cub_helper.cuh>
12 #include <algorithm>
13 
14 template<typename> struct ScalarType { };
15 template<> struct ScalarType<double> { typedef double type; };
16 template<> struct ScalarType<double2> { typedef double type; };
17 template<> struct ScalarType<double3> { typedef double type; };
18 template<> struct ScalarType<double4> { typedef double type; };
19 
20 template<typename> struct Vec2Type { };
21 template<> struct Vec2Type<double> { typedef double2 type; };
22 
23 #ifdef QUAD_SUM
24 #define QudaSumFloat doubledouble
25 #define QudaSumFloat2 doubledouble2
26 #define QudaSumFloat3 doubledouble3
27 template<> struct ScalarType<doubledouble> { typedef doubledouble type; };
28 template<> struct ScalarType<doubledouble2> { typedef doubledouble type; };
29 template<> struct ScalarType<doubledouble3> { typedef doubledouble type; };
30 template<> struct ScalarType<doubledouble4> { typedef doubledouble type; };
31 template<> struct Vec2Type<doubledouble> { typedef doubledouble2 type; };
32 #else
33 #define QudaSumFloat double
34 #define QudaSumFloat2 double2
35 #define QudaSumFloat3 double3
36 #define QudaSumFloat4 double4
37 #endif
38 
39 
41  if (a.Precision() != b.Precision())
42  errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision());
43  if (a.Length() != b.Length())
44  errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length());
45  if (a.Stride() != b.Stride())
46  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride());
47 }
48 
50  if (a.Length() != b.Length())
51  errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length());
52  if (a.Stride() != b.Stride())
53  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride());
54 }
55 
56 static struct {
57  const char *vol_str;
58  const char *aux_str;
60 } blasStrings;
61 
62 // These are used for reduction kernels
66 static cudaEvent_t reduceEnd;
67 
68 namespace quda {
69  namespace blas {
70 
71  cudaStream_t* getStream();
72 
73  void* getDeviceReduceBuffer() { return d_reduce; }
75  void* getHostReduceBuffer() { return h_reduce; }
76  cudaEvent_t* getReduceEvent() { return &reduceEnd; }
77 
78  void initReduce()
79  {
80  /* we have these different reductions to cater for:
81 
82  - regular reductions (reduce_quda.cu) where are reducing to a
83  single vector type (max length 4 presently), with possibly
84  parity dimension, and a grid-stride loop with max number of
85  blocks = 2 x SM count
86 
87  - multi-reductions where we are reducing to a matrix of size
88  of size MAX_MULTI_BLAS_N^2 of vectors (max length 4), with
89  possible parity dimension, and a grid-stride loop with
90  maximum number of blocks = 2 x SM count
91 
92  - inline reductions in kernels where we cannot assume a grid
93  stride loop - hence max blocks is given by the architecture
94  limit
95 
96  */
97 
98  const int max_reduce_blocks = 2*deviceProp.multiProcessorCount; // FIXME - should set this according to what's used in tune_quda.h
99 
100  const int max_reduce = 2 * max_reduce_blocks * 4 * sizeof(QudaSumFloat);
101  const int max_multi_reduce = 2 * MAX_MULTI_BLAS_N * MAX_MULTI_BLAS_N * max_reduce_blocks * 4 * sizeof(QudaSumFloat);
102 
103  const int max_generic_blocks = 65336; // FIXME - this isn't quite right
104  const int max_generic_reduce = 2 * MAX_MULTI_BLAS_N * max_generic_blocks * 4 * sizeof(QudaSumFloat);
105 
106  // reduction buffer size
107  size_t bytes = std::max(std::max(max_reduce, max_multi_reduce), max_generic_reduce);
108 
110 
111  // these arrays are actually oversized currently (only needs to be QudaSumFloat3)
112 
113  // if the device supports host-mapped memory then use a host-mapped array for the reduction
114  if (!h_reduce) {
115  // only use zero copy reductions when using 64-bit
116 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
117  if(deviceProp.canMapHostMemory) {
119  cudaHostGetDevicePointer(&hd_reduce, h_reduce, 0); // set the matching device pointer
120  } else
121 #endif
122  {
125  }
126  memset(h_reduce, 0, bytes); // added to ensure that valgrind doesn't report h_reduce is unitialised
127  }
128 
129  cudaEventCreateWithFlags(&reduceEnd, cudaEventDisableTiming);
130 
131  checkCudaError();
132  }
133 
134  void endReduce(void)
135  {
136  if (d_reduce) {
138  d_reduce = 0;
139  }
140  if (h_reduce) {
142  h_reduce = 0;
143  }
144  hd_reduce = 0;
145 
146  cudaEventDestroy(reduceEnd);
147  }
148 
149  namespace reduce {
150 
151 #include <texture.h>
152 #include <reduce_core.cuh>
153 #include <reduce_core.h>
154 #include <reduce_mixed_core.h>
155 
156  } // namespace reduce
157 
161  template <typename ReduceType, typename Float2, typename FloatN>
162  struct ReduceFunctor {
163 
165  virtual __device__ __host__ void pre() { ; }
166 
168  virtual __device__ __host__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y,
169  FloatN &z, FloatN &w, FloatN &v) = 0;
170 
172  virtual __device__ __host__ void post(ReduceType &sum) { ; }
173 
174  };
175 
179  template<typename ReduceType> __device__ __host__ ReduceType norm1_(const double2 &a) {
180  return (ReduceType)fabs(a.x) + (ReduceType)fabs(a.y);
181  }
182 
183  template<typename ReduceType> __device__ __host__ ReduceType norm1_(const float2 &a) {
184  return (ReduceType)fabs(a.x) + (ReduceType)fabs(a.y);
185  }
186 
187  template<typename ReduceType> __device__ __host__ ReduceType norm1_(const float4 &a) {
188  return (ReduceType)fabs(a.x) + (ReduceType)fabs(a.y) + (ReduceType)fabs(a.z) + (ReduceType)fabs(a.w);
189  }
190 
191  template <typename ReduceType, typename Float2, typename FloatN>
192  struct Norm1 : public ReduceFunctor<ReduceType, Float2, FloatN> {
193  Norm1(const Float2 &a, const Float2 &b) { ; }
194  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z,FloatN &w, FloatN &v)
195  { sum += norm1_<ReduceType>(x); }
196  static int streams() { return 1; }
197  static int flops() { return 2; }
198  };
199 
200  double norm1(const ColorSpinorField &x) {
201 #ifdef HOST_DEBUG
202  ColorSpinorField &y = const_cast<ColorSpinorField&>(x); // FIXME
203  return reduce::reduceCuda<double,QudaSumFloat,Norm1,0,0,0,0,0,false>
204  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
205 #else
206  errorQuda("L1 norm kernel only built when HOST_DEBUG is enabled");
207  return 0.0;
208 #endif
209  }
210 
214  template<typename ReduceType> __device__ __host__ void norm2_(ReduceType &sum, const double2 &a) {
215  sum += (ReduceType)a.x*(ReduceType)a.x;
216  sum += (ReduceType)a.y*(ReduceType)a.y;
217  }
218 
219  template<typename ReduceType> __device__ __host__ void norm2_(ReduceType &sum, const float2 &a) {
220  sum += (ReduceType)a.x*(ReduceType)a.x;
221  sum += (ReduceType)a.y*(ReduceType)a.y;
222  }
223 
224  template<typename ReduceType> __device__ __host__ void norm2_(ReduceType &sum, const float4 &a) {
225  sum += (ReduceType)a.x*(ReduceType)a.x;
226  sum += (ReduceType)a.y*(ReduceType)a.y;
227  sum += (ReduceType)a.z*(ReduceType)a.z;
228  sum += (ReduceType)a.w*(ReduceType)a.w;
229  }
230 
231 
232  template <typename ReduceType, typename Float2, typename FloatN>
233  struct Norm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
234  Norm2(const Float2 &a, const Float2 &b) { ; }
235  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z,FloatN &w, FloatN &v)
236  { norm2_<ReduceType>(sum,x); }
237  static int streams() { return 1; }
238  static int flops() { return 2; }
239  };
240 
241  double norm2(const ColorSpinorField &x) {
242  ColorSpinorField &y = const_cast<ColorSpinorField&>(x);
243  return reduce::reduceCuda<double,QudaSumFloat,Norm2,0,0,0,0,0,false>
244  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), y, y, y, y, y);
245  }
246 
247 
251  template<typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b) {
252  sum += (ReduceType)a.x*(ReduceType)b.x;
253  sum += (ReduceType)a.y*(ReduceType)b.y;
254  }
255 
256  template<typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const float2 &a, const float2 &b) {
257  sum += (ReduceType)a.x*(ReduceType)b.x;
258  sum += (ReduceType)a.y*(ReduceType)b.y;
259  }
260 
261  template<typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const float4 &a, const float4 &b) {
262  sum += (ReduceType)a.x*(ReduceType)b.x;
263  sum += (ReduceType)a.y*(ReduceType)b.y;
264  sum += (ReduceType)a.z*(ReduceType)b.z;
265  sum += (ReduceType)a.w*(ReduceType)b.w;
266  }
267 
268  template <typename ReduceType, typename Float2, typename FloatN>
269  struct Dot : public ReduceFunctor<ReduceType, Float2, FloatN> {
270  Dot(const Float2 &a, const Float2 &b) { ; }
271  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
272  { dot_<ReduceType>(sum,x,y); }
273  static int streams() { return 2; }
274  static int flops() { return 2; }
275  };
276 
278  return reduce::reduceCuda<double,QudaSumFloat,Dot,0,0,0,0,0,false>
279  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
280  }
281 
282 
287  template<typename ReduceType, typename InputType>
288  __device__ __host__ ReduceType dotNormA_(const InputType &a, const InputType &b) {
289  typedef typename ScalarType<ReduceType>::type scalar;
290  ReduceType c;
291  dot_<scalar>(c.x,a,b);
292  norm2_<scalar>(c.y,a);
293  return c;
294  }
295 
296  template <typename ReduceType, typename Float2, typename FloatN>
297  struct DotNormA : public ReduceFunctor<ReduceType, Float2, FloatN> {
298  DotNormA(const Float2 &a, const Float2 &b){}
299  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
300  {sum += dotNormA_<ReduceType,FloatN>(x,y);}
301  static int streams() { return 2; }
302  static int flops() { return 4; }
303  };
304 
306  return reduce::reduceCuda<double2,QudaSumFloat2,DotNormA,0,0,0,0,0,false>
307  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
308  }
309 
310 
315  template <typename ReduceType, typename Float2, typename FloatN>
316  struct axpyNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
317  Float2 a;
318  axpyNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
319  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
320  y += a.x*x; norm2_<ReduceType>(sum,y); }
321  static int streams() { return 3; }
322  static int flops() { return 4; }
323  };
324 
325  double axpyNorm(const double &a, ColorSpinorField &x, ColorSpinorField &y) {
326  return reduce::reduceCuda<double,QudaSumFloat,axpyNorm2,0,1,0,0,0,false>
327  (make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
328  }
329 
330 
335  template <typename ReduceType, typename Float2, typename FloatN>
336  struct AxpyReDot : public ReduceFunctor<ReduceType, Float2, FloatN> {
337  Float2 a;
338  AxpyReDot(const Float2 &a, const Float2 &b) : a(a) { ; }
339  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
340  y += a.x*x; dot_<ReduceType>(sum,x,y); }
341  static int streams() { return 3; }
342  static int flops() { return 4; }
343  };
344 
345  double axpyReDot(const double &a, ColorSpinorField &x, ColorSpinorField &y) {
346  return reduce::reduceCuda<double,QudaSumFloat,AxpyReDot,0,1,0,0,0,false>
347  (make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
348  }
349 
350 
355  template <typename ReduceType, typename Float2, typename FloatN>
356  struct xmyNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
357  xmyNorm2(const Float2 &a, const Float2 &b) { ; }
358  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
359  y = x - y; norm2_<ReduceType>(sum,y); }
360  static int streams() { return 3; }
361  static int flops() { return 3; }
362  };
363 
365  return reduce::reduceCuda<double,QudaSumFloat,xmyNorm2,0,1,0,0,0,false>
366  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
367  }
368 
369 
373  __device__ __host__ void Caxpy_(const double2 &a, const double2 &x, double2 &y) {
374  y.x += a.x*x.x; y.x -= a.y*x.y;
375  y.y += a.y*x.x; y.y += a.x*x.y;
376  }
377  __device__ __host__ void Caxpy_(const float2 &a, const float2 &x, float2 &y) {
378  y.x += a.x*x.x; y.x -= a.y*x.y;
379  y.y += a.y*x.x; y.y += a.x*x.y;
380  }
381  __device__ __host__ void Caxpy_(const float2 &a, const float4 &x, float4 &y) {
382  y.x += a.x*x.x; y.x -= a.y*x.y;
383  y.y += a.y*x.x; y.y += a.x*x.y;
384  y.z += a.x*x.z; y.z -= a.y*x.w;
385  y.w += a.y*x.z; y.w += a.x*x.w;
386  }
387 
392  template <typename ReduceType, typename Float2, typename FloatN>
393  struct caxpyNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
394  Float2 a;
395  caxpyNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
396  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
397  Caxpy_(a, x, y); norm2_<ReduceType>(sum,y); }
398  static int streams() { return 3; }
399  static int flops() { return 6; }
400  };
401 
403  return reduce::reduceCuda<double,QudaSumFloat,caxpyNorm2,0,1,0,0,0,false>
404  (make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, x, x, x);
405  }
406 
407 
414  template <typename ReduceType, typename Float2, typename FloatN>
415  struct caxpyxmaznormx : public ReduceFunctor<ReduceType, Float2, FloatN> {
416  Float2 a;
417  caxpyxmaznormx(const Float2 &a, const Float2 &b) : a(a) { ; }
418  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
419  { Caxpy_(a, x, y); Caxpy_(-a,z,x); norm2_<ReduceType>(sum,x); }
420  static int streams() { return 5; }
421  static int flops() { return 10; }
422  };
423 
426  return reduce::reduceCuda<double,QudaSumFloat,caxpyxmaznormx,1,1,0,0,0,false>
427  (make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, z, x, x);
428  }
429 
430 
437  template <typename ReduceType, typename Float2, typename FloatN>
438  struct cabxpyaxnorm : public ReduceFunctor<ReduceType, Float2, FloatN> {
439  Float2 a;
440  Float2 b;
441  cabxpyaxnorm(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
442  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
443  { x *= a.x; Caxpy_(b, x, y); norm2_<ReduceType>(sum,y); }
444  static int streams() { return 4; }
445  static int flops() { return 10; }
446  };
447 
448 
449  double cabxpyAxNorm(const double &a, const Complex &b,
451  return reduce::reduceCuda<double,QudaSumFloat,cabxpyaxnorm,1,1,0,0,0,false>
452  (make_double2(a, 0.0), make_double2(REAL(b), IMAG(b)), x, y, x, x, x);
453  }
454 
455 
459  template<typename ReduceType>
460  __device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b) {
461  typedef typename ScalarType<ReduceType>::type scalar;
462  sum.x += (scalar)a.x*(scalar)b.x;
463  sum.x += (scalar)a.y*(scalar)b.y;
464  sum.y += (scalar)a.x*(scalar)b.y;
465  sum.y -= (scalar)a.y*(scalar)b.x;
466  }
467 
468  template<typename ReduceType>
469  __device__ __host__ void cdot_(ReduceType &sum, const float2 &a, const float2 &b) {
470  typedef typename ScalarType<ReduceType>::type scalar;
471  sum.x += (scalar)a.x*(scalar)b.x;
472  sum.x += (scalar)a.y*(scalar)b.y;
473  sum.y += (scalar)a.x*(scalar)b.y;
474  sum.y -= (scalar)a.y*(scalar)b.x;
475  }
476 
477  template<typename ReduceType>
478  __device__ __host__ void cdot_(ReduceType &sum, const float4 &a, const float4 &b) {
479  typedef typename ScalarType<ReduceType>::type scalar;
480  sum.x += (scalar)a.x*(scalar)b.x;
481  sum.x += (scalar)a.y*(scalar)b.y;
482  sum.x += (scalar)a.z*(scalar)b.z;
483  sum.x += (scalar)a.w*(scalar)b.w;
484  sum.y += (scalar)a.x*(scalar)b.y;
485  sum.y -= (scalar)a.y*(scalar)b.x;
486  sum.y += (scalar)a.z*(scalar)b.w;
487  sum.y -= (scalar)a.w*(scalar)b.z;
488  }
489 
490  template <typename ReduceType, typename Float2, typename FloatN>
491  struct Cdot : public ReduceFunctor<ReduceType, Float2, FloatN> {
492  Cdot(const Float2 &a, const Float2 &b) { ; }
493  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
494  { cdot_<ReduceType>(sum,x,y); }
495  static int streams() { return 2; }
496  static int flops() { return 4; }
497  };
498 
499 
501  double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,Cdot,0,0,0,0,0,false>
502  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
503  return Complex(cdot.x, cdot.y);
504  }
505 
511  template <typename ReduceType, typename Float2, typename FloatN>
512  struct xpaycdotzy : public ReduceFunctor<ReduceType, Float2, FloatN> {
513  Float2 a;
514  xpaycdotzy(const Float2 &a, const Float2 &b) : a(a) { ; }
515  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
516  { y = x + a.x*y; cdot_<ReduceType>(sum,z,y); }
517  static int streams() { return 4; }
518  static int flops() { return 6; }
519  };
520 
522  double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,xpaycdotzy,0,1,0,0,0,false>
523  (make_double2(a, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
524  return Complex(cdot.x, cdot.y);
525  }
526 
527 
533  template <typename ReduceType, typename Float2, typename FloatN>
534  struct caxpydotzy : public ReduceFunctor<ReduceType, Float2, FloatN> {
535  Float2 a;
536  caxpydotzy(const Float2 &a, const Float2 &b) : a(a) { ; }
537  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
538  { Caxpy_(a, x, y); cdot_<ReduceType>(sum,z,y); }
539  static int streams() { return 4; }
540  static int flops() { return 8; }
541  };
542 
543 
545  double2 cdot = reduce::reduceCuda<double2,QudaSumFloat2,caxpydotzy,0,1,0,0,0,false>
546  (make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0), x, y, z, x, x);
547  return Complex(cdot.x, cdot.y);
548  }
549 
550 
555  template<typename ReduceType, typename InputType>
556  __device__ __host__ void cdotNormA_(ReduceType &sum, const InputType &a, const InputType &b) {
557  typedef typename ScalarType<ReduceType>::type scalar;
558  typedef typename Vec2Type<scalar>::type vec2;
559  cdot_<ReduceType>(sum,a,b);
560  norm2_<scalar>(sum.z,a);
561  }
562 
563  template <typename ReduceType, typename Float2, typename FloatN>
564  struct CdotNormA : public ReduceFunctor<ReduceType, Float2, FloatN> {
565  CdotNormA(const Float2 &a, const Float2 &b) { ; }
566  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
567  { cdotNormA_<ReduceType>(sum,x,y); }
568  static int streams() { return 2; }
569  static int flops() { return 6; }
570  };
571 
573  return reduce::reduceCuda<double3,QudaSumFloat3,CdotNormA,0,0,0,0,0,false>
574  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
575  }
576 
577 
582  template<typename ReduceType, typename InputType>
583  __device__ __host__ void cdotNormB_(ReduceType &sum, const InputType &a, const InputType &b) {
584  typedef typename ScalarType<ReduceType>::type scalar;
585  typedef typename Vec2Type<scalar>::type vec2;
586  cdot_<ReduceType>(sum,a,b);
587  norm2_<scalar>(sum.z,b);
588  }
589 
590  template <typename ReduceType, typename Float2, typename FloatN>
591  struct CdotNormB : public ReduceFunctor<ReduceType, Float2, FloatN> {
592  CdotNormB(const Float2 &a, const Float2 &b) { ; }
593  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
594  { cdotNormB_<ReduceType>(sum,x,y); }
595  static int streams() { return 2; }
596  static int flops() { return 6; }
597  };
598 
600  return reduce::reduceCuda<double3,QudaSumFloat3,CdotNormB,0,0,0,0,0,false>
601  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, x, x);
602  }
603 
604 
609  template <typename ReduceType, typename Float2, typename FloatN>
610  struct caxpbypzYmbwcDotProductUYNormY_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
611  Float2 a;
612  Float2 b;
613  caxpbypzYmbwcDotProductUYNormY_(const Float2 &a, const Float2 &b) : a(a), b(b) { ; }
614  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) { Caxpy_(a, x, z); Caxpy_(b, y, z); Caxpy_(-b, w, y); cdotNormB_<ReduceType>(sum,v,y); }
615  static int streams() { return 7; }
616  static int flops() { return 18; }
617  };
618 
620  const Complex &b, ColorSpinorField &y,
622  ColorSpinorField &u) {
623  if (x.Precision() != z.Precision()) {
624  return reduce::mixed::reduceCuda<double3,QudaSumFloat3,caxpbypzYmbwcDotProductUYNormY_,0,1,1,0,0,false>
625  (make_double2(REAL(a), IMAG(a)), make_double2(REAL(b), IMAG(b)), x, y, z, w, u);
626  } else {
627  return reduce::reduceCuda<double3,QudaSumFloat3,caxpbypzYmbwcDotProductUYNormY_,0,1,1,0,0,false>
628  (make_double2(REAL(a), IMAG(a)), make_double2(REAL(b), IMAG(b)), x, y, z, w, u);
629  }
630  }
631 
632 
639  template <typename ReduceType, typename Float2, typename FloatN>
640  struct axpyCGNorm2 : public ReduceFunctor<ReduceType, Float2, FloatN> {
641  Float2 a;
642  axpyCGNorm2(const Float2 &a, const Float2 &b) : a(a) { ; }
643  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
644  typedef typename ScalarType<ReduceType>::type scalar;
645  FloatN z_new = z + a.x*x;
646  norm2_<scalar>(sum.x,z_new);
647  dot_<scalar>(sum.y,z_new,z_new-z);
648  z = z_new;
649  }
650  static int streams() { return 3; }
651  static int flops() { return 6; }
652  };
653 
655  // swizzle since mixed is on z
656  double2 cg_norm ;
657  if (x.Precision() != y.Precision()) {
658  cg_norm = reduce::mixed::reduceCuda<double2,QudaSumFloat2,axpyCGNorm2,0,0,1,0,0,false>
659  (make_double2(a, 0.0), make_double2(0.0, 0.0), x, x, y, x, x);
660  } else {
661  cg_norm = reduce::reduceCuda<double2,QudaSumFloat2,axpyCGNorm2,0,0,1,0,0,false>
662  (make_double2(a, 0.0), make_double2(0.0, 0.0), x, x, y, x, x);
663  }
664  return Complex(cg_norm.x, cg_norm.y);
665  }
666 
667 
679  template <typename ReduceType, typename Float2, typename FloatN>
680  struct HeavyQuarkResidualNorm_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
681  typedef typename scalar<ReduceType>::type real;
682  Float2 a;
683  Float2 b;
684  ReduceType aux;
685  HeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b) : a(a), b(b), aux{ } { ; }
686 
687  __device__ __host__ void pre() { aux.x = 0; aux.y = 0; }
688 
689  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
690  norm2_<real>(aux.x,x); norm2_<real>(aux.y,y);
691  }
692 
694  __device__ __host__ void post(ReduceType &sum)
695  {
696  sum.x += aux.x; sum.y += aux.y; sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : static_cast<real>(1.0);
697  }
698 
699  static int streams() { return 2; }
700  static int flops() { return 4; }
701  };
702 
704  double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,HeavyQuarkResidualNorm_,0,0,0,0,0,true>
705  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, r, r, r, r);
706  rtn.z /= (x.Volume()*comm_size());
707  return rtn;
708  }
709 
710 
718  template <typename ReduceType, typename Float2, typename FloatN>
719  struct xpyHeavyQuarkResidualNorm_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
720  typedef typename scalar<ReduceType>::type real;
721  Float2 a;
722  Float2 b;
723  ReduceType aux;
724  xpyHeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b) : a(a), b(b), aux{ } { ; }
725 
726  __device__ __host__ void pre() { aux.x = 0; aux.y = 0; }
727 
728  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
729  norm2_<real>(aux.x,x + y); norm2_<real>(aux.y,z);
730  }
731 
733  __device__ __host__ void post(ReduceType &sum)
734  {
735  sum.x += aux.x; sum.y += aux.y; sum.z += (aux.x > 0.0) ? (aux.y / aux.x) : static_cast<real>(1.0);
736  }
737 
738  static int streams() { return 3; }
739  static int flops() { return 5; }
740  };
741 
743  ColorSpinorField &r) {
744  double3 rtn = reduce::reduceCuda<double3,QudaSumFloat3,xpyHeavyQuarkResidualNorm_,0,0,0,0,0,true>
745  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, r, r, r);
746  rtn.z /= (x.Volume()*comm_size());
747  return rtn;
748  }
749 
756  template <typename ReduceType, typename Float2, typename FloatN>
757  struct tripleCGReduction_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
758  tripleCGReduction_(const Float2 &a, const Float2 &b) { ; }
759  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
760  typedef typename ScalarType<ReduceType>::type scalar;
761  norm2_<scalar>(sum.x,x); norm2_<scalar>(sum.y,y); dot_<scalar>(sum.z,y,z);
762  }
763  static int streams() { return 3; }
764  static int flops() { return 6; }
765  };
766 
768  return reduce::reduceCuda<double3,QudaSumFloat3,tripleCGReduction_,0,0,0,0,0,false>
769  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
770  }
771 
772 
773 #ifdef ALTRELIABLE
774 
781  template <typename ReduceType, typename Float2, typename FloatN>
782  struct quadrupleCGReduction_ : public ReduceFunctor<ReduceType, Float2, FloatN> {
783  quadrupleCGReduction_(const Float2 &a, const Float2 &b) { ; }
784  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v) {
785  typedef typename ScalarType<ReduceType>::type scalar;
786  norm2_<scalar>(sum.x,x); norm2_<scalar>(sum.y,y); dot_<scalar>(sum.z,y,z); norm2_<scalar>(sum.w,w);
787  }
788  static int streams() { return 3; }
789  static int flops() { return 8; }
790  };
791 
793  return reduce::reduceCuda<double4,QudaSumFloat4,quadrupleCGReduction_,0,0,0,0,0,false>
794  (make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, z, x, x);
795  }
796 
797 #endif
798 
799  } // namespace blas
800 
801 } // namespace quda
AxpyReDot(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:338
scalar< ReduceType >::type real
Definition: reduce_quda.cu:720
__device__ __host__ ReduceType dotNormA_(const InputType &a, const InputType &b)
Definition: reduce_quda.cu:288
static struct @8 blasStrings
__device__ __host__ void cdotNormA_(ReduceType &sum, const InputType &a, const InputType &b)
Definition: reduce_quda.cu:556
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:319
#define pinned_malloc(size)
Definition: malloc_quda.h:55
void * getHostReduceBuffer()
Definition: reduce_quda.cu:75
double3 cDotProductNormA(ColorSpinorField &a, ColorSpinorField &b)
Definition: reduce_quda.cu:572
double caxpyNorm(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:402
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:614
caxpyNorm2(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:395
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:399
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:235
cudaDeviceProp deviceProp
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:689
virtual __device__ __host__ void pre()
pre-computation routine called before the "M-loop"
Definition: reduce_quda.cu:165
__device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:593
char aux_tmp[quda::TuneKey::aux_n]
Definition: reduce_quda.cu:59
__device__ __host__ void cdotNormB_(ReduceType &sum, const InputType &a, const InputType &b)
Definition: reduce_quda.cu:583
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:339
#define errorQuda(...)
Definition: util_quda.h:90
static int streams()
Definition: reduce_quda.cu:341
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:241
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:616
#define host_free(ptr)
Definition: malloc_quda.h:59
axpyNorm2(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:318
caxpydotzy(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:536
static int streams()
Definition: reduce_quda.cu:273
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:500
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * streams
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:342
__device__ __host__ ReduceType norm1_(const double2 &a)
Definition: reduce_quda.cu:179
Norm2(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:234
double3 xpyHeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &r)
Definition: reduce_quda.cu:742
void checkLength(const ColorSpinorField &a, ColorSpinorField &b)
Definition: reduce_quda.cu:49
Cdot(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:492
double axpyNorm(const double &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:325
const char * aux_str
Definition: reduce_quda.cu:58
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:277
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:358
static int streams()
Definition: reduce_quda.cu:495
void * getMappedHostReduceBuffer()
Definition: reduce_quda.cu:74
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:540
tripleCGReduction_(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:758
double xmyNorm(ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:364
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:442
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:596
CdotNormA(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:565
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:518
scalar< ReduceType >::type real
Definition: reduce_quda.cu:681
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:759
static int streams()
Definition: reduce_quda.cu:321
#define QudaSumFloat
Definition: reduce_quda.cu:33
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:739
#define b
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:194
cudaStream_t * getStream()
Definition: blas_quda.cu:75
static cudaEvent_t reduceEnd
Definition: reduce_quda.cu:66
Norm1(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:193
int comm_size(void)
Definition: comm_mpi.cpp:126
#define IMAG(a)
Definition: blas_quda.h:14
double cabxpyAxNorm(const double &a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:449
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:728
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:700
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:764
__host__ __device__ void sum(double &a, double &b)
int int int w
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:322
void initReduce()
Definition: reduce_quda.cu:78
xpyHeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:724
Complex axpyCGNorm(const double &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:654
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:396
double4 quadrupleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:197
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:299
static int flops()
Definition: reduce_quda.cu:302
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:238
double3 HeavyQuarkResidualNorm(ColorSpinorField &x, ColorSpinorField &r)
Definition: reduce_quda.cu:703
Dot(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:270
virtual __device__ __host__ void post(ReduceType &sum)
post-computation routine called after the "M-loop"
Definition: reduce_quda.cu:172
cudaEvent_t * getReduceEvent()
Definition: reduce_quda.cu:76
double caxpyXmazNormX(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:424
static int streams()
Definition: reduce_quda.cu:237
Complex caxpyDotzy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:544
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:569
CdotNormB(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:592
__device__ __host__ void post(ReduceType &sum)
sum the solution and residual norms, and compute the heavy-quark norm
Definition: reduce_quda.cu:733
HeavyQuarkResidualNorm_(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:685
void checkSpinor(const ColorSpinorField &a, const ColorSpinorField &b)
Definition: reduce_quda.cu:40
#define REAL(a)
Definition: blas_quda.h:13
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:515
void * memset(void *__b, int __c, size_t __len)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:537
static int streams()
Definition: reduce_quda.cu:595
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:496
double3 caxpbypzYmbwcDotProductUYNormY(const Complex &a, ColorSpinorField &x, const Complex &b, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &u)
Definition: reduce_quda.cu:619
cabxpyaxnorm(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:441
__device__ __host__ void Caxpy_(const double2 &a, const double2 &x, double2 &y)
Definition: reduce_quda.cu:373
double norm1(const ColorSpinorField &b)
Definition: reduce_quda.cu:200
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:271
const char * vol_str
Definition: reduce_quda.cu:57
virtual __device__ __host__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)=0
where the reduction is usually computed and any auxiliary operations
double axpyReDot(const double &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:345
xmyNorm2(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:357
caxpbypzYmbwcDotProductUYNormY_(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:613
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:163
static QudaSumFloat * h_reduce
Definition: reduce_quda.cu:64
#define MAX_MULTI_BLAS_N
Definition: quda_internal.h:49
xpaycdotzy(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:514
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:418
Complex xpaycDotzy(ColorSpinorField &x, const double &a, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:521
__device__ __host__ void norm2_(ReduceType &sum, const double2 &a)
Definition: reduce_quda.cu:214
static const int aux_n
Definition: tune_key.h:12
void * getDeviceReduceBuffer()
Definition: reduce_quda.cu:73
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:361
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:651
static int streams()
Definition: reduce_quda.cu:196
__device__ __host__ void pre()
pre-computation routine called before the "M-loop"
Definition: reduce_quda.cu:687
double fabs(double)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:643
unsigned long long flops
Definition: blas_quda.cu:42
DotNormA(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:298
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:274
#define device_malloc(size)
Definition: malloc_quda.h:52
caxpyxmaznormx(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:417
const void * c
static QudaSumFloat * d_reduce
Definition: reduce_quda.cu:63
axpyCGNorm2(const Float2 &a, const Float2 &b)
Definition: reduce_quda.cu:642
static int streams()
Definition: reduce_quda.cu:360
static int streams()
Definition: reduce_quda.cu:301
static int streams()
Definition: reduce_quda.cu:568
__device__ __host__ void pre()
pre-computation routine called before the "M-loop"
Definition: reduce_quda.cu:726
#define checkCudaError()
Definition: util_quda.h:129
#define mapped_malloc(size)
Definition: malloc_quda.h:56
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:566
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:421
double3 cDotProductNormB(ColorSpinorField &a, ColorSpinorField &b)
Definition: reduce_quda.cu:599
__device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, FloatN &v)
where the reduction is usually computed and any auxiliary operations
Definition: reduce_quda.cu:493
#define a
static int flops()
total number of input and output streams
Definition: reduce_quda.cu:445
static QudaSumFloat * hd_reduce
Definition: reduce_quda.cu:65
double2 reDotProductNormA(ColorSpinorField &a, ColorSpinorField &b)
Definition: reduce_quda.cu:305
__device__ __host__ void post(ReduceType &sum)
sum the solution and residual norms, and compute the heavy-quark norm
Definition: reduce_quda.cu:694
void endReduce()
Definition: reduce_quda.cu:134
unsigned long long bytes
Definition: blas_quda.cu:43
double3 tripleCGReduction(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z)
Definition: reduce_quda.cu:767
#define device_free(ptr)
Definition: malloc_quda.h:57